print "Seed = %d" % RANDOM_SEED np .random.seed (RANDOM_SEED) random.seed (RANDOM_SEED) #env = Pendulum(2,withDisplay=True) # Continuous pendulum env = Pendulum(2,length=.5,mass=3.0,armature=.2,withDisplay=False) env.withSinCos = False # State is dim-3: (cosq,sinq,qdot) ... NX = env.nobs # ... training converges with q,qdot with 2x more neurones. NU = env.nu # Control is dim-1: joint torque env.vmax = 100. env.Kf = np.diagflat([ 0.2, 2. ]) env.modulo = False env.DT = 0.15 env.NDT = 1 #env.umax = 15. #env.umax = (15.,15.) env.umax = np.matrix([5.,10.]).T NSTEPS = 32 env.qlow[1] = -np.pi env.qup [1] = np.pi # Shortcut function to convert SE3 to 7-dof vector. M2gv = lambda M: XYZQUATToViewerConfiguration(se3ToXYZQUAT(M)) def place(objectId,M): robot.viewer.gui.applyConfiguration(objectId, M2gv(M)) robot.viewer.gui.refresh() # Refresh the window.
DECAY_RATE = 0.99 # Discount factor UPDATE_RATE = 0.01 # Homotopy rate to update the networks REPLAY_SIZE = 10000 # Size of replay buffer BATCH_SIZE = 64 # Number of points to be fed in stochastic gradient NH1 = NH2 = 250 # Hidden layer size RESTORE = "netvalues/actorcritic" # Previously optimize net weight # (set empty string if no) ### --- Environment env = Pendulum(1) # Continuous pendulum env.withSinCos = True # State is dim-3: (cosq,sinq,qdot) ... NX = env.nobs # ... training converges with q,qdot with 2x more neurones. NU = env.nu # Control is dim-1: joint torque env.vmax = 100. env.DT = .15 env.NDT = 2 env.Kf = 0.2 NSTEPS = 30 ### --- Q-value and policy networks class QValueNetwork: def __init__(self): nvars = len(tf.trainable_variables()) x = tflearn.input_data(shape=[None, NX]) u = tflearn.input_data(shape=[None, NU]) netx1 = tflearn.fully_connected(x, NH1,
BATCH_SIZE = 64 # Number of points to be fed in stochastic gradient NH1 = NH2 = 250 # Hidden layer size RESTORE = ""#"netvalues/actorcritic.15.kf2" # Previously optimize net weight # (set empty string if no) RENDERRATE = 20 # Render rate (rollout and plot) during training (0 = no) #RENDERACTION = [ 'saveweights', 'draw', 'rollout' ] REGULAR = True # Render on a regular grid vs random grid ### --- Environment env = Pendulum(1) # Continuous pendulum env.withSinCos = True # State is dim-3: (cosq,sinq,qdot) ... NX = env.nobs # ... training converges with q,qdot with 2x more neurones. NU = env.nu # Control is dim-1: joint torque env.DT = .15 env.NDT = 2 env.Kf = 0.2 env.vmax = 100 RENDERACTION = [ 'draw', ] ''' env = Pendulum(2,length=.5,mass=3.0,armature=10.) env.withSinCos = True # State is dim-3: (cosq,sinq,qdot) ... NX = env.nobs # ... training converges with q,qdot with 2x more neurones. NU = env.nu # Control is dim-1: joint torque env.DT = 0.2 env.NDT = 1 env.Kf = 10.0 # 1.0
BATCH_SIZE = 64 # Number of points to be fed in stochastic gradient NH1 = NH2 = 250 # Hidden layer size RESTORE = "netvalues/actorcritic.dt015.kf02.ep1300" # Previously optimize net weight # (set empty string if no) ### --- Environment env = Pendulum(1) # Continuous pendulum env.withSinCos = True # State is dim-3: (cosq,sinq,qdot) ... NX = env.nobs # ... training converges with q,qdot with 2x more neurones. NU = env.nu # Control is dim-1: joint torque env.vmax = 100. env.Kf = 0.2 env.modulo = False env.DT = 0.15 env.NDT = 1 NSTEPS = 32 # Number of intergration steps in horizon NNODES = 8 # Number of shooting nodes FNODES = NSTEPS / NNODES # Number of integration nodes per shooting interval ... assert (not NSTEPS % NNODES) # ... should be an integer ### --- Q-value and policy networks class QValueNetwork: def __init__(self): nvars = len(tf.trainable_variables()) x = tflearn.input_data(shape=[None, NX]) u = tflearn.input_data(shape=[None, NU])