UPDATE_RATE = tc.UPDATE_RATE # Homotopy rate to update the networks REPLAY_SIZE = tc.REPLAY_SIZE # Size of replay buffer BATCH_SIZE = tc.BATCH_SIZE # Number of points to be fed in stochastic gradient NH1 = tc.NH1 NH2 = tc.NH2 # Hidden layer size range_esp = tc.range_esp time_step = tc.time_step reward_weights = [1., 0.0, 0.00] sim_number = 2 RANDSET = 0 env = Robot("single_pendulum.urdf") env_rend = Robot("single_pendulum.urdf", sim_number=sim_number) #for rendering env.RANDSET = 0 step_expl = 0. epi_expl = 0. NX = 3 # ... training converges with q,qdot with 2x more neurones. NU = 1 # Control is dim-1: joint torque def angle_normalize(x): return min(x % (2 * np.pi), abs(x % (2 * np.pi) - 2 * np.pi)) class QValueNetwork: def __init__(self): nvars = len(tf.compat.v1.trainable_variables())
# tip of Acrobot is considered to be in the center of gravity of link2 # -> goal position length = .035 + .01 + 0.1088 goal = np.array([0,0,length]) env = Robot("double_pendulum.urdf") env_rend = Robot("double_pendulum.urdf",sim_number=SIM_NUMBER) #for rendering env.GUI_ENABLED=0 env.RANDSET = 0 env.SINCOS = 1 env.time_step= time_step env.actuated_index =[2] env.max_torque=5.0 env_rend.SINCOS = 1 env_rend.GUI_ENABLED = 1 env_rend.actuated_index = [2] step_expl = 0.
POLICY_LEARNING_RATE = tc.POLICY_LEARNING_RATE # Base learning rate for the policy network DECAY_RATE = tc.DECAY_RATE # Discount factor UPDATE_RATE = tc.UPDATE_RATE # Homotopy rate to update the networks REPLAY_SIZE = tc.REPLAY_SIZE # Size of replay buffer BATCH_SIZE = tc.BATCH_SIZE # Number of points to be fed in stochastic gradient NH1 = NH2 = tc.NH1 # Hidden layer size range_esp = tc.range_esp time_step = tc.time_step SIM_NUMBER = 1.2 model = DDPG.load("ddpg_pendulum_stb_baselines_" + str(SIM_NUMBER)) robot = Robot("single_pendulum.urdf") robot.sim_number = 1 robot.RANDSET = 0 robot.GUI_ENABLED = 1 robot.SINCOS = 1 path_log = "/home/pasquale/Desktop/thesis/thesis-code/1D_pendulum/stable_baselines/" robot.time_step = time_step robot.setupSim() #Evaluate policy #env.robot.stopSim() #env = PendulumPyB() #Check convergence #confronta #convergenza #tempo di training #average reward
rsum += reward #print(rsum) print(x) if verbose: print('Lasted ',i,' timestep -- total reward:',rsum) signal.signal(signal.SIGTSTP, lambda x,y:rendertrial()) # Roll-out when CTRL-Z is pressed h_rwd = [] h_qva = [] h_ste = [] ## Envirnonment Configuration ## env.GUI_ENABLED=0 env.RANDSET = RANDSET env.SINCOS = 1 env.time_step= time_step env.setupSim() ## Training Loop ## start_time = time.time() for episode in range(1,NEPISODES): env.resetRobot() x = np.array([[env.states_sincos[1][0],env.states_sincos[1][1], env.states_dot[1][3]]]) #remove .T rsum = 0.0