def __init__(self): self.max_speed = 1000. self.max_torque = 2. #self.dt = .05 #self.viewer = None self.robot = Robot("single_pendulum.urdf") self.robot.time_step = .01 self.robot.GUI_ENABLED = 0 self.robot.setupSim() self.robot.SINCOS = 1 self.reward_weights = [1.,0.0,0.0] self.step_count = 0 self.episode_duration=tc.NSTEPS high = np.array([1., 1., self.max_speed], dtype=np.float32) self.action_space = spaces.Box( low=-self.max_torque, high=self.max_torque, shape=(1,), dtype=np.float32 ) self.observation_space = spaces.Box( low=-high, high=high, dtype=np.float32 )
QVALUE_LEARNING_RATE = tc.QVALUE_LEARNING_RATE # Base learning rate for the Q-value Network POLICY_LEARNING_RATE = tc.POLICY_LEARNING_RATE # Base learning rate for the policy network DECAY_RATE = tc.DECAY_RATE # Discount factor UPDATE_RATE = tc.UPDATE_RATE # Homotopy rate to update the networks REPLAY_SIZE = tc.REPLAY_SIZE # Size of replay buffer BATCH_SIZE = tc.BATCH_SIZE # Number of points to be fed in stochastic gradient NH1 = tc.NH1 NH2 = tc.NH2 # Hidden layer size range_esp = tc.range_esp time_step = tc.time_step reward_weights = [1., 0.0, 0.00] sim_number = 2 RANDSET = 0 env = Robot("single_pendulum.urdf") env_rend = Robot("single_pendulum.urdf", sim_number=sim_number) #for rendering env.RANDSET = 0 step_expl = 0. epi_expl = 0. NX = 3 # ... training converges with q,qdot with 2x more neurones. NU = 1 # Control is dim-1: joint torque def angle_normalize(x): return min(x % (2 * np.pi), abs(x % (2 * np.pi) - 2 * np.pi))
NSTEPS = tc.NSTEPS # Max episode length QVALUE_LEARNING_RATE = tc.QVALUE_LEARNING_RATE # Base learning rate for the Q-value Network POLICY_LEARNING_RATE = tc.POLICY_LEARNING_RATE # Base learning rate for the policy network DECAY_RATE = tc.DECAY_RATE # Discount factor UPDATE_RATE = tc.UPDATE_RATE # Homotopy rate to update the networks REPLAY_SIZE = tc.REPLAY_SIZE # Size of replay buffer BATCH_SIZE = tc.BATCH_SIZE # Number of points to be fed in stochastic gradient NH1 = NH2 = tc.NH1 # Hidden layer size range_esp = tc.range_esp time_step = tc.time_step SIM_NUMBER = 1.2 model = DDPG.load("ddpg_pendulum_stb_baselines_" + str(SIM_NUMBER)) robot = Robot("single_pendulum.urdf") robot.sim_number = 1 robot.RANDSET = 0 robot.GUI_ENABLED = 1 robot.SINCOS = 1 path_log = "/home/pasquale/Desktop/thesis/thesis-code/1D_pendulum/stable_baselines/" robot.time_step = time_step robot.setupSim() #Evaluate policy #env.robot.stopSim() #env = PendulumPyB() #Check convergence #confronta #convergenza
start_time = time.time() model.learn(total_timesteps=NSTEPS * NEPISODES) end_time = time.time() elapsed_time = end_time - start_time model.save("ddpg_pendulum_stb_baselines_" + str(SIM_NUMBER)) print('elapsed ' + str(elapsed_time) + 's') mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=21) env.robot.stopSim() #mean_reward, std_reward = evaluate_policy(model, env_eval, n_eval_episodes=10) #print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}') ### save video # model = DDPG.load("ddpg_pendulum_stb_baselines") robot = Robot("double_pendulum.urdf") robot.sim_number = SIM_NUMBER RANDSET = 0 robot.LOGDATA = 1 robot.SINCOS = 1 robot.video_path = "/home/pasquale/Desktop/thesis/thesis-code/2D_Acrobot/stable_baselines/Video" path_log = "/home/pasquale/Desktop/thesis/thesis-code/2D_Acrobot/stable_baselines/" robot.time_step = time_step robot.setupSim() for i in range(NSTEPS): obs = np.array([ robot.states_sincos[1][0], robot.states_sincos[1][1], robot.states_dot[1][3], robot.states_sincos[2][0], robot.states_sincos[2][1], robot.states_dot[2][3] ])
# second revolute [0.023,0.,0.1] # center of gravity of link 2 [-0.0050107, 1.9371E-10, 0.10088] # # since the rotation is on x and all joints RF have the same orientation and also link2 CoG, consider only y and z for the length (actually only z since y is always 0) # # tip of Acrobot is considered to be in the center of gravity of link2 # -> goal position length = .035 + .01 + 0.1088 goal = np.array([0,0,length]) env = Robot("double_pendulum.urdf") env_rend = Robot("double_pendulum.urdf",sim_number=SIM_NUMBER) #for rendering env.GUI_ENABLED=0 env.RANDSET = 0 env.SINCOS = 1 env.time_step= time_step env.actuated_index =[2] env.max_torque=5.0 env_rend.SINCOS = 1 env_rend.GUI_ENABLED = 1 env_rend.actuated_index = [2]