def __init__(self): self.max_speed = 1000. self.max_torque = 2. #self.dt = .05 #self.viewer = None self.robot = Robot("single_pendulum.urdf") self.robot.time_step = .01 self.robot.GUI_ENABLED = 0 self.robot.setupSim() self.robot.SINCOS = 1 self.reward_weights = [1.,0.0,0.0] self.step_count = 0 self.episode_duration=tc.NSTEPS high = np.array([1., 1., self.max_speed], dtype=np.float32) self.action_space = spaces.Box( low=-self.max_torque, high=self.max_torque, shape=(1,), dtype=np.float32 ) self.observation_space = spaces.Box( low=-high, high=high, dtype=np.float32 )
QVALUE_LEARNING_RATE = tc.QVALUE_LEARNING_RATE # Base learning rate for the Q-value Network POLICY_LEARNING_RATE = tc.POLICY_LEARNING_RATE # Base learning rate for the policy network DECAY_RATE = tc.DECAY_RATE # Discount factor UPDATE_RATE = tc.UPDATE_RATE # Homotopy rate to update the networks REPLAY_SIZE = tc.REPLAY_SIZE # Size of replay buffer BATCH_SIZE = tc.BATCH_SIZE # Number of points to be fed in stochastic gradient NH1 = tc.NH1 NH2 = tc.NH2 # Hidden layer size range_esp = tc.range_esp time_step = tc.time_step reward_weights = [1., 0.0, 0.00] sim_number = 2 RANDSET = 0 env = Robot("single_pendulum.urdf") env_rend = Robot("single_pendulum.urdf", sim_number=sim_number) #for rendering env.RANDSET = 0 step_expl = 0. epi_expl = 0. NX = 3 # ... training converges with q,qdot with 2x more neurones. NU = 1 # Control is dim-1: joint torque def angle_normalize(x): return min(x % (2 * np.pi), abs(x % (2 * np.pi) - 2 * np.pi))
NSTEPS = tc.NSTEPS # Max episode length QVALUE_LEARNING_RATE = tc.QVALUE_LEARNING_RATE # Base learning rate for the Q-value Network POLICY_LEARNING_RATE = tc.POLICY_LEARNING_RATE # Base learning rate for the policy network DECAY_RATE = tc.DECAY_RATE # Discount factor UPDATE_RATE = tc.UPDATE_RATE # Homotopy rate to update the networks REPLAY_SIZE = tc.REPLAY_SIZE # Size of replay buffer BATCH_SIZE = tc.BATCH_SIZE # Number of points to be fed in stochastic gradient NH1 = NH2 = tc.NH1 # Hidden layer size range_esp = tc.range_esp time_step = tc.time_step SIM_NUMBER = 1.2 model = DDPG.load("ddpg_pendulum_stb_baselines_" + str(SIM_NUMBER)) robot = Robot("single_pendulum.urdf") robot.sim_number = 1 robot.RANDSET = 0 robot.GUI_ENABLED = 1 robot.SINCOS = 1 path_log = "/home/pasquale/Desktop/thesis/thesis-code/1D_pendulum/stable_baselines/" robot.time_step = time_step robot.setupSim() #Evaluate policy #env.robot.stopSim() #env = PendulumPyB() #Check convergence #confronta #convergenza
end_time=time.time() elapsed_time = end_time-start_time model.save("ddpg_pendulum_stb_baselines_"+str(SIM_NUMBER)) print('elapsed '+str(elapsed_time)+'s') mean_reward,std_reward = evaluate_policy(model,env,n_eval_episodes = 21) env.robot.stopSim() #mean_reward, std_reward = evaluate_policy(model, env_eval, n_eval_episodes=10) #print(f'Mean reward: {mean_reward} +/- {std_reward:.2f}') ### save video # model = DDPG.load("ddpg_pendulum_stb_baselines") robot = Robot("single_pendulum.urdf") robot.sim_number=SIM_NUMBER RANDSET =0 robot.LOGDATA = 1 robot.SINCOS=1 robot.video_path = "/home/pasquale/Desktop/thesis/thesis-code/1D_pendulum/stable_baselines/Video" path_log= "/home/pasquale/Desktop/thesis/thesis-code/1D_pendulum/stable_baselines/" robot.time_step = time_step robot.setupSim() for i in range(NSTEPS): obs = np.array([robot.states_sincos[1][0],robot.states_sincos[1][1],robot.states_dot[1][3]]) action, _states = model.predict(obs) action=action.tolist() robot.simulateDyn(action) time.sleep(0.05)
# second revolute [0.023,0.,0.1] # center of gravity of link 2 [-0.0050107, 1.9371E-10, 0.10088] # # since the rotation is on x and all joints RF have the same orientation and also link2 CoG, consider only y and z for the length (actually only z since y is always 0) # # tip of Acrobot is considered to be in the center of gravity of link2 # -> goal position length = .035 + .01 + 0.1088 goal = np.array([0,0,length]) env = Robot("double_pendulum.urdf") env_rend = Robot("double_pendulum.urdf",sim_number=SIM_NUMBER) #for rendering env.GUI_ENABLED=0 env.RANDSET = 0 env.SINCOS = 1 env.time_step= time_step env.actuated_index =[2] env.max_torque=5.0 env_rend.SINCOS = 1 env_rend.GUI_ENABLED = 1 env_rend.actuated_index = [2]
UPDATE_RATE = tc.UPDATE_RATE # Homotopy rate to update the networks REPLAY_SIZE = tc.REPLAY_SIZE # Size of replay buffer BATCH_SIZE = tc.BATCH_SIZE # Number of points to be fed in stochastic gradient NH1 = tc.NH1 NH2 = tc.NH2 # Hidden layer size range_esp = tc.range_esp # Hidden layer size time_step = tc.time_step reward_weights = [1.,0.0,0.00] SIM_NUMBER = 00 RANDSET =0 env = Robot("single_pendulum.urdf") env_rend = Robot("single_pendulum.urdf",sim_number=SIM_NUMBER) #for rendering step_expl = 0. epi_expl = 0. NX = 3 # ... training converges with q,qdot with 2x more neurones. NU = 1 # Control is dim-1: joint torque def angle_normalize(x):
NSTEPS = tc.NSTEPS # Max episode length QVALUE_LEARNING_RATE = tc.QVALUE_LEARNING_RATE # Base learning rate for the Q-value Network POLICY_LEARNING_RATE = tc.POLICY_LEARNING_RATE # Base learning rate for the policy network DECAY_RATE = tc.DECAY_RATE # Discount factor UPDATE_RATE = tc.UPDATE_RATE # Homotopy rate to update the networks REPLAY_SIZE = tc.REPLAY_SIZE # Size of replay buffer BATCH_SIZE = tc.BATCH_SIZE # Number of points to be fed in stochastic gradient NH1 = tc.NH1 NH2 = tc.NH2 # Hidden layer size range_esp = tc.range_esp reward_weights = [1., 0.0, 0.00] sim_number = 12 RANDSET = 0 env = Robot("single_pendulum.urdf") env_rend = Robot("single_pendulum.urdf", sim_number=sim_number) #for rendering env.RANDSET = 0 step_expl = 0. epi_expl = 0. NX = 3 # ... training converges with q,qdot with 2x more neurones. NU = 1 # Control is dim-1: joint torque def angle_normalize(x): return min(x % (2 * np.pi), abs(x % (2 * np.pi) - 2 * np.pi))
class PendulumPyB(gym.Env): metadata = {'render.modes': ['human']} def __init__(self): self.max_speed = 1000. self.max_torque = 2. #self.dt = .05 #self.viewer = None self.robot = Robot("single_pendulum.urdf") self.robot.time_step = .01 self.robot.GUI_ENABLED = 0 self.robot.setupSim() self.robot.SINCOS = 1 self.reward_weights = [1., 0.0, 0.0] self.step_count = 0 self.episode_duration = tc.NSTEPS high = np.array([1., 1., self.max_speed], dtype=np.float32) self.action_space = spaces.Box(low=-self.max_torque, high=self.max_torque, shape=(1, ), dtype=np.float32) self.observation_space = spaces.Box(low=-high, high=high, dtype=np.float32) def step(self, u): #dt = self.dt self.step_count += 1 end_episode = False #u = np.clip(u, -self.max_torque, self.max_torque)[0] u.tolist() self.robot.simulateDyn([u[0]]) reward = -angle_normalize( self.robot.states[1][3] )**2 * self.reward_weights[0] - self.robot.states_dot[1][ 3]**2 * self.reward_weights[1] - self.reward_weights[2] * (u**2) if not self.step_count % self.episode_duration: end_episode = True return self._get_obs(), reward, end_episode, {} def reset(self): self.robot.resetRobot() return self._get_obs() def _get_obs(self): self.robot.observeState() return np.array([ self.robot.states_sincos[1][0], self.robot.states_sincos[1][1], self.robot.states_dot[1][3] ]) """ def render(self, mode='human'): if self.viewer is None: from gym.envs.classic_control import rendering self.viewer = rendering.Viewer(500, 500) self.viewer.set_bounds(-2.2, 2.2, -2.2, 2.2) rod = rendering.make_capsule(1, .2) rod.set_color(.8, .3, .3) self.pole_transform = rendering.Transform() rod.add_attr(self.pole_transform) self.viewer.add_geom(rod) axle = rendering.make_circle(.05) axle.set_color(0, 0, 0) self.viewer.add_geom(axle) fname = path.join(path.dirname(__file__), "assets/clockwise.png") self.img = rendering.Image(fname, 1., 1.) self.imgtrans = rendering.Transform() self.img.add_attr(self.imgtrans) self.viewer.add_onetime(self.img) self.pole_transform.set_rotation(self.state[0] + np.pi / 2) if self.last_u: self.imgtrans.scale = (-self.last_u / 2, np.abs(self.last_u) / 2) return self.viewer.render(return_rgb_array=mode == 'rgb_array') """ """ def close(self):