def main(control="gym-like"): env = KukaGymEnv(renders=True, isDiscrete=False, maxSteps=10000000) motorsIds = [] if control == "gym-like": dv = 1 motorsIds.append(env._p.addUserDebugParameter("posX", -dv, dv, 0)) motorsIds.append(env._p.addUserDebugParameter("posY", -dv, dv, 0)) motorsIds.append(env._p.addUserDebugParameter("yaw", -dv, dv, 0)) else: dv = 0.01 motorsIds.append(env._p.addUserDebugParameter("posX", -dv, dv, 0)) motorsIds.append(env._p.addUserDebugParameter("posY", -dv, dv, 0)) motorsIds.append(env._p.addUserDebugParameter("posZ", -dv, dv, 0)) motorsIds.append(env._p.addUserDebugParameter("yaw", -dv, dv, 0)) motorsIds.append(env._p.addUserDebugParameter("fingerAngle", 0, 0.3, 0.3)) done = False i = 0 while not done: time.sleep(0.01) action = [] for motorId in motorsIds: action.append(env._p.readUserDebugParameter(motorId)) if control == "gym-like": state, reward, done, info = env.step(action) else: state, reward, done, info = env.step2(action) obs = env.getExtendedObservation() print(i) print(f"Action: {action}") print(f"Observation: {obs}") i += 1
def main(): environment = KukaGymEnv(renders=True, isDiscrete=False, maxSteps=10000000) motorsIds = [] #motorsIds.append(environment._p.addUserDebugParameter("posX",0.4,0.75,0.537)) #motorsIds.append(environment._p.addUserDebugParameter("posY",-.22,.3,0.0)) #motorsIds.append(environment._p.addUserDebugParameter("posZ",0.1,1,0.2)) #motorsIds.append(environment._p.addUserDebugParameter("yaw",-3.14,3.14,0)) #motorsIds.append(environment._p.addUserDebugParameter("fingerAngle",0,0.3,.3)) dv = 0.01 motorsIds.append(environment._p.addUserDebugParameter("posX", -dv, dv, 0)) motorsIds.append(environment._p.addUserDebugParameter("posY", -dv, dv, 0)) motorsIds.append(environment._p.addUserDebugParameter("posZ", -dv, dv, 0)) motorsIds.append(environment._p.addUserDebugParameter("yaw", -dv, dv, 0)) motorsIds.append(environment._p.addUserDebugParameter("fingerAngle", 0, 0.3, .3)) done = False while (not done): action = [] for motorId in motorsIds: action.append(environment._p.readUserDebugParameter(motorId)) state, reward, done, info = environment.step2(action) obs = environment.getExtendedObservation()
def __init__(self, hyperparameters): """ Args: hyperparameters (dict): a dictionary of hyperparameters discounted_returns: None """ # Extract hyperparameters self.lr = hyperparameters['learning_rate'] self.discount = hyperparameters['discount_rate'] self.num_batch_transitions = hyperparameters['num_batch_transitions'] self.state_dim = hyperparameters['state_dim'] self.action_dim = hyperparameters['action_dim'] self.total_train_steps = hyperparameters['total_train_steps'] self.max_episode_length = hyperparameters['max_episode_length'] self.num_train_epochs = hyperparameters['num_train_epochs'] self.device = hyperparameters['device'] # Initialize actor/critic networks self.actor = Actor(self.state_dim, self.action_dim) self.critic = Critic(self.state_dim, self.action_dim) self.actor_optim = optim.Adam(self.actor.parameters(), lr=self.lr) self.critic_optim = optim.Adam(self.critic.parameters(), lr=self.lr) # Initialize replay buffer and environment self.replay_buffer = ReplayBuffer(self.batch_size) self.enviroment = KukaGymEnv(renders=True)
def main(): environment = KukaGymEnv(renders=True) motorsIds = [] #motorsIds.append(environment._p.addUserDebugParameter("posX",0.4,0.75,0.537)) #motorsIds.append(environment._p.addUserDebugParameter("posY",-.22,.3,0.0)) #motorsIds.append(environment._p.addUserDebugParameter("posZ",0.1,1,0.2)) #motorsIds.append(environment._p.addUserDebugParameter("yaw",-3.14,3.14,0)) #motorsIds.append(environment._p.addUserDebugParameter("fingerAngle",0,0.3,.3)) dv = 0.001 motorsIds.append(environment._p.addUserDebugParameter("posX", -dv, dv, 0)) motorsIds.append(environment._p.addUserDebugParameter("posY", -dv, dv, 0)) motorsIds.append(environment._p.addUserDebugParameter( "posZ", -dv, dv, -dv)) motorsIds.append(environment._p.addUserDebugParameter("yaw", -dv, dv, 0)) motorsIds.append( environment._p.addUserDebugParameter("fingerAngle", 0, 0.3, .3)) done = False while (not done): action = [] for motorId in motorsIds: action.append(environment._p.readUserDebugParameter(motorId)) state, reward, done, info = environment.step2(action) obs = environment.getExtendedObservation()
def test_timing_profile(self): from pybullet_envs.bullet.kukaGymEnv import KukaGymEnv import time kuka_env = KukaGymEnv(renders=False, isDiscrete=False) # operates at 240 HZ task = generate_task(task_generator_id="pushing") causal_rl_env = CausalWorld( task=task, enable_visualization=False, seed=0, skip_frame=10, normalize_actions=False, normalize_observations=False) # operates at 250 HZ start = time.time() kuka_env.reset() end = time.time() kuka_reset_time = end - start start = time.time() causal_rl_env.reset() end = time.time() causal_rl_reset_time = end - start self.assertLess(causal_rl_reset_time, kuka_reset_time * 1.25) start = time.time() kuka_env.step(kuka_env.action_space.sample()) end = time.time() kuka_step_time = end - start start = time.time() causal_rl_env.step(causal_rl_env.action_space.sample()) end = time.time() causal_rl_step_time = end - start print("time 1", causal_rl_step_time) print("time 2", kuka_step_time) self.assertLess(causal_rl_step_time, kuka_step_time * 10) start = time.time() kuka_env.render() end = time.time() kuka_render_time = end - start start = time.time() causal_rl_env.render() end = time.time() causal_rl_render_time = end - start self.assertLess(causal_rl_render_time, kuka_render_time * 1.25) causal_rl_env.close() kuka_env.close() return
def main(): env = KukaGymEnv(renders=True) act = deepq.load("kuka_model.pkl") print(act) while True: obs, done = env.reset(), False print("===================================") print("obs") print(obs) episode_rew = 0 while not done: env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew print("Episode reward", episode_rew)
def initialize(self, render=False): self.initialized = True self._env = KukaGymEnv(renders=render) self.observation_space = self._env.observation_space.shape self.action_space = self._env.action_space.shape self.state = {} initial_obs = self.reset() return initial_obs
def main_usingEnvOnly(): environment = KukaGymEnv(renders=True, isDiscrete=False, maxSteps=10000000) randomObjs = add_random_objs_to_scene(10) motorsIds = [] motorsIds.append( environment._p.addUserDebugParameter("posX", 0.4, 0.75, 0.537)) motorsIds.append( environment._p.addUserDebugParameter("posY", -.22, .3, 0.0)) motorsIds.append(environment._p.addUserDebugParameter("posZ", 0.1, 1, 0.2)) motorsIds.append( environment._p.addUserDebugParameter("yaw", -3.14, 3.14, 0)) motorsIds.append( environment._p.addUserDebugParameter("fingerAngle", 0, 0.3, .3)) # dv = 0.01 # motorsIds.append(environment._p.addUserDebugParameter("posX",-dv,dv,0)) # motorsIds.append(environment._p.addUserDebugParameter("posY",-dv,dv,0)) # motorsIds.append(environment._p.addUserDebugParameter("posZ",-dv,dv,0)) # motorsIds.append(environment._p.addUserDebugParameter("yaw",-dv,dv,0)) # motorsIds.append(environment._p.addUserDebugParameter("fingerAngle",0,0.3,.3)) done = False #According to the hand-eye coordination paper, the camera is mounted over the shoulder of the arm camEyePos = [0.03, 0.236, 0.54] targetPos = [0.640000, 0.075000, -0.190000] cameraUp = [0, 0, 1] viewMat = p.computeViewMatrix(camEyePos, targetPos, cameraUp) camInfo = p.getDebugVisualizerCamera() projMatrix = camInfo[3] # viewMat = [-0.5120397806167603, 0.7171027660369873, -0.47284144163131714, 0.0, -0.8589617609977722, -0.42747554183006287, 0.28186774253845215, 0.0, 0.0, 0.5504802465438843, 0.8348482847213745, 0.0, 0.1925382763147354, -0.24935829639434814, -0.4401884973049164, 1.0] # projMatrix = [0.75, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, -1.0000200271606445, -1.0, 0.0, 0.0, -0.02000020071864128, 0.0] img_arr = p.getCameraImage(640, 512, viewMatrix=viewMat, projectionMatrix=projMatrix) #640*512*3 # write_from_imgarr(img_arr, 1) while (not done): action = [] for motorId in motorsIds: action.append(environment._p.readUserDebugParameter(motorId)) state, reward, done, info = environment.step2(action) obs = environment.getExtendedObservation()
def main(): env = KukaGymEnv(renders=False, isDiscrete=True) model = deepq.models.mlp([64]) act = deepq.learn(env, q_func=model, lr=1e-3, max_timesteps=10000000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=10, callback=callback) print("Saving model to kuka_model.pkl") act.save("kuka_model.pkl")
def main(): env = KukaGymEnv(renders=False, isDiscrete=True) act = deepq.learn( env, network=models.mlp(num_hidden=64, num_layers=1), lr=1e-3, max_timesteps=100000000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=10, callback=callback ) print("Saving model to kuka_model.pkl") act.save("kuka_model.pkl")
def main(): env = KukaGymEnv(renders=True, isDiscrete=True) act = deepq.load("kuka_model.pkl") print(act) while True: obs, done = env.reset(), False print("===================================") print("obs") print(obs) episode_rew = 0 while not done: env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew print("Episode reward", episode_rew)
def main(): env = KukaGymEnv(renders=True, isDiscrete=False, maxSteps=10000000) save_path = os.path.join("saves", "ddpg-") os.makedirs(save_path, exist_ok=True) device = torch.device("cuda") act_net = model.DDPGActor(env.observation_space.shape[0], env.action_space.shape[0]).to(device) crt_net = model.D4PGCritic(env.observation_space.shape[0], env.action_space.shape[0], N_ATOMS, Vmin, Vmax).to(device) print(act_net) print(crt_net) tgt_act_net = common.TargetNet(act_net) tgt_crt_net = common.TargetNet(crt_net) writer = SummaryWriter(comment="-d4pg_") agent = model.AgentDDPG(act_net, device=device) exp_source = experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA, steps_count=REWARD_STEPS) buffer = experience.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE) act_opt = optim.Adam(act_net.parameters(), lr=LEARNING_RATE) crt_opt = optim.Adam(crt_net.parameters(), lr=LEARNING_RATE) frame_idx = 0 best_reward = None with common.RewardTracker(writer) as tracker: with common.TBMeanTracker(writer, batch_size=10) as tb_tracker: while True: frame_idx += 1 #print("populate") buffer.populate(1) rewards_steps = exp_source.pop_rewards_steps() #print(rewards_steps) if rewards_steps: rewards, steps = zip(*rewards_steps) tb_tracker.track("episode_steps", steps[0], frame_idx) tracker.reward(rewards[0], frame_idx) if len(buffer) < 100: continue batch = buffer.sample(BATCH_SIZE) #print("infer") states_v, actions_v, rewards_v, dones_mask, last_states_v = common.unpack_batch_ddqn( batch, device) #print("train critic")# train critic crt_opt.zero_grad() crt_distr_v = crt_net(states_v, actions_v) last_act_v = tgt_act_net.target_model(last_states_v) last_distr_v = F.softmax(tgt_crt_net.target_model( last_states_v, last_act_v), dim=1) proj_distr_v = distr_projection(last_distr_v, rewards_v, dones_mask, gamma=GAMMA**REWARD_STEPS, device=device) prob_dist_v = -F.log_softmax(crt_distr_v, dim=1) * proj_distr_v critic_loss_v = prob_dist_v.sum(dim=1).mean() critic_loss_v.backward() crt_opt.step() tb_tracker.track("loss_critic", critic_loss_v, frame_idx) #print("train actor") # train actor act_opt.zero_grad() act_opt.zero_grad() cur_actions_v = act_net(states_v) crt_distr_v = crt_net(states_v, cur_actions_v) actor_loss_v = -crt_net.distr_to_q(crt_distr_v) actor_loss_v = actor_loss_v.mean() actor_loss_v.backward() act_opt.step() tb_tracker.track("loss_actor", actor_loss_v, frame_idx) tgt_act_net.alpha_sync(alpha=1 - 1e-3) tgt_crt_net.alpha_sync(alpha=1 - 1e-3) if frame_idx % TEST_ITERS == 0: print("testing") env.reset() ts = time.time() rewards, steps = test_net(act_net, env, device=device) print("Test done in %.2f sec, reward %.3f, steps %d" % (time.time() - ts, rewards, steps)) writer.add_scalar("test_reward", rewards, frame_idx) writer.add_scalar("test_steps", steps, frame_idx) if best_reward is None or best_reward < rewards: if best_reward is not None: print("Best reward updated: %.3f -> %.3f" % (best_reward, rewards)) name = "best_%+.3f_%d.dat" % (rewards, frame_idx) fname = os.path.join(save_path, name) torch.save(act_net.state_dict(), fname) best_reward = rewards
from pybullet_envs.bullet.kukaGymEnv import KukaGymEnv max_step = 1000 episode_count = 0 max_episodes = 50 env = KukaGymEnv(renders=True) observation = env.reset() while episode_count < max_episodes: env.render() action = env.action_space.sample( ) # your agent here (this takes random actions) print(action) observation, reward, done, info = env.step(action) # Reset environment if done if done: observation = env.reset() episode_count += 1 env.close()
#add parent dir to find package. Only needed for source code build, pip install doesn't need it. import os, inspect currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) parentdir = os.path.dirname(os.path.dirname(currentdir)) os.sys.path.insert(0,parentdir) from pybullet_envs.bullet.kukaGymEnv import KukaGymEnv import time environment = KukaGymEnv(renders=True) motorsIds=[] #motorsIds.append(environment._p.addUserDebugParameter("posX",0.4,0.75,0.537)) #motorsIds.append(environment._p.addUserDebugParameter("posY",-.22,.3,0.0)) #motorsIds.append(environment._p.addUserDebugParameter("posZ",0.1,1,0.2)) #motorsIds.append(environment._p.addUserDebugParameter("yaw",-3.14,3.14,0)) #motorsIds.append(environment._p.addUserDebugParameter("fingerAngle",0,0.3,.3)) dv = 0.001 motorsIds.append(environment._p.addUserDebugParameter("posX",-dv,dv,0)) motorsIds.append(environment._p.addUserDebugParameter("posY",-dv,dv,0)) motorsIds.append(environment._p.addUserDebugParameter("posZ",-dv,dv,-dv)) motorsIds.append(environment._p.addUserDebugParameter("yaw",-dv,dv,0)) motorsIds.append(environment._p.addUserDebugParameter("fingerAngle",0,0.3,.3)) done = False while (not done): action=[]
config['trainBatchSize'] = 32 config['gamma'] = 0.9 config['learningRate'] = 0.001 config['netGradClip'] = 1 config['logFlag'] = True config['logFileName'] = 'StabilizerOneDLog/traj' config['logFrequency'] = 1000 config['priorityMemoryOption'] = False config['netUpdateOption'] = 'doubleQ' config['netUpdateFrequency'] = 1 config['priorityMemory_absErrUpper'] = 5 import gym from pybullet_envs.bullet.kukaGymEnv import KukaGymEnv env = KukaGymEnv(renders=True, isDiscrete=True) N_S = env.observation_space.shape[0] N_A = env.action_space.n netParameter = dict() netParameter['n_feature'] = N_S netParameter['n_hidden'] = [100] netParameter['n_output'] = N_A policyNet = MultiLayerNetRegression(netParameter['n_feature'], netParameter['n_hidden'], netParameter['n_output']) targetNet = deepcopy(policyNet) optimizer = optim.Adam(policyNet.parameters(), lr=config['learningRate'])
def __init__(self, base_env_name, env_v=0, obs_resolution=64, obs_ptcloud=False, random_colors=False, debug=False, visualize=False): self._base_env_name = base_env_name self._random_colors = random_colors self._mobile = False if base_env_name.startswith( ('Hopper', 'Walker2D', 'Ant', 'Humanoid', 'HalfCheetah', 'InvertedPendulumSwingup')): self._mobile = True # update base pos when rendering # If we created a bullet sim env from scratch, would init sim as: # conn_mode = pybullet.GUI if visualize else pybullet.DIRECT # self._p = bullet_client.BulletClient(connection_mode=conn_mode) # But all bullet gym envs init bullet client in constructor or reset. # Kuka creates bullet client in the constructor, so need to call # its constructor explicitly. All other envs derive from classes in # pybullet_envs.env_bases, which init bullet client in reset(). # We set the corresponding render flag for them after their creation. if 'Kuka' in base_env_name: max_episode_steps = 1000 self._env = KukaGymEnv(renders=visualize, maxSteps=max_episode_steps) self._env._max_episode_steps = max_episode_steps # Appropriate camera vals from KukaCamGymEnv self._base_pos = [0.52, -0.2, -0.33] self._env.unwrapped._cam_dist = 1.3 self._env.unwrapped._cam_yaw = 180 self._env.unwrapped._cam_pitch = -41 else: self._env = gym.make(base_env_name + 'BulletEnv-v' + str(env_v)) max_episode_steps = self._env._max_episode_steps if 'CartPole' in base_env_name: self._env.render(mode='rgb_array') # init cam dist vars if visualize: self._env.render(mode='human') # turn on debug viz super(AuxBulletEnv, self).__init__(max_episode_steps, obs_resolution, obs_ptcloud, debug, visualize) # Zoom in camera. assert (hasattr(self._env.unwrapped, '_cam_dist')) if base_env_name.startswith('Reacher'): self._env.unwrapped._cam_dist = 0.55 self._env.unwrapped._cam_pitch = -89 # -90 doesn't look ok on debug elif base_env_name.startswith('InvertedPendulum'): self._env.unwrapped._cam_dist = 2.2 # was: 1.8 elif base_env_name.startswith('InvertedDoublePendulum'): self._env.unwrapped._cam_dist = 2.8 self._env.unwrapped._cam_pitch = -1.0 self._part_nms = ['cart', 'pole', 'pole2'] elif base_env_name.startswith(('Hopper', 'Walker2D')): self._env.unwrapped._cam_dist = 1.5 self._env.unwrapped._cam_pitch = -40 elif base_env_name.startswith('Humanoid'): self._env.unwrapped._cam_dist = 1.8 self._env.unwrapped._cam_pitch = -40 self._env.reset() # load URDF files, init _p sim pointer self._sim = self._env.unwrapped._p # easy access to bullet client # Set reasonable colors and env defaults. if 'CartPole' in base_env_name: cartpole = self._env.unwrapped.cartpole self._sim.changeVisualShape(cartpole, 0, rgbaColor=(1, 1, 0, 1)) self._sim.changeVisualShape(cartpole, 1, rgbaColor=(0, 0, 1, 1)) # Compute camera info. This has to be done after self._env.reset() self._view_mat, self._proj_mat, base_pos = self.compute_cam_vals() # Specify camera objects for point cloud processing. self._cam_object_ids = None if self.obs_ptcloud: assert (self._base_env_name == 'CartPole') self._cam_object_ids = [self._env.unwrapped.cartpole] # Note: PyBullet envs with XML(MJCF)-based robots seem to have # issues with reporting body ids and names correctly. #for tmpi in range(self._sim.getNumBodies()): # robot_name, scene_name = self._sim.getBodyInfo(tmpi) # robot_name = robot_name.decode("utf8") # body_id = self._sim.getBodyUniqueId(tmpi) # print('body id:', body_id, robot_name, scene_name) # Specify observation and action spaces. self._ld_names = self.compute_low_dim_state_names() self._ld_names_ids_dict = \ {k: v for v, k in enumerate(self.low_dim_state_names)} if obs_resolution is None: self.observation_space = self._env.observation_space # low dim elif self.obs_ptcloud: state_sz = obs_resolution * 3 # 3D points in point cloud self.observation_space = gym.spaces.Box( -1.0 * AuxBulletEnv.PTCLOUD_BOX_SIZE * np.ones(state_sz), AuxBulletEnv.PTCLOUD_BOX_SIZE * np.ones(state_sz)) else: # RGB images self.observation_space = gym.spaces.Box( low=0.0, high=1.0, shape=[3, obs_resolution, obs_resolution], # HxWxRGB dtype=np.float32) self.action_space = self._env.action_space # Turn on visualization if needed. if visualize: self._sim.resetDebugVisualizerCamera( self._env.unwrapped._cam_dist, self._env.unwrapped._cam_yaw, self._env.unwrapped._cam_pitch, base_pos) if self.obs_resolution is not None: pybullet.configureDebugVisualizer(pybullet.COV_ENABLE_GUI, 1) pybullet.configureDebugVisualizer( pybullet.COV_ENABLE_RGB_BUFFER_PREVIEW, 1) pybullet.configureDebugVisualizer( pybullet.COV_ENABLE_DEPTH_BUFFER_PREVIEW, 1) pybullet.configureDebugVisualizer( pybullet.COV_ENABLE_SEGMENTATION_MARK_PREVIEW, 1) if self.debug: print('Created Aux', base_env_name, 'with observation_space', self.observation_space, 'action_space', self.action_space, 'max_episode_steps', self.max_episode_steps)
class AuxBulletEnv(gym.Env, AuxEnv): PTCLOUD_BOX_SIZE = 2.0 # 2m cube cut off for point cloud observations def __init__(self, base_env_name, env_v=0, obs_resolution=64, obs_ptcloud=False, random_colors=False, debug=False, visualize=False): self._base_env_name = base_env_name self._random_colors = random_colors self._mobile = False if base_env_name.startswith( ('Hopper', 'Walker2D', 'Ant', 'Humanoid', 'HalfCheetah', 'InvertedPendulumSwingup')): self._mobile = True # update base pos when rendering # If we created a bullet sim env from scratch, would init sim as: # conn_mode = pybullet.GUI if visualize else pybullet.DIRECT # self._p = bullet_client.BulletClient(connection_mode=conn_mode) # But all bullet gym envs init bullet client in constructor or reset. # Kuka creates bullet client in the constructor, so need to call # its constructor explicitly. All other envs derive from classes in # pybullet_envs.env_bases, which init bullet client in reset(). # We set the corresponding render flag for them after their creation. if 'Kuka' in base_env_name: max_episode_steps = 1000 self._env = KukaGymEnv(renders=visualize, maxSteps=max_episode_steps) self._env._max_episode_steps = max_episode_steps # Appropriate camera vals from KukaCamGymEnv self._base_pos = [0.52, -0.2, -0.33] self._env.unwrapped._cam_dist = 1.3 self._env.unwrapped._cam_yaw = 180 self._env.unwrapped._cam_pitch = -41 else: self._env = gym.make(base_env_name + 'BulletEnv-v' + str(env_v)) max_episode_steps = self._env._max_episode_steps if 'CartPole' in base_env_name: self._env.render(mode='rgb_array') # init cam dist vars if visualize: self._env.render(mode='human') # turn on debug viz super(AuxBulletEnv, self).__init__(max_episode_steps, obs_resolution, obs_ptcloud, debug, visualize) # Zoom in camera. assert (hasattr(self._env.unwrapped, '_cam_dist')) if base_env_name.startswith('Reacher'): self._env.unwrapped._cam_dist = 0.55 self._env.unwrapped._cam_pitch = -89 # -90 doesn't look ok on debug elif base_env_name.startswith('InvertedPendulum'): self._env.unwrapped._cam_dist = 2.2 # was: 1.8 elif base_env_name.startswith('InvertedDoublePendulum'): self._env.unwrapped._cam_dist = 2.8 self._env.unwrapped._cam_pitch = -1.0 self._part_nms = ['cart', 'pole', 'pole2'] elif base_env_name.startswith(('Hopper', 'Walker2D')): self._env.unwrapped._cam_dist = 1.5 self._env.unwrapped._cam_pitch = -40 elif base_env_name.startswith('Humanoid'): self._env.unwrapped._cam_dist = 1.8 self._env.unwrapped._cam_pitch = -40 self._env.reset() # load URDF files, init _p sim pointer self._sim = self._env.unwrapped._p # easy access to bullet client # Set reasonable colors and env defaults. if 'CartPole' in base_env_name: cartpole = self._env.unwrapped.cartpole self._sim.changeVisualShape(cartpole, 0, rgbaColor=(1, 1, 0, 1)) self._sim.changeVisualShape(cartpole, 1, rgbaColor=(0, 0, 1, 1)) # Compute camera info. This has to be done after self._env.reset() self._view_mat, self._proj_mat, base_pos = self.compute_cam_vals() # Specify camera objects for point cloud processing. self._cam_object_ids = None if self.obs_ptcloud: assert (self._base_env_name == 'CartPole') self._cam_object_ids = [self._env.unwrapped.cartpole] # Note: PyBullet envs with XML(MJCF)-based robots seem to have # issues with reporting body ids and names correctly. #for tmpi in range(self._sim.getNumBodies()): # robot_name, scene_name = self._sim.getBodyInfo(tmpi) # robot_name = robot_name.decode("utf8") # body_id = self._sim.getBodyUniqueId(tmpi) # print('body id:', body_id, robot_name, scene_name) # Specify observation and action spaces. self._ld_names = self.compute_low_dim_state_names() self._ld_names_ids_dict = \ {k: v for v, k in enumerate(self.low_dim_state_names)} if obs_resolution is None: self.observation_space = self._env.observation_space # low dim elif self.obs_ptcloud: state_sz = obs_resolution * 3 # 3D points in point cloud self.observation_space = gym.spaces.Box( -1.0 * AuxBulletEnv.PTCLOUD_BOX_SIZE * np.ones(state_sz), AuxBulletEnv.PTCLOUD_BOX_SIZE * np.ones(state_sz)) else: # RGB images self.observation_space = gym.spaces.Box( low=0.0, high=1.0, shape=[3, obs_resolution, obs_resolution], # HxWxRGB dtype=np.float32) self.action_space = self._env.action_space # Turn on visualization if needed. if visualize: self._sim.resetDebugVisualizerCamera( self._env.unwrapped._cam_dist, self._env.unwrapped._cam_yaw, self._env.unwrapped._cam_pitch, base_pos) if self.obs_resolution is not None: pybullet.configureDebugVisualizer(pybullet.COV_ENABLE_GUI, 1) pybullet.configureDebugVisualizer( pybullet.COV_ENABLE_RGB_BUFFER_PREVIEW, 1) pybullet.configureDebugVisualizer( pybullet.COV_ENABLE_DEPTH_BUFFER_PREVIEW, 1) pybullet.configureDebugVisualizer( pybullet.COV_ENABLE_SEGMENTATION_MARK_PREVIEW, 1) if self.debug: print('Created Aux', base_env_name, 'with observation_space', self.observation_space, 'action_space', self.action_space, 'max_episode_steps', self.max_episode_steps) @property def low_dim_state_space(self): return self._env.observation_space @property def low_dim_state_names(self): return self._ld_names def seed(self, seed): np.random.seed(seed) def close(self): self._env.unwrapped.close() def reset(self): self.reset_aggregators() if self._random_colors: self.reset_colors() state = self._env.reset() if self.obs_resolution is None: # low dim state obs = np.clip(state, self.observation_space.low, self.observation_space.high) else: # pixel or ptcloud state obs = self.render_obs() # For mobile envs: could look at the robot from different angles. #if self._mobile: # self._env.unwrapped._cam_yaw = (np.random.rand()-0.5)*360 if self.visualize and self._mobile: self._sim.resetDebugVisualizerCamera( self._env.unwrapped._cam_dist, self._env.unwrapped._cam_yaw, self._env.unwrapped._cam_pitch, self.get_base_pos()) return obs def step(self, action): state, rwd, done, info = self._env.step(action) # apply action if self.obs_resolution is None: obs = np.clip(state, self.observation_space.low, self.observation_space.high) else: obs = self.render_obs() done, more_info = self.update_aggregators(rwd, done) info.update(more_info) if self.debug: # print low-dim state msg = f'pid {os.getpid():d} step {self.stepnum:d} act ' if isinstance(action, np.ndarray): msg += np.array2string( action, precision=2, formatter={'float_kind': '{:0.2f}'.format}, ) else: msg += str(action) for i in range(len(self.low_dim_state_names)): msg += f' {self.low_dim_state_names[i]:s} {state[i]:0.4f}' print(msg) #if self.debug: # sanity check: aux should be same as low-dim state reported by env # except for envs that modify state after applying action (Hopper). #low_dim_state = self.compute_low_dim_state() #if np.abs(state - low_dim_state).sum()>1e-6: # print(' state', state) # print('low_dim_state', low_dim_state) # assert(False) if self.visualize and self._mobile: self._sim.resetDebugVisualizerCamera( self._env.unwrapped._cam_dist, self._env.unwrapped._cam_yaw, self._env.unwrapped._cam_pitch, self.get_base_pos()) return obs, rwd, done, info def render(self, mode='rgb_array'): pass # done automatically with PyBullet visualization def render_obs(self, override_resolution=None, debug_out_dir=None): if self._mobile and not self.obs_ptcloud: self._view_mat, self._proj_mat, _ = self.compute_cam_vals() obs = render_utils.render_obs( self._sim, self.stepnum, self.obs_resolution, override_resolution, self._env.unwrapped._cam_dist, self.get_base_pos(), self._env.unwrapped._cam_pitch, self._env.unwrapped._cam_yaw, self.obs_ptcloud, self._cam_object_ids, self._base_env_name, AuxBulletEnv.PTCLOUD_BOX_SIZE, debug_out_dir, self._view_mat, self._proj_mat) return obs def normalize_reward(self, rwd): # This could be a useful function for algorithms that expect episode # rewards to be closer to a small range, like [0,1] or [-1,1]. # # Rewards in [0,1] for CartPole, InvertedPendulum # Rewards close to [-?,10] for InvertedDoublePendulum # (bonus for 'alive', penalty for dist from vertical) # Rewards close to [-1,1] for InvertedPendulumSwingup # # Locomotion envs have several non-trivial components contributing to # the reward. It still might make sense to divide by total number of # steps, but the resulting range is not that simple to compute. # return rwd / self.max_episode_steps def compute_low_dim_state_names(self): if 'CartPole' in self._base_env_name: nms = ['theta', 'theta_vel', 'x', 'x_vel'] elif 'InvertedPendulum' in self._base_env_name: nms = ['x', 'x_vel', 'theta_cos', 'theta_sin', 'theta_vel'] elif 'InvertedDoublePendulum' in self._base_env_name: nms = [ 'x', 'x_vel', 'elbow_x', 'theta_cos', 'theta_sin', 'theta_vel', 'theta1_cos', 'theta1_sin', 'theta1_vel' ] elif self._base_env_name.startswith( ('Hopper', 'Walker', 'HalfCheetah', 'Ant', 'Humanoid')): # Note: _vel are scaled by 0.3 in these envs to be in ~[-1,1]; # the reported state is clipped to [-5,5] nms = [ 'z_delta', 'tgt_theta_sin', 'tgt_theta_cos', 'x_vel', 'y_vel', 'z_vel', 'body_roll', 'body_pitch' ] njoints = len(self._env.unwrapped.robot.ordered_joints) for j in range(njoints): pfx = 'j' + str(j) nms.append(pfx + '_pos') nms.append(pfx + '_vel') nfeet = len(self._env.unwrapped.robot.foot_list) for ft in range(nfeet): nms.append('ft' + str(ft) + '_contact') elif 'Reacher' in self._base_env_name: nms = [ 'tgt_x', 'tgt_y', 'to_tgt_vec_x', 'to_tgt_vec_y', 'theta_cos', 'theta_sin', 'theta_vel', 'theta1', 'theta1_vel' ] elif 'Kuka' in self._base_env_name: nms = [ 'grp_x', 'grp_y', 'grp_z', 'grp_r', 'grp_p', 'grp_y', 'blk_rel_x', 'blk_rel_y', 'blk_rel_eulz' ] else: assert (False), f'Unknown BulletEnv {self._base_env_name:s}' return nms def compute_low_dim_state(self): # Get low-dim state. Since there is no unified interface for bullet # gym envs to get the low-dim state, here handle special cases. # Code comes from step() method of the underlying bullet gym envs. # Note: some envs modify state within step() call, so this function # is only useful for getting initial low dim state (e.g. after reset). if 'CartPole' in self._base_env_name: # state = theta, theta_dot, x, x_dot cartpole = self._env.unwrapped.cartpole state = list(self._sim.getJointState(cartpole, 1)[0:2]) state.extend(self._sim.getJointState(cartpole, 0)[0:2]) elif self._base_env_name.startswith('Kuka'): # Kuka state = grp_pos, grp_euler, block_rel_pos # pos,euler are 3D; block_rel_pos is XYEulZ: # relative x,y position and euler angle of block in gripper space # Racecar state = ball_rel_x, ball_rel_y # Racecar env is a emulation of the MIT RC Racecar: reward is based # on distance to the randomly placed ball; observations are ball # position (x,y) in camera frame; camera follows car's body frame. state = self._env.unwrapped.getExtendedObservation() else: # envs in pybullet_envs.gym_*_envs.py use calc_state() state = self._env.unwrapped.robot.calc_state() return np.array(state) def compute_cam_vals(self): base_pos = self.get_base_pos() view_matrix = self._sim.computeViewMatrixFromYawPitchRoll( cameraTargetPosition=base_pos, distance=self._env.unwrapped._cam_dist, yaw=self._env.unwrapped._cam_yaw, pitch=self._env.unwrapped._cam_pitch, roll=0, upAxisIndex=2) proj_matrix = self._sim.computeProjectionMatrixFOV(fov=60, aspect=1.0, nearVal=0.1, farVal=100.0) return view_matrix, proj_matrix, base_pos def get_base_pos(self): pos = [0, 0, 0] if hasattr(self, '_base_pos'): return self._base_pos env = self._env.unwrapped if hasattr(env, 'robot'): if hasattr(env.robot, 'body_xyz'): # for pybullet_envs/env_bases.py pos = env.robot.body_xyz elif hasattr(env.robot, 'slider'): # for pendulum envs x, _ = env.robot.slider.current_position() pos = [x, 0, 0] return pos def reset_colors(self): # Randomly select a color scheme. base_clrs = [(1, 1, 0, 1), (0, 1, 1, 1), (1, 0, 1, 1), (0, 0, 1, 1)] pole_clrs = [(0, 0, 1, 1), (0, 0, 0, 1), (0, 1, 0, 1), (1, 0, 0, 1)] pole2_clrs = [(1, 0, 0, 1), (0, 0, 1, 1), (1, 1, 0, 1), (0, 0, 0, 1)] pole3_clrs = [(0.8, 0.8, 0.8, 1), (0.7, 0, 0.7, 1), (0.7, 0, 0, 1), (0, 0.8, 0.8, 1)] clr_scheme_id = np.random.randint(len(base_clrs)) base_clr = base_clrs[clr_scheme_id] pole_clr = pole_clrs[clr_scheme_id] pole2_clr = pole2_clrs[clr_scheme_id] pole3_clr = pole3_clrs[clr_scheme_id] #print('base_clrs', base_clrs, 'pole_clr', pole_clr) if 'CartPole' in self._base_env_name: cartpole = self._env.unwrapped.cartpole self._sim.changeVisualShape(cartpole, 0, rgbaColor=base_clr) self._sim.changeVisualShape(cartpole, 1, rgbaColor=pole_clr) elif 'Pendulum' in self._base_env_name: base = self._env.unwrapped.robot.parts["cart"] pole = self._env.unwrapped.robot.parts["pole"] self._sim.changeVisualShape(base.bodies[base.bodyIndex], base.bodyPartIndex, rgbaColor=base_clr) self._sim.changeVisualShape(pole.bodies[pole.bodyIndex], pole.bodyPartIndex, rgbaColor=pole_clr) if 'Double' in self._base_env_name: pole2 = self._env.unwrapped.robot.parts["pole2"] #print('pole2_clr', pole2_clr) self._sim.changeVisualShape(pole2.bodies[pole2.bodyIndex], pole2.bodyPartIndex, rgbaColor=pole2_clr) elif self._base_env_name.startswith( ('Walker2D', 'HalfCheetah', 'Ant')): if self._base_env_name.startswith('Walker2D'): part_nms = [ 'torso', 'thigh', 'leg', 'foot', 'thigh_left', 'leg_left', 'foot_left' ] elif self._base_env_name.startswith('HalfCheetah'): part_nms = [ 'torso', 'bthigh', 'bshin', 'bfoot', 'fthigh', 'fshin', 'ffoot' ] elif self._base_env_name.startswith('Ant'): part_nms = [ 'torso', 'front_left_foot', 'front_right_foot', 'left_back_foot', 'right_back_foot' ] else: assert (False) # clrs for env not specified clrs = [ pole_clr, pole2_clr, base_clr, pole_clr, base_clr, pole2_clr, pole3_clr ] for i, part_nm in enumerate(part_nms): part = self._env.unwrapped.robot.parts[part_nm] self._sim.changeVisualShape(part.bodies[part.bodyIndex], part.bodyPartIndex, rgbaColor=clrs[i]) # pitches = [-60, -35, -40, -20]; yaws = [-180, -20, 120, 270] # self._env.unwrapped._cam_pitch = pitches[clr_scheme_id] # self._env.unwrapped._cam_yaw = yaws[clr_scheme_id] def override_state(self, low_dim_state): # TODO: add support for more envs in the future. assert ('CartPole' in self._base_env_name or 'Pendulum' in self._base_env_name) unwrp_env = self._env.unwrapped ids_dict = self._ld_names_ids_dict assert (len(ids_dict.keys()) == len(low_dim_state)) if self._base_env_name.startswith('CartPole'): self._sim.resetJointState( # set pole pos,vel unwrp_env.cartpole, 1, low_dim_state[ids_dict['theta']], low_dim_state[ids_dict['theta_vel']]) self._sim.resetJointState( # set cart pos,vel unwrp_env.cartpole, 0, low_dim_state[ids_dict['x']], low_dim_state[ids_dict['x_vel']]) elif 'Pendulum' in self._base_env_name: unwrp_env.robot.slider.reset_current_position( low_dim_state[ids_dict['x']], low_dim_state[ids_dict['x_vel']]) theta = np.arctan2(low_dim_state[ids_dict['theta_sin']], low_dim_state[ids_dict['theta_cos']]) unwrp_env.robot.j1.reset_current_position( theta, low_dim_state[ids_dict['theta_vel']]) if 'Double' in self._base_env_name: theta1 = np.arctan2(low_dim_state[ids_dict['theta1_sin']], low_dim_state[ids_dict['theta1_cos']]) unwrp_env.robot.j2.reset_current_position( theta1, low_dim_state[ids_dict['theta1_vel']]) unwrp_env.state = self.compute_low_dim_state() return None
class PPO: def __init__(self, hyperparameters): """ Args: hyperparameters (dict): a dictionary of hyperparameters discounted_returns: None """ # Extract hyperparameters self.lr = hyperparameters['learning_rate'] self.discount = hyperparameters['discount_rate'] self.num_batch_transitions = hyperparameters['num_batch_transitions'] self.state_dim = hyperparameters['state_dim'] self.action_dim = hyperparameters['action_dim'] self.total_train_steps = hyperparameters['total_train_steps'] self.max_episode_length = hyperparameters['max_episode_length'] self.num_train_epochs = hyperparameters['num_train_epochs'] self.device = hyperparameters['device'] # Initialize actor/critic networks self.actor = Actor(self.state_dim, self.action_dim) self.critic = Critic(self.state_dim, self.action_dim) self.actor_optim = optim.Adam(self.actor.parameters(), lr=self.lr) self.critic_optim = optim.Adam(self.critic.parameters(), lr=self.lr) # Initialize replay buffer and environment self.replay_buffer = ReplayBuffer(self.batch_size) self.enviroment = KukaGymEnv(renders=True) def sample_policy(self, total_time_steps=None): """ Uses the policy to interact with the environment and fills the replay buffer Args: total_time_steps (int): the minimum total number of time steps to sample """ if total_time_steps is None: total_time_steps = self.num_batch_transitions # Sample until at least have total_time_steps samples num_samples = 0 while num_samples < total_time_steps: done = False cur_state = self.enviroment.reset() # Run full episodes until we get enough samples for i in range(self.max_episode_length): # Sample and perform action from the policy action, log_prob = self.actor.sample_action() next_state, reward, done, info = self.enviroment.step(action) # Store transition in replay buffer num_samples += 1 self.replay_buffer.append(cur_state, action, reward, next_state, done, log_prob) cur_state = next_state if done: break def rewards_to_go(self, sampled_rewards, sampled_dones, normalize=False): """ Computes discounted returns for each episode from the rewards with the option of normalizing them Args: sampled_rewards (Iterable[Float]): an iterable containing the rewards sampled_dones (Iterable[Bool]): an iterable containing whether there was terminal time step normalize (Bool): whether to normalize the returns """ discounted_returns = [] disc_return = 0 # Compute discounted return for each episode for reward, done in zip(reversed(sampled_rewards), reversed(sampled_dones)): if done: disc_return = 0 disc_return = reward + self.discount * disc_return discounted_returns.insert( 0, disc_return) # TODO: Try to append and then reverse discounted_returns = torch.tensor(discounted_returns, dtype=torch.float32).to(self.device) # Normalize discounted_returns if normalize: discounted_returns = (discounted_returns - discounted_returns.mean()) / ( discounted_returns.std() + 1e-8) return discounted_returns def train(self): """ Trains the actor and critic networks using PPO """ print("Training PPO for {} steps".format(self.total_train_steps)) num_time_steps = 0 num_train_iterations = 0 while num_time_steps < self.total_train_steps: # Sample policy for transitions self.sample_policy() state, action, reward, next_state, done, action_log_prob = self.sample_transitions( return_type="torch_tensor") # Compute advantage and value returns = self.rewards_to_go(reward, done) value = self.critic(state, action) advantage = returns - value.detach() # Normalize advantage advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-10) # Update actor and critic networks for train_epoch in range(self.num_train_epochs): # Compute value and action prob ratio value = self.critic( state, action ) # TODO: See if you can just resuse value outside for loop action, cur_action_log_prob = self.actor.sample_action() action_prob_ratio = torch.exp(cur_action_log_prob - action_log_prob) # Compute actor loss loss1 = action_prob_ratio * advantage loss2 = torch.clamp(action_prob_ratio, 1 - self.clip, 1 + self.clip) * advantage actor_loss = -torch.min(loss1, loss2).mean() # Compute critic loss critic_loss = F.mse_loss(returns, value) # Update actor network self.actor_optim.zero_grad() actor_loss.backward( retain_graph=True ) # retain_graph prevents gradients from being erased after backprop self.actor_optim.step() # Update critic network self.critic_optim.zero_grad() critic_loss.backward() self.critic_optim.step() self.buffer.clear() num_time_steps += len(state) def test(self): return
parser.add_argument('--output', default='output', type=str, help='Resuming model path for testing') parser.add_argument('--test', action='store_true', help='test or not') parser.add_argument('--vis', action='store_true', help='visualize each action or not') parser.add_argument('--discrete', dest='discrete', action='store_true', help='the actions are discrete or not') parser.add_argument('--cuda', dest='cuda', action='store_true') parser.add_argument('--l2norm', default=0.01, type=float, help='l2 weight decay') # TODO args = parser.parse_args() # StrCat args.output with args.env if args.resume is None: args.output = get_output_folder(args.output, args.env) else: args.output = args.resume if args.env == "KukaGym": env = KukaGymEnv(renders=False, isDiscrete=True) elif args.discrete: env = gym.make(args.env) env = env.unwrapped else: env = NormalizedEnv(gym.make(args.env)) # input random seed if args.seed > 0: np.random.seed(args.seed) env.seed(args.seed) # input states count & actions count print(env.observation_space.shape, env.action_space.shape) nb_states = env.observation_space.shape[0] if args.discrete: