def fetch_push_gym_demo(): env = GymWrapper( suite.make( 'FetchPush', robots='Panda', controller_configs=None, gripper_types='UltrasoundProbeGripper', has_renderer=True, has_offscreen_renderer=False, use_camera_obs=False, use_object_obs=True, control_freq=50, render_camera=None, )) for i_episode in range(20): observation = env.reset() for t in range(500): env.render() action = env.action_space.sample() observation, reward, done, info = env.step(action) if done: print("Episode finished after {} timesteps".format(t + 1)) break
def main(): parser = custom_arg_parser() args = parser.parse_args() load_defaults(args) print("Arguments:{}".format(args)) # Create the model name with all the parameters model_dir_name = serialize_args(args) print("Model name: {}".format(model_dir_name)) if args.model is not None: model_save_path = os.path.dirname(args.model) + "/" tb_save_path = model_save_path.replace("learned_models","tb_logs") else: model_save_path = "../../learned_models/" + model_dir_name + "/" tb_save_path = "../../tb_logs/" + model_dir_name + "/" print("Model save path:{}".format(model_save_path)) print("TB logs save path:{}".format(tb_save_path)) final_model_path = model_save_path + "final_" + model_dir_name model_load_path = args.model show_render = args.visualize # Save args to json for training from checkpoints if not os.path.exists(model_save_path): os.makedirs(model_save_path) with open(model_save_path + "args.json", 'w+') as f: json.dump(vars(args),f,indent=2,sort_keys=True) env = GymWrapper( suite.make( "JR2Door", has_renderer = show_render, use_camera_obs = False, ignore_done = False, control_freq = args.control_freq, horizon = args.horizon, door_type = args.door_type, bot_motion = args.bot_motion, robot_pos = args.robot_pos, robot_theta = args.robot_theta, dist_to_handle_coef = args.rcoef_dist_to_handle, door_angle_coef = args.rcoef_door_angle, handle_con_coef = args.rcoef_handle_con, body_door_con_coef = args.rcoef_body_door_con, self_con_coef = args.rcoef_self_con, arm_handle_con_coef = args.rcoef_arm_handle_con, arm_door_con_coef = args.rcoef_arm_door_con, force_coef = args.rcoef_force, gripper_touch_coef = args.rcoef_gripper_touch, dist_to_door_coef = args.rcoef_dist_to_door, wall_con_coef = args.rcoef_wall_con, reset_on_large_force= args.reset_on_large_force, debug_print = args.print_info, eef_type = args.eef_type, door_init_qpos = args.door_init_qpos, goal_offset = args.goal_offset, ) ) if args.slurm: env = SubprocVecEnv([lambda: env for i in range(args.n_cpu)]) else: env = DummyVecEnv([lambda: env]) # Load the specified model, if there is one if args.model is not None: # Training from checkpoint, so need to reset timesteps for tb reset_num_timesteps = False if args.rl_alg == "ppo2": model = PPO2.load(model_load_path,env=env) print("Succesfully loaded PPO2 model") if args.rl_alg == "ppo1": model = PPO1.load(model_load_path,env=env) print("Succesfully loaded PPO1 model") else: # New model, so need to reset timesteps for tb reset_num_timesteps = True if args.rl_alg == "ppo2": model = PPO2( args.policy, env, verbose=args.verbose, n_steps=args.n_steps, nminibatches=args.minibatches, noptepochs=args.opt_epochs, cliprange=args.clip_range, ent_coef=args.ent_coef, tensorboard_log=tb_save_path, #full_tensorboard_log=True ) elif args.rl_alg == "ppo1": model = PPO1( args.policy, env, verbose=args.verbose, timesteps_per_actorbatch=args.n_steps, optim_epochs=args.opt_epochs, tensorboard_log=tb_save_path, ) if args.replay: # Replay a policy obs = env.reset() count = 0 with open('episode-reward.csv', mode='w') as fid: writer = csv.writer(fid, delimiter=',') writer.writerow("reward") while(count < 1000): env.render() count += 1 print(count) while True: if args.model is None: print("Error: No model has been specified") action, _states = model.predict(obs,deterministic=True) #print("action {}".format(action)) obs, reward, done, info = env.step(action) env.render() #print(obs) #print(env.sim.data.qpos[env._ref_joint_vel_indexes]) #time.sleep(0.1) with open('episode-reward.csv', mode='a') as fid: writer = csv.writer(fid, delimiter=',') writer.writerow(reward) #if done: # quit() else: # Train model.learn( total_timesteps = args.total_timesteps, save_dir = model_save_path, render=show_render, reset_num_timesteps=reset_num_timesteps, ) model.save(final_model_path) print("Done training") obs = env.reset()
break We demonstrate equivalent functionality below. """ from robosuite.wrappers import GymWrapper if __name__ == "__main__": # Notice how the environment is wrapped by the wrapper env = GymWrapper( suite.make( "SawyerLift", use_camera_obs=False, # do not use pixel observations has_offscreen_renderer=False, # not needed since not using pixel obs has_renderer=True, # make sure we can render to the screen reward_shaping=True, # use dense rewards control_freq=100, # control should happen fast enough so that simulation looks smooth ) ) for i_episode in range(20): observation = env.reset() for t in range(500): env.render() action = env.action_space.sample() observation, reward, done, info = env.step(action) if done: print("Episode finished after {} timesteps".format(t + 1)) break
class NutAssemblyDenseHand(gym.Env): """ TODO: change obs and reward setting NutAssemblyHand: 'FetchPickAndPlace-v1' with a perfect controller Action taken as: pi_theta(s) = pi(s) + f_theta(s) Parameters: ----------- kp: float Scaling factor for position control """ def __init__(self, kp: float = 20, *args, **kwargs): options = {} controller_name = 'OSC_POSE' options["controller_configs"] = suite.load_controller_config( default_controller=controller_name) self.env = GymWrapper( suite.make( "NutAssemblyRound", # Nut Assembly task with the round peg robots="IIWA", # use IIWA robot **options, # controller options use_object_obs=True, use_camera_obs=False, # do not use pixel observations has_offscreen_renderer= False, # not needed since not using pixel obs has_renderer=False, # make sure we can render to the screen reward_shaping=False, # use dense rewards control_freq= 20, # control should happen fast enough so that simulation looks smooth )) self.max_episode_steps = 500 self.action_space = self.env.action_space self.observation_space = self.env.observation_space self.reward_type = 'sparse' self.distance_threshold = 0.065 def step(self, residual_action: np.ndarray): controller_action = np.array( self.controller_action(self.last_observation)) if (controller_action > 1).any() or (controller_action < -1).any(): print(controller_action) action = np.add(controller_action, residual_action) action = np.clip(action, -1, 1) ob, reward, done, info = self.env.step(action) ob = self.env.env._get_observation() observation = {} observation['observation'] = np.hstack( (ob['robot0_eef_pos'], ob['robot0_eef_quat'], ob['RoundNut0_pos'], ob['RoundNut0_quat'])) observation['desired_goal'] = np.array( self.env.sim.data.body_xpos[self.env.peg2_body_id]) observation['achieved_goal'] = ob['RoundNut0_pos'] self.last_observation = observation.copy() info['is_success'] = reward return observation, reward, done, info def reset(self): self.env.reset() # reset according to task defaults ob = self.env.env._get_observation() observation = {} observation['observation'] = np.hstack( (ob['robot0_eef_pos'], ob['robot0_eef_quat'], ob['RoundNut0_pos'], ob['RoundNut0_quat'])) observation['desired_goal'] = np.array( self.env.sim.data.body_xpos[self.env.peg2_body_id]) observation['achieved_goal'] = ob['RoundNut0_pos'] self.last_observation = observation.copy() self.object_in_hand = False self.object_below_hand = False self.gripper_reoriented = 0 return observation def seed(self, seed: int = 0): self.np_random, seed = seeding.np_random(seed) return self.env.seed(seed=seed) def render(self, mode: str = "human", *args, **kwargs): return self.env.render() def close(self): return self.env.close() def goal_distance(self, achieved_goal, desired_goal): return np.linalg.norm(achieved_goal - desired_goal, axis=1) def compute_reward(self, achieved_goal, goal, info): # Compute distance between goal and the achieved goal. d = self.goal_distance(achieved_goal, goal) if self.reward_type == 'sparse': return -(d > self.distance_threshold).astype(np.float32) else: return -d def controller_action(self, obs: dict, take_action: bool = True, DEBUG: bool = False): observation = obs['observation'] goal_pos = obs['desired_goal'] achieved_goal = obs['achieved_goal'] gripper_pos = observation[:3] gripper_quat = observation[3:7] object_pos = observation[7:10] object_quat = observation[10:] z_table = 0.8610982 object_axang = T.quat2axisangle(object_quat) if abs(object_axang[-1] - 1.24) < 0.2: object_axang_touse = [ 0, 0, object_axang[-1] % (2 * pi / 8) + (2 * pi / 8) ] else: object_axang_touse = [0, 0, object_axang[-1] % (2 * pi / 8)] gripper_axang = T.quat2axisangle(gripper_quat) # print('object axang to use ' + str(object_axang_touse)) if self.gripper_reoriented == 0: self.gripper_init_quat = gripper_quat self.gripper_reoriented = 1 init_inv = T.quat_inverse(self.gripper_init_quat) changing_wf = T.quat_multiply(init_inv, gripper_quat) changing_wf_axang = T.quat2axisangle(changing_wf) # gripper_quat_inv = T.quat_inverse(gripper_quat) # changing_wf = T.quat_multiply(gripper_quat_inv,self.gripper_init_quat) # changing_wf_axang = T.quat2axisangle(changing_wf) # print('changing wf axis ' +str(changing_wf_axang)) if not self.object_below_hand or self.gripper_reoriented < 5: self.nut_p = T.quat2axisangle(object_quat)[-1] # print(self.nut_p) if not self.object_below_hand: action = 20 * (object_pos[:2] - gripper_pos[:2]) else: action = [0, 0] action = 20 * (object_pos[:2] - gripper_pos[:2]) # frac = 0.2 # Rate @ which to rotate gripper about z. # ang_goal = frac*self.nut_p # Nut p is the nut's random intial pertubation about z. # if self.gripper_reoriented == 0: # self.gripper_init_quat = gripper_quat # if self.gripper_reoriented < 5: # Gripper should be aligned with nut after 5 action steps # action_angle= [0,0,ang_goal] # #print('object ' + str(object_axang)) # #print('gripper ' + str(gripper_axang)) # init_inv = T.quat_inverse(self.gripper_init_quat) # init_inv_mat = T.quat2mat(init_inv) # rel = T.quat_multiply(gripper_quat,init_inv) # rel_axang = T.quat2axisangle(rel) # #print('rel_axang ' +str(rel_axang)) # rel_axang_WF = np.matmul(init_inv_mat,rel_axang) # #print('rel_axang_WF ' +str(rel_axang_WF)) # if take_action: # self.gripper_reoriented+=1 # else: # After 5 action steps, don't rotate gripper any more # action_angle=[0,0,0] action_angle = 0.2 * (object_axang_touse - changing_wf_axang) action_angle = [0, 0, action_angle[-1]] #action_angle = [0,0,0] #print('action angle ' +str(action_angle)) if np.linalg.norm(object_axang_touse - changing_wf_axang) < 0.1: if take_action: self.gripper_reoriented = 5 action = np.hstack((action, [0], action_angle, [-1])) if np.linalg.norm((object_pos[:2] - gripper_pos[:2])) < 0.01: if take_action: self.object_below_hand = True #self.gripper_reoriented = 5 elif not self.object_in_hand: # Close gripper action = [0, 0, -1, 0, 0, 0, -1] if np.linalg.norm((object_pos[2] - gripper_pos[2])) < 0.01: action = [0, 0, 0, 0, 0, 0, 1] if take_action: self.object_in_hand = True else: # Move gripper up and toward goal position action = [0, 0, 1, 0, 0, 0, 1] if object_pos[2] - z_table > 0.1: action = 20 * (goal_pos[:2] - object_pos[:2]) action = np.hstack((action, [0, 0, 0, 0, 1])) if np.linalg.norm((goal_pos[:2] - object_pos[:2])) < 0.0225: action = [0, 0, 0, 0, 0, 0, -1] # Drop nut once it's close enough to the peg action = np.clip(action, -1, 1) return action
class NutAssemblyDense(gym.Env): """ NutAssembly: NutAssembly task from robosuite with no controller. Can be used for learning from scratch. """ def __init__(self, horizon=500, *args, **kwargs): options = {} controller_name = 'OSC_POSE' options["controller_configs"] = suite.load_controller_config( default_controller=controller_name) self.env = GymWrapper( suite.make( "NutAssemblyRound", # Nut Assembly task with the round peg robots="IIWA", # use IIWA robot **options, # controller options use_object_obs=True, use_camera_obs=False, # do not use pixel observations has_offscreen_renderer= False, # not needed since not using pixel obs has_renderer=False, # make sure we can render to the screen reward_shaping=True, # use dense rewards reward_scale=1.0, control_freq= 20, # control should happen fast enough so that simulation looks smooth horizon=horizon, # number of timesteps for ending episode )) self.max_episode_steps = horizon self.action_space = self.env.action_space self.observation_space = self.env.observation_space self.reward_type = 'sparse' self.distance_threshold = 0.065 def step(self, action): ob, reward, done, info = self.env.step(action) ob = self.env.env._get_observation() peg_pos = np.array(self.env.sim.data.body_xpos[self.env.peg2_body_id]) observation = np.hstack( (ob['robot0_eef_pos'], ob['robot0_eef_quat'], ob['RoundNut0_pos'], ob['RoundNut0_quat'], peg_pos)) # info['is_success'] = self.get_success(ob['RoundNut0_pos'], peg_pos) info['is_success'] = self.env.env._check_success() return observation, reward, done, info def reset(self): ob = self.env.reset() ob = self.env.env._get_observation() peg_pos = np.array(self.env.sim.data.body_xpos[self.env.peg2_body_id]) observation = np.hstack( (ob['robot0_eef_pos'], ob['robot0_eef_quat'], ob['RoundNut0_pos'], ob['RoundNut0_quat'], peg_pos)) return observation def seed(self, seed=0): self.np_random, seed = seeding.np_random(seed) return self.env.seed(seed=seed) def render(self, mode="human", *args, **kwargs): return self.env.render() def close(self): return self.env.close() def goal_distance(self, achieved_goal, desired_goal): return np.linalg.norm(achieved_goal - desired_goal) def get_success(self, object_pos, goal): # Compute distance between goal and the achieved goal. d = self.goal_distance(object_pos, goal) return (d < self.distance_threshold).astype(np.float32)
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy_sawyer.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) #------- Run policy for reaching ---------# play_env = robosuite.make( args.env_id, ignore_done=True, use_camera_obs=False, has_renderer=True, control_freq=100, gripper_visualization=True, #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2 #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2 box_pos=[0.23522776, 0.2287869, 0.82162434], #shift3 box_quat=[0.3775825618903728, 0, 0, 0.679425538604203], #shift3 #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4 #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4 #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5 #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5 #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6 #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6 #box_pos = [0.43522776, -0.367869, 0.82162434], #shift7 #box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7 ) play_env = GymWrapper(play_env, keys=None, generalized_goal=False) # false for loading prevs #Weights are loaded from reach model grasp_strange #play_env.viewer.set_camera(camera_id=2) # Switch views for eval # Setup network # ---------------------------------------- ob_space = play_env.observation_space ac_space = play_env.action_space pi_reach = policy_fn("pi", ob_space, ac_space, reuse=False) # Hack for loading policies using tensorflow init_op = tf.compat.v1.global_variables_initializer() saver = tf.compat.v1.train.Saver(max_to_keep=5) with tf.compat.v1.Session() as sess: sess.run(init_op) # Load Checkpoint #ckpt_path = './reach_and_grasp_weights/reach_one/trpo_gail.transition_limitation_2100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' #ckpt_path = './reach_and_grasp_weights/reach_shift2/trpo_gail.transition_limitation_2500.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' ckpt_path = './reach_and_grasp_weights/reach_3_almost/trpo_gail.transition_limitation_1100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' #ckpt_path = './reach_and_grasp_weights/reach_4/trpo_gail.transition_limitation_2400.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' # problem child 2 #ckpt_path = './reach_and_grasp_weights/reach_5/trpo_gail.transition_limitation_2000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' #problem child 1 #ckpt_path = './reach_and_grasp_weights/reach_6/trpo_gail.transition_limitation_2500.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' #ckpt_path = './reach_and_grasp_weights/reach_7/trpo_gail.transition_limitation_3000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' ckpt = tf.compat.v1.train.get_checkpoint_state(ckpt_path) saver.restore(sess, ckpt.model_checkpoint_path) # Create the playback environment _, _, last_ob, last_jpos, obs_array, jpos_array = runner_1_traj( play_env, pi_reach, None, timesteps_per_batch=3500, number_trajs=1, stochastic_policy=args.stochastic_policy, save=False) # Gripping load + setting up the last observation play_ob_dim = play_env.observation_space.shape[0] play_ac_dim = play_env.action_space.shape[0] grip_policy = GaussianMlpPolicy(play_ob_dim, play_ac_dim) grip_vf = NeuralNetValueFunction(play_ob_dim, play_ac_dim) grip_saver = tf.compat.v1.train.Saver(max_to_keep=5) unchanged_ob = np.float32(np.zeros(play_ob_dim)) with tf.compat.v1.Session() as sess2: sess2.run(init_op) # Load Checkpoint #ckpt_path = './reach_and_grasp_weights/reach_one/trpo_gail.transition_limitation_2100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' #grip_ckpt_path = './reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift' grip_ckpt_path = './reach_and_grasp_weights/grasp_3/grasp_acktr_rl.transition_limitation_1000.SawyerLift/' #3rd grasp #ckpt_path = './reach_and_grasp_weights/reach_4/trpo_gail.transition_limitation_2400.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' # problem child 2 #ckpt_path = './reach_and_grasp_weights/reach_5/trpo_gail.transition_limitation_2000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' #problem child 1 #ckpt_path = './reach_and_grasp_weights/reach_6/trpo_gail.transition_limitation_2500.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' #ckpt_path = './reach_and_grasp_weights/reach_7/trpo_gail.transition_limitation_3000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' grip_ckpt = tf.compat.v1.train.get_checkpoint_state(grip_ckpt_path) #print(grip_ckpt) grip_saver.restore(sess2, grip_ckpt.model_checkpoint_path) tt = 0 cum_rew = 0 #ob = last_ob #prev_ob = np.float32(np.zeros(ob.shape)) # check if indeed starts at all zeros obfilter = ZFilter(play_env.observation_space.shape) #statsu = np.load("./reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22953000.npz") # shift 2, is a problem? #statsu = np.load("./reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22953000.npz") # shift 2 statsu = np.load( "./reach_and_grasp_weights/grasp_3/grasp_acktr_rl.transition_limitation_1000.SawyerLift/filter_stats_21002000.npz" ) # shift 3 #statsu = np.load("./reach_and_grasp_weights/grasp_4/grasp_acktr_rl.transition_limitation_1200.SawyerLift/filter_stats_20162400.npz") # shift 4 #statsu = np.load("./reach_and_grasp_weights/grasp_5/grasp_acktr_rl.transition_limitation_1200.SawyerLift/filter_stats_26066400.npz") #shift 5 #statsu = np.load("./reach_and_grasp_weights/grasp_and_then_throws_somehow_6/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_27363000.npz") #shift 6 #statsu = np.load("./reach_and_grasp_weights/grasp_pickup_7/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22773000.npz") #shift 7 print("load n: ", statsu["n"]) print("load M: ", statsu["M"]) print("load S: ", statsu["S"]) obfilter.rs._n = statsu["n"] obfilter.rs._M = statsu["M"] obfilter.rs._S = statsu["S"] print("obf n: ", obfilter.rs._n) print("obf M: ", obfilter.rs._M) print("obf S: ", obfilter.rs._S) #env.set_robot_joint_positions(last_jpos) #ob = np.concatenate((last_ob,env.box_end),axis=0) ob = last_ob # Will this change the behavior of loading? play_env.set_robot_joint_positions(last_jpos) prev_ob = np.float32(np.zeros( ob.shape)) # check if indeed starts at all zeros unchanged_ob = ob ob = obfilter(ob) while True: s = np.concatenate([ob, prev_ob], -1) ac, _, _ = grip_policy.act(s) prev_ob = np.copy(ob) scaled_ac = play_env.action_space.low + (ac + 1.) * 0.5 * ( play_env.action_space.high - play_env.action_space.low) scaled_ac = np.clip(scaled_ac, play_env.action_space.low, play_env.action_space.high) ob, rew, new, _ = play_env.step(scaled_ac) unchanged_ob = ob ob = obfilter(ob) cum_rew += rew play_env.render() # check the running in for the first part #logger.log("rendering for reach policy") if new or tt >= 200: break tt += 1 print("Cumulative reward over session: " + str(cum_rew)) #obs_array = None #jpos_array = None #print(last_jpos) #print(last_ob) last_joint_after_grip = play_env._joint_positions last_after_grip = unchanged_ob # Change environment box position to start from the last position on playenv #env.model.box_pos_array = np.array(play_env.sim.data.body_xpos[play_env.cube_body_id]) #env.model.box_quat_array = convert_quat( # np.array(play_env.sim.data.body_xquat[play_env.cube_body_id]), to="xyzw" # ) #env.box_pos = play_env.model.box_pos_array #env.box_quat = play_env.model.box_quat_array # set up the environment for loading or training env = robosuite.make( args.env_id, ignore_done=True, use_camera_obs=False, has_renderer=True, control_freq=100, gripper_visualization=True, reward_shaping=True, rob_init=last_joint_after_grip, box_pos=np.array( play_env.sim.data.body_xpos[play_env.cube_body_id]), #shift3 box_quat=convert_quat(np.array( play_env.sim.data.body_xquat[play_env.cube_body_id]), to="xyzw"), #shift3 #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2 #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2 #box_pos = [0.23522776, 0.2287869, 0.82162434], #shift3 #box_quat=[0.3775825618903728, 0, 0, 0.679425538604203], #shift3 box_end=[0.3, 0.1, 1.0] # shift 3 #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4, try to increase traj limit to 2000 #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4 #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5 #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5 #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6 #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6 #box_pos = [0.43522776, -0.367869, 0.82162434], #shift7 #box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7 ) # Switch from gym to robosuite, also add reward shaping to see reach goal env = GymWrapper(env, keys=None, generalized_goal=True) # wrap in the gym environment task = 'train' #task = 'evaluate' #task = 'cont_training' #env = bench.Monitor(env, logger.get_dir() and # osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) # Note: taking away the bench monitor wrapping allows rendering #env.seed(args.seed) # Sawyer does not have seed gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) logger.log("log_directories: ", args.log_dir) logger.log("environment action space range: ", env.action_space) #logging the action space if task == 'train': play_env.close() #init_op2 = tf.compat.v1.global_variables_initializer() #sess2 = tf.compat.v1.Session(config=tf.ConfigProto()) #with tf.compat.v1.Session(config=tf.ConfigProto()) as sess2: #sess2.run(init_op) ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] lift_vf = NeuralNetValueFunction(ob_dim, ac_dim, name="lift_vf_aktr") lift_policy = GaussianMlpPolicy(ob_dim, ac_dim, name="lift_pi_aktr") #sess2.run(init_op2) old_acktr_learn( env, policy=lift_policy, vf=lift_vf, gamma=0.99, lam=0.97, timesteps_per_batch=1500, desired_kl=0.001, num_timesteps=args.num_timesteps, save_per_iter=args.save_per_iter, ckpt_dir=args.checkpoint_dir, traj_limitation=args.traj_limitation, last_ob=[last_after_grip], last_jpos=[last_joint_after_grip], animate=True, pi_name="lift_pi_aktr", ) env.close() elif task == 'cont_training': play_env.close() # with tf.compat.v1.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] #with tf.compat.v1.variable_scope("vf_aktr"): cont_vf = NeuralNetValueFunction(ob_dim, ac_dim, name="lift_vf_aktr") #with tf.compat.v1.variable_scope("pi_aktr"): cont_policy = GaussianMlpPolicy(ob_dim, ac_dim, name="lift_pi_aktr") ckpt_path_cont = './checkpoint/grasp_acktr_rl.transition_limitation_1500.SawyerLift' stat_cont = "./checkpoint/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_19713000.npz" old_acktr_learn( env, policy=cont_policy, vf=cont_vf, gamma=0.99, lam=0.97, timesteps_per_batch=1500, desired_kl=0.001, num_timesteps=args.num_timesteps, save_per_iter=args.save_per_iter, ckpt_dir=args.checkpoint_dir, traj_limitation=args.traj_limitation, last_ob=obs_array, last_jpos=jpos_array, animate=True, cont_training=True, load_path=ckpt_path_cont, obfilter_load=stat_cont, pi_name="lift_pi_aktr", ) env.close() elif task == 'evaluate': play_env.close() # with tf.compat.v1.Session(config=tf.ConfigProto()): ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] eval_lift_vf = NeuralNetValueFunction(ob_dim, ac_dim, name="lift_vf_aktr") eval_lift_policy = GaussianMlpPolicy(ob_dim, ac_dim, name="lift_pi_aktr") saver_2 = tf.compat.v1.train.Saver(max_to_keep=5) #sess2 = tf.compat.v1.Session(config=tf.ConfigProto()) #with tf.compat.v1.Session(config=tf.ConfigProto()) as sess3: with tf.compat.v1.Session() as sess3: sess3.run(init_op) ckpt_path_2 = './checkpoint/grasp_acktr_rl.transition_limitation_1500.SawyerLift' #ckpt_path_2 = './reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift' # shift 2 #ckpt_path_2 = './reach_and_grasp_weights/grasp_3/grasp_acktr_rl.transition_limitation_1000.SawyerLift' # shift 3 #ckpt_path_2 = './reach_and_grasp_weights/grasp_4/grasp_acktr_rl.transition_limitation_1200.SawyerLift' # shift 4 #ckpt_path_2 = './reach_and_grasp_weights/grasp_5/grasp_acktr_rl.transition_limitation_1200.SawyerLift' # shift 5 #ckpt_path_2 = './reach_and_grasp_weights/grasp_and_then_throws_somehow_6/grasp_acktr_rl.transition_limitation_1500.SawyerLift' #shift 6 #ckpt_path_2 = './reach_and_grasp_weights/grasp_pickup_7/grasp_acktr_rl.transition_limitation_1500.SawyerLift' #shift 7 ckpt_2 = tf.compat.v1.train.get_checkpoint_state(ckpt_path_2) saver_2.restore(sess3, ckpt_2.model_checkpoint_path) tt = 0 cum_rew = 0 #ob = last_ob #prev_ob = np.float32(np.zeros(ob.shape)) # check if indeed starts at all zeros obfilter = ZFilter(env.observation_space.shape) statsu = np.load( "./checkpoint/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_23493000.npz" ) #statsu = np.load("./reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22953000.npz") # shift 2 #statsu = np.load("./reach_and_grasp_weights/grasp_3/grasp_acktr_rl.transition_limitation_1000.SawyerLift/filter_stats_21002000.npz") # shift 3 #statsu = np.load("./reach_and_grasp_weights/grasp_4/grasp_acktr_rl.transition_limitation_1200.SawyerLift/filter_stats_20162400.npz") # shift 4 #statsu = np.load("./reach_and_grasp_weights/grasp_5/grasp_acktr_rl.transition_limitation_1200.SawyerLift/filter_stats_26066400.npz") #shift 5 #statsu = np.load("./reach_and_grasp_weights/grasp_and_then_throws_somehow_6/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_27363000.npz") #shift 6 #statsu = np.load("./reach_and_grasp_weights/grasp_pickup_7/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22773000.npz") #shift 7 print("load n: ", statsu["n"]) print("load M: ", statsu["M"]) print("load S: ", statsu["S"]) obfilter.rs._n = statsu["n"] obfilter.rs._M = statsu["M"] obfilter.rs._S = statsu["S"] print("obf n: ", obfilter.rs._n) print("obf M: ", obfilter.rs._M) print("obf S: ", obfilter.rs._S) env.set_robot_joint_positions(last_jpos) ob = np.concatenate((last_ob, env.box_end), axis=0) prev_ob = np.float32(np.zeros( ob.shape)) # check if indeed starts at all zeros ob = obfilter(ob) while True: s = np.concatenate([ob, prev_ob], -1) ac, _, _ = eval_lift_policy.act(s) prev_ob = np.copy(ob) scaled_ac = env.action_space.low + (ac + 1.) * 0.5 * ( env.action_space.high - env.action_space.low) scaled_ac = np.clip(scaled_ac, env.action_space.low, env.action_space.high) ob, rew, new, _ = env.step(scaled_ac) ob = obfilter(ob) cum_rew += rew env.render() # check the running in for the first part #logger.log("rendering for reach policy") if new or tt >= args.traj_limitation: break tt += 1 print("Cumulative reward over session: " + str(cum_rew)) env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = robosuite.make( args.env_id, ignore_done=True, use_camera_obs=False, has_renderer=True, control_freq=100, gripper_visualization=True, reward_shaping=True, #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2 #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2 ) # Switch from gym to robosuite, also add reward shaping to see reach goal env = GymWrapper(env) # wrap in the gym environment #task = 'train' task = 'evaluate' # Expert Path expert_path = '/home/mastercljohnson/Robotics/GAIL_Part/mod_surreal/robosuite/models/assets/demonstrations/150_grasp_shift2/combined/combined_0.npz' # path for 100 trajectories #parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz') def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy_sawyer.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) # Note: taking away the bench monitor wrapping allows rendering #env.seed(args.seed) # Sawyer does not have seed gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) logger.log("log_directories: ", args.log_dir) logger.log("environment action space range: ", env.action_space) #logging the action space #------- Run policy for reaching ---------# play_env = robosuite.make( args.env_id, ignore_done=True, use_camera_obs=False, has_renderer=True, control_freq=100, gripper_visualization=True, #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2 #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2 ) play_env = GymWrapper(play_env) #Weights are loaded from reach model grasp_strange #play_env.viewer.set_camera(camera_id=2) # Switch views for eval # Setup network # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi_reach = policy_fn("pi", ob_space, ac_space, reuse=False) # Hack for loading policies using tensorflow init_op = tf.compat.v1.global_variables_initializer() saver = tf.compat.v1.train.Saver(max_to_keep=5) with tf.compat.v1.Session() as sess: sess.run(init_op) # Load Checkpoint ckpt_path = './reach_and_grasp_weights/reach_one/trpo_gail.transition_limitation_2100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' ckpt = tf.compat.v1.train.get_checkpoint_state(ckpt_path) saver.restore(sess, ckpt.model_checkpoint_path) # Create the playback environment _, _, last_ob, last_jpos = runner_1_traj( play_env, pi_reach, None, timesteps_per_batch=3500, number_trajs=1, stochastic_policy=args.stochastic_policy, save=False) if task == 'train': play_env.close() dataset = Mujoco_Dset(expert_path=expert_path, traj_limitation=args.traj_limitation) reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train_grasp(env, last_ob, last_jpos, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, task_name) elif task == 'evaluate': pi_grasp = policy_fn("pi_grasp", ob_space, ac_space, reuse=False) saver_2 = tf.compat.v1.train.Saver(max_to_keep=5) with tf.compat.v1.Session() as sess: sess.run(init_op) ckpt_path_2 = './reach_and_grasp_weights/grasp_shift1_after_reach/grasptrpo_gail.transition_limitation_2000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' ckpt_2 = tf.compat.v1.train.get_checkpoint_state(ckpt_path_2) saver_2.restore(sess, ckpt_2.model_checkpoint_path) tt = 0 ob = last_ob while True: ac, vpred = pi_grasp.act(False, ob) ob, rew, new, _ = play_env.step(ac) play_env.render() # check the running in for the first part #logger.log("rendering for reach policy") if new or tt >= args.traj_limitation: break tt += 1 play_env.close() env.close()