Exemple #1
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = robosuite.make(args.env_id,
            ignore_done=True,
            use_camera_obs=False,
            has_renderer=True,
            control_freq=100,
            gripper_visualization=True,
            reward_shaping=True,
            #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
            #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
            #box_pos = [0.23522776, 0.2287869, 0.82162434], #shift3
            #box_quat=[0.3775825618903728, 0, 0, 0.679425538604203], #shift3
            #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4
            #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4
            #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5 
            #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5
            #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6
            #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6
            box_pos = [0.43522776, -0.367869, 0.82162434], #shift7
            box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7
            ) # Switch from gym to robosuite, also add reward shaping to see reach goal

    env = GymWrapper(env) # wrap in the gym environment

    # Environment joints should be clipped at 1 and -1 for sawyer

    
    # Task
    #task = 'train'
    task = 'evaluate'
    # parser.add_argument('--task', type=str, choices=['train', 'evaluate', 'sample'], default='train')

    # Expert Path
    #expert_path = '/home/mastercljohnson/Robotics/GAIL_Part/mod_surreal/robosuite/models/assets/demonstrations/ac100/combined/combined_0.npz' # path for 100 trajectories
    expert_path = '/home/mastercljohnson/Robotics/GAIL_Part/mod_surreal/robosuite/models/assets/demonstrations/120_shift7/combined/combined_0.npz' # path for 100 trajectories

    #parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz')
    
    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy_sawyer.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and
                        osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True)
    
    #env.seed(args.seed) # Sawyer does not have seed 

    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    #if not os.path.isdir(args.log_dir):
    #    os.makedirs(args.log_dir)

    logger.log("log_directories: ",args.log_dir)
    
    logger.log("environment action space range: ", env.action_space) #logging the action space

    if task == 'train':
        dataset = Mujoco_Dset(expert_path=expert_path, traj_limitation=args.traj_limitation)

        # Check dimensions of the dataset
        #print("dimension of inputs", dataset.dset.inputs.shape) # dims seem correct
        #print("dimension of inputs", dataset.dset.labels.shape) # dims seem correct

        reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
        train(env,
              args.seed,
              policy_fn,
              reward_giver,
              dataset,
              args.algo,
              args.g_step,
              args.d_step,
              args.policy_entcoeff,
              args.num_timesteps,
              args.save_per_iter,
              args.checkpoint_dir,
              args.log_dir,
              args.pretrained,
              args.BC_max_iter,
              task_name
              )
    elif task == 'evaluate':
        # Create the playback environment
        play_env = robosuite.make(args.env_id,
                ignore_done=True,
                use_camera_obs=False,
                has_renderer=True,
                control_freq=100,
                gripper_visualization=True,
                #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
                #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
                #box_pos = [0.23522776, 0.2287869, 0.82162434], #shift3
                #box_quat=[0.3775825618903728, 0, 0, 0.679425538604203], #shift3
                #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4
                #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4
                #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5 
                #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5
                #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6
                #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6
                box_pos = [0.43522776, -0.367869, 0.82162434], #shift7
                box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7
                )

        #play_env.viewer.set_camera(camera_id=2) # Switch views for eval

        runner(env,
                play_env,
                policy_fn,
                args.load_model_path,
                timesteps_per_batch=4000, # Change time step per batch to be more reasonable
                number_trajs=20, # change from 10 to 1 for evaluation
                stochastic_policy=args.stochastic_policy,
                save=args.save_sample
                )
    else:
        raise NotImplementedError
    env.close()
class NutAssemblyDenseHand(gym.Env):
    """
        TODO: change obs and reward setting
        NutAssemblyHand:
            'FetchPickAndPlace-v1' with a perfect controller
            Action taken as:
                pi_theta(s) = pi(s) + f_theta(s)
        Parameters:
        -----------
        kp: float
            Scaling factor for position control
    """
    def __init__(self, kp: float = 20, *args, **kwargs):
        options = {}
        controller_name = 'OSC_POSE'
        options["controller_configs"] = suite.load_controller_config(
            default_controller=controller_name)

        self.env = GymWrapper(
            suite.make(
                "NutAssemblyRound",  # Nut Assembly task with the round peg
                robots="IIWA",  # use IIWA robot
                **options,  # controller options
                use_object_obs=True,
                use_camera_obs=False,  # do not use pixel observations
                has_offscreen_renderer=
                False,  # not needed since not using pixel obs
                has_renderer=False,  # make sure we can render to the screen
                reward_shaping=False,  # use dense rewards
                control_freq=
                20,  # control should happen fast enough so that simulation looks smooth
            ))

        self.max_episode_steps = 500
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space
        self.reward_type = 'sparse'
        self.distance_threshold = 0.065

    def step(self, residual_action: np.ndarray):
        controller_action = np.array(
            self.controller_action(self.last_observation))
        if (controller_action > 1).any() or (controller_action < -1).any():
            print(controller_action)
        action = np.add(controller_action, residual_action)
        action = np.clip(action, -1, 1)
        ob, reward, done, info = self.env.step(action)
        ob = self.env.env._get_observation()
        observation = {}
        observation['observation'] = np.hstack(
            (ob['robot0_eef_pos'], ob['robot0_eef_quat'], ob['RoundNut0_pos'],
             ob['RoundNut0_quat']))
        observation['desired_goal'] = np.array(
            self.env.sim.data.body_xpos[self.env.peg2_body_id])
        observation['achieved_goal'] = ob['RoundNut0_pos']
        self.last_observation = observation.copy()
        info['is_success'] = reward
        return observation, reward, done, info

    def reset(self):
        self.env.reset()  # reset according to task defaults
        ob = self.env.env._get_observation()
        observation = {}
        observation['observation'] = np.hstack(
            (ob['robot0_eef_pos'], ob['robot0_eef_quat'], ob['RoundNut0_pos'],
             ob['RoundNut0_quat']))
        observation['desired_goal'] = np.array(
            self.env.sim.data.body_xpos[self.env.peg2_body_id])
        observation['achieved_goal'] = ob['RoundNut0_pos']
        self.last_observation = observation.copy()
        self.object_in_hand = False
        self.object_below_hand = False
        self.gripper_reoriented = 0
        return observation

    def seed(self, seed: int = 0):
        self.np_random, seed = seeding.np_random(seed)
        return self.env.seed(seed=seed)

    def render(self, mode: str = "human", *args, **kwargs):
        return self.env.render()

    def close(self):
        return self.env.close()

    def goal_distance(self, achieved_goal, desired_goal):
        return np.linalg.norm(achieved_goal - desired_goal, axis=1)

    def compute_reward(self, achieved_goal, goal, info):
        # Compute distance between goal and the achieved goal.
        d = self.goal_distance(achieved_goal, goal)
        if self.reward_type == 'sparse':
            return -(d > self.distance_threshold).astype(np.float32)
        else:
            return -d

    def controller_action(self,
                          obs: dict,
                          take_action: bool = True,
                          DEBUG: bool = False):
        observation = obs['observation']
        goal_pos = obs['desired_goal']
        achieved_goal = obs['achieved_goal']

        gripper_pos = observation[:3]
        gripper_quat = observation[3:7]
        object_pos = observation[7:10]
        object_quat = observation[10:]

        z_table = 0.8610982

        object_axang = T.quat2axisangle(object_quat)
        if abs(object_axang[-1] - 1.24) < 0.2:
            object_axang_touse = [
                0, 0, object_axang[-1] % (2 * pi / 8) + (2 * pi / 8)
            ]
        else:
            object_axang_touse = [0, 0, object_axang[-1] % (2 * pi / 8)]
        gripper_axang = T.quat2axisangle(gripper_quat)
        # print('object axang to use ' + str(object_axang_touse))

        if self.gripper_reoriented == 0:
            self.gripper_init_quat = gripper_quat
            self.gripper_reoriented = 1

        init_inv = T.quat_inverse(self.gripper_init_quat)
        changing_wf = T.quat_multiply(init_inv, gripper_quat)
        changing_wf_axang = T.quat2axisangle(changing_wf)

        # gripper_quat_inv = T.quat_inverse(gripper_quat)
        # changing_wf = T.quat_multiply(gripper_quat_inv,self.gripper_init_quat)
        # changing_wf_axang = T.quat2axisangle(changing_wf)

        # print('changing wf axis ' +str(changing_wf_axang))

        if not self.object_below_hand or self.gripper_reoriented < 5:
            self.nut_p = T.quat2axisangle(object_quat)[-1]
            # print(self.nut_p)
            if not self.object_below_hand:
                action = 20 * (object_pos[:2] - gripper_pos[:2])
            else:
                action = [0, 0]

            action = 20 * (object_pos[:2] - gripper_pos[:2])

            # frac = 0.2 # Rate @ which to rotate gripper about z.
            # ang_goal = frac*self.nut_p # Nut p is the nut's random intial pertubation about z.

            # if self.gripper_reoriented == 0:
            #     self.gripper_init_quat = gripper_quat
            # if self.gripper_reoriented < 5: # Gripper should be aligned with nut after 5 action steps
            #     action_angle= [0,0,ang_goal]

            #     #print('object ' + str(object_axang))
            #     #print('gripper ' + str(gripper_axang))

            #     init_inv = T.quat_inverse(self.gripper_init_quat)
            #     init_inv_mat = T.quat2mat(init_inv)

            #     rel = T.quat_multiply(gripper_quat,init_inv)
            #     rel_axang = T.quat2axisangle(rel)
            #     #print('rel_axang ' +str(rel_axang))

            #     rel_axang_WF = np.matmul(init_inv_mat,rel_axang)

            #     #print('rel_axang_WF ' +str(rel_axang_WF))

            #     if take_action:
            #         self.gripper_reoriented+=1
            # else: # After 5 action steps, don't rotate gripper any more
            #     action_angle=[0,0,0]

            action_angle = 0.2 * (object_axang_touse - changing_wf_axang)
            action_angle = [0, 0, action_angle[-1]]
            #action_angle = [0,0,0]
            #print('action angle ' +str(action_angle))

            if np.linalg.norm(object_axang_touse - changing_wf_axang) < 0.1:
                if take_action:
                    self.gripper_reoriented = 5

            action = np.hstack((action, [0], action_angle, [-1]))
            if np.linalg.norm((object_pos[:2] - gripper_pos[:2])) < 0.01:
                if take_action:
                    self.object_below_hand = True
                    #self.gripper_reoriented = 5

        elif not self.object_in_hand:  # Close gripper
            action = [0, 0, -1, 0, 0, 0, -1]
            if np.linalg.norm((object_pos[2] - gripper_pos[2])) < 0.01:
                action = [0, 0, 0, 0, 0, 0, 1]
                if take_action:
                    self.object_in_hand = True

        else:  # Move gripper up and toward goal position
            action = [0, 0, 1, 0, 0, 0, 1]
            if object_pos[2] - z_table > 0.1:
                action = 20 * (goal_pos[:2] - object_pos[:2])
                action = np.hstack((action, [0, 0, 0, 0, 1]))
                if np.linalg.norm((goal_pos[:2] - object_pos[:2])) < 0.0225:
                    action = [0, 0, 0, 0, 0, 0,
                              -1]  # Drop nut once it's close enough to the peg

        action = np.clip(action, -1, 1)
        return action
class NutAssemblyDense(gym.Env):
    """
        NutAssembly:
        NutAssembly task from robosuite with no controller. Can be used for learning from scratch.
    """
    def __init__(self, horizon=500, *args, **kwargs):
        options = {}
        controller_name = 'OSC_POSE'
        options["controller_configs"] = suite.load_controller_config(
            default_controller=controller_name)

        self.env = GymWrapper(
            suite.make(
                "NutAssemblyRound",  # Nut Assembly task with the round peg
                robots="IIWA",  # use IIWA robot
                **options,  # controller options
                use_object_obs=True,
                use_camera_obs=False,  # do not use pixel observations
                has_offscreen_renderer=
                False,  # not needed since not using pixel obs
                has_renderer=False,  # make sure we can render to the screen
                reward_shaping=True,  # use dense rewards
                reward_scale=1.0,
                control_freq=
                20,  # control should happen fast enough so that simulation looks smooth
                horizon=horizon,  # number of timesteps for ending episode
            ))
        self.max_episode_steps = horizon
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space
        self.reward_type = 'sparse'
        self.distance_threshold = 0.065

    def step(self, action):
        ob, reward, done, info = self.env.step(action)
        ob = self.env.env._get_observation()
        peg_pos = np.array(self.env.sim.data.body_xpos[self.env.peg2_body_id])
        observation = np.hstack(
            (ob['robot0_eef_pos'], ob['robot0_eef_quat'], ob['RoundNut0_pos'],
             ob['RoundNut0_quat'], peg_pos))
        # info['is_success'] = self.get_success(ob['RoundNut0_pos'], peg_pos)
        info['is_success'] = self.env.env._check_success()
        return observation, reward, done, info

    def reset(self):
        ob = self.env.reset()
        ob = self.env.env._get_observation()
        peg_pos = np.array(self.env.sim.data.body_xpos[self.env.peg2_body_id])
        observation = np.hstack(
            (ob['robot0_eef_pos'], ob['robot0_eef_quat'], ob['RoundNut0_pos'],
             ob['RoundNut0_quat'], peg_pos))
        return observation

    def seed(self, seed=0):
        self.np_random, seed = seeding.np_random(seed)
        return self.env.seed(seed=seed)

    def render(self, mode="human", *args, **kwargs):
        return self.env.render()

    def close(self):
        return self.env.close()

    def goal_distance(self, achieved_goal, desired_goal):
        return np.linalg.norm(achieved_goal - desired_goal)

    def get_success(self, object_pos, goal):
        # Compute distance between goal and the achieved goal.
        d = self.goal_distance(object_pos, goal)
        return (d < self.distance_threshold).astype(np.float32)
Exemple #4
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy_sawyer.MlpPolicy(name=name,
                                           ob_space=ob_space,
                                           ac_space=ac_space,
                                           reuse=reuse,
                                           hid_size=args.policy_hidden_size,
                                           num_hid_layers=2)

    #------- Run policy for reaching ---------#
    play_env = robosuite.make(
        args.env_id,
        ignore_done=True,
        use_camera_obs=False,
        has_renderer=True,
        control_freq=100,
        gripper_visualization=True,
        #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
        #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
        box_pos=[0.23522776, 0.2287869, 0.82162434],  #shift3
        box_quat=[0.3775825618903728, 0, 0, 0.679425538604203],  #shift3
        #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4
        #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4
        #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5
        #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5
        #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6
        #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6
        #box_pos = [0.43522776, -0.367869, 0.82162434], #shift7
        #box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7
    )

    play_env = GymWrapper(play_env, keys=None,
                          generalized_goal=False)  # false for loading prevs

    #Weights are loaded from reach model grasp_strange

    #play_env.viewer.set_camera(camera_id=2) # Switch views for eval

    # Setup network
    # ----------------------------------------
    ob_space = play_env.observation_space
    ac_space = play_env.action_space
    pi_reach = policy_fn("pi", ob_space, ac_space, reuse=False)

    # Hack for loading policies using tensorflow
    init_op = tf.compat.v1.global_variables_initializer()
    saver = tf.compat.v1.train.Saver(max_to_keep=5)
    with tf.compat.v1.Session() as sess:
        sess.run(init_op)
        # Load Checkpoint
        #ckpt_path = './reach_and_grasp_weights/reach_one/trpo_gail.transition_limitation_2100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        #ckpt_path = './reach_and_grasp_weights/reach_shift2/trpo_gail.transition_limitation_2500.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        ckpt_path = './reach_and_grasp_weights/reach_3_almost/trpo_gail.transition_limitation_1100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        #ckpt_path = './reach_and_grasp_weights/reach_4/trpo_gail.transition_limitation_2400.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' # problem child 2
        #ckpt_path = './reach_and_grasp_weights/reach_5/trpo_gail.transition_limitation_2000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' #problem child 1
        #ckpt_path = './reach_and_grasp_weights/reach_6/trpo_gail.transition_limitation_2500.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        #ckpt_path = './reach_and_grasp_weights/reach_7/trpo_gail.transition_limitation_3000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        ckpt = tf.compat.v1.train.get_checkpoint_state(ckpt_path)
        saver.restore(sess, ckpt.model_checkpoint_path)

        # Create the playback environment

        _, _, last_ob, last_jpos, obs_array, jpos_array = runner_1_traj(
            play_env,
            pi_reach,
            None,
            timesteps_per_batch=3500,
            number_trajs=1,
            stochastic_policy=args.stochastic_policy,
            save=False)

    # Gripping load + setting up the last observation
    play_ob_dim = play_env.observation_space.shape[0]
    play_ac_dim = play_env.action_space.shape[0]
    grip_policy = GaussianMlpPolicy(play_ob_dim, play_ac_dim)
    grip_vf = NeuralNetValueFunction(play_ob_dim, play_ac_dim)
    grip_saver = tf.compat.v1.train.Saver(max_to_keep=5)

    unchanged_ob = np.float32(np.zeros(play_ob_dim))

    with tf.compat.v1.Session() as sess2:
        sess2.run(init_op)
        # Load Checkpoint
        #ckpt_path = './reach_and_grasp_weights/reach_one/trpo_gail.transition_limitation_2100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        #grip_ckpt_path = './reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift'
        grip_ckpt_path = './reach_and_grasp_weights/grasp_3/grasp_acktr_rl.transition_limitation_1000.SawyerLift/'  #3rd grasp
        #ckpt_path = './reach_and_grasp_weights/reach_4/trpo_gail.transition_limitation_2400.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' # problem child 2
        #ckpt_path = './reach_and_grasp_weights/reach_5/trpo_gail.transition_limitation_2000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' #problem child 1
        #ckpt_path = './reach_and_grasp_weights/reach_6/trpo_gail.transition_limitation_2500.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        #ckpt_path = './reach_and_grasp_weights/reach_7/trpo_gail.transition_limitation_3000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        grip_ckpt = tf.compat.v1.train.get_checkpoint_state(grip_ckpt_path)

        #print(grip_ckpt)

        grip_saver.restore(sess2, grip_ckpt.model_checkpoint_path)

        tt = 0

        cum_rew = 0

        #ob = last_ob
        #prev_ob = np.float32(np.zeros(ob.shape)) # check if indeed starts at all zeros

        obfilter = ZFilter(play_env.observation_space.shape)

        #statsu = np.load("./reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22953000.npz") # shift 2, is a problem?

        #statsu = np.load("./reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22953000.npz") # shift 2

        statsu = np.load(
            "./reach_and_grasp_weights/grasp_3/grasp_acktr_rl.transition_limitation_1000.SawyerLift/filter_stats_21002000.npz"
        )  # shift 3

        #statsu = np.load("./reach_and_grasp_weights/grasp_4/grasp_acktr_rl.transition_limitation_1200.SawyerLift/filter_stats_20162400.npz") # shift 4
        #statsu = np.load("./reach_and_grasp_weights/grasp_5/grasp_acktr_rl.transition_limitation_1200.SawyerLift/filter_stats_26066400.npz") #shift 5
        #statsu = np.load("./reach_and_grasp_weights/grasp_and_then_throws_somehow_6/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_27363000.npz") #shift 6
        #statsu = np.load("./reach_and_grasp_weights/grasp_pickup_7/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22773000.npz") #shift 7

        print("load n: ", statsu["n"])
        print("load M: ", statsu["M"])
        print("load S: ", statsu["S"])

        obfilter.rs._n = statsu["n"]
        obfilter.rs._M = statsu["M"]
        obfilter.rs._S = statsu["S"]

        print("obf n: ", obfilter.rs._n)
        print("obf M: ", obfilter.rs._M)
        print("obf S: ", obfilter.rs._S)

        #env.set_robot_joint_positions(last_jpos)
        #ob = np.concatenate((last_ob,env.box_end),axis=0)
        ob = last_ob

        # Will this change the behavior of loading?
        play_env.set_robot_joint_positions(last_jpos)

        prev_ob = np.float32(np.zeros(
            ob.shape))  # check if indeed starts at all zeros

        unchanged_ob = ob

        ob = obfilter(ob)

        while True:
            s = np.concatenate([ob, prev_ob], -1)
            ac, _, _ = grip_policy.act(s)

            prev_ob = np.copy(ob)

            scaled_ac = play_env.action_space.low + (ac + 1.) * 0.5 * (
                play_env.action_space.high - play_env.action_space.low)
            scaled_ac = np.clip(scaled_ac, play_env.action_space.low,
                                play_env.action_space.high)

            ob, rew, new, _ = play_env.step(scaled_ac)

            unchanged_ob = ob

            ob = obfilter(ob)

            cum_rew += rew

            play_env.render()  # check the running in for the first part
            #logger.log("rendering for reach policy")

            if new or tt >= 200:
                break
            tt += 1

        print("Cumulative reward over session: " + str(cum_rew))

    #obs_array = None
    #jpos_array = None

    #print(last_jpos)
    #print(last_ob)

    last_joint_after_grip = play_env._joint_positions
    last_after_grip = unchanged_ob
    # Change environment box position to start from the last position on playenv
    #env.model.box_pos_array = np.array(play_env.sim.data.body_xpos[play_env.cube_body_id])
    #env.model.box_quat_array = convert_quat(
    #        np.array(play_env.sim.data.body_xquat[play_env.cube_body_id]), to="xyzw"
    #    )
    #env.box_pos = play_env.model.box_pos_array
    #env.box_quat = play_env.model.box_quat_array

    # set up the environment for loading or training
    env = robosuite.make(
        args.env_id,
        ignore_done=True,
        use_camera_obs=False,
        has_renderer=True,
        control_freq=100,
        gripper_visualization=True,
        reward_shaping=True,
        rob_init=last_joint_after_grip,
        box_pos=np.array(
            play_env.sim.data.body_xpos[play_env.cube_body_id]),  #shift3
        box_quat=convert_quat(np.array(
            play_env.sim.data.body_xquat[play_env.cube_body_id]),
                              to="xyzw"),  #shift3
        #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
        #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
        #box_pos = [0.23522776, 0.2287869, 0.82162434], #shift3
        #box_quat=[0.3775825618903728, 0, 0, 0.679425538604203], #shift3
        box_end=[0.3, 0.1, 1.0]  # shift 3
        #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4, try to increase traj limit to 2000
        #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4
        #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5
        #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5
        #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6
        #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6
        #box_pos = [0.43522776, -0.367869, 0.82162434], #shift7
        #box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7
    )  # Switch from gym to robosuite, also add reward shaping to see reach goal

    env = GymWrapper(env, keys=None,
                     generalized_goal=True)  # wrap in the gym environment

    task = 'train'
    #task = 'evaluate'
    #task = 'cont_training'

    #env = bench.Monitor(env, logger.get_dir() and
    #                    osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True)

    # Note: taking away the bench monitor wrapping allows rendering

    #env.seed(args.seed) # Sawyer does not have seed

    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    logger.log("log_directories: ", args.log_dir)
    logger.log("environment action space range: ",
               env.action_space)  #logging the action space

    if task == 'train':
        play_env.close()

        #init_op2 = tf.compat.v1.global_variables_initializer()

        #sess2 = tf.compat.v1.Session(config=tf.ConfigProto())
        #with tf.compat.v1.Session(config=tf.ConfigProto()) as sess2:
        #sess2.run(init_op)
        ob_dim = env.observation_space.shape[0]
        ac_dim = env.action_space.shape[0]
        lift_vf = NeuralNetValueFunction(ob_dim, ac_dim, name="lift_vf_aktr")
        lift_policy = GaussianMlpPolicy(ob_dim, ac_dim, name="lift_pi_aktr")

        #sess2.run(init_op2)

        old_acktr_learn(
            env,
            policy=lift_policy,
            vf=lift_vf,
            gamma=0.99,
            lam=0.97,
            timesteps_per_batch=1500,
            desired_kl=0.001,
            num_timesteps=args.num_timesteps,
            save_per_iter=args.save_per_iter,
            ckpt_dir=args.checkpoint_dir,
            traj_limitation=args.traj_limitation,
            last_ob=[last_after_grip],
            last_jpos=[last_joint_after_grip],
            animate=True,
            pi_name="lift_pi_aktr",
        )

        env.close()

    elif task == 'cont_training':

        play_env.close()

        # with tf.compat.v1.Session(config=tf.ConfigProto()):
        ob_dim = env.observation_space.shape[0]
        ac_dim = env.action_space.shape[0]

        #with tf.compat.v1.variable_scope("vf_aktr"):
        cont_vf = NeuralNetValueFunction(ob_dim, ac_dim, name="lift_vf_aktr")
        #with tf.compat.v1.variable_scope("pi_aktr"):
        cont_policy = GaussianMlpPolicy(ob_dim, ac_dim, name="lift_pi_aktr")

        ckpt_path_cont = './checkpoint/grasp_acktr_rl.transition_limitation_1500.SawyerLift'

        stat_cont = "./checkpoint/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_19713000.npz"

        old_acktr_learn(
            env,
            policy=cont_policy,
            vf=cont_vf,
            gamma=0.99,
            lam=0.97,
            timesteps_per_batch=1500,
            desired_kl=0.001,
            num_timesteps=args.num_timesteps,
            save_per_iter=args.save_per_iter,
            ckpt_dir=args.checkpoint_dir,
            traj_limitation=args.traj_limitation,
            last_ob=obs_array,
            last_jpos=jpos_array,
            animate=True,
            cont_training=True,
            load_path=ckpt_path_cont,
            obfilter_load=stat_cont,
            pi_name="lift_pi_aktr",
        )

        env.close()

    elif task == 'evaluate':

        play_env.close()

        # with tf.compat.v1.Session(config=tf.ConfigProto()):
        ob_dim = env.observation_space.shape[0]
        ac_dim = env.action_space.shape[0]

        eval_lift_vf = NeuralNetValueFunction(ob_dim,
                                              ac_dim,
                                              name="lift_vf_aktr")
        eval_lift_policy = GaussianMlpPolicy(ob_dim,
                                             ac_dim,
                                             name="lift_pi_aktr")

        saver_2 = tf.compat.v1.train.Saver(max_to_keep=5)

        #sess2 = tf.compat.v1.Session(config=tf.ConfigProto())
        #with tf.compat.v1.Session(config=tf.ConfigProto()) as sess3:
        with tf.compat.v1.Session() as sess3:
            sess3.run(init_op)

            ckpt_path_2 = './checkpoint/grasp_acktr_rl.transition_limitation_1500.SawyerLift'
            #ckpt_path_2 = './reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift' # shift 2
            #ckpt_path_2 = './reach_and_grasp_weights/grasp_3/grasp_acktr_rl.transition_limitation_1000.SawyerLift' # shift 3
            #ckpt_path_2 = './reach_and_grasp_weights/grasp_4/grasp_acktr_rl.transition_limitation_1200.SawyerLift' # shift 4
            #ckpt_path_2 = './reach_and_grasp_weights/grasp_5/grasp_acktr_rl.transition_limitation_1200.SawyerLift' # shift 5
            #ckpt_path_2 = './reach_and_grasp_weights/grasp_and_then_throws_somehow_6/grasp_acktr_rl.transition_limitation_1500.SawyerLift' #shift 6
            #ckpt_path_2 = './reach_and_grasp_weights/grasp_pickup_7/grasp_acktr_rl.transition_limitation_1500.SawyerLift' #shift 7
            ckpt_2 = tf.compat.v1.train.get_checkpoint_state(ckpt_path_2)
            saver_2.restore(sess3, ckpt_2.model_checkpoint_path)

            tt = 0

            cum_rew = 0

            #ob = last_ob
            #prev_ob = np.float32(np.zeros(ob.shape)) # check if indeed starts at all zeros

            obfilter = ZFilter(env.observation_space.shape)

            statsu = np.load(
                "./checkpoint/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_23493000.npz"
            )

            #statsu = np.load("./reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22953000.npz") # shift 2
            #statsu = np.load("./reach_and_grasp_weights/grasp_3/grasp_acktr_rl.transition_limitation_1000.SawyerLift/filter_stats_21002000.npz") # shift 3
            #statsu = np.load("./reach_and_grasp_weights/grasp_4/grasp_acktr_rl.transition_limitation_1200.SawyerLift/filter_stats_20162400.npz") # shift 4
            #statsu = np.load("./reach_and_grasp_weights/grasp_5/grasp_acktr_rl.transition_limitation_1200.SawyerLift/filter_stats_26066400.npz") #shift 5
            #statsu = np.load("./reach_and_grasp_weights/grasp_and_then_throws_somehow_6/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_27363000.npz") #shift 6
            #statsu = np.load("./reach_and_grasp_weights/grasp_pickup_7/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22773000.npz") #shift 7

            print("load n: ", statsu["n"])
            print("load M: ", statsu["M"])
            print("load S: ", statsu["S"])

            obfilter.rs._n = statsu["n"]
            obfilter.rs._M = statsu["M"]
            obfilter.rs._S = statsu["S"]

            print("obf n: ", obfilter.rs._n)
            print("obf M: ", obfilter.rs._M)
            print("obf S: ", obfilter.rs._S)

            env.set_robot_joint_positions(last_jpos)
            ob = np.concatenate((last_ob, env.box_end), axis=0)
            prev_ob = np.float32(np.zeros(
                ob.shape))  # check if indeed starts at all zeros

            ob = obfilter(ob)

            while True:
                s = np.concatenate([ob, prev_ob], -1)
                ac, _, _ = eval_lift_policy.act(s)

                prev_ob = np.copy(ob)

                scaled_ac = env.action_space.low + (ac + 1.) * 0.5 * (
                    env.action_space.high - env.action_space.low)
                scaled_ac = np.clip(scaled_ac, env.action_space.low,
                                    env.action_space.high)

                ob, rew, new, _ = env.step(scaled_ac)

                ob = obfilter(ob)

                cum_rew += rew

                env.render()  # check the running in for the first part
                #logger.log("rendering for reach policy")

                if new or tt >= args.traj_limitation:
                    break
                tt += 1

            print("Cumulative reward over session: " + str(cum_rew))

    env.close()
Exemple #5
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = robosuite.make(
        args.env_id,
        ignore_done=True,
        use_camera_obs=False,
        has_renderer=True,
        control_freq=100,
        gripper_visualization=True,
        reward_shaping=True,
        #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
        #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
    )  # Switch from gym to robosuite, also add reward shaping to see reach goal

    env = GymWrapper(env)  # wrap in the gym environment

    #task = 'train'
    task = 'evaluate'

    # Expert Path
    expert_path = '/home/mastercljohnson/Robotics/GAIL_Part/mod_surreal/robosuite/models/assets/demonstrations/150_grasp_shift2/combined/combined_0.npz'  # path for 100 trajectories

    #parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz')

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy_sawyer.MlpPolicy(name=name,
                                           ob_space=ob_space,
                                           ac_space=ac_space,
                                           reuse=reuse,
                                           hid_size=args.policy_hidden_size,
                                           num_hid_layers=2)

    env = bench.Monitor(env,
                        logger.get_dir()
                        and osp.join(logger.get_dir(), "monitor.json"),
                        allow_early_resets=True)

    # Note: taking away the bench monitor wrapping allows rendering

    #env.seed(args.seed) # Sawyer does not have seed

    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    logger.log("log_directories: ", args.log_dir)
    logger.log("environment action space range: ",
               env.action_space)  #logging the action space

    #------- Run policy for reaching ---------#
    play_env = robosuite.make(
        args.env_id,
        ignore_done=True,
        use_camera_obs=False,
        has_renderer=True,
        control_freq=100,
        gripper_visualization=True,
        #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
        #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
    )

    play_env = GymWrapper(play_env)

    #Weights are loaded from reach model grasp_strange

    #play_env.viewer.set_camera(camera_id=2) # Switch views for eval

    # Setup network
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi_reach = policy_fn("pi", ob_space, ac_space, reuse=False)

    # Hack for loading policies using tensorflow
    init_op = tf.compat.v1.global_variables_initializer()
    saver = tf.compat.v1.train.Saver(max_to_keep=5)
    with tf.compat.v1.Session() as sess:
        sess.run(init_op)
        # Load Checkpoint
        ckpt_path = './reach_and_grasp_weights/reach_one/trpo_gail.transition_limitation_2100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        ckpt = tf.compat.v1.train.get_checkpoint_state(ckpt_path)
        saver.restore(sess, ckpt.model_checkpoint_path)

        # Create the playback environment

        _, _, last_ob, last_jpos = runner_1_traj(
            play_env,
            pi_reach,
            None,
            timesteps_per_batch=3500,
            number_trajs=1,
            stochastic_policy=args.stochastic_policy,
            save=False)

    if task == 'train':
        play_env.close()

        dataset = Mujoco_Dset(expert_path=expert_path,
                              traj_limitation=args.traj_limitation)

        reward_giver = TransitionClassifier(env,
                                            args.adversary_hidden_size,
                                            entcoeff=args.adversary_entcoeff)
        train_grasp(env, last_ob, last_jpos, args.seed, policy_fn,
                    reward_giver, dataset, args.algo, args.g_step, args.d_step,
                    args.policy_entcoeff, args.num_timesteps,
                    args.save_per_iter, args.checkpoint_dir, args.log_dir,
                    args.pretrained, args.BC_max_iter, task_name)

    elif task == 'evaluate':
        pi_grasp = policy_fn("pi_grasp", ob_space, ac_space, reuse=False)
        saver_2 = tf.compat.v1.train.Saver(max_to_keep=5)
        with tf.compat.v1.Session() as sess:
            sess.run(init_op)
            ckpt_path_2 = './reach_and_grasp_weights/grasp_shift1_after_reach/grasptrpo_gail.transition_limitation_2000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
            ckpt_2 = tf.compat.v1.train.get_checkpoint_state(ckpt_path_2)
            saver_2.restore(sess, ckpt_2.model_checkpoint_path)

            tt = 0
            ob = last_ob

            while True:
                ac, vpred = pi_grasp.act(False, ob)
                ob, rew, new, _ = play_env.step(ac)

                play_env.render()  # check the running in for the first part
                #logger.log("rendering for reach policy")

                if new or tt >= args.traj_limitation:
                    break
                tt += 1

        play_env.close()

    env.close()
Exemple #6
0
    grippers = ["TwoFingerGripper", "PR2Gripper", "RobotiqGripper"]

    for gripper in grippers:

        # create environment with selected grippers
        env = GymWrapper(
            suite.make(
                "SawyerPickPlace",
                gripper_type=gripper,
                use_camera_obs=False,  # do not use pixel observations
                has_offscreen_renderer=
                False,  # not needed since not using pixel obs
                has_renderer=True,  # make sure we can render to the screen
                reward_shaping=True,  # use dense rewards
                control_freq=
                100,  # control should happen fast enough so that simulation looks smooth
            ))

        # run a random policy
        observation = env.reset()
        for t in range(500):
            env.render()
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            if done:
                print("Episode finished after {} timesteps".format(t + 1))
                break

        # close window
        env.close()