def __init__(self, horizon=500, *args, **kwargs):
        options = {}
        controller_name = 'OSC_POSE'
        options["controller_configs"] = suite.load_controller_config(
            default_controller=controller_name)

        self.env = GymWrapper(
            suite.make(
                "NutAssemblyRound",  # Nut Assembly task with the round peg
                robots="IIWA",  # use IIWA robot
                **options,  # controller options
                use_object_obs=True,
                use_camera_obs=False,  # do not use pixel observations
                has_offscreen_renderer=
                False,  # not needed since not using pixel obs
                has_renderer=False,  # make sure we can render to the screen
                reward_shaping=True,  # use dense rewards
                reward_scale=1.0,
                control_freq=
                20,  # control should happen fast enough so that simulation looks smooth
                horizon=horizon,  # number of timesteps for ending episode
            ))
        self.max_episode_steps = horizon
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space
        self.reward_type = 'sparse'
        self.distance_threshold = 0.065
Beispiel #2
0
def evaluate_policy(env_config, model_path, n_eval, printout=False):
    if printout:
        print("Loading policy...")

    # Load trained model and corresponding policy
    data = torch.load(model_path)
    policy = data['evaluation/policy']

    if printout:
        print("Policy loaded")

    # Load controller
    controller = env_config.pop("controller")
    if controller in set(ALL_CONTROLLERS):
        # This is a default controller
        controller_config = load_controller_config(
            default_controller=controller)
    else:
        # This is a string to the custom controller
        controller_config = load_controller_config(custom_fpath=controller)

    # Create robosuite env
    env = suite.make(**env_config,
                     has_renderer=False,
                     has_offscreen_renderer=False,
                     use_object_obs=True,
                     use_camera_obs=False,
                     reward_shaping=True,
                     controller_configs=controller_config)
    env = GymWrapper(env)

    # Use CUDA if available
    if torch.cuda.is_available():
        set_gpu_mode(True)
        policy.cuda() if not isinstance(
            policy, MakeDeterministic) else policy.stochastic_policy.cuda()

    if printout:
        print("Evaluating policy over {} simulations...".format(n_eval))

    # Create variable to hold rewards to be averaged
    returns = []

    # Loop through simulation n_eval times and take average returns each time
    for i in range(n_eval):
        path = rollout(
            env,
            policy,
            max_path_length=env_config["horizon"],
            render=False,
        )

        # Determine total summed rewards from episode and append to 'returns'
        returns.append(sum(path["rewards"]))

    # Average the episode rewards and return the normalized corresponding value
    return sum(returns) / (env_config["reward_scale"] * n_eval)
Beispiel #3
0
def make_env():
    controller = load_controller_config(default_controller="OSC_POSE")
    env = GymWrapper(
        suite.make(
            "Door",
            robots="Panda",  # use Sawyer robot
            use_camera_obs=False,  # do not use pixel observations
            has_offscreen_renderer=False,  # not needed since not using pixel obs
            has_renderer=False,  # make sure we can render to the screen
            reward_shaping=True,  # use dense rewards
            reward_scale=1.0,  # scale max 1 per timestep
            control_freq=
            20,  # control should happen fast enough so that simulation looks smooth
            horizon=500,
            ignore_done=True,
            hard_reset=False,
            controller_configs=controller))
    return env
Beispiel #4
0
    def env_trans_fn(self, env, set_eval):

        if self.args.rs_demo_wrapper:
            env = DemoSamplerWrapper(
                env,
                demo_path=self.args.demo_path,
                need_xml=True,
                num_traj=-1,
                sampling_schemes=["uniform", "random"],
                scheme_ratios=[0.9, 0.1],
            )
            # For some reason this is reassigned
            env.action_spec = env.env.action_spec

        # Only render to camera when using as an evaluation environment.
        env = GymWrapper(env)
        env.reward_range = None
        env.metadata = None
        env.spec = None

        return RoboSuiteWrapper(env)
Beispiel #5
0
def fetch_push_gym_demo():
    env = GymWrapper(
        suite.make(
            'FetchPush',
            robots='Panda',
            controller_configs=None,
            gripper_types='UltrasoundProbeGripper',
            has_renderer=True,
            has_offscreen_renderer=False,
            use_camera_obs=False,
            use_object_obs=True,
            control_freq=50,
            render_camera=None,
        ))
    for i_episode in range(20):
        observation = env.reset()
        for t in range(500):
            env.render()
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            if done:
                print("Episode finished after {} timesteps".format(t + 1))
                break
                print("Episode finished after {} timesteps".format(t+1))
                break

We demonstrate equivalent functionality below.
"""

from robosuite.wrappers import GymWrapper

if __name__ == "__main__":

    # Notice how the environment is wrapped by the wrapper
    env = GymWrapper(
        suite.make(
            "SawyerLift",
            use_camera_obs=False,  # do not use pixel observations
            has_offscreen_renderer=False,  # not needed since not using pixel obs
            has_renderer=True,  # make sure we can render to the screen
            reward_shaping=True,  # use dense rewards
            control_freq=100,  # control should happen fast enough so that simulation looks smooth
        )
    )

    for i_episode in range(20):
        observation = env.reset()
        for t in range(500):
            env.render()
            action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            if done:
                print("Episode finished after {} timesteps".format(t + 1))
                break
Beispiel #7
0
 def _init():
     register_gripper(UltrasoundProbeGripper)
     env = GymWrapper(suite.make(env_id, **options))
     env = Monitor(env)
     env.seed(seed + rank)
     return env
Beispiel #8
0
        eval_env = DummyVecEnv([eval_env_func])
        eval_env = VecNormalize(eval_env)

        eval_callback = EvalCallback(eval_env, best_model_save_path='./best_models/',
                             log_path='./logs_best_model/',
                             deterministic=True, render=False, n_eval_episodes=10)

        model.learn(total_timesteps=training_timesteps, tb_log_name=tb_log_name, callback=eval_callback)

        model.save(save_model_path)
        env.save(save_vecnormalize_path)

    else:
        options['has_renderer'] = True
        register_gripper(UltrasoundProbeGripper)
        env_gym = GymWrapper(suite.make(env_id, **options))
        env = DummyVecEnv([lambda : env_gym])

        model = PPO.load(load_model_path)
        env = VecNormalize.load(load_vecnormalize_path, env)

        env.training = False
        env.norm_reward = False

        obs = env.reset()
        eprew = 0

        while True:
            action, _states = model.predict(obs, deterministic=True)
            #action = env.action_space.sample()
            obs, reward, done, info = env.step(action)
class NutAssemblyDenseHand(gym.Env):
    """
        TODO: change obs and reward setting
        NutAssemblyHand:
            'FetchPickAndPlace-v1' with a perfect controller
            Action taken as:
                pi_theta(s) = pi(s) + f_theta(s)
        Parameters:
        -----------
        kp: float
            Scaling factor for position control
    """
    def __init__(self, kp: float = 20, *args, **kwargs):
        options = {}
        controller_name = 'OSC_POSE'
        options["controller_configs"] = suite.load_controller_config(
            default_controller=controller_name)

        self.env = GymWrapper(
            suite.make(
                "NutAssemblyRound",  # Nut Assembly task with the round peg
                robots="IIWA",  # use IIWA robot
                **options,  # controller options
                use_object_obs=True,
                use_camera_obs=False,  # do not use pixel observations
                has_offscreen_renderer=
                False,  # not needed since not using pixel obs
                has_renderer=False,  # make sure we can render to the screen
                reward_shaping=False,  # use dense rewards
                control_freq=
                20,  # control should happen fast enough so that simulation looks smooth
            ))

        self.max_episode_steps = 500
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space
        self.reward_type = 'sparse'
        self.distance_threshold = 0.065

    def step(self, residual_action: np.ndarray):
        controller_action = np.array(
            self.controller_action(self.last_observation))
        if (controller_action > 1).any() or (controller_action < -1).any():
            print(controller_action)
        action = np.add(controller_action, residual_action)
        action = np.clip(action, -1, 1)
        ob, reward, done, info = self.env.step(action)
        ob = self.env.env._get_observation()
        observation = {}
        observation['observation'] = np.hstack(
            (ob['robot0_eef_pos'], ob['robot0_eef_quat'], ob['RoundNut0_pos'],
             ob['RoundNut0_quat']))
        observation['desired_goal'] = np.array(
            self.env.sim.data.body_xpos[self.env.peg2_body_id])
        observation['achieved_goal'] = ob['RoundNut0_pos']
        self.last_observation = observation.copy()
        info['is_success'] = reward
        return observation, reward, done, info

    def reset(self):
        self.env.reset()  # reset according to task defaults
        ob = self.env.env._get_observation()
        observation = {}
        observation['observation'] = np.hstack(
            (ob['robot0_eef_pos'], ob['robot0_eef_quat'], ob['RoundNut0_pos'],
             ob['RoundNut0_quat']))
        observation['desired_goal'] = np.array(
            self.env.sim.data.body_xpos[self.env.peg2_body_id])
        observation['achieved_goal'] = ob['RoundNut0_pos']
        self.last_observation = observation.copy()
        self.object_in_hand = False
        self.object_below_hand = False
        self.gripper_reoriented = 0
        return observation

    def seed(self, seed: int = 0):
        self.np_random, seed = seeding.np_random(seed)
        return self.env.seed(seed=seed)

    def render(self, mode: str = "human", *args, **kwargs):
        return self.env.render()

    def close(self):
        return self.env.close()

    def goal_distance(self, achieved_goal, desired_goal):
        return np.linalg.norm(achieved_goal - desired_goal, axis=1)

    def compute_reward(self, achieved_goal, goal, info):
        # Compute distance between goal and the achieved goal.
        d = self.goal_distance(achieved_goal, goal)
        if self.reward_type == 'sparse':
            return -(d > self.distance_threshold).astype(np.float32)
        else:
            return -d

    def controller_action(self,
                          obs: dict,
                          take_action: bool = True,
                          DEBUG: bool = False):
        observation = obs['observation']
        goal_pos = obs['desired_goal']
        achieved_goal = obs['achieved_goal']

        gripper_pos = observation[:3]
        gripper_quat = observation[3:7]
        object_pos = observation[7:10]
        object_quat = observation[10:]

        z_table = 0.8610982

        object_axang = T.quat2axisangle(object_quat)
        if abs(object_axang[-1] - 1.24) < 0.2:
            object_axang_touse = [
                0, 0, object_axang[-1] % (2 * pi / 8) + (2 * pi / 8)
            ]
        else:
            object_axang_touse = [0, 0, object_axang[-1] % (2 * pi / 8)]
        gripper_axang = T.quat2axisangle(gripper_quat)
        # print('object axang to use ' + str(object_axang_touse))

        if self.gripper_reoriented == 0:
            self.gripper_init_quat = gripper_quat
            self.gripper_reoriented = 1

        init_inv = T.quat_inverse(self.gripper_init_quat)
        changing_wf = T.quat_multiply(init_inv, gripper_quat)
        changing_wf_axang = T.quat2axisangle(changing_wf)

        # gripper_quat_inv = T.quat_inverse(gripper_quat)
        # changing_wf = T.quat_multiply(gripper_quat_inv,self.gripper_init_quat)
        # changing_wf_axang = T.quat2axisangle(changing_wf)

        # print('changing wf axis ' +str(changing_wf_axang))

        if not self.object_below_hand or self.gripper_reoriented < 5:
            self.nut_p = T.quat2axisangle(object_quat)[-1]
            # print(self.nut_p)
            if not self.object_below_hand:
                action = 20 * (object_pos[:2] - gripper_pos[:2])
            else:
                action = [0, 0]

            action = 20 * (object_pos[:2] - gripper_pos[:2])

            # frac = 0.2 # Rate @ which to rotate gripper about z.
            # ang_goal = frac*self.nut_p # Nut p is the nut's random intial pertubation about z.

            # if self.gripper_reoriented == 0:
            #     self.gripper_init_quat = gripper_quat
            # if self.gripper_reoriented < 5: # Gripper should be aligned with nut after 5 action steps
            #     action_angle= [0,0,ang_goal]

            #     #print('object ' + str(object_axang))
            #     #print('gripper ' + str(gripper_axang))

            #     init_inv = T.quat_inverse(self.gripper_init_quat)
            #     init_inv_mat = T.quat2mat(init_inv)

            #     rel = T.quat_multiply(gripper_quat,init_inv)
            #     rel_axang = T.quat2axisangle(rel)
            #     #print('rel_axang ' +str(rel_axang))

            #     rel_axang_WF = np.matmul(init_inv_mat,rel_axang)

            #     #print('rel_axang_WF ' +str(rel_axang_WF))

            #     if take_action:
            #         self.gripper_reoriented+=1
            # else: # After 5 action steps, don't rotate gripper any more
            #     action_angle=[0,0,0]

            action_angle = 0.2 * (object_axang_touse - changing_wf_axang)
            action_angle = [0, 0, action_angle[-1]]
            #action_angle = [0,0,0]
            #print('action angle ' +str(action_angle))

            if np.linalg.norm(object_axang_touse - changing_wf_axang) < 0.1:
                if take_action:
                    self.gripper_reoriented = 5

            action = np.hstack((action, [0], action_angle, [-1]))
            if np.linalg.norm((object_pos[:2] - gripper_pos[:2])) < 0.01:
                if take_action:
                    self.object_below_hand = True
                    #self.gripper_reoriented = 5

        elif not self.object_in_hand:  # Close gripper
            action = [0, 0, -1, 0, 0, 0, -1]
            if np.linalg.norm((object_pos[2] - gripper_pos[2])) < 0.01:
                action = [0, 0, 0, 0, 0, 0, 1]
                if take_action:
                    self.object_in_hand = True

        else:  # Move gripper up and toward goal position
            action = [0, 0, 1, 0, 0, 0, 1]
            if object_pos[2] - z_table > 0.1:
                action = 20 * (goal_pos[:2] - object_pos[:2])
                action = np.hstack((action, [0, 0, 0, 0, 1]))
                if np.linalg.norm((goal_pos[:2] - object_pos[:2])) < 0.0225:
                    action = [0, 0, 0, 0, 0, 0,
                              -1]  # Drop nut once it's close enough to the peg

        action = np.clip(action, -1, 1)
        return action
Beispiel #10
0
    do_render = False
    use_shape = 1

    env = robosuite.make(
        env_name,
        has_offscreen_renderer=False,  # not needed since not using pixel obs
        has_renderer=do_render,
        ignore_done=True,
        use_camera_obs=False,
        gripper_visualization=False,
        reward_shaping=use_shape,
        control_freq=100,
    )

    # env = IKWrapper(env)
    env = GymWrapper(env)

    # list of all demonstrations episodes
    demos = list(f["data"].keys())
    """ we will save 1 file for 1 demo trajectory. """
    for ep_i in range(1, len(demos) + 1):

        # ep = demos[ep_i]  ## The demos list variable does not sort 1, 2, 3, ... Instead it sorts 1, 10, 100, ...
        ep = "demo_%d" % ep_i

        model_file = f["data/{}".format(ep)].attrs["model_file"]
        model_path = os.path.join(demo_path, "models", model_file)
        with open(model_path, "r") as model_f:
            model_xml = model_f.read()

        ## reset model for each episode
[2] JOINT_POSITION - Joint Position
[3] OSC_POSITION - Operational Space Control (Position Only)
[4] OSC_POSE - Operational Space Control (Position + Orientation)
[5] IK_POSE - Inverse Kinematics Control (Position + Orientation) (Note: must have PyBullet installed)
"""
options = {}
controller_name = 'OSC_POSITION'
options["controller_configs"] = suite.load_controller_config(
    default_controller=controller_name)

env = GymWrapper(
    suite.make(
        "PickPlaceMilk",  # pickPLaceMilk task
        robots="IIWA",  # use IIWA robot
        **options,  # controller options
        use_camera_obs=False,  # do not use pixel observations
        has_offscreen_renderer=False,  # not needed since not using pixel obs
        has_renderer=True,  # make sure we can render to the screen
        reward_shaping=True,  # use dense rewards
        control_freq=
        20,  # control should happen fast enough so that simulation looks smooth
    ))


def controller(obs: dict, object_in_hand: bool = False):
    gripper_pos = obs['robot0_eef_pos']
    try:
        object_pos = obs['Can0_pos']
    except:
        object_pos = obs['Milk0_pos']
    z_table = 0.86109826
    # print(object_pos)
Beispiel #12
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = robosuite.make(args.env_id,
            ignore_done=True,
            use_camera_obs=False,
            has_renderer=True,
            control_freq=100,
            gripper_visualization=True,
            reward_shaping=True,
            #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
            #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
            #box_pos = [0.23522776, 0.2287869, 0.82162434], #shift3
            #box_quat=[0.3775825618903728, 0, 0, 0.679425538604203], #shift3
            #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4
            #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4
            #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5 
            #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5
            #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6
            #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6
            box_pos = [0.43522776, -0.367869, 0.82162434], #shift7
            box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7
            ) # Switch from gym to robosuite, also add reward shaping to see reach goal

    env = GymWrapper(env) # wrap in the gym environment

    # Environment joints should be clipped at 1 and -1 for sawyer

    
    # Task
    #task = 'train'
    task = 'evaluate'
    # parser.add_argument('--task', type=str, choices=['train', 'evaluate', 'sample'], default='train')

    # Expert Path
    #expert_path = '/home/mastercljohnson/Robotics/GAIL_Part/mod_surreal/robosuite/models/assets/demonstrations/ac100/combined/combined_0.npz' # path for 100 trajectories
    expert_path = '/home/mastercljohnson/Robotics/GAIL_Part/mod_surreal/robosuite/models/assets/demonstrations/120_shift7/combined/combined_0.npz' # path for 100 trajectories

    #parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz')
    
    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy_sawyer.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and
                        osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True)
    
    #env.seed(args.seed) # Sawyer does not have seed 

    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    #if not os.path.isdir(args.log_dir):
    #    os.makedirs(args.log_dir)

    logger.log("log_directories: ",args.log_dir)
    
    logger.log("environment action space range: ", env.action_space) #logging the action space

    if task == 'train':
        dataset = Mujoco_Dset(expert_path=expert_path, traj_limitation=args.traj_limitation)

        # Check dimensions of the dataset
        #print("dimension of inputs", dataset.dset.inputs.shape) # dims seem correct
        #print("dimension of inputs", dataset.dset.labels.shape) # dims seem correct

        reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
        train(env,
              args.seed,
              policy_fn,
              reward_giver,
              dataset,
              args.algo,
              args.g_step,
              args.d_step,
              args.policy_entcoeff,
              args.num_timesteps,
              args.save_per_iter,
              args.checkpoint_dir,
              args.log_dir,
              args.pretrained,
              args.BC_max_iter,
              task_name
              )
    elif task == 'evaluate':
        # Create the playback environment
        play_env = robosuite.make(args.env_id,
                ignore_done=True,
                use_camera_obs=False,
                has_renderer=True,
                control_freq=100,
                gripper_visualization=True,
                #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
                #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
                #box_pos = [0.23522776, 0.2287869, 0.82162434], #shift3
                #box_quat=[0.3775825618903728, 0, 0, 0.679425538604203], #shift3
                #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4
                #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4
                #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5 
                #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5
                #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6
                #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6
                box_pos = [0.43522776, -0.367869, 0.82162434], #shift7
                box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7
                )

        #play_env.viewer.set_camera(camera_id=2) # Switch views for eval

        runner(env,
                play_env,
                policy_fn,
                args.load_model_path,
                timesteps_per_batch=4000, # Change time step per batch to be more reasonable
                number_trajs=20, # change from 10 to 1 for evaluation
                stochastic_policy=args.stochastic_policy,
                save=args.save_sample
                )
    else:
        raise NotImplementedError
    env.close()
Beispiel #13
0
def experiment(variant, agent="SAC"):

    # Make sure agent is a valid choice
    assert agent in AGENTS, "Invalid agent selected. Selected: {}. Valid options: {}".format(
        agent, AGENTS)

    # Get environment configs for expl and eval envs and create the appropriate envs
    # suites[0] is expl and suites[1] is eval
    suites = []
    for env_config in (variant["expl_environment_kwargs"],
                       variant["eval_environment_kwargs"]):
        # Load controller
        controller = env_config.pop("controller")
        if controller in set(ALL_CONTROLLERS):
            # This is a default controller
            controller_config = load_controller_config(
                default_controller=controller)
        else:
            # This is a string to the custom controller
            controller_config = load_controller_config(custom_fpath=controller)
        # Create robosuite env and append to our list
        suites.append(
            suite.make(
                **env_config,
                has_renderer=False,
                has_offscreen_renderer=False,
                use_object_obs=True,
                use_camera_obs=False,
                reward_shaping=True,
                controller_configs=controller_config,
            ))
    # Create gym-compatible envs
    expl_env = NormalizedBoxEnv(GymWrapper(suites[0]))
    eval_env = NormalizedBoxEnv(GymWrapper(suites[1]))

    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size

    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs'],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs'],
    )
    target_qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs'],
    )
    target_qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs'],
    )

    # Define references to variables that are agent-specific
    trainer = None
    eval_policy = None
    expl_policy = None

    # Instantiate trainer with appropriate agent
    if agent == "SAC":
        expl_policy = TanhGaussianPolicy(
            obs_dim=obs_dim,
            action_dim=action_dim,
            **variant['policy_kwargs'],
        )
        eval_policy = MakeDeterministic(expl_policy)
        trainer = SACTrainer(env=eval_env,
                             policy=expl_policy,
                             qf1=qf1,
                             qf2=qf2,
                             target_qf1=target_qf1,
                             target_qf2=target_qf2,
                             **variant['trainer_kwargs'])
    elif agent == "TD3":
        eval_policy = TanhMlpPolicy(input_size=obs_dim,
                                    output_size=action_dim,
                                    **variant['policy_kwargs'])
        target_policy = TanhMlpPolicy(input_size=obs_dim,
                                      output_size=action_dim,
                                      **variant['policy_kwargs'])
        es = GaussianStrategy(
            action_space=expl_env.action_space,
            max_sigma=0.1,
            min_sigma=0.1,  # Constant sigma
        )
        expl_policy = PolicyWrappedWithExplorationStrategy(
            exploration_strategy=es,
            policy=eval_policy,
        )
        trainer = TD3Trainer(policy=eval_policy,
                             qf1=qf1,
                             qf2=qf2,
                             target_qf1=target_qf1,
                             target_qf2=target_qf2,
                             target_policy=target_policy,
                             **variant['trainer_kwargs'])
    else:
        print("Error: No valid agent chosen!")

    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        expl_policy,
    )

    # Define algorithm
    algorithm = CustomTorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Beispiel #14
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = robosuite.make(
        args.env_id,
        ignore_done=True,
        use_camera_obs=False,
        has_renderer=True,
        control_freq=100,
        gripper_visualization=True,
        reward_shaping=True,
        #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
        #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
    )  # Switch from gym to robosuite, also add reward shaping to see reach goal

    env = GymWrapper(env)  # wrap in the gym environment

    #task = 'train'
    task = 'evaluate'

    # Expert Path
    expert_path = '/home/mastercljohnson/Robotics/GAIL_Part/mod_surreal/robosuite/models/assets/demonstrations/150_grasp_shift2/combined/combined_0.npz'  # path for 100 trajectories

    #parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz')

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy_sawyer.MlpPolicy(name=name,
                                           ob_space=ob_space,
                                           ac_space=ac_space,
                                           reuse=reuse,
                                           hid_size=args.policy_hidden_size,
                                           num_hid_layers=2)

    env = bench.Monitor(env,
                        logger.get_dir()
                        and osp.join(logger.get_dir(), "monitor.json"),
                        allow_early_resets=True)

    # Note: taking away the bench monitor wrapping allows rendering

    #env.seed(args.seed) # Sawyer does not have seed

    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    logger.log("log_directories: ", args.log_dir)
    logger.log("environment action space range: ",
               env.action_space)  #logging the action space

    #------- Run policy for reaching ---------#
    play_env = robosuite.make(
        args.env_id,
        ignore_done=True,
        use_camera_obs=False,
        has_renderer=True,
        control_freq=100,
        gripper_visualization=True,
        #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
        #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
    )

    play_env = GymWrapper(play_env)

    #Weights are loaded from reach model grasp_strange

    #play_env.viewer.set_camera(camera_id=2) # Switch views for eval

    # Setup network
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi_reach = policy_fn("pi", ob_space, ac_space, reuse=False)

    # Hack for loading policies using tensorflow
    init_op = tf.compat.v1.global_variables_initializer()
    saver = tf.compat.v1.train.Saver(max_to_keep=5)
    with tf.compat.v1.Session() as sess:
        sess.run(init_op)
        # Load Checkpoint
        ckpt_path = './reach_and_grasp_weights/reach_one/trpo_gail.transition_limitation_2100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        ckpt = tf.compat.v1.train.get_checkpoint_state(ckpt_path)
        saver.restore(sess, ckpt.model_checkpoint_path)

        # Create the playback environment

        _, _, last_ob, last_jpos = runner_1_traj(
            play_env,
            pi_reach,
            None,
            timesteps_per_batch=3500,
            number_trajs=1,
            stochastic_policy=args.stochastic_policy,
            save=False)

    if task == 'train':
        play_env.close()

        dataset = Mujoco_Dset(expert_path=expert_path,
                              traj_limitation=args.traj_limitation)

        reward_giver = TransitionClassifier(env,
                                            args.adversary_hidden_size,
                                            entcoeff=args.adversary_entcoeff)
        train_grasp(env, last_ob, last_jpos, args.seed, policy_fn,
                    reward_giver, dataset, args.algo, args.g_step, args.d_step,
                    args.policy_entcoeff, args.num_timesteps,
                    args.save_per_iter, args.checkpoint_dir, args.log_dir,
                    args.pretrained, args.BC_max_iter, task_name)

    elif task == 'evaluate':
        pi_grasp = policy_fn("pi_grasp", ob_space, ac_space, reuse=False)
        saver_2 = tf.compat.v1.train.Saver(max_to_keep=5)
        with tf.compat.v1.Session() as sess:
            sess.run(init_op)
            ckpt_path_2 = './reach_and_grasp_weights/grasp_shift1_after_reach/grasptrpo_gail.transition_limitation_2000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
            ckpt_2 = tf.compat.v1.train.get_checkpoint_state(ckpt_path_2)
            saver_2.restore(sess, ckpt_2.model_checkpoint_path)

            tt = 0
            ob = last_ob

            while True:
                ac, vpred = pi_grasp.act(False, ob)
                ob, rew, new, _ = play_env.step(ac)

                play_env.render()  # check the running in for the first part
                #logger.log("rendering for reach policy")

                if new or tt >= args.traj_limitation:
                    break
                tt += 1

        play_env.close()

    env.close()
Beispiel #15
0
    args = parser.parse_args()

    # create original environment
    env = robosuite.make(
        args.environment,
        ignore_done=True,
        use_camera_obs=False,
        has_renderer=True,
        control_freq=100,
    )
    data_directory = args.directory

    # wrap the environment with data collection wrapper
    #env = DataCollectionWrapper(env, data_directory)
    #env = IKWrapper(env)
    env = GymWrapper(env)

    # testing to make sure multiple env.reset calls don't create multiple directories
    env.reset()
    env.reset()
    env.reset()

    # collect some data
    #print("Collecting some random data...")
    #collect_random_trajectory(env, timesteps=args.timesteps)

    # playback some data
    _ = input("Press any key to begin the playback...")
    print("Playing back the data...")
    #data_directory = env.ep_directory
    playback_trajectory(env, data_directory)
    # Create env
    env_suite = suite.make(**env_args,
                           controller_configs=controller_config,
                           has_renderer=not args.record_video,
                           has_offscreen_renderer=args.record_video,
                           use_object_obs=True,
                           use_camera_obs=args.record_video,
                           reward_shaping=True
                           )
    
    # Make sure we only pass in the proprio and object obs (no images)
    keys = ["object-state"]
    for idx in range(len(env_suite.robots)):
        keys.append(f"robot{idx}_proprio-state")
    
    # Wrap environment so it's compatible with Gym API
    env = GymWrapper(env_suite, keys=keys)

    # Run rollout
    simulate_policy(
        env=env,
        model_path=os.path.join(args.load_dir, "params.pkl"),
        horizon=env_args["horizon"],
        render=not args.record_video,
        video_writer=video_writer,
        num_episodes=args.num_episodes,
        printout=True,
        use_gpu=args.gpu,
    )
Beispiel #17
0
    do_render = args.render
    use_shape = 1

    env = robosuite.make(
        env_name,
        has_offscreen_renderer=False,  # not needed since not using pixel obs
        has_renderer=do_render,
        ignore_done=True,
        use_camera_obs=False,
        gripper_visualization=False,
        reward_shaping=use_shape,
        control_freq=100,
    )

    # env = IKWrapper(env)
    env = GymWrapper(env)

    # list of all demonstrations episodes
    demos = list(f["data"].keys())
    """ we will save 1 file for 1 demo trajectory. """
    for ep_i in range(1, len(demos) + 1):

        # ep = demos[ep_i]  ## The demos list variable does not sort 1, 2, 3, ... Instead it sorts 1, 10, 100, ...
        ep = "demo_%d" % ep_i

        model_file = f["data/{}".format(ep)].attrs["model_file"]
        model_path = os.path.join(demo_path, "models", model_file)
        with open(model_path, "r") as model_f:
            model_xml = model_f.read()

        ## reset model for each episode
        box_quat=[0.6775825618903728, 0, 0, 0.679425538604203],  # shift2
    )

    pre_env = robosuite.make(
        args.environment,
        ignore_done=True,
        use_camera_obs=False,
        has_renderer=True,
        control_freq=100,
        gripper_visualization=True,
        box_pos=[0.63522776, -0.3287869, 0.82162434],  # shift2
        box_quat=[0.6775825618903728, 0, 0, 0.679425538604203],  # shift2
    )

    # enable controlling the end effector directly instead of using joint velocities
    pre_env = GymWrapper(pre_env)
    env = IKWrapper(env)  # note cannot disable this or things go wack

    # wrap the environment with data collection wrapper
    tmp_directory = "~/Robotics/{}".format(str(time.time()).replace(
        ".", "_"))  # Change from temp to Robotics folder
    env = DataCollectionWrapperBaseline(env, tmp_directory)

    # initialize device
    if args.device == "keyboard":
        from robosuite.devices import Keyboard

        device = Keyboard()
        env.viewer.add_keypress_callback("any", device.on_press)
        env.viewer.add_keyup_callback("any", device.on_release)
        env.viewer.add_keyrepeat_callback("any", device.on_press)
Beispiel #19
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy_sawyer.MlpPolicy(name=name,
                                           ob_space=ob_space,
                                           ac_space=ac_space,
                                           reuse=reuse,
                                           hid_size=args.policy_hidden_size,
                                           num_hid_layers=2)

    #------- Run policy for reaching ---------#
    play_env = robosuite.make(
        args.env_id,
        ignore_done=True,
        use_camera_obs=False,
        has_renderer=True,
        control_freq=100,
        gripper_visualization=True,
        #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
        #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
        box_pos=[0.23522776, 0.2287869, 0.82162434],  #shift3
        box_quat=[0.3775825618903728, 0, 0, 0.679425538604203],  #shift3
        #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4
        #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4
        #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5
        #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5
        #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6
        #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6
        #box_pos = [0.43522776, -0.367869, 0.82162434], #shift7
        #box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7
    )

    play_env = GymWrapper(play_env, keys=None,
                          generalized_goal=False)  # false for loading prevs

    #Weights are loaded from reach model grasp_strange

    #play_env.viewer.set_camera(camera_id=2) # Switch views for eval

    # Setup network
    # ----------------------------------------
    ob_space = play_env.observation_space
    ac_space = play_env.action_space
    pi_reach = policy_fn("pi", ob_space, ac_space, reuse=False)

    # Hack for loading policies using tensorflow
    init_op = tf.compat.v1.global_variables_initializer()
    saver = tf.compat.v1.train.Saver(max_to_keep=5)
    with tf.compat.v1.Session() as sess:
        sess.run(init_op)
        # Load Checkpoint
        #ckpt_path = './reach_and_grasp_weights/reach_one/trpo_gail.transition_limitation_2100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        #ckpt_path = './reach_and_grasp_weights/reach_shift2/trpo_gail.transition_limitation_2500.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        ckpt_path = './reach_and_grasp_weights/reach_3_almost/trpo_gail.transition_limitation_1100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        #ckpt_path = './reach_and_grasp_weights/reach_4/trpo_gail.transition_limitation_2400.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' # problem child 2
        #ckpt_path = './reach_and_grasp_weights/reach_5/trpo_gail.transition_limitation_2000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' #problem child 1
        #ckpt_path = './reach_and_grasp_weights/reach_6/trpo_gail.transition_limitation_2500.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        #ckpt_path = './reach_and_grasp_weights/reach_7/trpo_gail.transition_limitation_3000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        ckpt = tf.compat.v1.train.get_checkpoint_state(ckpt_path)
        saver.restore(sess, ckpt.model_checkpoint_path)

        # Create the playback environment

        _, _, last_ob, last_jpos, obs_array, jpos_array = runner_1_traj(
            play_env,
            pi_reach,
            None,
            timesteps_per_batch=3500,
            number_trajs=1,
            stochastic_policy=args.stochastic_policy,
            save=False)

    # Gripping load + setting up the last observation
    play_ob_dim = play_env.observation_space.shape[0]
    play_ac_dim = play_env.action_space.shape[0]
    grip_policy = GaussianMlpPolicy(play_ob_dim, play_ac_dim)
    grip_vf = NeuralNetValueFunction(play_ob_dim, play_ac_dim)
    grip_saver = tf.compat.v1.train.Saver(max_to_keep=5)

    unchanged_ob = np.float32(np.zeros(play_ob_dim))

    with tf.compat.v1.Session() as sess2:
        sess2.run(init_op)
        # Load Checkpoint
        #ckpt_path = './reach_and_grasp_weights/reach_one/trpo_gail.transition_limitation_2100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        #grip_ckpt_path = './reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift'
        grip_ckpt_path = './reach_and_grasp_weights/grasp_3/grasp_acktr_rl.transition_limitation_1000.SawyerLift/'  #3rd grasp
        #ckpt_path = './reach_and_grasp_weights/reach_4/trpo_gail.transition_limitation_2400.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' # problem child 2
        #ckpt_path = './reach_and_grasp_weights/reach_5/trpo_gail.transition_limitation_2000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/' #problem child 1
        #ckpt_path = './reach_and_grasp_weights/reach_6/trpo_gail.transition_limitation_2500.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        #ckpt_path = './reach_and_grasp_weights/reach_7/trpo_gail.transition_limitation_3000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        grip_ckpt = tf.compat.v1.train.get_checkpoint_state(grip_ckpt_path)

        #print(grip_ckpt)

        grip_saver.restore(sess2, grip_ckpt.model_checkpoint_path)

        tt = 0

        cum_rew = 0

        #ob = last_ob
        #prev_ob = np.float32(np.zeros(ob.shape)) # check if indeed starts at all zeros

        obfilter = ZFilter(play_env.observation_space.shape)

        #statsu = np.load("./reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22953000.npz") # shift 2, is a problem?

        #statsu = np.load("./reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22953000.npz") # shift 2

        statsu = np.load(
            "./reach_and_grasp_weights/grasp_3/grasp_acktr_rl.transition_limitation_1000.SawyerLift/filter_stats_21002000.npz"
        )  # shift 3

        #statsu = np.load("./reach_and_grasp_weights/grasp_4/grasp_acktr_rl.transition_limitation_1200.SawyerLift/filter_stats_20162400.npz") # shift 4
        #statsu = np.load("./reach_and_grasp_weights/grasp_5/grasp_acktr_rl.transition_limitation_1200.SawyerLift/filter_stats_26066400.npz") #shift 5
        #statsu = np.load("./reach_and_grasp_weights/grasp_and_then_throws_somehow_6/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_27363000.npz") #shift 6
        #statsu = np.load("./reach_and_grasp_weights/grasp_pickup_7/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22773000.npz") #shift 7

        print("load n: ", statsu["n"])
        print("load M: ", statsu["M"])
        print("load S: ", statsu["S"])

        obfilter.rs._n = statsu["n"]
        obfilter.rs._M = statsu["M"]
        obfilter.rs._S = statsu["S"]

        print("obf n: ", obfilter.rs._n)
        print("obf M: ", obfilter.rs._M)
        print("obf S: ", obfilter.rs._S)

        #env.set_robot_joint_positions(last_jpos)
        #ob = np.concatenate((last_ob,env.box_end),axis=0)
        ob = last_ob

        # Will this change the behavior of loading?
        play_env.set_robot_joint_positions(last_jpos)

        prev_ob = np.float32(np.zeros(
            ob.shape))  # check if indeed starts at all zeros

        unchanged_ob = ob

        ob = obfilter(ob)

        while True:
            s = np.concatenate([ob, prev_ob], -1)
            ac, _, _ = grip_policy.act(s)

            prev_ob = np.copy(ob)

            scaled_ac = play_env.action_space.low + (ac + 1.) * 0.5 * (
                play_env.action_space.high - play_env.action_space.low)
            scaled_ac = np.clip(scaled_ac, play_env.action_space.low,
                                play_env.action_space.high)

            ob, rew, new, _ = play_env.step(scaled_ac)

            unchanged_ob = ob

            ob = obfilter(ob)

            cum_rew += rew

            play_env.render()  # check the running in for the first part
            #logger.log("rendering for reach policy")

            if new or tt >= 200:
                break
            tt += 1

        print("Cumulative reward over session: " + str(cum_rew))

    #obs_array = None
    #jpos_array = None

    #print(last_jpos)
    #print(last_ob)

    last_joint_after_grip = play_env._joint_positions
    last_after_grip = unchanged_ob
    # Change environment box position to start from the last position on playenv
    #env.model.box_pos_array = np.array(play_env.sim.data.body_xpos[play_env.cube_body_id])
    #env.model.box_quat_array = convert_quat(
    #        np.array(play_env.sim.data.body_xquat[play_env.cube_body_id]), to="xyzw"
    #    )
    #env.box_pos = play_env.model.box_pos_array
    #env.box_quat = play_env.model.box_quat_array

    # set up the environment for loading or training
    env = robosuite.make(
        args.env_id,
        ignore_done=True,
        use_camera_obs=False,
        has_renderer=True,
        control_freq=100,
        gripper_visualization=True,
        reward_shaping=True,
        rob_init=last_joint_after_grip,
        box_pos=np.array(
            play_env.sim.data.body_xpos[play_env.cube_body_id]),  #shift3
        box_quat=convert_quat(np.array(
            play_env.sim.data.body_xquat[play_env.cube_body_id]),
                              to="xyzw"),  #shift3
        #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
        #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
        #box_pos = [0.23522776, 0.2287869, 0.82162434], #shift3
        #box_quat=[0.3775825618903728, 0, 0, 0.679425538604203], #shift3
        box_end=[0.3, 0.1, 1.0]  # shift 3
        #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4, try to increase traj limit to 2000
        #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4
        #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5
        #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5
        #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6
        #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6
        #box_pos = [0.43522776, -0.367869, 0.82162434], #shift7
        #box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7
    )  # Switch from gym to robosuite, also add reward shaping to see reach goal

    env = GymWrapper(env, keys=None,
                     generalized_goal=True)  # wrap in the gym environment

    task = 'train'
    #task = 'evaluate'
    #task = 'cont_training'

    #env = bench.Monitor(env, logger.get_dir() and
    #                    osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True)

    # Note: taking away the bench monitor wrapping allows rendering

    #env.seed(args.seed) # Sawyer does not have seed

    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    logger.log("log_directories: ", args.log_dir)
    logger.log("environment action space range: ",
               env.action_space)  #logging the action space

    if task == 'train':
        play_env.close()

        #init_op2 = tf.compat.v1.global_variables_initializer()

        #sess2 = tf.compat.v1.Session(config=tf.ConfigProto())
        #with tf.compat.v1.Session(config=tf.ConfigProto()) as sess2:
        #sess2.run(init_op)
        ob_dim = env.observation_space.shape[0]
        ac_dim = env.action_space.shape[0]
        lift_vf = NeuralNetValueFunction(ob_dim, ac_dim, name="lift_vf_aktr")
        lift_policy = GaussianMlpPolicy(ob_dim, ac_dim, name="lift_pi_aktr")

        #sess2.run(init_op2)

        old_acktr_learn(
            env,
            policy=lift_policy,
            vf=lift_vf,
            gamma=0.99,
            lam=0.97,
            timesteps_per_batch=1500,
            desired_kl=0.001,
            num_timesteps=args.num_timesteps,
            save_per_iter=args.save_per_iter,
            ckpt_dir=args.checkpoint_dir,
            traj_limitation=args.traj_limitation,
            last_ob=[last_after_grip],
            last_jpos=[last_joint_after_grip],
            animate=True,
            pi_name="lift_pi_aktr",
        )

        env.close()

    elif task == 'cont_training':

        play_env.close()

        # with tf.compat.v1.Session(config=tf.ConfigProto()):
        ob_dim = env.observation_space.shape[0]
        ac_dim = env.action_space.shape[0]

        #with tf.compat.v1.variable_scope("vf_aktr"):
        cont_vf = NeuralNetValueFunction(ob_dim, ac_dim, name="lift_vf_aktr")
        #with tf.compat.v1.variable_scope("pi_aktr"):
        cont_policy = GaussianMlpPolicy(ob_dim, ac_dim, name="lift_pi_aktr")

        ckpt_path_cont = './checkpoint/grasp_acktr_rl.transition_limitation_1500.SawyerLift'

        stat_cont = "./checkpoint/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_19713000.npz"

        old_acktr_learn(
            env,
            policy=cont_policy,
            vf=cont_vf,
            gamma=0.99,
            lam=0.97,
            timesteps_per_batch=1500,
            desired_kl=0.001,
            num_timesteps=args.num_timesteps,
            save_per_iter=args.save_per_iter,
            ckpt_dir=args.checkpoint_dir,
            traj_limitation=args.traj_limitation,
            last_ob=obs_array,
            last_jpos=jpos_array,
            animate=True,
            cont_training=True,
            load_path=ckpt_path_cont,
            obfilter_load=stat_cont,
            pi_name="lift_pi_aktr",
        )

        env.close()

    elif task == 'evaluate':

        play_env.close()

        # with tf.compat.v1.Session(config=tf.ConfigProto()):
        ob_dim = env.observation_space.shape[0]
        ac_dim = env.action_space.shape[0]

        eval_lift_vf = NeuralNetValueFunction(ob_dim,
                                              ac_dim,
                                              name="lift_vf_aktr")
        eval_lift_policy = GaussianMlpPolicy(ob_dim,
                                             ac_dim,
                                             name="lift_pi_aktr")

        saver_2 = tf.compat.v1.train.Saver(max_to_keep=5)

        #sess2 = tf.compat.v1.Session(config=tf.ConfigProto())
        #with tf.compat.v1.Session(config=tf.ConfigProto()) as sess3:
        with tf.compat.v1.Session() as sess3:
            sess3.run(init_op)

            ckpt_path_2 = './checkpoint/grasp_acktr_rl.transition_limitation_1500.SawyerLift'
            #ckpt_path_2 = './reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift' # shift 2
            #ckpt_path_2 = './reach_and_grasp_weights/grasp_3/grasp_acktr_rl.transition_limitation_1000.SawyerLift' # shift 3
            #ckpt_path_2 = './reach_and_grasp_weights/grasp_4/grasp_acktr_rl.transition_limitation_1200.SawyerLift' # shift 4
            #ckpt_path_2 = './reach_and_grasp_weights/grasp_5/grasp_acktr_rl.transition_limitation_1200.SawyerLift' # shift 5
            #ckpt_path_2 = './reach_and_grasp_weights/grasp_and_then_throws_somehow_6/grasp_acktr_rl.transition_limitation_1500.SawyerLift' #shift 6
            #ckpt_path_2 = './reach_and_grasp_weights/grasp_pickup_7/grasp_acktr_rl.transition_limitation_1500.SawyerLift' #shift 7
            ckpt_2 = tf.compat.v1.train.get_checkpoint_state(ckpt_path_2)
            saver_2.restore(sess3, ckpt_2.model_checkpoint_path)

            tt = 0

            cum_rew = 0

            #ob = last_ob
            #prev_ob = np.float32(np.zeros(ob.shape)) # check if indeed starts at all zeros

            obfilter = ZFilter(env.observation_space.shape)

            statsu = np.load(
                "./checkpoint/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_23493000.npz"
            )

            #statsu = np.load("./reach_and_grasp_weights/grasp_and_pickup2/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22953000.npz") # shift 2
            #statsu = np.load("./reach_and_grasp_weights/grasp_3/grasp_acktr_rl.transition_limitation_1000.SawyerLift/filter_stats_21002000.npz") # shift 3
            #statsu = np.load("./reach_and_grasp_weights/grasp_4/grasp_acktr_rl.transition_limitation_1200.SawyerLift/filter_stats_20162400.npz") # shift 4
            #statsu = np.load("./reach_and_grasp_weights/grasp_5/grasp_acktr_rl.transition_limitation_1200.SawyerLift/filter_stats_26066400.npz") #shift 5
            #statsu = np.load("./reach_and_grasp_weights/grasp_and_then_throws_somehow_6/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_27363000.npz") #shift 6
            #statsu = np.load("./reach_and_grasp_weights/grasp_pickup_7/grasp_acktr_rl.transition_limitation_1500.SawyerLift/filter_stats_22773000.npz") #shift 7

            print("load n: ", statsu["n"])
            print("load M: ", statsu["M"])
            print("load S: ", statsu["S"])

            obfilter.rs._n = statsu["n"]
            obfilter.rs._M = statsu["M"]
            obfilter.rs._S = statsu["S"]

            print("obf n: ", obfilter.rs._n)
            print("obf M: ", obfilter.rs._M)
            print("obf S: ", obfilter.rs._S)

            env.set_robot_joint_positions(last_jpos)
            ob = np.concatenate((last_ob, env.box_end), axis=0)
            prev_ob = np.float32(np.zeros(
                ob.shape))  # check if indeed starts at all zeros

            ob = obfilter(ob)

            while True:
                s = np.concatenate([ob, prev_ob], -1)
                ac, _, _ = eval_lift_policy.act(s)

                prev_ob = np.copy(ob)

                scaled_ac = env.action_space.low + (ac + 1.) * 0.5 * (
                    env.action_space.high - env.action_space.low)
                scaled_ac = np.clip(scaled_ac, env.action_space.low,
                                    env.action_space.high)

                ob, rew, new, _ = env.step(scaled_ac)

                ob = obfilter(ob)

                cum_rew += rew

                env.render()  # check the running in for the first part
                #logger.log("rendering for reach policy")

                if new or tt >= args.traj_limitation:
                    break
                tt += 1

            print("Cumulative reward over session: " + str(cum_rew))

    env.close()
Beispiel #20
0
def main():

  parser = custom_arg_parser()
  args = parser.parse_args()
  load_defaults(args)
  print("Arguments:{}".format(args))
  # Create the model name with all the parameters
  
  model_dir_name = serialize_args(args)
  print("Model name: {}".format(model_dir_name))
  if args.model is not None:
    model_save_path = os.path.dirname(args.model) + "/"
    tb_save_path = model_save_path.replace("learned_models","tb_logs")
  else:
    model_save_path = "../../learned_models/" + model_dir_name + "/"
    tb_save_path = "../../tb_logs/" +  model_dir_name + "/"
  print("Model save path:{}".format(model_save_path))
  print("TB logs save path:{}".format(tb_save_path))
  final_model_path = model_save_path + "final_" + model_dir_name
  model_load_path = args.model
  show_render = args.visualize

  # Save args to json for training from checkpoints
  if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)
    with open(model_save_path + "args.json", 'w+') as f:
      json.dump(vars(args),f,indent=2,sort_keys=True)

  env = GymWrapper(
      suite.make(
      "JR2Door",
      has_renderer        = show_render,
      use_camera_obs      = False,
      ignore_done         = False,
      control_freq        = args.control_freq,
      horizon             = args.horizon,
      door_type           = args.door_type,
      bot_motion          = args.bot_motion,
      robot_pos           = args.robot_pos,
      robot_theta         = args.robot_theta,
      dist_to_handle_coef = args.rcoef_dist_to_handle,
      door_angle_coef     = args.rcoef_door_angle,
      handle_con_coef     = args.rcoef_handle_con,
      body_door_con_coef  = args.rcoef_body_door_con,
      self_con_coef       = args.rcoef_self_con,
      arm_handle_con_coef = args.rcoef_arm_handle_con,
      arm_door_con_coef   = args.rcoef_arm_door_con,
      force_coef          = args.rcoef_force,
      gripper_touch_coef  = args.rcoef_gripper_touch,
      dist_to_door_coef   = args.rcoef_dist_to_door,
      wall_con_coef       = args.rcoef_wall_con,
      reset_on_large_force= args.reset_on_large_force,
      debug_print         = args.print_info,
      eef_type            = args.eef_type,
      door_init_qpos      = args.door_init_qpos,
      goal_offset         = args.goal_offset,
    )
  )
  
  if args.slurm:
    env = SubprocVecEnv([lambda: env for i in range(args.n_cpu)])
  else:
    env = DummyVecEnv([lambda: env])

  # Load the specified model, if there is one
  if args.model is not None:
    # Training from checkpoint, so need to reset timesteps for tb
    reset_num_timesteps = False
    if args.rl_alg == "ppo2":
      model = PPO2.load(model_load_path,env=env)
      print("Succesfully loaded PPO2 model")
    if args.rl_alg == "ppo1":
      model = PPO1.load(model_load_path,env=env)
      print("Succesfully loaded PPO1 model")
  else: 
    # New model, so need to reset timesteps for tb
    reset_num_timesteps = True
    if args.rl_alg == "ppo2":
      model = PPO2(
                  args.policy,
                  env,
                  verbose=args.verbose,
                  n_steps=args.n_steps,
                  nminibatches=args.minibatches,
                  noptepochs=args.opt_epochs,
                  cliprange=args.clip_range,
                  ent_coef=args.ent_coef,
                  tensorboard_log=tb_save_path,
                  #full_tensorboard_log=True
                  )

    elif args.rl_alg == "ppo1":
      model = PPO1(
                  args.policy,
                  env,
                  verbose=args.verbose,
                  timesteps_per_actorbatch=args.n_steps,
                  optim_epochs=args.opt_epochs,
                  tensorboard_log=tb_save_path,
                  )
  if args.replay:
    # Replay a policy
    obs = env.reset()
    count = 0
    with open('episode-reward.csv', mode='w') as fid:
      writer = csv.writer(fid, delimiter=',')
      writer.writerow("reward")
    while(count < 1000):
      env.render()
      count += 1
      print(count)
    while True:
      if args.model is None:
        print("Error: No model has been specified")
      action, _states = model.predict(obs,deterministic=True)
      #print("action {}".format(action))
      obs, reward, done, info = env.step(action)
      env.render()
      #print(obs)
      #print(env.sim.data.qpos[env._ref_joint_vel_indexes])
      #time.sleep(0.1)

      with open('episode-reward.csv', mode='a') as fid:
        writer = csv.writer(fid, delimiter=',')
        writer.writerow(reward)

      #if done:
      #  quit()
  else:
    # Train
    model.learn(
                total_timesteps = args.total_timesteps,
                save_dir = model_save_path,
                render=show_render,
                reset_num_timesteps=reset_num_timesteps,
                )

    model.save(final_model_path)
  
    print("Done training")
    obs = env.reset()
class NutAssemblyDense(gym.Env):
    """
        NutAssembly:
        NutAssembly task from robosuite with no controller. Can be used for learning from scratch.
    """
    def __init__(self, horizon=500, *args, **kwargs):
        options = {}
        controller_name = 'OSC_POSE'
        options["controller_configs"] = suite.load_controller_config(
            default_controller=controller_name)

        self.env = GymWrapper(
            suite.make(
                "NutAssemblyRound",  # Nut Assembly task with the round peg
                robots="IIWA",  # use IIWA robot
                **options,  # controller options
                use_object_obs=True,
                use_camera_obs=False,  # do not use pixel observations
                has_offscreen_renderer=
                False,  # not needed since not using pixel obs
                has_renderer=False,  # make sure we can render to the screen
                reward_shaping=True,  # use dense rewards
                reward_scale=1.0,
                control_freq=
                20,  # control should happen fast enough so that simulation looks smooth
                horizon=horizon,  # number of timesteps for ending episode
            ))
        self.max_episode_steps = horizon
        self.action_space = self.env.action_space
        self.observation_space = self.env.observation_space
        self.reward_type = 'sparse'
        self.distance_threshold = 0.065

    def step(self, action):
        ob, reward, done, info = self.env.step(action)
        ob = self.env.env._get_observation()
        peg_pos = np.array(self.env.sim.data.body_xpos[self.env.peg2_body_id])
        observation = np.hstack(
            (ob['robot0_eef_pos'], ob['robot0_eef_quat'], ob['RoundNut0_pos'],
             ob['RoundNut0_quat'], peg_pos))
        # info['is_success'] = self.get_success(ob['RoundNut0_pos'], peg_pos)
        info['is_success'] = self.env.env._check_success()
        return observation, reward, done, info

    def reset(self):
        ob = self.env.reset()
        ob = self.env.env._get_observation()
        peg_pos = np.array(self.env.sim.data.body_xpos[self.env.peg2_body_id])
        observation = np.hstack(
            (ob['robot0_eef_pos'], ob['robot0_eef_quat'], ob['RoundNut0_pos'],
             ob['RoundNut0_quat'], peg_pos))
        return observation

    def seed(self, seed=0):
        self.np_random, seed = seeding.np_random(seed)
        return self.env.seed(seed=seed)

    def render(self, mode="human", *args, **kwargs):
        return self.env.render()

    def close(self):
        return self.env.close()

    def goal_distance(self, achieved_goal, desired_goal):
        return np.linalg.norm(achieved_goal - desired_goal)

    def get_success(self, object_pos, goal):
        # Compute distance between goal and the achieved goal.
        d = self.goal_distance(object_pos, goal)
        return (d < self.distance_threshold).astype(np.float32)
    # Pop the controller
    controller = env_args.pop("controller")
    if controller in ALL_CONTROLLERS:
        controller_config = load_controller_config(
            default_controller=controller)
    else:
        controller_config = load_controller_config(custom_fpath=controller)

    # Create env
    env_suite = suite.make(**env_args,
                           controller_configs=controller_config,
                           has_renderer=not args.record_video,
                           has_offscreen_renderer=args.record_video,
                           use_object_obs=True,
                           use_camera_obs=args.record_video,
                           reward_shaping=True)
    env = GymWrapper(env_suite)

    # Run rollout
    simulate_policy(
        env=env,
        model_path=os.path.join(args.load_dir, "params.pkl"),
        horizon=env_args["horizon"],
        render=not args.record_video,
        video_writer=video_writer,
        num_episodes=args.num_episodes,
        printout=True,
        use_gpu=args.gpu,
    )
    env_name = f["data"].attrs["env"]

    do_render = args.render

    env = robosuite.make(
        env_name,
        has_offscreen_renderer=False,  # not needed since not using pixel obs
        has_renderer=do_render,
        ignore_done=True,
        use_camera_obs=False,
        gripper_visualization=False,
        reward_shaping=1,
        control_freq=100,
    )
    # env = IKWrapper(env)
    env = GymWrapper(env)

    # if args.demo_list[0] is None:
    #     args.demo_list = range(0, len(demos))

    # start = 700
    # if args.env_id == 24:
    #     start = 630
    # sort_list = np.arange(start, start + 20)

    # ## load from sorted list
    # if args.demo_list[0] is None:
    #     args.demo_list = []
    #     filename = "%s/Documents/Git/imitation_data/TRAJ_robo/%s/%s_sort.txt" % (pathlib.Path.home(), env_name, env_name)
    #     # filename = "%s/Documents/Git/imitation_data/TRAJ_robo/%s/%s_chosen.txt" % (pathlib.Path.home(), env_name, env_name)