Exemple #1
0
    def __init__(self, opt):
        self.opt = opt
        self.env = gym.make(opt.env)
        if "SawyerPush" in self.opt.env:
            self.env = SawyerECWrapper(self.env, opt.env)
            self.env._max_episode_steps = 70
        self.env.seed(0)
        random.seed(0)
        try:
            self.state_dim = self.env.observation_space.shape[0]
        except:
            self.state_dim = 16  #self.env.observation_space.shape[0]
        try:
            self.action_dim = self.env.action_space.shape[0]
        except:
            self.action_dim = 2
        self.max_action = float(self.env.action_space.high[0])
        self.log_root = opt.log_root
        self.episode_n = opt.episode_n
        self.policy_path = os.path.join(
            opt.log_root,
            '{}_base/models/TD3_{}_0_actor'.format(opt.env, opt.env))
        if opt.load_policy != "":
            print(self.policy_path)
            self.policy = TD3(opt.load_policy, self.state_dim, self.action_dim,
                              self.max_action)

        self.setup(opt)
        self.create_data()
        print('----------- Dataset initialized ---------------')
        print('-----------------------------------------------\n')
Exemple #2
0
 def __init__(self, opt):
     self.opt = opt
     self.env_name = opt.env
     self.policy_path = os.path.join(
         opt.log_root,
         '{}_base/models/TD3_{}_0_actor'.format(opt.env, opt.env))
     self.state_dim = opt.state_dim1
     self.action_dim = opt.action_dim1
     self.max_action = 1
     print(self.env_name, self.state_dim, self.action_dim)
     self.policy = TD3(self.policy_path, self.state_dim, self.action_dim,
                       self.max_action, self.opt)
     self.env = gym.make(self.env_name)
     if "SawyerPush" in self.opt.env:
         self.env = SawyerECWrapper(self.env, opt.env)
         self.env._max_episode_steps = 70
     self.env.seed(100)
Exemple #3
0
def eval_policy(policy, env_name, seed, eval_episodes=10):
        eval_env = gym.make(env_name)
        if "SawyerPush" in args.env:
            eval_env = SawyerECWrapper(eval_env, args.env)
            eval_env._max_episode_steps = 70
        eval_env.seed(seed + 100)

        avg_reward = 0.
        success_rate = 0.
        for _ in range(eval_episodes):
                state, done = eval_env.reset(), False
                state = flatten_state(state)
                while not done:
                        action = policy.select_action(np.array(flatten_state(state)))
                        state, reward, done, info = eval_env.step(action)
                        if ("first_success" in info.keys() and info["first_success"]):
                            success_rate += 1
                        avg_reward += reward

        avg_reward /= eval_episodes
        success_rate /= eval_episodes

        print("---------------------------------------")
        print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}, {success_rate:.3f}")
        print("---------------------------------------")
        return avg_reward
Exemple #4
0
def main(args):
        file_name = f"{args.policy}_{args.env}_{args.seed}"
        print("---------------------------------------")
        print(f"Policy: {args.policy}, Env: {args.env}, Seed: {args.seed}")
        print("---------------------------------------")

        log_path = safe_path(os.path.join(args.log_root, '{}_base'.format(args.env)))
        result_path = safe_path(os.path.join(log_path, 'results'))
        model_path = safe_path(os.path.join(log_path, 'models'))
        
        '''
        ### s2r hacks
        s2r_parser = argparse.ArgumentParser()
        s2r_parser.add_argument("--encoder_type", default="mlp")
        s2r_parser.add_argument("--end_effector", default=True)
        s2r_parser.add_argument("--screen_width", type=int, default=480)
        s2r_parser.add_argument("--screen_height", type=int, default=480)
        s2r_parser.add_argument("--action_repeat", type=int, default=1)
        s2r_parser.add_argument("--puck_friction", type=float, default=2.0)
        s2r_parser.add_argument("--puck_mass", type=float, default=0.01)
        s2r_parser.add_argument("--unity",  default=False)
        s2r_parser.add_argument("--unity_editor", default=False)
        s2r_parser.add_argument("--virtual_display",  default=None)
        s2r_parser.add_argument("--port", default=1050)
        s2r_parser.add_argument("--absorbing_state", default=False)
        s2r_parser.add_argument("--dr", default=False)
        s2r_parser.add_argument("--env", default=None)
        s2r_args = s2r_parser.parse_args()
        import ipdb;ipdb.set_trace()
        env = make_s2r_env(args.env, s2r_args, env_type="real")
        '''
        env = gym.make(args.env)
        if "SawyerPush" in args.env:
            env = SawyerECWrapper(env, args.env)
            env._max_episode_steps = 70
        # Set seeds
        env.seed(args.seed)
        torch.manual_seed(args.seed)
        np.random.seed(args.seed)

        try:
            state_dim = env.observation_space.shape[0]
        except:
            state_dim = 16 #env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        max_action = float(env.action_space.high[0])

        kwargs = {
                "state_dim": state_dim,
                "action_dim": action_dim,
                "max_action": max_action,
                "discount": args.discount,
                "tau": args.tau,
        }

        # Initialize policy
        if args.policy == "TD3":
                # Target policy smoothing is scaled wrt the action scale
                kwargs["policy_noise"] = args.policy_noise * max_action
                kwargs["noise_clip"] = args.noise_clip * max_action
                kwargs["policy_freq"] = args.policy_freq
                policy = TD3.TD3(**kwargs)

        replay_buffer = utils.ReplayBuffer(state_dim, action_dim)

        # Evaluate untrained policy
        evaluations = [eval_policy(policy, args.env, args.seed)]

        state, done = env.reset(), False
        episode_reward = 0
        episode_timesteps = 0
        episode_num = 0
        success = False
        reach_reward = 0
        push_reward = 0
        cylinder_to_target = 100
        for t in range(int(args.max_timesteps)):
                state = flatten_state(state)
                episode_timesteps += 1

                # Select action randomly or according to policy
                if t < args.start_timesteps:
                        action = env.action_space.sample()
                else:
                        action = (
                                        policy.select_action(np.array(state))
                                        + np.random.normal(0, max_action * args.expl_noise, size=action_dim)
                        ).clip(-max_action, max_action)

                # Perform action
                next_state, reward, done, info = env.step(action)
                next_state = flatten_state(next_state)
                done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0

                if ("first_success" in info.keys() and info["first_success"]):
                    success = True

                # reach_reward += info["reward_reach"]
                # push_reward += info["reward_push"]
                # cylinder_to_target = min(cylinder_to_target, info["cylinder_to_target"])

                # Store data in replay buffer
                replay_buffer.add(state, action, next_state, reward, done_bool)

                state = next_state
                episode_reward += reward

                # Train agent after collecting sufficient data
                if t >= args.start_timesteps:
                        policy.train(replay_buffer, args.batch_size)

                if done:
                        # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
                        # reach_reward /= episode_timesteps
                        # push_reward /= episode_timesteps
                        #  Reach Reward: {reach_reward:.3f} Push Reward: {push_reward:.3f} cylinder_to_target: {cylinder_to_target:.3f}
                        print(
                                f"Total T: {t + 1} Episode Num: {episode_num + 1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f} Success: {success}")
                        # Reset environment
                        success = False
                        state, done = env.reset(), False
                        episode_reward = 0
                        reach_reward, push_reward = 0, 0
                        cylinder_to_target = 100
                        episode_timesteps = 0
                        episode_num += 1

                # Evaluate episode
                if (t + 1) % args.eval_freq == 0:
                        evaluations.append(eval_policy(policy, args.env, args.seed))
                        np.save(os.path.join(result_path, '{}'.format(file_name)), evaluations)
                        if args.save_model: policy.save(os.path.join(model_path, '{}'.format(file_name)))
Exemple #5
0
class CrossImgPolicy:
    def __init__(self, opt):
        self.opt = opt
        self.env_name = opt.env
        self.policy_path = os.path.join(
            opt.log_root,
            '{}_base/models/TD3_{}_0_actor'.format(opt.env, opt.env))
        self.state_dim = opt.state_dim1
        self.action_dim = opt.action_dim1
        self.max_action = 1
        print(self.env_name, self.state_dim, self.action_dim)
        self.policy = TD3(self.policy_path, self.state_dim, self.action_dim,
                          self.max_action, self.opt)
        self.env = gym.make(self.env_name)
        if "SawyerPush" in self.opt.env:
            self.env = SawyerECWrapper(self.env, opt.env)
            self.env._max_episode_steps = 70
        self.env.seed(100)

    def eval_policy(self,
                    iter,
                    gxmodel=None,
                    axmodel=None,
                    imgpath=None,
                    eval_episodes=10):
        eval_env = self.env
        state_buffer = []
        action_buffer = []
        avg_reward, new_reward = 0., 0.
        success_rate = 0.
        save_flag = False
        if imgpath is not None:
            if not os.path.exists(imgpath):
                os.mkdir(imgpath)
            save_flag = True

        for i in tqdm(range(eval_episodes)):
            state, done = eval_env.reset(), False
            if save_flag:
                episode_path = os.path.join(
                    imgpath, 'iteration_{}_episode_{}.mp4'.format(iter, i))
                frames = []
            count = 0
            while not done:
                state = np.array(flatten_state(state))
                img, depth = self.env.sim.render(mode='offscreen',
                                                 width=100,
                                                 height=100,
                                                 depth=True)
                with torch.no_grad():
                    action = self.policy.select_cross_action(
                        img, gxmodel, axmodel)
                state_buffer.append(state)
                action_buffer.append(action)
                state, reward, done, info = eval_env.step(action)
                state = flatten_state(state)
                if ("first_success" in info.keys()
                        and info["first_success"] == 1):
                    success_rate += 1
                elif ("episode_success" in info.keys()
                      and info["episode_success"] == True):
                    success_rate += 1
                avg_reward += reward

                if save_flag:
                    img = eval_env.sim.render(mode='offscreen',
                                              camera_name='track',
                                              width=500,
                                              height=500)
                    frames.append(img[::-1, :, :])
                count += 1
            if save_flag:
                self._save_video(episode_path, frames)
                if i >= 3:
                    save_flag = False
        avg_reward /= eval_episodes
        success_rate /= eval_episodes

        print("-----------------------------------------------")
        print("Evaluation over {} episodes: {:.3f}, {:.3f}".format(
            eval_episodes, avg_reward, success_rate))
        print("-----------------------------------------------")

        return avg_reward, success_rate

    def _save_video(self, fname, frames, fps=15.0):
        """ Saves @frames into a video with file name @fname. """
        def f(t):
            frame_length = len(frames)
            new_fps = 1.0 / (1.0 / fps + 1.0 / frame_length)
            idx = min(int(t * new_fps), frame_length - 1)
            return frames[idx]

        video = mpy.VideoClip(f, duration=len(frames) / fps + 2)

        video.write_videofile(fname, fps, verbose=False)
Exemple #6
0
class CycleData:
    def __init__(self, opt):
        self.opt = opt
        self.env = gym.make(opt.env)
        if "SawyerPush" in self.opt.env:
            self.env = SawyerECWrapper(self.env, opt.env)
            self.env._max_episode_steps = 70
        self.env.seed(0)
        random.seed(0)
        try:
            self.state_dim = self.env.observation_space.shape[0]
        except:
            self.state_dim = 16  #self.env.observation_space.shape[0]
        try:
            self.action_dim = self.env.action_space.shape[0]
        except:
            self.action_dim = 2
        self.max_action = float(self.env.action_space.high[0])
        self.log_root = opt.log_root
        self.episode_n = opt.episode_n
        self.policy_path = os.path.join(
            opt.log_root,
            '{}_base/models/TD3_{}_0_actor'.format(opt.env, opt.env))
        if opt.load_policy != "":
            print(self.policy_path)
            self.policy = TD3(opt.load_policy, self.state_dim, self.action_dim,
                              self.max_action)

        self.setup(opt)
        self.create_data()
        print('----------- Dataset initialized ---------------')
        print('-----------------------------------------------\n')

    def setup(self, opt):
        self.episode_n = opt.episode_n
        self.env_logs = safe_path(
            os.path.join(self.log_root, '{}_data'.format(self.opt.env)))
        self.data_root = safe_path(
            os.path.join(self.env_logs, '{}_{}'.format(self.opt.data_type,
                                                       self.opt.data_id)))
        self.img_path = safe_path(os.path.join(self.data_root, 'imgs'))

    def create_data(self):
        self.reset_buffer()
        total_samples = 0
        i_episode = 0
        while total_samples < self.episode_n:
            observation, done, t = self.env.reset(), False, 0
            observation = flatten_state(observation)
            self.add_observation(observation)
            # episode_path = os.path.join(self.img_path,'episode-{}'.format(i_episode))
            # if not os.path.exists(episode_path):
            #     os.mkdir(episode_path)
            # path = os.path.join(episode_path, 'img_{}_{}.jpg'.format(i_episode, 0))
            # self.check_and_save(path)
            i_episode += 1
            while not done:
                if self.opt.load_policy != "":
                    action = self.policy.select_action(observation)
                else:
                    action = self.env.action_space.sample()
                observation, reward, done, info = self.env.step(action)
                observation = flatten_state(observation)
                self.add_action(action)
                self.add_observation(observation)

                # path = os.path.join(episode_path, 'img_{}_{}.jpg'.format(i_episode, t + 1))
                # self.check_and_save(path)
                t += 1

                if done:
                    print("Episode {} finished after {} timesteps".format(
                        i_episode, t))
                    total_samples += t
                    break
            self.merge_buffer()
        print("{} total samples collected".format(total_samples))
        self.collect_data()

    def check_and_save(self, path):
        img = self.env.sim.render(mode='offscreen',
                                  camera_name='track',
                                  width=256,
                                  height=256,
                                  depth=False)
        img = Image.fromarray(img[::-1, :, :])
        img.save(path)

    def collect_data(self):
        self.env.close()
        self.norm_state()
        self.pair_n = self.now_state.shape[0]
        assert (self.pair_n == self.next_state.shape[0])
        assert (self.pair_n == self.action.shape[0])
        self.save_npy()

    def norm_state(self):
        self.now_state = np.vstack(self.now_state)
        self.next_state = np.vstack(self.next_state)
        self.action = np.vstack(self.action)

    def save_npy(self):
        np.save(os.path.join(self.data_root, 'now_state.npy'), self.now_state)
        np.save(os.path.join(self.data_root, 'next_state.npy'),
                self.next_state)
        np.save(os.path.join(self.data_root, 'action.npy'), self.action)

    def reset_buffer(self):
        self.joint_pose_buffer = []
        self.achieved_goal_buffer = []
        self.goal_pos_buffer = []
        self.action_buffer = []

        self.now_state = []
        self.next_state = []
        self.action = []

    def add_observation(self, observation):
        self.joint_pose_buffer.append(observation)

    def add_action(self, action):
        self.action_buffer.append(action)

    def merge_buffer(self):
        self.now_state += self.joint_pose_buffer[:-1]
        self.next_state += self.joint_pose_buffer[1:]
        self.action += self.action_buffer

        self.joint_pose_buffer = []
        self.achieved_goal_buffer = []
        self.goal_pos_buffer = []
        self.action_buffer = []