Exemple #1
0
    def eval(self):
        average_episode_reward = 0
        average_episode_success = 0

        video_recorder = VideoRecorder()
        video_recorder.init()

        for episode in range(self.num_eval_episodes):

            obs_dict = self.env.reset()
            obs = obs_dict[self.observation_key]
            obs_g = obs_dict[self.desired_goal_key]
            done = False
            episode_reward = 0
            episode_step = 0

            while not done:
                action = self.agent.act(obs, obs_g, sample=True)

                next_obs_dict, reward, done, info = self.env.step(action)

                done = float(done)
                episode_reward += reward

                achieved_goal = next_obs_dict[self.achieved_goal_key]

                obs = next_obs_dict[self.observation_key]
                obs_g = next_obs_dict[self.desired_goal_key]
                episode_step += 1

                video_recorder.record(next_obs_dict)

            average_episode_reward += episode_reward / self.num_eval_episodes
            average_episode_success += float(
                info['is_success']) / self.num_eval_episodes

        video_recorder.save(f'{self.step}.mp4')

        tune.report(
            eval_reward=average_episode_reward,
            eval_is_success=average_episode_success,
            timesteps_this_iter=0,
        )
Exemple #2
0
class Workspace(object):
    def __init__(
            self,
            log_save_tb=True,
            log_frequency_step=10000,
            agent_name='drq',
            # device='cuda',
            device='cpu',
            env='cartpole_swingup',
            seed=1,
            image_size=84,
            action_repeat=8,
            frame_stack=3,
            replay_buffer_capacity=100000,
            image_pad=4,
            save_video=True):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.logger = Logger(self.work_dir,
                             save_tb=log_save_tb,
                             log_frequency=log_frequency_step,
                             agent=agent_name,
                             action_repeat=action_repeat)

        utils.set_seed_everywhere(seed)
        self.device = torch.device(device)
        self.env = make_env(env, seed, image_size, action_repeat, frame_stack)

        self.agent = DRQAgent(
            obs_shape=self.env.observation_space.shape,
            action_shape=self.env.action_space.shape,
            action_range=(float(self.env.action_space.low.min()),
                          float(self.env.action_space.high.max())),
            device=self.device)

        self.replay_buffer = ReplayBuffer(self.env.observation_space.shape,
                                          self.env.action_space.shape,
                                          replay_buffer_capacity, image_pad,
                                          self.device)

        self.video_recorder = VideoRecorder(
            self.work_dir if save_video else None)
        self.step = 0

    def evaluate(
        self,
        num_eval_episodes=10,
    ):
        average_episode_reward = 0
        for episode in range(num_eval_episodes):
            obs = self.env.reset()
            self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            episode_step = 0
            while not done:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=False)
                obs, reward, done, info = self.env.step(action)
                self.video_recorder.record(self.env)
                episode_reward += reward
                episode_step += 1

            average_episode_reward += episode_reward
            self.video_recorder.save(f'{self.step}.mp4')
        average_episode_reward /= num_eval_episodes
        self.logger.log('eval/episode_reward', average_episode_reward,
                        self.step)
        self.logger.dump(self.step)

    def run(self,
            num_train_steps=1000000,
            num_train_iters=1,
            num_seed_steps=1000,
            eval_frequency=5000):
        episode, episode_reward, episode_step, done = 0, 0, 1, True
        start_time = time.time()
        while self.step < num_train_steps:
            if done:
                if self.step > 0:
                    self.logger.log('train/duration',
                                    time.time() - start_time, self.step)
                    start_time = time.time()
                    self.logger.dump(self.step,
                                     save=(self.step > num_seed_steps))

                # evaluate agent periodically
                if self.step % eval_frequency == 0:
                    self.logger.log('eval/episode', episode, self.step)
                    self.evaluate()

                self.logger.log('train/episode_reward', episode_reward,
                                self.step)

                obs = self.env.reset()
                done = False
                episode_reward = 0
                episode_step = 0
                episode += 1

                self.logger.log('train/episode', episode, self.step)

            # sample action for data collection
            if self.step < num_seed_steps:
                action = self.env.action_space.sample()
            else:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=True)

            # run training update
            if self.step >= num_seed_steps:
                for _ in range(num_train_iters):
                    self.agent.update(self.replay_buffer, self.logger,
                                      self.step)

            next_obs, reward, done, info = self.env.step(action)

            # allow infinite bootstrap
            done = float(done)
            done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done
            episode_reward += reward

            self.replay_buffer.add(obs, action, reward, next_obs, done,
                                   done_no_max)

            obs = next_obs
            episode_step += 1
            self.step += 1
Exemple #3
0
class Workspace(object):
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.cfg = cfg

        self.logger = Logger(self.work_dir + "_" + self.cfg.env +
                             "_eval2k_effective_{}_seed_{}".format(
                                 self.cfg.effective_aug, self.cfg.seed),
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency_step,
                             agent=cfg.agent.name,
                             action_repeat=cfg.action_repeat)

        self.effective_aug = self.cfg.effective_aug
        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.env = make_env(cfg)

        cfg.agent.params.obs_shape = self.env.observation_space.shape
        cfg.agent.params.action_shape = self.env.action_space.shape
        cfg.agent.params.action_range = [
            float(self.env.action_space.low.min()),
            float(self.env.action_space.high.max())
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)

        self.replay_buffer = ReplayBuffer(self.env.observation_space.shape,
                                          self.env.action_space.shape,
                                          cfg.replay_buffer_capacity,
                                          self.cfg.image_pad, self.device,
                                          self.effective_aug)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0

    def evaluate(self):
        average_episode_reward = 0
        for episode in range(self.cfg.num_eval_episodes):
            obs = self.env.reset()
            self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            episode_step = 0
            while not done:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=False)
                obs, reward, done, info = self.env.step(action)
                self.video_recorder.record(self.env)
                episode_reward += reward
                episode_step += 1

            average_episode_reward += episode_reward
            self.video_recorder.save(f'{self.step}.mp4')
        average_episode_reward /= self.cfg.num_eval_episodes
        self.logger.log('eval/episode_reward', average_episode_reward,
                        self.step)
        self.logger.dump(self.step)

    def run(self):
        episode, episode_reward, episode_step, done = 0, 0, 1, True
        start_time = time.time()
        while self.step < self.cfg.num_train_steps:
            if done:
                if self.step > 0:
                    self.logger.log('train/duration',
                                    time.time() - start_time, self.step)
                    start_time = time.time()
                    self.logger.dump(
                        self.step, save=(self.step > self.cfg.num_seed_steps))

                # evaluate agent periodically
                if self.step % self.cfg.eval_frequency == 0:
                    self.logger.log('eval/episode', episode, self.step)
                    self.evaluate()

                self.logger.log('train/episode_reward', episode_reward,
                                self.step)

                obs = self.env.reset()
                done = False
                episode_reward = 0
                episode_step = 0
                episode += 1

                self.logger.log('train/episode', episode, self.step)

            # sample action for data collection
            if self.step < self.cfg.num_seed_steps:
                action = self.env.action_space.sample()
            else:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=True)

            # run training update
            if self.step >= self.cfg.num_seed_steps:
                for _ in range(self.cfg.num_train_iters):
                    self.agent.update(self.replay_buffer, self.logger,
                                      self.step)

            next_obs, reward, done, info = self.env.step(action)

            # allow infinite bootstrap
            done = float(done)
            done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done
            episode_reward += reward

            self.replay_buffer.add(obs, action, reward, next_obs, done,
                                   done_no_max)

            obs = next_obs
            episode_step += 1
            self.step += 1
Exemple #4
0
class Workspace(object):
    def __init__(self, cfg):

        self.work_dir = os.getcwd()
        """Hack to adjust action_repeat"""
        adjust_action_repeat_hack(cfg)

        print(f"CFG:\n{'-'*100}\n{cfg}\n{'-'*100}")

        self.cfg = cfg
        experiment_name = f"{cfg.full_title}_{cfg.run_id}"

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             save_wb=cfg.log_save_wandb,
                             log_frequency=cfg.log_frequency_step,
                             agent=cfg.agent.name,
                             action_repeat=cfg.action_repeat,
                             cfg=dict(flatten_cfg(cfg)),
                             plot_project="drqtest",
                             experiment=experiment_name)
        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.env = make_env(cfg)

        cfg.agent.params.obs_shape = self.env.observation_space.shape
        cfg.agent.params.action_shape = self.env.action_space.shape
        cfg.agent.params.action_range = [
            float(self.env.action_space.low.min()),
            float(self.env.action_space.high.max())
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)

        print(f"ACTOR:\n{'-'*100}\n{self.agent.actor}\n{'-'*100}")
        print(f"CRITIC:\n{'-'*100}\n{self.agent.critic}\n{'-'*100}")

        self.replay_buffer = ReplayBuffer(
            self.env.observation_space.shape,
            self.env.action_space.shape,
            cfg.replay_buffer_capacity,
            self.cfg.image_pad,
            self.device,
            use_aug=cfg.replay_buffer_augmentation)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0

    def evaluate(self):
        average_episode_reward = 0
        for episode in range(self.cfg.num_eval_episodes):
            obs = self.env.reset()
            self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            episode_step = 0
            while not done:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=False)
                obs, reward, done, info = self.env.step(action)
                self.video_recorder.record(self.env)
                episode_reward += reward
                episode_step += 1

            average_episode_reward += episode_reward
            self.video_recorder.save(f'{self.step}.mp4')
        average_episode_reward /= self.cfg.num_eval_episodes
        self.logger.log('eval/episode_reward', average_episode_reward,
                        self.step)
        self.logger.dump(self.step)

    def run(self):
        episode, episode_reward, episode_step, done = 0, 0, 1, True
        start_time = time.time()

        while self.step < self.cfg.num_train_steps:
            if done:
                if self.step > 0:
                    self.logger.log('train/duration',
                                    time.time() - start_time, self.step)
                    start_time = time.time()
                    self.logger.dump(
                        self.step, save=(self.step > self.cfg.num_seed_steps))

                # evaluate agent periodically
                if self.step % self.cfg.eval_frequency == 0:
                    self.logger.log('eval/episode', episode, self.step)
                    self.evaluate()

                self.logger.log('train/episode_reward', episode_reward,
                                self.step)

                obs = self.env.reset()
                done = False
                episode_reward = 0
                episode_step = 0
                episode += 1

                self.logger.log('train/episode', episode, self.step)

            # sample action for data collection
            if self.step < self.cfg.num_seed_steps:
                action = self.env.action_space.sample()
            else:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=True)

            # run training update
            if self.step >= self.cfg.num_seed_steps:
                for _ in range(self.cfg.num_train_iters):
                    self.agent.update(self.replay_buffer, self.logger,
                                      self.step)

            next_obs, reward, done, info = self.env.step(action)

            # allow infinite bootstrap
            done = float(done)
            done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done
            episode_reward += reward

            self.replay_buffer.add(obs, action, reward, next_obs, done,
                                   done_no_max)

            obs = next_obs
            episode_step += 1
            self.step += 1
Exemple #5
0
class Workspace(object):
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency,
                             agent=cfg.agent.name)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.env = utils.make_env(cfg)
        self.obs_shape = self.env.observation_space['observation'].shape
        self.goal_shape = self.env.observation_space['desired_goal'].shape

        cfg.agent.params.obs_dim = self.obs_shape[0]
        cfg.agent.params.goal_dim = self.goal_shape[0]
        cfg.agent.params.action_dim = self.env.action_space.shape[0]
        cfg.agent.params.action_range = [
            float(self.env.action_space.low.min()),
            float(self.env.action_space.high.max())
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)

        self.replay_buffer = ReplayBuffer(self.obs_shape, self.goal_shape,
                                          self.env.action_space.shape,
                                          int(cfg.replay_buffer_capacity),
                                          self.device)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0

    def evaluate(self):
        for episode in range(self.cfg.num_eval_episodes):
            obs = self.env.reset()
            self.agent.reset()
            self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            while not done:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs['observation'],
                                            obs['desired_goal'],
                                            sample=False)
                obs, reward, done, _ = self.env.step(action)
                self.video_recorder.record(self.env)
                episode_reward += reward

            self.video_recorder.save(f'{self.step}.mp4')
            self.logger.log('eval/episode_reward', episode_reward, self.step)
        self.logger.dump(self.step)

    def run_her(self, path_buffer):

        #first_obs = path_buffer[0][0]
        #last_obs = path_buffer[-1][0]
        #first_goal = first_obs['achieved_goal']
        #last_goal = last_obs['achieved_goal']
        #goal_changed = np.mean(last_goal - first_goal)**2 > 1e-6

        #if goal_changed:
        for n, ts in enumerate(path_buffer):
            # select goal id
            if self.cfg.her_strat == 'future':
                i = np.random.randint(n, len(path_buffer))
            elif self.cfg.her_strat == 'last':
                i = -1
            new_goal_obs = path_buffer[i][3]
            new_goal = new_goal_obs['achieved_goal']
            # relabel
            obs, action, reward, next_obs, done, done_no_max = ts
            obs['desired_goal'] = new_goal
            next_obs['desired_goal'] = new_goal
            reward = self.env.compute_reward(next_obs['achieved_goal'],
                                             new_goal, None)
            self.replay_buffer.add(obs, action, reward, next_obs, done,
                                   done_no_max)

    def run(self):
        episode, episode_reward, done = 0, 0, True
        start_time = time.time()
        path_buffer = []
        while self.step < self.cfg.num_train_steps:
            if done:
                if self.step > 0:
                    self.logger.log('train/duration',
                                    time.time() - start_time, self.step)
                    start_time = time.time()
                    self.logger.dump(
                        self.step, save=(self.step > self.cfg.num_seed_steps))

                # evaluate agent periodically
                if self.step > 0 and self.step % self.cfg.eval_frequency == 0:
                    self.logger.log('eval/episode', episode, self.step)
                    if self.cfg.save_model:
                        self.agent.save()
                        self.agent.load()
                    self.evaluate()

                self.logger.log('train/episode_reward', episode_reward,
                                self.step)

                obs = self.env.reset()
                self.agent.reset()
                done = False
                episode_reward = 0
                episode_step = 0
                episode += 1

                self.logger.log('train/episode', episode, self.step)

                # her
                if self.cfg.her_iters > 0 and len(path_buffer):
                    for k in range(self.cfg.her_iters):
                        self.run_her(path_buffer)
                path_buffer = []

            # sample action for data collection
            if self.step < self.cfg.num_seed_steps:
                action = self.env.action_space.sample()
            else:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs['observation'],
                                            obs['desired_goal'],
                                            sample=True)

            # run training update
            if self.step >= self.cfg.num_seed_steps:
                self.agent.update(self.replay_buffer, self.logger, self.step)

            next_obs, reward, done, _ = self.env.step(action)

            # allow infinite bootstrap
            done = float(done)
            done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done
            episode_reward += reward

            self.replay_buffer.add(obs, action, reward, next_obs, done,
                                   done_no_max)
            path_buffer.append(
                [obs, action, reward, next_obs, done, done_no_max])

            obs = next_obs
            episode_step += 1
            self.step += 1
Exemple #6
0
class Workspace(object):
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency_step,
                             agent=cfg.agent.name)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.env = dmc.make_meta(cfg.env, cfg.episode_length, cfg.seed)
        self.eval_env = dmc.make_meta(cfg.env, cfg.episode_length,
                                      cfg.seed + 1)

        obs_spec = self.env.observation_spec()['features']
        action_spec = self.env.action_spec()

        cfg.agent.params.obs_shape = obs_spec.shape
        cfg.agent.params.action_shape = action_spec.shape
        cfg.agent.params.action_range = [
            float(action_spec.minimum.min()),
            float(action_spec.maximum.max())
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)

        self.replay_buffer = MetaReplayBuffer(cfg.train_tasks, obs_spec.shape,
                                              action_spec.shape,
                                              cfg.replay_buffer_capacity,
                                              self.device)

        self.eval_video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0

    def evaluate(self):
        average_total_reward = 0
        for task_id in self.cfg.eval_tasks:
            # adaptation phase
            state = self.agent.reset(
            )  # reset agent once, so the memory persists acros episodes
            for episode in range(self.cfg.num_adapt_episodes):
                time_step = self.eval_env.reset(task_id)
                while not time_step.last():
                    with utils.eval_mode(self.agent):
                        obs = time_step.observation['features']
                        action = self.agent.act(obs, state, sample=False)
                    time_step = self.eval_env.step(action)
                    next_obs = time_step.observation['features']
                    # update agent's memory
                    state = self.agent.step(state, obs, action,
                                            time_step.reward, next_obs)

            # evaluation phase
            # agent's memory should be initialized by now
            average_episode_reward = 0
            for episode in range(self.cfg.num_eval_episodes):
                time_step = self.eval_env.reset(task_id)
                self.eval_video_recorder.init(enabled=(episode == 0))
                episode_reward = 0
                episode_success = 0
                episode_step = 0
                while not time_step.last():
                    with utils.eval_mode(self.agent):
                        obs = time_step.observation['features']
                        action = self.agent.act(obs, state, sample=False)
                    time_step = self.eval_env.step(action)
                    next_obs = time_step.observation['features']
                    # update agent's memory
                    state = self.agent.step(state, obs, action,
                                            time_step.reward, next_obs)
                    self.eval_video_recorder.record(self.eval_env)
                    episode_reward += time_step.reward
                    episode_step += 1

                average_episode_reward += episode_reward
                self.eval_video_recorder.save(
                    f'task_{task_id}_step_{self.step}.mp4')
            average_episode_reward /= self.cfg.num_eval_episodes
            average_total_reward += average_episode_reward
            self.logger.log(f'eval/task_{task_id}_episode_reward',
                            average_episode_reward / self.cfg.episode_length,
                            self.step)
        average_total_reward /= len(self.cfg.eval_tasks)
        self.logger.log('eval/episode_reward',
                        average_total_reward / self.cfg.episode_length,
                        self.step)
        self.logger.dump(self.step, ty='eval')

    def run(self):
        episode, episode_reward, episode_step = 0, 0, 0
        start_time = time.time()
        done = True
        while self.step < self.cfg.num_train_steps:
            if done:
                if self.step > 0:
                    fps = episode_step / (time.time() - start_time)
                    self.logger.log('train/fps', fps, self.step)
                    start_time = time.time()

                    self.logger.log('train/episode_reward',
                                    episode_reward / self.cfg.episode_length,
                                    self.step)
                    self.logger.log('train/episode', episode, self.step)
                    self.logger.dump(
                        self.step,
                        save=(self.step > self.cfg.num_seed_steps),
                        ty='train')

                # initially try each task
                if episode < len(self.cfg.train_tasks):
                    task_id = self.cfg.train_tasks[episode]
                else:
                    task_id = np.random.choice(self.cfg.train_tasks)
                state = self.agent.reset()
                time_step = self.env.reset(task_id)
                obs = time_step.observation['features']
                episode_reward = 0
                episode_step = 0
                episode += 1

                self.logger.log('train/episode', episode, self.step)

            # evaluate agent periodically
            if self.step % self.cfg.eval_frequency == 0:
                self.logger.log('eval/episode', episode, self.step)
                self.evaluate()

            # sample action for data collection
            if self.step < self.cfg.num_seed_steps:
                spec = self.env.action_spec()
                action = np.random.uniform(spec.minimum, spec.maximum,
                                           spec.shape)
            else:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, state, sample=True)

            # run training update
            if self.step >= self.cfg.num_seed_steps:
                for _ in range(self.cfg.num_train_iters):
                    self.agent.update(self.replay_buffer, self.logger,
                                      self.step)

            time_step = self.env.step(action)
            next_obs = time_step.observation['features']

            # allow infinite bootstrap
            done = time_step.last()
            episode_reward += time_step.reward

            self.replay_buffer.add(task_id, obs, action, time_step.reward,
                                   next_obs, done)
            # update agent's memory
            state = self.agent.step(state, obs, action, time_step.reward,
                                    next_obs)

            obs = next_obs
            episode_step += 1
            self.step += 1
Exemple #7
0
class Workspace(object):
    def __init__(self, cfg):
        self.work_dir = os.getcwd().split('runs')[0] + 'runs/'
        self.work_dir = self.work_dir + \
            '2020.10.21/jaco_reach_site_features_drq_agent.cls=agents.drq_agent.DRQAgent,agent.name=drq,batch_size=64,lr=0.005/seed=0/'
        self.model_dir = self.work_dir + '/agent_model'
        print(f'workspace: {self.work_dir}')
        self.cfg = cfg
        self.log_eval_dir = self.work_dir + '/eval_standalone'
        # Use a separate eval dir to avoid overwriting training files
        if not os.path.exists(self.log_eval_dir):
            os.makedirs(self.log_eval_dir)
        self.logger = Logger(self.log_eval_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency_step,
                             agent=cfg.agent.name,
                             action_repeat=cfg.action_repeat,
                             overwrite=True)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)

        # Environment Sampler
        self.num_train_envs = cfg.num_envs
        self.env_sampler = utils.EnvSampler(cfg,
                                            False,
                                            False,
                                            work_dir=self.work_dir)
        experiment_identifier = self.work_dir.split('runs')[1]
        self.eval_envs = self.env_sampler.sample_eval_envs(
            experiment_identifier)
        env_sample_key = list(self.eval_envs.keys())[0]
        sample_env = self.eval_envs[env_sample_key]
        cfg.agent.params.obs_shape = sample_env.observation_space.shape
        cfg.agent.params.action_shape = sample_env.action_space.shape
        cfg.agent.params.action_range = [
            float(sample_env.action_space.low.min()),
            float(sample_env.action_space.high.max())
        ]
        if cfg.lowobs_append:
            if cfg.env == 'jaco_reach_site_features':
                cfg.agent.params.lstate_shape = 49
            else:
                cfg.agent.params.lstate_shape = 9
        else:
            cfg.agent.params.lstate_shape = 0

        self.agent = hydra.utils.instantiate(cfg.agent)

        self.render_train_samples = True
        if self.render_train_samples:
            if cfg.env.startswith('jaco'):
                height = 256
                width = 256
            else:
                height = width = 500
            from PIL import Image
            for env_idx, env in self.eval_envs.items():
                name = 'StandAloneEval_Unseen_Environment_' + str(
                    env_idx) + '.png'
                img_path = self.work_dir + name
                env.reset()
                obs = env.render(mode='rgb_array', height=height, width=width)
                im = Image.fromarray(obs)
                im.save(img_path)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None, phase='eval_standalone')

        self.reload_weights = cfg.reload_weights
        self.train_vid_interval = cfg.train_vid_interval

        self.eval_trials = 100
        self.step = 0

    def evaluate(self):
        average_episode_reward = 0
        for episode in range(self.eval_trials):
            print('Episode Trial ', episode)
            self.video_recorder.init(enabled=True)
            eval_env = self.eval_envs[random.sample(list(self.eval_envs),
                                                    1)[0]]
            obs = eval_env.reset()
            done = False
            episode_reward = 0
            episode_step = 0
            while (episode_step <= eval_env._max_episode_steps - 1):
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=False)
                obs, reward, _, _ = eval_env.step(action)
                self.video_recorder.record(eval_env)
                episode_reward += reward
                episode_step += 1
                self.step += 1
                if done: break
            average_episode_reward += episode_reward
            print('Episode Reward ', episode_reward)
            self.video_recorder.save(f'{self.step}.mp4')
        average_episode_reward /= self.eval_trials
        self.logger.log('eval_standalone/episode_reward',
                        average_episode_reward, self.step)
        self.logger.dump(self.step, ty='eval_standalone')

    def run(self):
        if os.path.exists(self.model_dir):
            latest_step = utils.get_latest_file(self.model_dir)
            self.agent.load(self.model_dir, latest_step)
        else:
            raise ValueError('Could not reload weights!')

        self.evaluate()
Exemple #8
0
class Workspace(object):
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')
        self.cfg = cfg
        config_file = self.work_dir.split('runs')[0] + 'configs/' \
             + cfg.env.replace('-', '_') + '.yaml'
        shutil.copy(config_file, self.work_dir)

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency_step,
                             agent=cfg.agent.name,
                             action_repeat=cfg.action_repeat,
                             overwrite=True)

        experiment_identifier = self.work_dir.split('runs')[1]
        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)

        # Interventions
        interventions = cfg.internvention
        image_augmentation = False

        if 'type_1' in interventions:
            image_augmentation = True
        if 'type_2' in interventions:
            cfg.apply_mod = True

        # Environment Sampler
        self.num_train_envs = cfg.num_envs
        self.env_sampler = utils.EnvSampler(cfg, work_dir=self.work_dir)
        self.eval_envs = self.env_sampler.sample_eval_envs(
            experiment_identifier)
        self.train_envs = self.env_sampler.sample_all_train_envs(
            experiment_identifier)

        self.resample_envs = cfg.resample_env
        self.env_resample_rate = cfg.env_resample_rate

        self.render_train_samples = True

        if self.render_train_samples:
            if cfg.env.startswith('jaco'):
                height = 256
                width = 256
            else:
                height = width = 500
            from PIL import Image
            for env_idx, env in self.train_envs.items():
                name = 'Environment_' + str(env_idx) + '.png'
                env.reset()
                obs = env.render(mode='rgb_array', height=height, width=width)
                im = Image.fromarray(obs)
                im.save(name)
            for env_idx, env in self.eval_envs.items():
                name = 'Eval_Unseen_Environment_' + str(env_idx) + '.png'
                env.reset()
                obs = env.render(mode='rgb_array', height=height, width=width)
                im = Image.fromarray(obs)
                im.save(name)

        env_sample_key = list(self.eval_envs.keys())[0]
        sample_env = self.eval_envs[env_sample_key]
        cfg.agent.params.obs_shape = sample_env.observation_space.shape
        cfg.agent.params.action_shape = sample_env.action_space.shape
        cfg.agent.params.action_range = [
            float(sample_env.action_space.low.min()),
            float(sample_env.action_space.high.max())
        ]
        state_append = cfg.lowobs_append
        if state_append:
            if cfg.env == 'window-open-v1':
                # Double check this
                cfg.agent.params.lstate_shape = 9
            elif cfg.env == 'jaco_reach_site_features':
                cfg.agent.params.lstate_shape = 49
            else:
                cfg.agent.params.lstate_shape = 9
        else:
            cfg.agent.params.lstate_shape = 0

        self.agent = hydra.utils.instantiate(cfg.agent)
        self.replay_buffer = MultiEnvReplayBuffer(
            sample_env.observation_space.shape,
            sample_env.action_space.shape,
            cfg.replay_buffer_capacity,
            self.cfg.image_pad,
            self.device,
            image_augmentation,
            num_envs=self.num_train_envs,
            state_append=state_append,
            state_lstate_shape=cfg.agent.params.lstate_shape)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.train_video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None, phase='train')
        self.model_dir = self.work_dir + '/agent_model'
        self.step = [0] * self.num_train_envs
        self.reload_weights = cfg.reload_weights

        self.train_vid_interval = cfg.train_vid_interval

    def evaluate(self, phase, eval_env):
        average_episode_reward = 0
        for episode in range(self.cfg.num_eval_episodes):
            obs = eval_env.reset()
            if phase == 'unseen':
                self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            episode_step = 0
            # not done doesnt work for metaworld
            while (episode_step <= eval_env._max_episode_steps - 1):
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=False)
                obs, reward, done, _ = eval_env.step(action)
                if phase == 'unseen':
                    self.video_recorder.record(eval_env)
                episode_reward += reward
                episode_step += 1
                if done: break
            average_episode_reward += episode_reward
            if phase == 'unseen':
                self.video_recorder.save(f'{self.step[0]}.mp4')
        average_episode_reward /= self.cfg.num_eval_episodes
        if phase == 'seen':
            self.logger.log('eval_seen/episode_reward', average_episode_reward,
                            self.step[0])
            self.logger.dump(self.step[0], ty='eval_seen')
        elif phase == 'unseen':
            self.logger.log('eval_unseen/episode_reward',
                            average_episode_reward, self.step[0])
            self.logger.dump(self.step[0], ty='eval_unseen')
        eval_env.reset()

    def run(self):
        init_env = None
        keys_to_sample = random.sample(list(self.train_envs),
                                       self.num_train_envs)
        sampled_train_envs = {
            key: self.train_envs[key]
            for key in keys_to_sample
        }
        # Better way to access first elem of OrderedDict?
        for env_idx, env in sampled_train_envs.items():
            init_env = env
            break
        episode, episode_reward, episode_step, done = [0] * self.num_train_envs, [0] * self.num_train_envs, \
            [0] * self.num_train_envs, [True] * self.num_train_envs
        obs, next_obs = [init_env.reset()] * self.num_train_envs, [
            init_env.reset()
        ] * self.num_train_envs
        start_time = time.time()

        train_recording = False
        env_to_rec = 0

        if self.reload_weights and os.path.exists(self.model_dir):
            # Continue training
            try:
                latest_step = utils.get_latest_file(self.model_dir)
                self.agent.load(self.model_dir, latest_step)
            except:
                print('Could not reload weights!')
        while self.step[0] < self.cfg.num_train_steps:

            if self.resample_envs and self.step[
                    0] > 0 and self.step[0] % self.env_resample_rate == 0:
                keys_to_sample = random.sample(list(self.train_envs),
                                               self.num_train_envs)
                sampled_train_envs = {
                    key: self.train_envs[key]
                    for key in keys_to_sample
                }

            for env_idx, (env_tag,
                          env) in enumerate(sampled_train_envs.items()):
                episode_step[env_idx] = 0
                while (episode_step[env_idx] <= env._max_episode_steps - 1):
                    if not train_recording and env_idx == env_to_rec and self.step[
                            env_idx] % self.train_vid_interval == 0:
                        train_recording = True
                        self.train_video_recorder.init(enabled=True)

                    if done[env_idx] or (episode_step[env_idx] >=
                                         env._max_episode_steps - 1):
                        if self.step[env_idx] > 0:
                            self.logger.log('train/duration',
                                            time.time() - start_time,
                                            self.step[env_idx])
                            start_time = time.time()

                        if self.step[
                                env_idx] > 0 and env_idx == env_to_rec and train_recording:
                            file_name = str(self.step[env_idx]) + '_' + env_tag
                            self.train_video_recorder.save(f'{file_name}.mp4')
                            self.train_video_recorder.frames = []
                            train_recording = False
                            env_to_rec = random.randint(
                                0,
                                len(sampled_train_envs) - 1)

                        # Evaluate agent periodically
                        if env_idx == 0 and episode[
                                env_idx] % self.cfg.eval_frequency == 0:
                            # Evaluate an env from training
                            self.logger.log('eval_seen/episode',
                                            episode[env_idx],
                                            self.step[env_idx])
                            eval_env = self.train_envs[random.sample(
                                list(self.train_envs), 1)[0]]
                            self.evaluate(phase='seen', eval_env=eval_env)
                            # Evaluate an unseen env
                            self.logger.log('eval_unseen/episode',
                                            episode[env_idx],
                                            self.step[env_idx])
                            eval_env = self.eval_envs[random.sample(
                                list(self.eval_envs), 1)[0]]
                            self.evaluate(phase='unseen', eval_env=eval_env)
                        if episode[env_idx] % self.cfg.ckpt_frequency == 0:
                            self.agent.save(self.model_dir, episode[env_idx])
                        self.logger.log('train/episode_reward',
                                        episode_reward[env_idx],
                                        self.step[env_idx])
                        obs[env_idx] = env.reset()
                        done[env_idx] = False
                        episode_reward[env_idx] = 0
                        episode[env_idx] += 1

                        self.logger.log('train/episode', episode[env_idx],
                                        self.step[env_idx])
                        self.logger.log('train/env_idx', env_tag,
                                        self.step[env_idx])

                    # sample action for data collection
                    if self.step[env_idx] < self.cfg.num_seed_steps:
                        action = env.action_space.sample()
                    else:
                        with utils.eval_mode(self.agent):
                            action = self.agent.act(obs[env_idx], sample=True)

                    next_obs[env_idx], reward, done[env_idx], _ = env.step(
                        action)
                    if train_recording and env_idx == env_to_rec:
                        self.train_video_recorder.record(env)

                    # allow infinite bootstrap
                    done[env_idx] = float(done[env_idx])
                    done_no_max = 0 if episode_step[
                        env_idx] + 1 == env._max_episode_steps - 1 else done[
                            env_idx]

                    episode_reward[env_idx] += reward

                    self.replay_buffer.add(env_idx, obs[env_idx], action,
                                           reward, next_obs[env_idx],
                                           done[env_idx], done_no_max)

                    obs[env_idx] = next_obs[env_idx]
                    episode_step[env_idx] += 1
                    self.step[env_idx] += 1

                    # Run training update
                    if self.step[env_idx] >= self.cfg.num_seed_steps:
                        #print('Running train update')
                        for _ in range(self.cfg.num_train_iters):
                            self.agent.update(self.replay_buffer,
                                              self.num_train_envs, self.logger,
                                              self.step[env_idx], env_tag,
                                              env_idx)
                # At the end of each episode, log
                self.logger.dump(
                    self.step[env_idx],
                    save=(self.step[env_idx] > self.cfg.num_seed_steps),
                    ty='train')
Exemple #9
0
class Workspace(object):
    def __init__(self, cfg):
        self.work_dir = os.getcwd().split('runs')[0] + 'runs/'
        self.work_dir = self.work_dir + \
            '2020.10.22/jaco_reach_site_features_drq_agent.cls=agents.drq_agent.DRQAgent,agent.name=drq,batch_size=64,lr=0.005/'
        self.model_dir = self.work_dir + '/agent_model'
        print(f'workspace: {self.work_dir}')
        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency_step,
                             agent=cfg.agent.name,
                             action_repeat=cfg.action_repeat,
                             overwrite=True)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)

        # Environment Sampler
        self.num_train_envs = cfg.num_envs
        self.frame_stack = 5
        self.env_sampler = utils.EnvSampler(cfg,
                                            False,
                                            False,
                                            work_dir=self.work_dir)
        self.eval_env_sim = self.env_sampler.make_env()

        self.jaco_real_env = JacoPhysics('j2s7s300',
                                         robot_server_ip='127.0.0.1',
                                         robot_server_port=9030,
                                         control_type='position')
        self.frame_size = 84
        self.jaco_real_env = utils.FrameStackJacoReal(self.jaco_real_env,
                                                      k=self.frame_stack,
                                                      frame_size=self.frame_size,
                                                      dummy_env=self.eval_env_sim)

        cfg.agent.params.obs_shape = self.eval_env_sim.observation_space.shape
        cfg.agent.params.action_shape = self.eval_env_sim.action_space.shape
        cfg.agent.params.action_range = [
            float(self.eval_env_sim.action_space.low.min()),
            float(self.eval_env_sim.action_space.high.max())
        ]
        if cfg.lowobs_append:
            if cfg.env == 'jaco_reach_site_features':
                cfg.agent.params.lstate_shape = 49
            else:
                cfg.agent.params.lstate_shape = 9
        else:
            cfg.agent.params.lstate_shape = 0

        self.agent = hydra.utils.instantiate(cfg.agent)

        self.sim_video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None,
            dir_name='jaco_sim_video',
            phase='eval')
        self.real_video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None,
            dir_name='jaco_real_video',
            phase='eval',
            height=640,
            width=480)

        self.reload_weights = cfg.reload_weights
        self.train_vid_interval = cfg.train_vid_interval

        self.num_eval_episodes = 1
        self.episode_max_step = 30
        self.step = 0

    def evaluate(self):
        average_episode_reward = 0
        for trial in range(self.num_eval_episodes):
            # This will send jaco to real home
            obs = self.jaco_real_env.reset()
            sim_obs = self.eval_env_sim.reset()
            obs['state_low_obs'] = sim_obs['state_low_obs']
            # Now lets go to sim home
            self.send_robot_to_sim_home()
            print('Done sending him home')
            self.sim_video_recorder.init(enabled=(trial == 0))
            self.real_video_recorder.init(enabled=(trial == 0))
            # What to do with done? Make sim to indicate done?
            # done = False
            episode_reward = 0
            episode_step = 0
            while (episode_step <= self.episode_max_step):
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=False)
                translated_act = self.translate_action_sim_to_real(action)

                obs = self.jaco_real_env.step(translated_act)
                obs['state_low_obs'] = sim_obs['state_low_obs']
                print('Translated Act ', translated_act)
                # Take a sim step with the original action
                sim_obs, reward, done, _ = self.eval_env_sim.step(action)

                self.sim_video_recorder.record(self.eval_env_sim)
                self.real_video_recorder.record(self.jaco_real_env, real_jaco=True)
                episode_reward += reward
                episode_step += 1
                # if done: break
            average_episode_reward += episode_reward
            self.sim_video_recorder.save(f'{trial}.mp4')
            self.real_video_recorder.save(f'{trial}.mp4')
        average_episode_reward /= self.cfg.num_eval_episodes
        print('Rewards ', average_episode_reward)

    def run(self):

        if os.path.exists(self.model_dir):
            latest_step = utils.get_latest_file(self.model_dir)
            self.agent.load(self.model_dir, latest_step)

        self.evaluate()

    def translate_action_sim_to_real(self, action):

        self.eval_env_sim.step(action)
        sim_qpos = self.eval_env_sim.physics.data.qpos
        return sim_qpos

    def send_robot_to_sim_home(self):
        self.eval_env_sim.reset()
        home_sim = self.eval_env_sim.physics.data.qpos
        self.jaco_real_env.step(home_sim)
Exemple #10
0
class Workspace(object):
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')
        self.model_dir = utils.make_dir(self.work_dir, 'model')
        self.buffer_dir = utils.make_dir(self.work_dir, 'buffer')

        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency_step,
                             action_repeat=cfg.action_repeat,
                             agent=cfg.agent.name)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.env = dmc.make(cfg.env, cfg.frame_stack, cfg.action_repeat,
                            cfg.seed)
        self.eval_env = dmc.make(cfg.env, cfg.frame_stack, cfg.action_repeat,
                                 cfg.seed + 1)

        obs_spec = self.env.observation_spec()['pixels']
        action_spec = self.env.action_spec()

        cfg.agent.params.obs_shape = obs_spec.shape
        cfg.agent.params.action_shape = action_spec.shape
        cfg.agent.params.action_range = [
            float(action_spec.minimum.min()),
            float(action_spec.maximum.max())
        ]
        # exploration agent uses intrinsic reward
        self.expl_agent = hydra.utils.instantiate(cfg.agent,
                                                  task_agnostic=True)
        # task agent uses extr extrinsic reward
        self.task_agent = hydra.utils.instantiate(cfg.agent,
                                                  task_agnostic=False)
        self.task_agent.assign_modules_from(self.expl_agent)

        if cfg.load_pretrained:
            pretrained_path = utils.find_pretrained_agent(
                cfg.pretrained_dir, cfg.env, cfg.seed, cfg.pretrained_step)
            print(f'snapshot is taken from: {pretrained_path}')
            pretrained_agent = utils.load(pretrained_path)
            self.task_agent.assign_modules_from(pretrained_agent)

        # buffer for the task-agnostic phase
        self.expl_buffer = ReplayBuffer(obs_spec.shape, action_spec.shape,
                                        cfg.replay_buffer_capacity,
                                        self.device)
        # buffer for task-specific phase
        self.task_buffer = ReplayBuffer(obs_spec.shape, action_spec.shape,
                                        cfg.replay_buffer_capacity,
                                        self.device)

        self.eval_video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0

    def get_agent(self):
        if self.step < self.cfg.num_expl_steps:
            return self.expl_agent
        return self.task_agent

    def get_buffer(self):
        if self.step < self.cfg.num_expl_steps:
            return self.expl_buffer
        return self.task_buffer

    def evaluate(self):
        avg_episode_reward = 0
        for episode in range(self.cfg.num_eval_episodes):
            time_step = self.eval_env.reset()
            self.eval_video_recorder.init(enabled=(episode == 0))
            episode_reward = 0
            episode_success = 0
            episode_step = 0
            while not time_step.last():
                agent = self.get_agent()
                with utils.eval_mode(agent):
                    obs = time_step.observation['pixels']
                    action = agent.act(obs, sample=False)
                time_step = self.eval_env.step(action)
                self.eval_video_recorder.record(self.eval_env)
                episode_reward += time_step.reward
                episode_step += 1

            avg_episode_reward += episode_reward
            self.eval_video_recorder.save(f'{self.step}.mp4')
        avg_episode_reward /= self.cfg.num_eval_episodes
        self.logger.log('eval/episode_reward', avg_episode_reward, self.step)
        self.logger.dump(self.step, ty='eval')

    def run(self):
        episode, episode_reward, episode_step = 0, 0, 0
        start_time = time.time()
        done = True
        while self.step <= self.cfg.num_train_steps:
            if done:
                if self.step > 0:
                    fps = episode_step / (time.time() - start_time)
                    self.logger.log('train/fps', fps, self.step)
                    start_time = time.time()
                    self.logger.log('train/episode_reward', episode_reward,
                                    self.step)
                    self.logger.log('train/episode', episode, self.step)
                    self.logger.dump(self.step, ty='train')

                time_step = self.env.reset()
                obs = time_step.observation['pixels']
                episode_reward = 0
                episode_step = 0
                episode += 1

                self.logger.log('train/episode', episode, self.step)

            agent = self.get_agent()
            replay_buffer = self.get_buffer()
            # evaluate agent periodically
            if self.step % self.cfg.eval_frequency == 0:
                self.logger.log('eval/episode', episode - 1, self.step)
                self.evaluate()

            # save agent periodically
            if self.cfg.save_model and self.step % self.cfg.save_frequency == 0:
                utils.save(
                    self.expl_agent,
                    os.path.join(self.model_dir, f'expl_agent_{self.step}.pt'))
                utils.save(
                    self.task_agent,
                    os.path.join(self.model_dir, f'task_agent_{self.step}.pt'))
            if self.cfg.save_buffer and self.step % self.cfg.save_frequency == 0:
                replay_buffer.save(self.buffer_dir, self.cfg.save_pixels)

            # sample action for data collection
            if self.step < self.cfg.num_random_steps:
                spec = self.env.action_spec()
                action = np.random.uniform(spec.minimum, spec.maximum,
                                           spec.shape)
            else:
                with utils.eval_mode(agent):
                    action = agent.act(obs, sample=True)

            agent.update(replay_buffer, self.step)

            time_step = self.env.step(action)
            next_obs = time_step.observation['pixels']

            # allow infinite bootstrap
            done = time_step.last()
            episode_reward += time_step.reward

            replay_buffer.add(obs, action, time_step.reward, next_obs, done)

            obs = next_obs
            episode_step += 1
            self.step += 1
Exemple #11
0
class Workspace(object):
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency,
                             agent=cfg.agent.name)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.train_envs, self.test_envs = utils.make_env(cfg)

        cfg.agent.params.obs_dim = self.train_envs[0].observation_space.shape[0] + cfg.noise_dims
        cfg.agent.params.action_dim = self.train_envs[0].action_space.shape[0]
        if cfg.agent.name != 'sac':
            cfg.agent.params.num_envs = cfg.num_train_envs
        cfg.agent.params.action_range = [
            float(self.train_envs[0].action_space.low.min()),
            float(self.train_envs[0].action_space.high.max())
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)
        self.agent.seq_len = cfg.seq_len

        self.replay_buffer = MultiEnvReplayBuffer((cfg.agent.params.obs_dim,),  # hard coded
                                          self.train_envs[0].action_space.shape,
                                          int(cfg.replay_buffer_capacity),
                                          self.device, num_envs=cfg.num_train_envs, seq_len=cfg.seq_len)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = [0] * cfg.num_train_envs

    def evaluate(self, env, train=False):
        for episode in range(self.cfg.num_eval_episodes):
            obs = env.reset()
            self.agent.reset()
            self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            while not done:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=False)
                obs, reward, done, _ = env.step(action)
                self.video_recorder.record(env)
                episode_reward += reward

            self.video_recorder.save(f'{self.step}.mp4')
            if train:
                self.logger.log('eval/train_episode_reward', episode_reward, self.step[0])
            else:
                self.logger.log('eval/eval_episode_reward', episode_reward, self.step[0])

    def run(self):
        episode, episode_reward, episode_step, done = [0] * self.cfg.num_train_envs, [0] * self.cfg.num_train_envs, \
            [0] * self.cfg.num_train_envs, [True] * self.cfg.num_train_envs
        obs, next_obs = [self.train_envs[0].reset()] * self.cfg.num_train_envs, [self.train_envs[0].reset()] * self.cfg.num_train_envs
        start_time = time.time()
        while self.step[0] < self.cfg.num_train_steps:
            for e_idx, env in enumerate(self.train_envs):
                if done[e_idx]:
                    if self.step[e_idx] > 0:
                        self.logger.log('train/duration',
                                        time.time() - start_time, self.step[e_idx])
                        start_time = time.time()
                        self.logger.dump(
                            self.step[e_idx], save=(self.step[e_idx] > self.cfg.num_seed_steps))

                    # evaluate agent periodically
                    if self.step[0] > 0 and self.step[0] % self.cfg.eval_frequency == 0:
                        self.logger.log('eval/episode', episode[e_idx], self.step[e_idx])
                        self.evaluate(env, train=True)
                        self.evaluate(self.test_envs[0], train=False)
                        self.logger.dump(self.step[e_idx])
                    self.logger.log('train/episode_reward', episode_reward[e_idx],
                                    self.step[e_idx])

                    obs[e_idx] = env.reset()
                    self.agent.reset()
                    done[e_idx] = False
                    episode_reward[e_idx] = 0
                    episode_step[e_idx] = 0
                    episode[e_idx] += 1

                    self.logger.log('train/episode', episode[e_idx], self.step[e_idx])

                # sample action for data collection
                if self.step[e_idx] < self.cfg.num_seed_steps:
                    action = env.action_space.sample()
                else:
                    with utils.eval_mode(self.agent):
                        action = self.agent.act(obs[e_idx], sample=True)

                # run training update for encoder
                if self.step[e_idx] > self.cfg.num_seed_steps and self.step[e_idx] <= self.cfg.num_train_encoder_steps:
                    self.agent.update(self.replay_buffer, self.logger, self.step[e_idx], train_sac=True, train_encoder=True)
                # run training update for sac
                elif self.step[e_idx] >= self.cfg.num_train_encoder_steps:
                    self.agent.update(self.replay_buffer, self.logger, self.step[e_idx], train_sac=True, train_encoder=True)

                try:
                    next_obs[e_idx], reward, done[e_idx], _ = env.step(action)
                except:
                    next_obs[e_idx] = obs[e_idx]
                    reward = 0
                    print('Invalid action. Terminating episode.')
                    done[e_idx] = True

                # allow infinite bootstrap
                done[e_idx] = float(done[e_idx])
                done_no_max = 0 if episode_step[e_idx] + 1 == env._max_episode_steps else done[e_idx]
                episode_reward[e_idx] += reward

                self.replay_buffer.add(e_idx, obs[e_idx], action, reward, next_obs[e_idx], done[e_idx],
                                    done_no_max)

                obs[e_idx] = next_obs[e_idx]
                episode_step[e_idx] += 1
                self.step[e_idx] += 1
class Workspace(object):
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency,
                             agent=cfg.agent.name)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.env = utils.make_env(cfg)

        cfg.agent.params.obs_dim = self.env.observation_space.shape[0]
        cfg.agent.params.action_dim = self.env.action_space.shape[0]
        cfg.agent.params.action_range = [
            float(self.env.action_space.low.min()),
            float(self.env.action_space.high.max())
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)

        self.replay_buffer = ReplayBuffer(self.env.observation_space.shape,
                                          self.env.action_space.shape,
                                          int(cfg.replay_buffer_capacity),
                                          self.device)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0

    def evaluate(self):
        average_episode_reward = 0
        for episode in range(self.cfg.num_eval_episodes):
            obs = self.env.reset()
            self.agent.reset()
            self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            while not done:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=False)
                obs, reward, done, _ = self.env.step(action)
                self.video_recorder.record(self.env)
                episode_reward += reward

            average_episode_reward += episode_reward
            self.video_recorder.save(f'{self.step}.mp4')
        average_episode_reward /= self.cfg.num_eval_episodes
        self.logger.log('eval/episode_reward', average_episode_reward,
                        self.step)
        self.logger.dump(self.step)

    def run(self):
        episode, episode_reward, done = 0, 0, True
        start_time = time.time()
        while self.step < self.cfg.num_train_steps:
            if done:
                if self.step > 0:
                    self.logger.log('train/duration',
                                    time.time() - start_time, self.step)
                    start_time = time.time()
                    self.logger.dump(
                        self.step, save=(self.step > self.cfg.num_seed_steps))

                # evaluate agent periodically
                if self.step > 0 and self.step % self.cfg.eval_frequency == 0:
                    self.logger.log('eval/episode', episode, self.step)
                    self.evaluate()

                # save agent periodically
                if self.step > 0 and self.step % self.cfg.save_frequency == 0:
                    self.agent.save(self.work_dir, self.step)

                self.logger.log('train/episode_reward', episode_reward,
                                self.step)

                obs = self.env.reset()
                self.agent.reset()
                done = False
                episode_reward = 0
                episode_step = 0
                episode += 1

                self.logger.log('train/episode', episode, self.step)

            # sample action for data collection
            if self.step < self.cfg.num_seed_steps:
                action = self.env.action_space.sample()
                propensity = 1.0 / np.prod(self.env.action_space.high -
                                           self.env.action_space.low)
            else:
                with utils.eval_mode(self.agent):
                    if self.cfg.log_propensities:
                        action, propensity = self.agent.act(obs,
                                                            sample=True,
                                                            propensity=True)
                    else:
                        action = self.agent.act(obs, sample=True)

            # run training update
            if self.step >= self.cfg.num_seed_steps:
                self.agent.update(self.replay_buffer, self.logger, self.step)

            try:
                next_obs, reward, done, _ = self.env.step(action)
            except:
                import ipdb
                ipdb.set_trace()

            # allow infinite bootstrap
            done = float(done)
            done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done
            episode_reward += reward

            if self.cfg.log_propensities:
                self.replay_buffer.add(obs, action, reward, next_obs, done,
                                       done_no_max, propensity)
            else:
                self.replay_buffer.add(obs, action, reward, next_obs, done,
                                       done_no_max)

            obs = next_obs
            episode_step += 1
            self.step += 1
Exemple #13
0
class Workspace(object):
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')

        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency_step,
                             agent=cfg.agent.name,
                             action_repeat=cfg.action_repeat)

        utils.set_seed_everywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        self.env = make_env(cfg)

        cfg.agent.params.obs_shape = self.env.observation_space.shape
        cfg.agent.params.action_shape = self.env.action_space.shape
        cfg.agent.params.action_range = [
            float(self.env.action_space.low.min()),
            float(self.env.action_space.high.max())
        ]
        self.agent = hydra.utils.instantiate(cfg.agent)

        self.replay_buffer = ReplayBuffer(self.env.observation_space.shape,
                                          self.env.action_space.shape,
                                          cfg.replay_buffer_capacity,
                                          self.cfg.image_pad, self.device)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0

        if self.cfg.episode_dir:
            self.load_episodes(cfg.episode_dir)

    def load_episodes(self, directory):
        directory = pathlib.Path(directory).expanduser()
        print(f'Loading episodes from {directory}')
        num_loaded_episodes = 0
        for filename in directory.glob('*.npz'):
            try:
                with filename.open('rb') as f:
                    episode = np.load(f)
                    episode = {k: episode[k] for k in episode.keys()}
            except Exception as e:
                print(f'Could not load episode: {e}')
                continue
            images = process_images(episode['image'])
            obses = images[:-1]
            actions = episode['action'][:-1]
            rewards = episode['sparse_reward'][:-1]
            next_obses = images[1:]
            dones = np.zeros(len(episode['action']))
            dones_no_max = dones
            [
                self.replay_buffer.add(*kwargs) for kwargs in zip(
                    obses, actions, rewards, next_obses, dones, dones_no_max)
            ]
            num_loaded_episodes += 1
        print(f'Loaded {num_loaded_episodes} episodes.')

    def evaluate(self):
        average_episode_reward = 0
        average_episode_success = 0
        for episode in range(self.cfg.num_eval_episodes):
            obs = self.env.reset()
            self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            episode_step = 0
            while not done:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=False)
                obs, reward, done, info = self.env.step(action)
                self.video_recorder.record(self.env)
                episode_reward += reward
                episode_step += 1

            average_episode_reward += episode_reward
            average_episode_success += float(episode_reward > 0)
            self.video_recorder.save(f'{self.step}.mp4')
        average_episode_reward /= self.cfg.num_eval_episodes
        average_episode_success /= self.cfg.num_eval_episodes
        self.logger.log('eval/episode_reward', average_episode_reward,
                        self.step)
        self.logger.log('eval/episode_success', average_episode_success,
                        self.step)
        self.logger.dump(self.step)

    def run(self):
        episode, episode_reward, episode_step, done = 0, 0, 1, True
        start_time = time.time()
        while self.step < self.cfg.num_train_steps:
            if done:
                if self.step > 0:
                    self.logger.log('train/duration',
                                    time.time() - start_time, self.step)
                    start_time = time.time()
                    self.logger.dump(
                        self.step, save=(self.step > self.cfg.num_seed_steps))

                # evaluate agent periodically
                if self.step % self.cfg.eval_frequency == 0:
                    self.logger.log('eval/episode', episode, self.step)
                    self.evaluate()

                self.logger.log('train/episode_reward', episode_reward,
                                self.step)
                self.logger.log('train/episode_success',
                                float(episode_reward > 0), self.step)

                obs = self.env.reset()
                done = False
                episode_reward = 0
                episode_step = 0
                episode += 1

                self.logger.log('train/episode', episode, self.step)

            # sample action for data collection
            if self.step < self.cfg.num_seed_steps:
                action = self.env.action_space.sample()
            else:
                with utils.eval_mode(self.agent):
                    action = self.agent.act(obs, sample=True)

            # run training update
            if self.step >= self.cfg.num_seed_steps:
                for _ in range(self.cfg.num_train_iters):
                    self.agent.update(self.replay_buffer, self.logger,
                                      self.step)

            next_obs, reward, done, info = self.env.step(action)

            # allow infinite bootstrap
            done = float(done)
            done_no_max = 0 if episode_step + 1 == self.env._max_episode_steps else done
            episode_reward += reward

            self.replay_buffer.add(obs, action, reward, next_obs, done,
                                   done_no_max)

            obs = next_obs
            episode_step += 1
            self.step += 1
Exemple #14
0
class Workspace(object):
    def __init__(self, cfg):
        self.work_dir = os.getcwd()
        print(f'workspace: {self.work_dir}')
        self.cfg = cfg

        self.logger = Logger(self.work_dir,
                             save_tb=cfg.log_save_tb,
                             log_frequency=cfg.log_frequency,
                             agent=cfg.agent.name)

        setSeedEverywhere(cfg.seed)
        self.device = torch.device(cfg.device)
        # self.env = utils.makeEnv(cfg)
        self.env = hydra.utils.call(cfg.env)

        cfg.agent.obs_dim = self.env.observation_space.shape[0]
        cfg.agent.action_dim = self.env.action_space.shape[0]
        cfg.agent.action_range = [
            float(self.env.action_space.low.min()),
            float(self.env.action_space.high.max())
        ]
        cfg.agent.n_step = cfg.replay_buffer.n_step # n-step experience replay
        self.agent = hydra.utils.instantiate(cfg.agent,_recursive_=False)

        self.replay_buffer = ReplayBuffer(
            capacity=cfg.replay_buffer.capacity,
            obs_shape = self.env.observation_space.shape,
            action_shape = self.env.action_space.shape,
            obs_dtype = self.env.observation_space.dtype,
            action_dtype = self.env.action_space.dtype,
            n_step = cfg.replay_buffer.n_step, # n-step experience replay
            discount=cfg.agent.discount, # per step discount
            device = self.device)

        self.video_recorder = VideoRecorder(
            self.work_dir if cfg.save_video else None)
        self.step = 0

    def evaluate(self):
        average_episode_reward = 0
        for episode in range(self.cfg.num_eval_episodes):
            obs = self.env.reset()
            self.agent.reset()
            self.video_recorder.init(enabled=(episode == 0))
            done = False
            episode_reward = 0
            while not done:
                with evalMode(self.agent):
                    action = self.agent.act(obs, sample=False)
                obs, reward, done, _ = self.env.step(action)
                self.video_recorder.record(self.env)
                episode_reward += reward

            average_episode_reward += episode_reward
            self.video_recorder.save(f'{self.step}.mp4')
        average_episode_reward /= self.cfg.num_eval_episodes
        self.logger.log('eval/episode_reward', average_episode_reward,
                        self.step)
        self.logger.dump(self.step)

    def run(self):
        episode, episode_reward, done = 0, 0, True
        start_time = time.time()
        num_train_steps = self.cfg.num_train_steps # total training steps
        num_seed_steps = self.cfg.num_seed_steps # steps prior to training
        env = self.env
        while self.step < num_train_steps:
            if done:
                if self.step > 0:
                    self.logger.log('train/duration',
                                    time.time() - start_time, self.step)
                    start_time = time.time()
                    self.logger.dump(self.step, save=(self.step > num_seed_steps))
                # evaluate agent periodically
                if self.step > 0 and self.step % self.cfg.eval_frequency == 0:
                    self.logger.log('eval/episode', episode, self.step)
                    self.evaluate()
                self.logger.log('train/episode_reward', episode_reward,self.step)
                self.logger.log('train/episode', episode, self.step)

                done = False
                episode_reward = 0
                episode_step = 0
                episode += 1
                
                self.agent.reset()
                obs = env.reset()
                self.replay_buffer.onEpisodeEnd()

            # sample action for data collection
            if self.step < num_seed_steps:
                action = env.action_space.sample()
            else:
                with evalMode(self.agent):
                    action = self.agent.act(obs, sample=True)
            # run training update
            if self.step >= num_seed_steps:
                self.agent.update(self.replay_buffer, self.logger, self.step) 

            next_obs, reward, done, _ = env.step(action)

            max_episode_step_reached = (episode_step + 1 == env._max_episode_steps)
            not_done = True if max_episode_step_reached else (not done) # allow infinite bootstrap
            done = done or max_episode_step_reached # signals episode ended
            self.replay_buffer.add(obs, action, reward, next_obs, not_done)
            
            obs = next_obs
            episode_step += 1
            self.step += 1
            episode_reward += reward