Esempio n. 1
0
def eval_alg(policy, env, max_path_length, num_eval_rollouts, env_seed, eval_deterministic=False):
    if eval_deterministic:
        policy = MakeDeterministic(policy)
    
    env.seed(env_seed)

    eval_sampler = InPlacePathSampler(
        env=env,
        policy=policy,
        max_samples=max_path_length * (num_eval_rollouts + 1),
        max_path_length=max_path_length, policy_uses_pixels=False,
        policy_uses_task_params=False,
        concat_task_params_to_policy_obs=False
    )
    test_paths = eval_sampler.obtain_samples()
    path_trajs = [np.array([d['xy_pos'] for d in path["env_infos"]]) for path in test_paths]
    return {'path_trajs': path_trajs}
Esempio n. 2
0
    def _try_to_eval(self, epoch):
        logger.save_extra_data(self.get_extra_data_to_save(epoch))
        if self._can_evaluate():
            if self.environment_farming:
                # Create new new eval_sampler each evaluation time in order to avoid relesed environment problem
                env_for_eval_sampler = self.farmer.force_acq_env()
                print(env_for_eval_sampler)
                self.eval_sampler = InPlacePathSampler(
                    env=env_for_eval_sampler,
                    policy=self.eval_policy,
                    max_samples=self.num_steps_per_eval + self.max_path_length,
                    max_path_length=self.max_path_length,
                )

            self.evaluate(epoch)

            # Adding env back to free_env list
            self.farmer.add_free_env(env_for_eval_sampler)

            params = self.get_epoch_snapshot(epoch)
            logger.save_itr_params(epoch, params)
            table_keys = logger.get_table_key_set()
            if self._old_table_keys is not None:
                assert table_keys == self._old_table_keys, (
                    "Table keys cannot change from iteration to iteration.")
            self._old_table_keys = table_keys

            logger.record_tabular(
                "Number of train steps total",
                self._n_train_steps_total,
            )
            logger.record_tabular(
                "Number of env steps total",
                self._n_env_steps_total,
            )
            logger.record_tabular(
                "Number of rollouts total",
                self._n_rollouts_total,
            )

            times_itrs = gt.get_times().stamps.itrs
            train_time = times_itrs['train'][-1]
            sample_time = times_itrs['sample'][-1]
            eval_time = times_itrs['eval'][-1] if epoch > 0 else 0
            epoch_time = train_time + sample_time + eval_time
            total_time = gt.get_times().total

            logger.record_tabular('Train Time (s)', train_time)
            logger.record_tabular('(Previous) Eval Time (s)', eval_time)
            logger.record_tabular('Sample Time (s)', sample_time)
            logger.record_tabular('Epoch Time (s)', epoch_time)
            logger.record_tabular('Total Train Time (s)', total_time)

            logger.record_tabular("Epoch", epoch)
            logger.dump_tabular(with_prefix=False, with_timestamp=False)
        else:
            logger.log("Skipping eval for now.")
Esempio n. 3
0
def eval_alg(policy,
             env,
             num_eval_rollouts,
             eval_deterministic=False,
             max_path_length=1000):
    if eval_deterministic:
        policy = MakeDeterministic(policy)

    eval_sampler = InPlacePathSampler(env=env,
                                      policy=policy,
                                      max_samples=max_path_length *
                                      (num_eval_rollouts + 1),
                                      max_path_length=max_path_length,
                                      policy_uses_pixels=False,
                                      policy_uses_task_params=False,
                                      concat_task_params_to_policy_obs=False)
    test_paths = eval_sampler.obtain_samples()
    average_returns = get_average_returns(test_paths)
    return average_returns
Esempio n. 4
0
    def __init__(
            self,
            env,
            qf1,
            qf2,
            policy,
            replay_buffer1,
            replay_buffer2,
            num_epochs=1000,
            num_steps_per_epoch=1000,
            policy_learning_rate=1e-4,
            batch_size=128,
            num_steps_per_eval=3000,
            max_path_length=300,
            discount=0.99,
    ):
        super().__init__()
        self.env = env
        self.qf1 = qf1
        self.qf2 = qf2
        self.policy = policy
        self.replay_buffer1 = replay_buffer1
        self.replay_buffer2 = replay_buffer2
        self.num_steps_per_epoch = num_steps_per_epoch
        self.num_epochs = num_epochs
        self.policy_learning_rate = policy_learning_rate
        self.batch_size = batch_size
        self.discount = discount

        self.eval_sampler = InPlacePathSampler(
            env=env,
            policy=self.policy,
            max_samples=num_steps_per_eval,
            max_path_length=max_path_length,
        )

        self.policy_optimizer = optim.Adam(self.policy.parameters(),
                                           lr=self.policy_learning_rate)
Esempio n. 5
0
class MetaRLAlgorithm(metaclass=abc.ABCMeta):
    def __init__(
            self,
            env,
            agent,
            train_tasks,
            eval_tasks,
            meta_batch=64,
            num_iterations=100,
            num_train_steps_per_itr=1000,
            num_initial_steps=100,
            num_tasks_sample=100,
            num_steps_prior=100,
            num_steps_posterior=100,
            num_extra_rl_steps_posterior=100,
            num_evals=10,
            num_steps_per_eval=1000,
            batch_size=1024,
            embedding_batch_size=1024,
            embedding_mini_batch_size=1024,
            max_path_length=1000,
            discount=0.99,
            replay_buffer_size=1000000,
            reward_scale=1,
            num_exp_traj_eval=1,
            update_post_train=1,
            eval_deterministic=True,
            render=False,
            save_replay_buffer=False,
            save_algorithm=False,
            save_environment=False,
            render_eval_paths=False,
            dump_eval_paths=False,
            plotter=None,
    ):
        """
        :param env: training env
        :param agent: agent that is conditioned on a latent variable z that rl_algorithm is responsible for feeding in
        :param train_tasks: list of tasks used for training
        :param eval_tasks: list of tasks used for eval

        see default experiment config file for descriptions of the rest of the arguments
        """
        self.env = env
        self.agent = agent
        self.exploration_agent = agent # Can potentially use a different policy purely for exploration rather than also solving tasks, currently not being used
        self.train_tasks = train_tasks
        self.eval_tasks = eval_tasks
        self.meta_batch = meta_batch
        self.num_iterations = num_iterations
        self.num_train_steps_per_itr = num_train_steps_per_itr
        self.num_initial_steps = num_initial_steps
        self.num_tasks_sample = num_tasks_sample
        self.num_steps_prior = num_steps_prior
        self.num_steps_posterior = num_steps_posterior
        self.num_extra_rl_steps_posterior = num_extra_rl_steps_posterior
        self.num_evals = num_evals
        self.num_steps_per_eval = num_steps_per_eval
        self.batch_size = batch_size
        self.embedding_batch_size = embedding_batch_size
        self.embedding_mini_batch_size = embedding_mini_batch_size
        self.max_path_length = max_path_length
        self.discount = discount
        self.replay_buffer_size = replay_buffer_size
        self.reward_scale = reward_scale
        self.update_post_train = update_post_train
        self.num_exp_traj_eval = num_exp_traj_eval
        self.eval_deterministic = eval_deterministic
        self.render = render
        self.save_replay_buffer = save_replay_buffer
        self.save_algorithm = save_algorithm
        self.save_environment = save_environment

        self.eval_statistics = None
        self.render_eval_paths = render_eval_paths
        self.dump_eval_paths = dump_eval_paths
        self.plotter = plotter
        # self.alpha = alpha

        self.sampler = InPlacePathSampler(
            env=env,
            policy=agent,
            max_path_length=self.max_path_length,
        )

        # separate replay buffers for
        # - training RL update
        # - training encoder update
        self.replay_buffer = MultiTaskReplayBuffer(
                self.replay_buffer_size,
                env,
                self.train_tasks,
            )

        self.enc_replay_buffer = MultiTaskReplayBuffer(
                self.replay_buffer_size,
                env,
                self.train_tasks,
        )

        self._n_env_steps_total = 0
        self._n_train_steps_total = 0
        self._n_rollouts_total = 0
        self._do_train_time = 0
        self._epoch_start_time = None
        self._algo_start_time = None
        self._old_table_keys = None
        self._current_path_builder = PathBuilder()
        self._exploration_paths = []

    def make_exploration_policy(self, policy):
         return policy

    def make_eval_policy(self, policy):
        return policy

    def sample_task(self, is_eval=False):
        '''
        sample task randomly
        '''
        if is_eval:
            idx = np.random.randint(len(self.eval_tasks))
        else:
            idx = np.random.randint(len(self.train_tasks))
        return idx

    def train(self):
        '''
        meta-training loop
        '''
        self.pretrain()
        params = self.get_epoch_snapshot(-1)
        logger.save_itr_params(-1, params)
        gt.reset()
        gt.set_def_unique(False)
        self._current_path_builder = PathBuilder()

        # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate
        for it_ in gt.timed_for(
                range(self.num_iterations),
                save_itrs=True,
        ):
            self._start_epoch(it_)
            self.training_mode(True)
            print("\nIteration:{}".format(it_+1))
            if it_ == 0:#                                                                算法第一步,初始化每个任务的buffer
                print('\nCollecting initial pool of data for train and eval')
                # temp for evaluating
                for idx in self.train_tasks:#在训练开始之前,为每个任务采集2000条transition
                    self.task_idx = idx#更改当前任务idx
                    self.env.reset_task(idx)#重置任务
                    self.collect_data(self.num_initial_steps, 1, np.inf)#采集num_initial_steps条轨迹c并利用q(z|c)更新self.z
                    # print("task id:", self.task_idx, " env:", self.replay_buffer.env)
                    # print("buffer ", self.task_idx, ":", self.replay_buffer.task_buffers[self.task_idx].__dict__.items())
            # Sample data from train tasks.
            print("\nFinishing collecting initial pool of data")
            print("\nSampling data from train tasks for Meta-training")
            for i in range(self.num_tasks_sample):#对于所有的train_tasks,随机从中取5个,然后为每个任务的buffer采集num_steps_prior + num_extra_rl_steps_posterior条transition
                print("\nSample data , round{}".format(i+1))#为每个任务的enc_buffer采集num_steps_prior条transition
                idx = np.random.randint(len(self.train_tasks))#train_tasks里面随便选一个task
                self.task_idx = idx
                self.env.reset_task(idx)#task重置
                self.enc_replay_buffer.task_buffers[idx].clear()#清除对应的enc_bufffer

                # collect some trajectories with z ~ prior
                if self.num_steps_prior > 0:
                    print("\ncollect some trajectories with z ~ prior")
                    self.collect_data(self.num_steps_prior, 1, np.inf)#利用z的先验采集num_steps_prior条transition
                # collect some trajectories with z ~ posterior
                if self.num_steps_posterior > 0:
                    print("\ncollect some trajectories with z ~ posterior")
                    self.collect_data(self.num_steps_posterior, 1, self.update_post_train)#利用后验的z收集轨迹
                # even if encoder is trained only on samples from the prior, the policy needs to learn to handle z ~ posterior
                if self.num_extra_rl_steps_posterior > 0:
                    print("\ncollect some trajectories for policy update only")
                    self.collect_data(self.num_extra_rl_steps_posterior, 1, self.update_post_train, add_to_enc_buffer=False)#利用后验的z收集num_extra_rl_steps_posterior条轨迹,仅用于策略
            print("\nFinishing sample data from train tasks")
            # Sample train tasks and compute gradient updates on parameters.
            print("\nStrating Meta-training , Episode {}".format(it_))
            for train_step in range(self.num_train_steps_per_itr):#每轮迭代计算num_train_steps_per_itr次梯度              500x2000=1000000
                indices = np.random.choice(self.train_tasks, self.meta_batch)#train_tasks中随机取meta_batch个task , sample RL batch b~B
                if ((train_step + 1) % 500 == 0):
                    print("\nTraining step {}".format(train_step + 1))
                    print("Indices: {}".format(indices))
                    print("alpha:{}".format(self.alpha))
                self._do_training(indices)#梯度下降
                self._n_train_steps_total += 1
            gt.stamp('train')

            self.training_mode(False)

            # eval
            self._try_to_eval(it_)
            gt.stamp('eval')

            self._end_epoch()

    def pretrain(self):
        """
        Do anything before the main training phase.
        """
        pass

    def collect_data(self, num_samples, resample_z_rate, update_posterior_rate, add_to_enc_buffer=True):#在当前环境下,用当前self.agent.policy采样num_samples条轨迹
        '''
        get trajectories from current env in batch mode with given policy
        collect complete trajectories until the number of collected transitions >= num_samples

        :param agent: policy to rollout
        :param num_samples: total number of transitions to sample 总共采集多少条轨迹
        :param resample_z_rate: how often to resample latent context z (in units of trajectories),每采集多少条轨迹,利用q(z|c)前向传播采样一次z
        :param update_posterior_rate: how often to update q(z | c) from which z is sampled (in units of trajectories),每多少条轨迹更新一次推断网络q(z|c)
        :param add_to_enc_buffer: whether to add collected data to encoder replay buffer
        '''
        # start from the prior
        self.agent.clear_z()

        num_transitions = 0
        while num_transitions < num_samples:#paths, n_steps_total返回轨迹与总步数
            paths, n_samples = self.sampler.obtain_samples(max_samples=num_samples - num_transitions,#最大总步数
                                                                max_trajs=update_posterior_rate,#最大轨迹数量
                                                                accum_context=False,
                                                                resample=resample_z_rate)#resample_z_rate:根据c采样z的频率
            num_transitions += n_samples#步数总数+=采样步数
            self.replay_buffer.add_paths(self.task_idx, paths)#将该task下采集的轨迹加入经验池
            print("\n    buffer",self.task_idx, "size:", self.replay_buffer.task_buffers[self.task_idx].size())
            # time.sleep(1)
            # print("task id:", self.task_idx)
            # print("buffer ", self.task_idx, ":", self.replay_buffer.task_buffers[self.task_idx])
            # print("buffer ", self.task_idx, ":", self.replay_buffer.task_buffers[self.task_idx].__dict__.items())
            # print("buffer ", self.task_idx, ":", self.replay_buffer.task_buffers[self.task_idx])
            if add_to_enc_buffer:#是否加入encoder的buffer
                self.enc_replay_buffer.add_paths(self.task_idx, paths)
                # print("enc_buffer ", self.task_idx, ":", self.enc_replay_buffer.task_buffers[self.task_idx].__dict__.items())
                # print("enc_buffer ", self.task_idx, ":",self.enc_replay_buffer.task_buffers[self.task_idx])
                print("enc_buffer",self.task_idx, "size:", self.enc_replay_buffer.task_buffers[self.task_idx].size())
                # time.sleep(1)
            if update_posterior_rate != np.inf:#利用context更新后验z
                # context = self.prepare_context(self.task_idx)
                context = self.prepare_context(self.task_idx)
                self.agent.infer_posterior(context)
        self._n_env_steps_total += num_transitions
        gt.stamp('sample')
        # print("buffer ", self.task_idx, ":", self.replay_buffer.task_buffers[self.task_idx])
        # print("enc_buffer ", self.task_idx, ":", self.enc_replay_buffer.task_buffers[self.task_idx])

    def _try_to_eval(self, epoch):
        logger.save_extra_data(self.get_extra_data_to_save(epoch))
        if self._can_evaluate():
            self.evaluate(epoch)

            params = self.get_epoch_snapshot(epoch)
            logger.save_itr_params(epoch, params)
            table_keys = logger.get_table_key_set()
            if self._old_table_keys is not None:
                assert table_keys == self._old_table_keys, (
                    "Table keys cannot change from iteration to iteration."
                )
            self._old_table_keys = table_keys

            logger.record_tabular(
                "Number of train steps total",
                self._n_train_steps_total,
            )
            logger.record_tabular(
                "Number of env steps total",
                self._n_env_steps_total,
            )
            logger.record_tabular(
                "Number of rollouts total",
                self._n_rollouts_total,
            )

            times_itrs = gt.get_times().stamps.itrs
            train_time = times_itrs['train'][-1]
            sample_time = times_itrs['sample'][-1]
            eval_time = times_itrs['eval'][-1] if epoch > 0 else 0
            epoch_time = train_time + sample_time + eval_time
            total_time = gt.get_times().total

            logger.record_tabular('Train Time (s)', train_time)
            logger.record_tabular('(Previous) Eval Time (s)', eval_time)
            logger.record_tabular('Sample Time (s)', sample_time)
            logger.record_tabular('Epoch Time (s)', epoch_time)
            logger.record_tabular('Total Train Time (s)', total_time)

            logger.record_tabular("Epoch", epoch)
            logger.dump_tabular(with_prefix=False, with_timestamp=False)
        else:
            logger.log("Skipping eval for now.")

    def _can_evaluate(self):
        """
        One annoying thing about the logger table is that the keys at each
        iteration need to be the exact same. So unless you can compute
        everything, skip evaluation.

        A common example for why you might want to skip evaluation is that at
        the beginning of training, you may not have enough data for a
        validation and training set.

        :return:
        """
        # eval collects its own context, so can eval any time
        return True

    def _can_train(self):
        return all([self.replay_buffer.num_steps_can_sample(idx) >= self.batch_size for idx in self.train_tasks])

    def _get_action_and_info(self, agent, observation):
        """
        Get an action to take in the environment.
        :param observation:
        :return:
        """
        agent.set_num_steps_total(self._n_env_steps_total)
        return agent.get_action(observation,)

    def _start_epoch(self, epoch):
        self._epoch_start_time = time.time()
        self._exploration_paths = []
        self._do_train_time = 0
        logger.push_prefix('Iteration #%d | ' % epoch)

    def _end_epoch(self):
        logger.log("Epoch Duration: {0}".format(
            time.time() - self._epoch_start_time
        ))
        logger.log("Started Training: {0}".format(self._can_train()))
        logger.pop_prefix()

    ##### Snapshotting utils #####
    def get_epoch_snapshot(self, epoch):
        data_to_save = dict(
            epoch=epoch,
            exploration_policy=self.exploration_policy,
        )
        if self.save_environment:
            data_to_save['env'] = self.training_env
        return data_to_save

    def get_extra_data_to_save(self, epoch):
        """
        Save things that shouldn't be saved every snapshot but rather
        overwritten every time.
        :param epoch:
        :return:
        """
        if self.render:
            self.training_env.render(close=True)
        data_to_save = dict(
            epoch=epoch,
        )
        if self.save_environment:
            data_to_save['env'] = self.training_env
        if self.save_replay_buffer:
            data_to_save['replay_buffer'] = self.replay_buffer
        if self.save_algorithm:
            data_to_save['algorithm'] = self
        return data_to_save

    def collect_paths(self, idx, epoch, run):
        self.task_idx = idx
        self.env.reset_task(idx)

        self.agent.clear_z()
        paths = []
        num_transitions = 0
        num_trajs = 0
        while num_transitions < self.num_steps_per_eval:
            path, num = self.sampler.obtain_samples(deterministic=self.eval_deterministic, max_samples=self.num_steps_per_eval - num_transitions, max_trajs=1, accum_context=True)
            paths += path
            num_transitions += num
            num_trajs += 1
            if num_trajs >= self.num_exp_traj_eval:
                self.agent.infer_posterior(self.agent.context)#利用收集到的context,更新self.z

        if self.sparse_rewards:
            for p in paths:
                sparse_rewards = np.stack(e['sparse_reward'] for e in p['env_infos']).reshape(-1, 1)
                p['rewards'] = sparse_rewards

        goal = self.env._goal
        for path in paths:
            path['goal'] = goal # goal

        # save the paths for visualization, only useful for point mass
        if self.dump_eval_paths:
            logger.save_extra_data(paths, path='eval_trajectories/task{}-epoch{}-run{}'.format(idx, epoch, run))

        return paths

    def _do_eval(self, indices, epoch):
        final_returns = []
        online_returns = []
        for idx in indices:
            all_rets = []
            for r in range(self.num_evals):
                paths = self.collect_paths(idx, epoch, r)
                all_rets.append([eval_util.get_average_returns([p]) for p in paths])
            final_returns.append(np.mean([a[-1] for a in all_rets]))
            # record online returns for the first n trajectories
            n = min([len(a) for a in all_rets])
            all_rets = [a[:n] for a in all_rets]
            all_rets = np.mean(np.stack(all_rets), axis=0) # avg return per nth rollout
            online_returns.append(all_rets)
        n = min([len(t) for t in online_returns])
        online_returns = [t[:n] for t in online_returns]
        return final_returns, online_returns

    def evaluate(self, epoch):
        if self.eval_statistics is None:
            self.eval_statistics = OrderedDict()

        ### sample trajectories from prior for debugging / visualization
        if self.dump_eval_paths:
            # 100 arbitrarily chosen for visualizations of point_robot trajectories
            # just want stochasticity of z, not the policy
            self.agent.clear_z()
            prior_paths, _ = self.sampler.obtain_samples(deterministic=self.eval_deterministic, max_samples=self.max_path_length * 20,
                                                        accum_context=False,
                                                        resample=1)
            logger.save_extra_data(prior_paths, path='eval_trajectories/prior-epoch{}'.format(epoch))

        ### train tasks
        # eval on a subset of train tasks for speed
        indices = np.random.choice(self.train_tasks, len(self.eval_tasks))#indices是任务编号的集合,从train_tasks中选择eval_tasks个任务,组成indices
        eval_util.dprint('evaluating on {} train tasks'.format(len(indices)))
        print('\nevaluating on {} train tasks'.format(len(indices)))
        ### eval train tasks with posterior sampled from the training replay buffer
        train_returns = []
        for idx in indices:#对于所有需要评估的任务
            self.task_idx = idx
            self.env.reset_task(idx)
            paths = []
            for _ in range(self.num_steps_per_eval // self.max_path_length):#总共用多少条path进行评估 600/200
                # context = self.prepare_context(idx)#c~Sc(B)
                context = self.prepare_context(idx)
                # print("context:",context)
                self.agent.infer_posterior(context)#z~q(z|c)
                p, _ = self.sampler.obtain_samples(deterministic=self.eval_deterministic, max_samples=self.max_path_length,
                                                        accum_context=False,
                                                        max_trajs=1,
                                                        resample=np.inf)
                paths += p #收集200条paths

            if self.sparse_rewards:
                for p in paths:
                    sparse_rewards = np.stack(e['sparse_reward'] for e in p['env_infos']).reshape(-1, 1)
                    p['rewards'] = sparse_rewards

            train_returns.append(eval_util.get_average_returns(paths))#200条path的轨迹平均
        # print(" train_returns:{}".format(train_returns))
        train_returns = np.mean(train_returns)#把轨迹获得的奖赏加起来取平均
        # print(" train_returns:{}".format(train_returns))
        # time.sleep(5)
        ### eval train tasks with on-policy data to match eval of test tasks
        train_final_returns, train_online_returns = self._do_eval(indices, epoch)
        print("train_final_returns:{}".format(train_final_returns))
        # print("train_online_returns:{}".format(train_online_returns))
        eval_util.dprint('train online returns')
        eval_util.dprint(train_online_returns)

        ### test tasks
        eval_util.dprint('evaluating on {} test tasks'.format(len(self.eval_tasks)))
        print('\nevaluating on {} test tasks'.format(len(self.eval_tasks)))
        test_final_returns, test_online_returns = self._do_eval(self.eval_tasks, epoch)
        print("test_final_returns:{}".format(test_final_returns))
        # print("test_online_returns:{}".format(test_online_returns))
        eval_util.dprint('test online returns')
        eval_util.dprint(test_online_returns)

        # save the final posterior
        self.agent.log_diagnostics(self.eval_statistics)

        if hasattr(self.env, "log_diagnostics"):
            #self.env.log_diagnostics(paths, prefix=None)
            self.env.log_diagnostics(paths)

        avg_train_return = np.mean(train_final_returns)
        avg_test_return = np.mean(test_final_returns)
        # print("\ntrain_returns:{}".format(train_returns))
        print("\navg_train_return:{}".format(avg_train_return))
        print("avg_test_return:{}".format(avg_test_return))
        time.sleep(5)
        avg_train_online_return = np.mean(np.stack(train_online_returns), axis=0)
        avg_test_online_return = np.mean(np.stack(test_online_returns), axis=0)
        self.eval_statistics['AverageTrainReturn_all_train_tasks'] = train_returns
        self.eval_statistics['AverageReturn_all_train_tasks'] = avg_train_return
        self.eval_statistics['AverageReturn_all_test_tasks'] = avg_test_return
        logger.save_extra_data(avg_train_online_return, path='online-train-epoch{}'.format(epoch))
        logger.save_extra_data(avg_test_online_return, path='online-test-epoch{}'.format(epoch))

        for key, value in self.eval_statistics.items():
            logger.record_tabular(key, value)
        self.eval_statistics = None

        if self.render_eval_paths:
            self.env.render_paths(paths)

        if self.plotter:
            self.plotter.draw()

    @abc.abstractmethod
    def training_mode(self, mode):
        """
        Set training mode to `mode`.
        :param mode: If True, training will happen (e.g. set the dropout
        probabilities to not all ones).
        """
        pass

    @abc.abstractmethod
    def _do_training(self):
        """
        Perform some update, e.g. perform one gradient step.
        :return:
        """
        pass
Esempio n. 6
0
    def __init__(
            self,
            env,
            exploration_policy: ExplorationPolicy,
            expert_replay_buffer,
            training_env=None,
            num_epochs=100,
            num_steps_per_epoch=10000,
            num_steps_per_eval=1000,
            num_steps_between_updates=1000,
            min_steps_before_training=1000,
            max_path_length=1000,
            discount=0.99,
            replay_buffer_size=10000,
            render=False,
            save_replay_buffer=False,
            save_algorithm=False,
            save_environment=False,
            save_best=False,
            save_best_starting_from_epoch=0,
            eval_sampler=None,
            eval_policy=None,
            replay_buffer=None,
            policy_uses_pixels=False,
            wrap_absorbing=False,
            freq_saving=1,
            # some environment like halfcheetah_v2 have a timelimit that defines the terminal
            # this is used as a minor hack to turn off time limits
            no_terminal=False,
            policy_uses_task_params=False,
            concat_task_params_to_policy_obs=False
        ):
        """
        Base class for RL Algorithms
        :param env: Environment used to evaluate.
        :param exploration_policy: Policy used to explore
        :param training_env: Environment used by the algorithm. By default, a
        copy of `env` will be made.
        :param num_epochs:
        :param num_steps_per_epoch:
        :param num_steps_per_eval:
        :param num_updates_per_env_step: Used by online training mode.
        :param num_updates_per_epoch: Used by batch training mode.
        :param batch_size:
        :param max_path_length:
        :param discount:
        :param replay_buffer_size:
        :param render:
        :param save_replay_buffer:
        :param save_algorithm:
        :param save_environment:
        :param eval_sampler:
        :param eval_policy: Policy to evaluate with.
        :param replay_buffer:
        """
        self.training_env = training_env or pickle.loads(pickle.dumps(env))
        # self.training_env = training_env or deepcopy(env)
        self.exploration_policy = exploration_policy
        self.expert_replay_buffer = expert_replay_buffer
        self.num_epochs = num_epochs
        self.num_env_steps_per_epoch = num_steps_per_epoch
        self.num_steps_per_eval = num_steps_per_eval
        self.num_steps_between_updates = num_steps_between_updates
        self.min_steps_before_training = min_steps_before_training
        self.max_path_length = max_path_length
        self.discount = discount
        self.replay_buffer_size = replay_buffer_size
        self.render = render
        self.save_replay_buffer = save_replay_buffer
        self.save_algorithm = save_algorithm
        self.save_environment = save_environment
        self.save_best = save_best
        self.save_best_starting_from_epoch = save_best_starting_from_epoch
        self.policy_uses_pixels = policy_uses_pixels
        self.policy_uses_task_params = policy_uses_task_params
        self.concat_task_params_to_policy_obs = concat_task_params_to_policy_obs
        if eval_sampler is None:
            if eval_policy is None:
                eval_policy = exploration_policy
            eval_sampler = InPlacePathSampler(
                env=env,
                policy=eval_policy,
                max_samples=self.num_steps_per_eval + self.max_path_length,
                max_path_length=self.max_path_length, policy_uses_pixels=policy_uses_pixels,
                policy_uses_task_params=policy_uses_task_params,
                concat_task_params_to_policy_obs=concat_task_params_to_policy_obs
            )
        self.eval_policy = eval_policy
        self.eval_sampler = eval_sampler

        self.action_space = env.action_space
        self.obs_space = env.observation_space
        self.env = env
        if replay_buffer is None:
            replay_buffer = EnvReplayBuffer(
                self.replay_buffer_size,
                self.env,
                policy_uses_pixels=self.policy_uses_pixels,
                policy_uses_task_params=self.policy_uses_task_params,
                concat_task_params_to_policy_obs=self.concat_task_params_to_policy_obs
            )
        self.replay_buffer = replay_buffer

        self._n_env_steps_total = 0
        self._n_train_steps_total = 0
        self._n_rollouts_total = 0
        self._do_train_time = 0
        self._epoch_start_time = None
        self._algo_start_time = None
        self._old_table_keys = None
        self._current_path_builder = PathBuilder()
        self._exploration_paths = []
        self.wrap_absorbing = wrap_absorbing
        self.freq_saving = freq_saving
        self.no_terminal = no_terminal
Esempio n. 7
0
    def __init__(
        self,
        env,
        exploration_policy: ExplorationPolicy,
        training_env=None,
        num_epochs=100,
        num_steps_per_epoch=10000,
        num_steps_per_eval=1000,
        num_updates_per_env_step=1,
        batch_size=1024,
        max_path_length=1000,
        discount=0.99,
        replay_buffer_size=1000000,
        reward_scale=1,
        render=False,
        save_replay_buffer=False,
        save_algorithm=False,
        save_environment=False,
        eval_sampler=None,
        eval_policy=None,
        replay_buffer=None,
        demo_path=None,
        action_skip=1,
        experiment_name="default",
        mix_demo=False,
    ):
        """
        Base class for RL Algorithms
        :param env: Environment used to evaluate.
        :param exploration_policy: Policy used to explore
        :param training_env: Environment used by the algorithm. By default, a
        copy of `env` will be made.
        :param num_epochs:
        :param num_steps_per_epoch:
        :param num_steps_per_eval:
        :param num_updates_per_env_step: Used by online training mode.
        :param num_updates_per_epoch: Used by batch training mode.
        :param batch_size:
        :param max_path_length:
        :param discount:
        :param replay_buffer_size:
        :param reward_scale:
        :param render:
        :param save_replay_buffer:
        :param save_algorithm:
        :param save_environment:
        :param eval_sampler:
        :param eval_policy: Policy to evaluate with.
        :param replay_buffer:
        """

        ### TODO: look at NormalizedBoxEnv, do we need it? ###

        # self.training_env = training_env or gym.make("HalfCheetah-v2")
        self.training_env = training_env or MujocoManipEnv(
            env.env.__class__.__name__)
        self.exploration_policy = exploration_policy
        self.num_epochs = num_epochs
        self.num_env_steps_per_epoch = num_steps_per_epoch
        self.num_steps_per_eval = num_steps_per_eval
        self.num_updates_per_train_call = num_updates_per_env_step
        self.batch_size = batch_size
        self.max_path_length = max_path_length
        self.discount = discount
        self.replay_buffer_size = replay_buffer_size
        self.reward_scale = reward_scale
        self.render = render
        self.save_replay_buffer = save_replay_buffer
        self.save_algorithm = save_algorithm
        self.save_environment = save_environment
        if eval_sampler is None:
            if eval_policy is None:
                eval_policy = exploration_policy
            eval_sampler = InPlacePathSampler(
                env=env,
                policy=eval_policy,
                max_samples=self.num_steps_per_eval + self.max_path_length,
                max_path_length=self.max_path_length,
            )
        self.eval_policy = eval_policy
        self.eval_sampler = eval_sampler

        self.action_space = env.action_space
        self.obs_space = env.observation_space
        self.env = env
        if replay_buffer is None:
            replay_buffer = EnvReplayBuffer(
                self.replay_buffer_size,
                self.env,
            )
        self.replay_buffer = replay_buffer

        self.demo_sampler = None
        self.mix_demo = mix_demo
        if demo_path is not None:
            self.demo_sampler = DemoSampler(
                demo_path=demo_path,
                observation_dim=self.obs_space.shape[0],
                action_dim=self.action_space.shape[0],
                preload=True)
        self.action_skip = action_skip
        self.action_skip_count = 0

        self._n_env_steps_total = 0
        self._n_train_steps_total = 0
        self._n_rollouts_total = 0
        self._do_train_time = 0
        self._epoch_start_time = None
        self._algo_start_time = None
        self._old_table_keys = None
        self._current_path_builder = PathBuilder()
        self._exploration_paths = []

        t_now = time.time()
        time_str = datetime.datetime.fromtimestamp(t_now).strftime(
            '%Y%m%d%H%M%S')
        os.makedirs(os.path.join(LOCAL_EXP_PATH, experiment_name, time_str))
        self._writer = SummaryWriter(
            os.path.join(LOCAL_EXP_PATH, experiment_name, time_str))
Esempio n. 8
0
    def __init__(
        self,
        env,
        exploration_policy: ExplorationPolicy,
        training_env=None,
        num_epochs=100,
        num_steps_per_epoch=10000,
        num_steps_per_eval=1000,
        num_updates_per_env_step=1,
        num_updates_per_epoch=None,
        batch_size=1024,
        max_path_length=1000,
        discount=0.99,
        replay_buffer_size=1000000,
        reward_scale=1,
        min_num_steps_before_training=None,
        render=False,
        save_replay_buffer=False,
        save_algorithm=False,
        save_environment=True,
        eval_sampler=None,
        eval_policy=None,
        replay_buffer=None,
        collection_mode='online',
    ):
        """
        Base class for RL Algorithms

        :param env: Environment used to evaluate.
        :param exploration_policy: Policy used to explore
        :param training_env: Environment used by the algorithm. By default, a
        copy of `env` will be made for training, so that training and
        evaluation are completely independent.
        :param num_epochs:
        :param num_steps_per_epoch:
        :param num_steps_per_eval:
        :param num_updates_per_env_step: Used by online training mode.
        :param num_updates_per_epoch: Used by batch training mode.
        :param batch_size:
        :param max_path_length:
        :param discount:
        :param replay_buffer_size:
        :param reward_scale:
        :param min_num_steps_before_training:
        :param render:
        :param save_replay_buffer:
        :param save_algorithm:
        :param save_environment:
        :param eval_sampler:
        :param eval_policy: Policy to evaluate with.
        :param replay_buffer:
        :param collection_mode: String determining how training happens
         - 'online': Train after every step taken in the environment.
         - 'batch': Train after every epoch.
        """
        assert collection_mode in ['online', 'batch']
        if collection_mode == 'batch':
            assert num_updates_per_epoch is not None

        self.training_env = training_env  #or pickle.loads(pickle.dumps(env))
        self.exploration_policy = exploration_policy
        self.num_epochs = num_epochs
        self.num_env_steps_per_epoch = num_steps_per_epoch
        self.num_steps_per_eval = num_steps_per_eval
        if collection_mode == 'online':
            self.num_updates_per_train_call = num_updates_per_env_step
        else:
            self.num_updates_per_train_call = num_updates_per_epoch
        self.batch_size = batch_size
        self.max_path_length = max_path_length
        self.discount = discount
        self.replay_buffer_size = replay_buffer_size
        self.reward_scale = reward_scale
        self.render = render
        self.collection_mode = collection_mode
        self.save_replay_buffer = save_replay_buffer
        self.save_algorithm = save_algorithm
        self.save_environment = save_environment
        if min_num_steps_before_training is None:
            min_num_steps_before_training = self.num_env_steps_per_epoch
        self.min_num_steps_before_training = min_num_steps_before_training
        if eval_sampler is None:
            if eval_policy is None:
                eval_policy = exploration_policy
            eval_sampler = InPlacePathSampler(
                env=env,
                policy=eval_policy,
                max_samples=self.num_steps_per_eval + self.max_path_length,
                max_path_length=self.max_path_length,
            )
        self.eval_policy = eval_policy
        self.eval_sampler = eval_sampler
        self.eval_statistics = OrderedDict()
        self.need_to_update_eval_statistics = True

        self.action_space = env.action_space
        self.obs_space = env.observation_space
        self.env = env
        if replay_buffer is None:
            replay_buffer = EnvReplayBuffer(
                self.replay_buffer_size,
                self.env,
            )
        self.replay_buffer = replay_buffer

        self._n_env_steps_total = 0
        self._n_train_steps_total = 0
        self._n_rollouts_total = 0
        self._do_train_time = 0
        self._epoch_start_time = None
        self._algo_start_time = None
        self._old_table_keys = None
        self._current_path_builder = PathBuilder()
        self._exploration_paths = []
        self.post_epoch_funcs = []
class MetaRLAlgorithm(metaclass=abc.ABCMeta):
    def __init__(
        self,
        env,
        agent,
        train_tasks,
        eval_tasks,
        meta_batch=64,
        num_iterations=100,
        num_train_steps_per_itr=1000,
        num_initial_steps=100,
        num_tasks_sample=100,
        num_steps_prior=100,
        num_steps_posterior=100,
        num_extra_rl_steps_posterior=100,
        num_evals=10,
        num_steps_per_eval=1000,
        batch_size=1024,
        embedding_batch_size=1024,
        embedding_mini_batch_size=1024,
        max_path_length=1000,
        discount=0.99,
        replay_buffer_size=1000000,
        reward_scale=1,
        num_exp_traj_eval=1,
        update_post_train=1,
        eval_deterministic=True,
        render=False,
        save_replay_buffer=False,
        save_algorithm=False,
        save_environment=False,
        render_eval_paths=False,
        dump_eval_paths=False,
        plotter=None,
        dyna=False,
        dyna_num_train_itr=50,
        dyna_num_train_steps_per_itr=50,
        dyna_tandem_train=True,
        dyna_n_layers=3,
        dyna_hidden_layer_size=64,
        dyna_learning_rate=1e-3,
    ):
        """
        :param env: training env
        :param agent: agent that is conditioned on a latent variable z that rl_algorithm is responsible for feeding in
        :param train_tasks: list of tasks used for training
        :param eval_tasks: list of tasks used for eval

        see default experiment config file for descriptions of the rest of the arguments
        """
        self.env = env
        self.agent = agent
        self.exploration_agent = agent  # Can potentially use a different policy purely for exploration rather than also solving tasks, currently not being used
        self.train_tasks = train_tasks
        self.eval_tasks = eval_tasks
        self.meta_batch = meta_batch
        self.num_iterations = num_iterations
        self.num_train_steps_per_itr = num_train_steps_per_itr
        self.num_initial_steps = num_initial_steps
        self.num_tasks_sample = num_tasks_sample
        self.num_steps_prior = num_steps_prior
        self.num_steps_posterior = num_steps_posterior
        self.num_extra_rl_steps_posterior = num_extra_rl_steps_posterior
        self.num_evals = num_evals
        self.num_steps_per_eval = num_steps_per_eval
        self.batch_size = batch_size
        self.embedding_batch_size = embedding_batch_size
        self.embedding_mini_batch_size = embedding_mini_batch_size
        self.max_path_length = max_path_length
        self.discount = discount
        self.replay_buffer_size = replay_buffer_size
        self.reward_scale = reward_scale
        self.update_post_train = update_post_train
        self.num_exp_traj_eval = num_exp_traj_eval
        self.eval_deterministic = eval_deterministic
        self.render = render
        self.save_replay_buffer = save_replay_buffer
        self.save_algorithm = save_algorithm
        self.save_environment = save_environment

        self.eval_statistics = None
        self.render_eval_paths = render_eval_paths
        self.dump_eval_paths = dump_eval_paths
        self.plotter = plotter

        self.dyna = dyna
        self.dyna_num_train_itr = dyna_num_train_itr
        self.dyna_num_train_steps_per_itr = dyna_num_train_steps_per_itr
        self.dyna_tandem_train = dyna_tandem_train
        self.dyna_n_layers = dyna_n_layers
        self.dyna_hidden_layer_size = dyna_hidden_layer_size
        self.dyna_learning_rate = dyna_learning_rate

        if dyna:
            self.sampler = DynamicsSampler(
                env=env,
                policy=agent,
                max_path_length=self.max_path_length,
                num_train_itr=dyna_num_train_itr,
                num_train_steps_per_itr=dyna_num_train_steps_per_itr,
                tandem_train=dyna_tandem_train,
                n_layers=dyna_n_layers,
                hidden_layer_size=dyna_hidden_layer_size,
                learning_rate=dyna_learning_rate,
            )
        else:
            self.sampler = InPlacePathSampler(
                env=env,
                policy=agent,
                max_path_length=self.max_path_length,
            )

        # separate replay buffers for
        # - training RL update
        # - training encoder update
        self.replay_buffer = MultiTaskReplayBuffer(
            self.replay_buffer_size,
            env,
            self.train_tasks,
        )

        self.enc_replay_buffer = MultiTaskReplayBuffer(
            self.replay_buffer_size,
            env,
            self.train_tasks,
        )

        self._n_env_steps_total = 0
        self._n_train_steps_total = 0
        self._n_rollouts_total = 0
        self._do_train_time = 0
        self._epoch_start_time = None
        self._algo_start_time = None
        self._old_table_keys = None
        self._current_path_builder = PathBuilder()
        self._exploration_paths = []

    def make_exploration_policy(self, policy):
        return policy

    def make_eval_policy(self, policy):
        return policy

    def sample_task(self, is_eval=False):
        '''
        sample task randomly
        '''
        if is_eval:
            idx = np.random.randint(len(self.eval_tasks))
        else:
            idx = np.random.randint(len(self.train_tasks))
        return idx

    def train(self):
        '''
        meta-training loop
        '''
        self.pretrain()
        params = self.get_epoch_snapshot(-1)
        logger.save_itr_params(-1, params)
        gt.reset()
        gt.set_def_unique(False)
        self._current_path_builder = PathBuilder()

        # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate
        for it_ in gt.timed_for(
                range(self.num_iterations),
                save_itrs=True,
        ):
            self._start_epoch(it_)
            self.training_mode(True)
            if it_ == 0:
                print('collecting initial pool of data for train and eval')
                # temp for evaluating
                for idx in self.train_tasks:
                    self.task_idx = idx
                    self.env.reset_task(idx)
                    self.collect_data(self.num_initial_steps, 1, np.inf)
            # Sample data from train tasks.
            for i in range(self.num_tasks_sample):
                idx = np.random.randint(len(self.train_tasks))
                self.task_idx = idx
                self.env.reset_task(idx)
                self.enc_replay_buffer.task_buffers[idx].clear()

                # collect some trajectories with z ~ prior
                if self.num_steps_prior > 0:
                    self.collect_data(self.num_steps_prior, 1, np.inf)
                # collect some trajectories with z ~ posterior
                if self.num_steps_posterior > 0:
                    self.collect_data(self.num_steps_posterior, 1,
                                      self.update_post_train)
                # even if encoder is trained only on samples from the prior, the policy needs to learn to handle z ~ posterior
                if self.num_extra_rl_steps_posterior > 0:
                    self.collect_data(self.num_extra_rl_steps_posterior,
                                      1,
                                      self.update_post_train,
                                      add_to_enc_buffer=False)

            # Sample train tasks and compute gradient updates on parameters.
            for train_step in range(self.num_train_steps_per_itr):
                indices = np.random.choice(self.train_tasks, self.meta_batch)
                self._do_training(indices)
                self._n_train_steps_total += 1
            gt.stamp('train')

            self.training_mode(False)

            # eval
            self._try_to_eval(it_)
            gt.stamp('eval')

            self._end_epoch()

    def pretrain(self):
        """
        Do anything before the main training phase.
        """
        pass

    def collect_data(self,
                     num_samples,
                     resample_z_rate,
                     update_posterior_rate,
                     add_to_enc_buffer=True):
        '''
        get trajectories from current env in batch mode with given policy
        collect complete trajectories until the number of collected transitions >= num_samples

        :param agent: policy to rollout
        :param num_samples: total number of transitions to sample
        :param resample_z_rate: how often to resample latent context z (in units of trajectories)
        :param update_posterior_rate: how often to update q(z | c) from which z is sampled (in units of trajectories)
        :param add_to_enc_buffer: whether to add collected data to encoder replay buffer
        '''
        # start from the prior
        self.agent.clear_z()

        num_transitions = 0
        while num_transitions < num_samples:
            paths, n_samples = self.sampler.obtain_samples(
                max_samples=num_samples - num_transitions,
                max_trajs=update_posterior_rate,
                accum_context=False,
                resample=resample_z_rate,
                testing=False)
            num_transitions += n_samples
            self.replay_buffer.add_paths(self.task_idx, paths)
            if add_to_enc_buffer:
                self.enc_replay_buffer.add_paths(self.task_idx, paths)
            if update_posterior_rate != np.inf:
                context = self.sample_context(self.task_idx)
                self.agent.infer_posterior(context)
        self._n_env_steps_total += num_transitions
        gt.stamp('sample')

    def _try_to_eval(self, epoch):
        logger.save_extra_data(self.get_extra_data_to_save(epoch))
        if self._can_evaluate():
            self.evaluate(epoch)

            params = self.get_epoch_snapshot(epoch)
            logger.save_itr_params(epoch, params)
            table_keys = logger.get_table_key_set()
            if self._old_table_keys is not None:
                assert table_keys == self._old_table_keys, (
                    "Table keys cannot change from iteration to iteration.")
            self._old_table_keys = table_keys

            logger.record_tabular(
                "Number of train steps total",
                self._n_train_steps_total,
            )
            logger.record_tabular(
                "Number of env steps total",
                self._n_env_steps_total,
            )
            logger.record_tabular(
                "Number of rollouts total",
                self._n_rollouts_total,
            )

            times_itrs = gt.get_times().stamps.itrs
            train_time = times_itrs['train'][-1]
            sample_time = times_itrs['sample'][-1]
            eval_time = times_itrs['eval'][-1] if epoch > 0 else 0
            epoch_time = train_time + sample_time + eval_time
            total_time = gt.get_times().total

            logger.record_tabular('Train Time (s)', train_time)
            logger.record_tabular('(Previous) Eval Time (s)', eval_time)
            logger.record_tabular('Sample Time (s)', sample_time)
            logger.record_tabular('Epoch Time (s)', epoch_time)
            logger.record_tabular('Total Train Time (s)', total_time)

            logger.record_tabular("Epoch", epoch)
            logger.dump_tabular(with_prefix=False, with_timestamp=False)
        else:
            logger.log("Skipping eval for now.")

    def _can_evaluate(self):
        """
        One annoying thing about the logger table is that the keys at each
        iteration need to be the exact same. So unless you can compute
        everything, skip evaluation.

        A common example for why you might want to skip evaluation is that at
        the beginning of training, you may not have enough data for a
        validation and training set.

        :return:
        """
        # eval collects its own context, so can eval any time
        return True

    def _can_train(self):
        return all([
            self.replay_buffer.num_steps_can_sample(idx) >= self.batch_size
            for idx in self.train_tasks
        ])

    def _get_action_and_info(self, agent, observation):
        """
        Get an action to take in the environment.
        :param observation:
        :return:
        """
        agent.set_num_steps_total(self._n_env_steps_total)
        return agent.get_action(observation, )

    def _start_epoch(self, epoch):
        self._epoch_start_time = time.time()
        self._exploration_paths = []
        self._do_train_time = 0
        logger.push_prefix('Iteration #%d | ' % epoch)

    def _end_epoch(self):
        logger.log("Epoch Duration: {0}".format(time.time() -
                                                self._epoch_start_time))
        logger.log("Started Training: {0}".format(self._can_train()))
        logger.pop_prefix()

    ##### Snapshotting utils #####
    def get_epoch_snapshot(self, epoch):
        data_to_save = dict(
            epoch=epoch,
            exploration_policy=self.exploration_policy,
        )
        if self.save_environment:
            data_to_save['env'] = self.training_env
        return data_to_save

    def get_extra_data_to_save(self, epoch):
        """
        Save things that shouldn't be saved every snapshot but rather
        overwritten every time.
        :param epoch:
        :return:
        """
        if self.render:
            self.training_env.render(close=True)
        data_to_save = dict(epoch=epoch, )
        if self.save_environment:
            data_to_save['env'] = self.training_env
        if self.save_replay_buffer:
            data_to_save['replay_buffer'] = self.replay_buffer
        if self.save_algorithm:
            data_to_save['algorithm'] = self
        return data_to_save

    def collect_paths(self, idx, epoch, run):
        self.task_idx = idx
        self.env.reset_task(idx)

        self.agent.clear_z()
        paths = []
        num_transitions = 0
        num_trajs = 0
        while num_transitions < self.num_steps_per_eval:
            path, num = self.sampler.obtain_samples(
                deterministic=self.eval_deterministic,
                max_samples=self.num_steps_per_eval - num_transitions,
                max_trajs=1,
                accum_context=True,
                testing=True)
            paths += path
            num_transitions += num
            num_trajs += 1
            if num_trajs >= self.num_exp_traj_eval:
                self.agent.infer_posterior(self.agent.context)

        if self.sparse_rewards:
            for p in paths:
                sparse_rewards = np.stack(
                    e['sparse_reward'] for e in p['env_infos']).reshape(-1, 1)
                p['rewards'] = sparse_rewards

        goal = self.env._goal
        for path in paths:
            path['goal'] = goal  # goal

        # save the paths for visualization, only useful for point mass
        if self.dump_eval_paths:
            logger.save_extra_data(
                paths,
                path='eval_trajectories/task{}-epoch{}-run{}'.format(
                    idx, epoch, run))

        return paths

    def _do_eval(self, indices, epoch):
        final_returns = []
        online_returns = []
        for idx in indices:
            all_rets = []
            for r in range(self.num_evals):
                paths = self.collect_paths(idx, epoch, r)
                all_rets.append(
                    [eval_util.get_average_returns([p]) for p in paths])
            final_returns.append(np.mean([a[-1] for a in all_rets]))
            # record online returns for the first n trajectories
            n = min([len(a) for a in all_rets])
            all_rets = [a[:n] for a in all_rets]
            all_rets = np.mean(np.stack(all_rets),
                               axis=0)  # avg return per nth rollout
            online_returns.append(all_rets)
        n = min([len(t) for t in online_returns])
        online_returns = [t[:n] for t in online_returns]
        return final_returns, online_returns

    def evaluate(self, epoch):
        if self.eval_statistics is None:
            self.eval_statistics = OrderedDict()

        ### sample trajectories from prior for debugging / visualization
        if self.dump_eval_paths:
            # 100 arbitrarily chosen for visualizations of point_robot trajectories
            # just want stochasticity of z, not the policy
            self.agent.clear_z()
            prior_paths, _ = self.sampler.obtain_samples(
                deterministic=self.eval_deterministic,
                max_samples=self.max_path_length * 20,
                accum_context=False,
                resample=1,
                testing=True)
            logger.save_extra_data(
                prior_paths,
                path='eval_trajectories/prior-epoch{}'.format(epoch))

        ### train tasks
        # eval on a subset of train tasks for speed
        indices = np.random.choice(self.train_tasks, len(self.eval_tasks))
        eval_util.dprint('evaluating on {} train tasks'.format(len(indices)))
        ### eval train tasks with posterior sampled from the training replay buffer
        train_returns = []
        for idx in indices:
            self.task_idx = idx
            self.env.reset_task(idx)
            paths = []
            for _ in range(self.num_steps_per_eval // self.max_path_length):
                context = self.sample_context(idx)
                self.agent.infer_posterior(context)
                p, _ = self.sampler.obtain_samples(
                    deterministic=self.eval_deterministic,
                    max_samples=self.max_path_length,
                    accum_context=False,
                    max_trajs=1,
                    resample=np.inf,
                    testing=True)
                paths += p

            if self.sparse_rewards:
                for p in paths:
                    sparse_rewards = np.stack(e['sparse_reward']
                                              for e in p['env_infos']).reshape(
                                                  -1, 1)
                    p['rewards'] = sparse_rewards

            train_returns.append(eval_util.get_average_returns(paths))
        train_returns = np.mean(train_returns)
        ### eval train tasks with on-policy data to match eval of test tasks
        train_final_returns, train_online_returns = self._do_eval(
            indices, epoch)
        eval_util.dprint('train online returns')
        eval_util.dprint(train_online_returns)

        ### test tasks
        eval_util.dprint('evaluating on {} test tasks'.format(
            len(self.eval_tasks)))
        test_final_returns, test_online_returns = self._do_eval(
            self.eval_tasks, epoch)
        eval_util.dprint('test online returns')
        eval_util.dprint(test_online_returns)

        # save the final posterior
        self.agent.log_diagnostics(self.eval_statistics)

        if hasattr(self.env, "log_diagnostics"):
            self.env.log_diagnostics(paths, prefix=None)

        avg_train_return = np.mean(train_final_returns)
        avg_test_return = np.mean(test_final_returns)
        avg_train_online_return = np.mean(np.stack(train_online_returns),
                                          axis=0)
        avg_test_online_return = np.mean(np.stack(test_online_returns), axis=0)
        self.eval_statistics[
            'AverageTrainReturn_all_train_tasks'] = train_returns
        self.eval_statistics[
            'AverageReturn_all_train_tasks'] = avg_train_return
        self.eval_statistics['AverageReturn_all_test_tasks'] = avg_test_return
        logger.save_extra_data(avg_train_online_return,
                               path='online-train-epoch{}'.format(epoch))
        logger.save_extra_data(avg_test_online_return,
                               path='online-test-epoch{}'.format(epoch))

        for key, value in self.eval_statistics.items():
            logger.record_tabular(key, value)
        self.eval_statistics = None

        if self.render_eval_paths:
            self.env.render_paths(paths)

        if self.plotter:
            self.plotter.draw()

    @abc.abstractmethod
    def training_mode(self, mode):
        """
        Set training mode to `mode`.
        :param mode: If True, training will happen (e.g. set the dropout
        probabilities to not all ones).
        """
        pass

    @abc.abstractmethod
    def _do_training(self):
        """
        Perform some update, e.g. perform one gradient step.
        :return:
        """
        pass
Esempio n. 10
0
    def __init__(
            self,
            env,
            policy,
            train_tasks,
            eval_tasks,
            meta_batch=64,
            num_iterations=100,
            num_train_steps_per_itr=1000,
            num_tasks_sample=100,
            num_steps_per_task=100,
            num_evals=10,
            num_steps_per_eval=1000,
            batch_size=1024,
            embedding_batch_size=1024,
            embedding_mini_batch_size=1024,
            max_path_length=1000,
            discount=0.99,
            replay_buffer_size=1000000,  #1000000,
            reward_scale=1,
            train_embedding_source='posterior_only',
            eval_embedding_source='initial_pool',
            eval_deterministic=True,
            render=False,
            save_replay_buffer=False,
            save_algorithm=False,
            save_environment=False,
            obs_emb_dim=0):
        """
        Base class for Meta RL Algorithms
        :param env: training env
        :param policy: policy that is conditioned on a latent variable z that rl_algorithm is responsible for feeding in
        :param train_tasks: list of tasks used for training
        :param eval_tasks: list of tasks used for eval
        :param meta_batch: number of tasks used for meta-update
        :param num_iterations: number of meta-updates taken
        :param num_train_steps_per_itr: number of meta-updates performed per iteration
        :param num_tasks_sample: number of train tasks to sample to collect data for
        :param num_steps_per_task: number of transitions to collect per task
        :param num_evals: number of independent evaluation runs, with separate task encodings
        :param num_steps_per_eval: number of transitions to sample for evaluation
        :param batch_size: size of batches used to compute RL update
        :param embedding_batch_size: size of batches used to compute embedding
        :param embedding_mini_batch_size: size of batch used for encoder update
        :param max_path_length: max episode length
        :param discount:
        :param replay_buffer_size: max replay buffer size
        :param reward_scale:
        :param render:
        :param save_replay_buffer:
        :param save_algorithm:
        :param save_environment:
        """
        self.env = env
        self.policy = policy
        self.exploration_policy = policy  # Can potentially use a different policy purely for exploration rather than also solving tasks, currently not being used
        self.train_tasks = train_tasks
        self.eval_tasks = eval_tasks
        self.meta_batch = meta_batch
        self.num_iterations = num_iterations
        self.num_train_steps_per_itr = num_train_steps_per_itr
        self.num_tasks_sample = num_tasks_sample
        self.num_steps_per_task = num_steps_per_task
        self.num_evals = num_evals
        self.num_steps_per_eval = num_steps_per_eval
        self.batch_size = batch_size
        self.embedding_batch_size = embedding_batch_size
        self.embedding_mini_batch_size = embedding_mini_batch_size
        self.max_path_length = max_path_length
        self.discount = discount
        self.replay_buffer_size = min(
            int(replay_buffer_size / (len(train_tasks))), 1000)
        self.reward_scale = reward_scale
        self.train_embedding_source = train_embedding_source
        self.eval_embedding_source = eval_embedding_source  # TODO: add options for computing embeddings on train tasks too
        self.eval_deterministic = eval_deterministic
        self.render = render
        self.save_replay_buffer = save_replay_buffer
        self.save_algorithm = save_algorithm
        self.save_environment = save_environment

        self.eval_sampler = InPlacePathSampler(
            env=env,
            policy=policy,
            max_samples=self.num_steps_per_eval,
            max_path_length=self.max_path_length,
        )

        # separate replay buffers for
        # - training RL update
        # - training encoder update
        # - testing encoder
        self.replay_buffer = MultiTaskReplayBuffer(self.replay_buffer_size,
                                                   env,
                                                   self.train_tasks,
                                                   state_dim=obs_emb_dim)

        self.enc_replay_buffer = MultiTaskReplayBuffer(self.replay_buffer_size,
                                                       env,
                                                       self.train_tasks,
                                                       state_dim=obs_emb_dim)
        self.eval_enc_replay_buffer = MultiTaskReplayBuffer(
            self.replay_buffer_size,
            env,
            self.eval_tasks,
            state_dim=obs_emb_dim)

        self._n_env_steps_total = 0
        self._n_train_steps_total = 0
        self._n_rollouts_total = 0
        self._do_train_time = 0
        self._epoch_start_time = None
        self._algo_start_time = None
        self._old_table_keys = None
        self._current_path_builder = PathBuilder()
        self._exploration_paths = []
Esempio n. 11
0
    def __init__(
            self,
            env_sampler,
            exploration_policy: ExplorationPolicy,
            neural_process,
            train_neural_process=False,
            latent_repr_mode='concat_params',  # OR concat_samples
            num_latent_samples=5,
            num_epochs=100,
            num_steps_per_epoch=10000,
            num_steps_per_eval=1000,
            num_updates_per_env_step=1,
            batch_size=1024,
            max_path_length=1000,
            discount=0.99,
            replay_buffer_size=1000000,
            reward_scale=1,
            render=False,
            save_replay_buffer=False,
            save_algorithm=False,
            save_environment=False,
            eval_sampler=None,
            eval_policy=None,
            replay_buffer=None,
            epoch_to_start_training=0):
        """
        Base class for RL Algorithms
        :param env: Environment used to evaluate.
        :param exploration_policy: Policy used to explore
        :param training_env: Environment used by the algorithm. By default, a
        copy of `env` will be made.
        :param num_epochs:
        :param num_steps_per_epoch:
        :param num_steps_per_eval:
        :param num_updates_per_env_step: Used by online training mode.
        :param num_updates_per_epoch: Used by batch training mode.
        :param batch_size:
        :param max_path_length:
        :param discount:
        :param replay_buffer_size:
        :param reward_scale:
        :param render:
        :param save_replay_buffer:
        :param save_algorithm:
        :param save_environment:
        :param eval_sampler:
        :param eval_policy: Policy to evaluate with.
        :param replay_buffer:
        """
        assert not train_neural_process, 'Have not implemented it yet! Remember to set it to train mode when training'
        self.neural_process = neural_process
        self.neural_process.set_mode('eval')
        self.latent_repr_mode = latent_repr_mode
        self.num_latent_samples = num_latent_samples
        self.env_sampler = env_sampler
        env, env_specs = env_sampler()
        self.training_env, _ = env_sampler(env_specs)
        # self.training_env = training_env or pickle.loads(pickle.dumps(env))
        # self.training_env = training_env or deepcopy(env)
        self.exploration_policy = exploration_policy
        self.num_epochs = num_epochs
        self.num_env_steps_per_epoch = num_steps_per_epoch
        self.num_steps_per_eval = num_steps_per_eval
        self.num_updates_per_train_call = num_updates_per_env_step
        self.batch_size = batch_size
        self.max_path_length = max_path_length
        self.discount = discount
        self.replay_buffer_size = replay_buffer_size
        self.reward_scale = reward_scale
        self.render = render
        self.save_replay_buffer = save_replay_buffer
        self.save_algorithm = save_algorithm
        self.save_environment = save_environment
        self.epoch_to_start_training = epoch_to_start_training

        if self.latent_repr_mode == 'concat_params':

            def get_latent_repr(posterior_state):
                z_mean, z_cov = self.neural_process.get_posterior_params(
                    posterior_state)
                return np.concatenate([z_mean, z_cov])

            self.extra_obs_dim = 2 * self.neural_process.z_dim
        else:

            def get_latent_repr(posterior_state):
                z_mean, z_cov = self.neural_process.get_posterior_params(
                    posterior_state)
                samples = np.random.multivariate_normal(
                    z_mean, np.diag(z_cov), self.num_latent_samples)
                samples = samples.flatten()
                return samples

            self.extra_obs_dim = self.num_latent_samples * self.neural_process.z_dim
        self.get_latent_repr = get_latent_repr

        if eval_sampler is None:
            if eval_policy is None:
                eval_policy = exploration_policy
            eval_sampler = InPlacePathSampler(
                env=env,
                policy=eval_policy,
                max_samples=self.num_steps_per_eval + self.max_path_length,
                max_path_length=self.max_path_length,
                neural_process=neural_process,
                latent_repr_fn=get_latent_repr,
                reward_scale=reward_scale)
        self.eval_policy = eval_policy
        self.eval_sampler = eval_sampler

        self.action_space = env.action_space
        self.obs_space = env.observation_space

        self.env = env
        obs_space_dim = gym_get_dim(self.obs_space)
        act_space_dim = gym_get_dim(self.action_space)
        if replay_buffer is None:
            replay_buffer = SimpleReplayBuffer(
                self.replay_buffer_size,
                obs_space_dim + self.extra_obs_dim,
                act_space_dim,
                discrete_action_dim=isinstance(self.action_space, Discrete))
        self.replay_buffer = replay_buffer

        self._n_env_steps_total = 0
        self._n_train_steps_total = 0
        self._n_rollouts_total = 0
        self._do_train_time = 0
        self._epoch_start_time = None
        self._algo_start_time = None
        self._old_table_keys = None
        self._current_path_builder = PathBuilder()
        self._exploration_paths = []
    def __init__(
        self,
        env,
        agent,
        train_goals,
        wd_goals,
        ood_goals,
        replay_buffers,
        meta_batch_size=64,
        num_iterations=100,
        num_train_steps_per_itr=1000,
        num_tasks=100,
        num_steps_prior=100,
        num_steps_posterior=100,
        num_extra_rl_steps_posterior=100,
        num_evals=10,
        num_steps_per_eval=1000,
        max_path_length=1000,
        discount=0.99,
        reward_scale=1,
        num_exp_traj_eval=1,
        eval_deterministic=True,
        render=False,
        save_replay_buffer=False,
        save_algorithm=False,
        save_environment=False,
        render_eval_paths=False,
        dump_eval_paths=False,
        plotter=None,
        use_same_context=True,
        recurrent=False,
    ):
        """
        :param env: training env
        :param agent: agent that is conditioned on a latent variable z that rl_algorithm is responsible for feeding in
        :param train_tasks: list of tasks used for training
        :param eval_tasks: list of tasks used for eval

        see default experiment config file for descriptions of the rest of the arguments
        """
        self.env = env
        self.agent = agent
        self.train_goals = train_goals
        self.wd_goals = wd_goals
        self.ood_goals = ood_goals
        self.replay_buffers = replay_buffers
        self.num_iterations = num_iterations
        self.num_train_steps_per_itr = num_train_steps_per_itr
        self.meta_batch_size = meta_batch_size
        self.num_evals = num_evals
        self.num_steps_per_eval = num_steps_per_eval
        self.max_path_length = max_path_length
        self.discount = discount
        self.reward_scale = reward_scale
        self.num_exp_traj_eval = num_exp_traj_eval
        self.eval_deterministic = eval_deterministic
        self.render = render
        self.save_replay_buffer = save_replay_buffer
        self.save_algorithm = save_algorithm
        self.save_environment = save_environment
        self.use_same_context = use_same_context
        self.recurrent = recurrent

        self.eval_statistics = None
        self.render_eval_paths = render_eval_paths
        self.dump_eval_paths = dump_eval_paths
        self.plotter = plotter

        self.sampler = InPlacePathSampler(
            env=env,
            policy=agent,
            max_path_length=self.max_path_length,
        )

        # separate replay buffers for
        # - training RL update
        # - training encoder update

        self._n_env_steps_total = 0
        self._n_train_steps_total = 0
        self._n_rollouts_total = 0
        self._do_train_time = 0
        self._epoch_start_time = None
        self._algo_start_time = None
        self._old_table_keys = None
        self._current_path_builder = PathBuilder()
        self._exploration_paths = []
class MetaRLAlgorithm(metaclass=abc.ABCMeta):
    def __init__(
        self,
        env,
        agent,
        train_goals,
        wd_goals,
        ood_goals,
        replay_buffers,
        meta_batch_size=64,
        num_iterations=100,
        num_train_steps_per_itr=1000,
        num_tasks=100,
        num_steps_prior=100,
        num_steps_posterior=100,
        num_extra_rl_steps_posterior=100,
        num_evals=10,
        num_steps_per_eval=1000,
        max_path_length=1000,
        discount=0.99,
        reward_scale=1,
        num_exp_traj_eval=1,
        eval_deterministic=True,
        render=False,
        save_replay_buffer=False,
        save_algorithm=False,
        save_environment=False,
        render_eval_paths=False,
        dump_eval_paths=False,
        plotter=None,
        use_same_context=True,
        recurrent=False,
    ):
        """
        :param env: training env
        :param agent: agent that is conditioned on a latent variable z that rl_algorithm is responsible for feeding in
        :param train_tasks: list of tasks used for training
        :param eval_tasks: list of tasks used for eval

        see default experiment config file for descriptions of the rest of the arguments
        """
        self.env = env
        self.agent = agent
        self.train_goals = train_goals
        self.wd_goals = wd_goals
        self.ood_goals = ood_goals
        self.replay_buffers = replay_buffers
        self.num_iterations = num_iterations
        self.num_train_steps_per_itr = num_train_steps_per_itr
        self.meta_batch_size = meta_batch_size
        self.num_evals = num_evals
        self.num_steps_per_eval = num_steps_per_eval
        self.max_path_length = max_path_length
        self.discount = discount
        self.reward_scale = reward_scale
        self.num_exp_traj_eval = num_exp_traj_eval
        self.eval_deterministic = eval_deterministic
        self.render = render
        self.save_replay_buffer = save_replay_buffer
        self.save_algorithm = save_algorithm
        self.save_environment = save_environment
        self.use_same_context = use_same_context
        self.recurrent = recurrent

        self.eval_statistics = None
        self.render_eval_paths = render_eval_paths
        self.dump_eval_paths = dump_eval_paths
        self.plotter = plotter

        self.sampler = InPlacePathSampler(
            env=env,
            policy=agent,
            max_path_length=self.max_path_length,
        )

        # separate replay buffers for
        # - training RL update
        # - training encoder update

        self._n_env_steps_total = 0
        self._n_train_steps_total = 0
        self._n_rollouts_total = 0
        self._do_train_time = 0
        self._epoch_start_time = None
        self._algo_start_time = None
        self._old_table_keys = None
        self._current_path_builder = PathBuilder()
        self._exploration_paths = []

    def make_exploration_policy(self, policy):
        return policy

    def make_eval_policy(self, policy):
        return policy

    def train(self):
        '''
        meta-training loop
        '''
        self.pretrain()
        gt.reset()
        gt.set_def_unique(False)
        self._current_path_builder = PathBuilder()

        # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate
        for it_ in gt.timed_for(
                range(self.num_iterations),
                save_itrs=True,
        ):
            self._start_epoch(it_)
            self.training_mode(True)

            # Sample train tasks and compute gradient updates on parameters.
            batch_idxes = np.random.randint(0,
                                            len(self.train_goals),
                                            size=self.meta_batch_size)
            train_batch_obj_id = self.replay_buffers.sample_training_data(
                batch_idxes, self.use_same_context)
            for _ in range(self.num_train_steps_per_itr):
                train_raw_batch = ray.get(train_batch_obj_id)
                gt.stamp('sample_training_data', unique=False)

                batch_idxes = np.random.randint(0,
                                                len(self.train_goals),
                                                size=self.meta_batch_size)
                # In this way, we can start the data sampling job for the
                # next training while doing training for the current loop.
                train_batch_obj_id = self.replay_buffers.sample_training_data(
                    batch_idxes, self.use_same_context)
                gt.stamp('set_up_sampling', unique=False)

                train_data = self.construct_training_batch(train_raw_batch)
                gt.stamp('construct_training_batch', unique=False)

                self._do_training(train_data)
                self._n_train_steps_total += 1
            gt.stamp('train')

            self.training_mode(False)

            # eval
            self._try_to_eval(it_)
            gt.stamp('eval')

            self._end_epoch()
            if it_ == self.num_iterations:
                logger.save_itr_params(it_, self.agent.get_snapshot())

    def construct_training_batch(self, raw_batch):
        ''' Construct training batch from raw batch'''
        state = np.concatenate([rb[0] for rb in raw_batch], axis=0)
        next_state = np.concatenate([rb[1] for rb in raw_batch], axis=0)
        actions = np.concatenate([rb[2] for rb in raw_batch], axis=0)
        rewards = np.concatenate([rb[3] for rb in raw_batch], axis=0)
        dones = np.concatenate([rb[4] for rb in raw_batch], axis=0)
        contexts = np.concatenate([rb[5] for rb in raw_batch], axis=0)

        return [state, next_state, actions, rewards, dones, contexts]

    def pretrain(self):
        """
        Do anything before the main training phase.
        """
        pass

    def _try_to_eval(self, epoch):
        logger.save_extra_data(self.get_extra_data_to_save(epoch))
        if self._can_evaluate():
            self.evaluate(epoch)

            params = self.get_epoch_snapshot(epoch)
            logger.save_itr_params(epoch, params)

            table_keys = logger.get_table_key_set()
            if self._old_table_keys is not None:
                assert table_keys == self._old_table_keys, (
                    "Table keys cannot change from iteration to iteration.")
            self._old_table_keys = table_keys

            logger.record_tabular(
                "Number of train steps total",
                self._n_train_steps_total,
            )
            logger.record_tabular(
                "Number of env steps total",
                self._n_env_steps_total,
            )
            logger.record_tabular(
                "Number of rollouts total",
                self._n_rollouts_total,
            )

            times_itrs = gt.get_times().stamps.itrs
            train_time = times_itrs['train'][-1]
            sample_time = times_itrs['sample_training_data'][-1]
            eval_time = times_itrs['eval'][-1] if epoch > 0 else 0
            epoch_time = train_time + sample_time + eval_time
            total_time = gt.get_times().total

            logger.record_tabular('Train Time (s)', train_time)
            logger.record_tabular('(Previous) Eval Time (s)', eval_time)
            logger.record_tabular('Sample Time (s)', sample_time)
            logger.record_tabular('Epoch Time (s)', epoch_time)
            logger.record_tabular('Total Train Time (s)', total_time)

            logger.record_tabular("Epoch", epoch)
            logger.dump_tabular(with_prefix=False, with_timestamp=False)
        else:
            logger.log("Skipping eval for now.")

    def _can_evaluate(self):
        """
        One annoying thing about the logger table is that the keys at each
        iteration need to be the exact same. So unless you can compute
        everything, skip evaluation.

        A common example for why you might want to skip evaluation is that at
        the beginning of training, you may not have enough data for a
        validation and training set.

        :return:
        """
        # eval collects its own context, so can eval any time
        return True

    def _can_train(self):
        return True

    def _get_action_and_info(self, agent, observation):
        """
        Get an action to take in the environment.
        :param observation:
        :return:
        """
        agent.set_num_steps_total(self._n_env_steps_total)
        return agent.get_action(observation, )

    def _start_epoch(self, epoch):
        self._epoch_start_time = time.time()
        self._exploration_paths = []
        self._do_train_time = 0
        logger.push_prefix('Iteration #%d | ' % epoch)

    def _end_epoch(self):
        logger.log("Epoch Duration: {0}".format(time.time() -
                                                self._epoch_start_time))
        logger.log("Started Training: {0}".format(self._can_train()))
        logger.pop_prefix()

    ##### Snapshotting utils #####

    def get_epoch_snapshot(self, epoch):
        data_to_save = dict(
            epoch=epoch,
            exploration_policy=self.agent,
        )
        if self.save_environment:
            data_to_save['env'] = self.env
        return data_to_save

    def get_extra_data_to_save(self, epoch):
        """
        Save things that shouldn't be saved every snapshot but rather
        overwritten every time.
        :param epoch:
        :return:
        """
        if self.render:
            self.env.render(close=True)
        data_to_save = dict(epoch=epoch, )
        if self.save_algorithm:
            data_to_save['algorithm'] = self
        if epoch == self.num_iterations - 1:
            data_to_save['algorithm'] = self
        return data_to_save

    def collect_paths(self, goal, epoch, run):
        self.env.set_goal(goal)

        self.agent.clear_z()
        paths = []
        num_transitions = 0
        num_trajs = 0
        while num_transitions < self.num_steps_per_eval:
            path, num = self.sampler.obtain_samples(
                deterministic=self.eval_deterministic,
                max_samples=self.num_steps_per_eval - num_transitions,
                max_trajs=1,
                accum_context=True)
            paths += path
            num_transitions += num
            num_trajs += 1
            if num_trajs >= self.num_exp_traj_eval:
                self.agent.infer_posterior(self.agent.context)

        if self.sparse_rewards:
            for p in paths:
                sparse_rewards = np.stack(
                    e['sparse_reward'] for e in p['env_infos']).reshape(-1, 1)
                p['rewards'] = sparse_rewards

        goal = self.env._goal
        for path in paths:
            path['goal'] = goal  # goal

        # save the paths for visualization, only useful for point mass
        if self.dump_eval_paths:
            logger.save_extra_data(
                paths,
                path='eval_trajectories/evla_goal{}-epoch{}-run{}'.format(
                    goal, epoch, run))

        return paths

    def _do_eval(self, goal_set, epoch):

        final_returns = []
        final_achieved = []
        for goal in goal_set:
            all_rets = []
            all_achieved = []
            for r in range(self.num_evals):
                paths = self.collect_paths(goal, epoch, r)
                all_rets.append(
                    [eval_util.get_average_returns([p]) for p in paths])
                all_achieved.append(
                    [eval_util.get_average_achieved([p]) for p in paths])
            final_returns.append(np.mean([a[-1] for a in all_rets]))
            final_achieved.append(np.mean([a[-1] for a in all_achieved]))

        return final_returns, final_achieved

    def evaluate(self, epoch):
        if self.eval_statistics is None:
            self.eval_statistics = OrderedDict()

        ### sample trajectories from prior for debugging / visualization
        if self.dump_eval_paths:
            # 100 arbitrarily chosen for visualizations of point_robot trajectories
            # just want stochasticity of z, not the policy
            self.agent.clear_z()
            prior_paths, _ = self.sampler.obtain_samples(
                deterministic=self.eval_deterministic,
                max_samples=self.max_path_length * 20,
                accum_context=False,
                resample=1)
            logger.save_extra_data(
                prior_paths,
                path='eval_trajectories/prior-epoch{}'.format(epoch))

        ### train tasks
        # eval on a subset of train tasks for speed
        eval_util.dprint('evaluating on {} train tasks'.format(
            len(self.train_goals)))

        ### eval train tasks with on-policy data to match eval of test tasks
        train_final_returns, train_final_achieved = self._do_eval(
            self.train_goals, epoch)

        # Comment this line for walker-param
        # train_final_achieved_pair = [(train_final_achieved[i], goal) for i, goal in enumerate(self.train_goals)]
        train_final_achieved_pair = [(train_final_achieved[i], -1)
                                     for i, goal in enumerate(self.train_goals)
                                     ]

        eval_util.dprint('train final achieved')
        eval_util.dprint(train_final_achieved_pair)

        ### WD tasks

        eval_util.dprint('evaluating on {} wd tasks'.format(len(
            self.wd_goals)))
        wd_final_returns, wd_final_achieved = self._do_eval(
            self.wd_goals, epoch)

        # Comment this line for walker-param
        # wd_final_achieved_pair = [(wd_final_achieved[i], goal) for i, goal in enumerate(self.wd_goals)]
        wd_final_achieved_pair = [(wd_final_achieved[i], -1)
                                  for i, goal in enumerate(self.wd_goals)]

        eval_util.dprint('WD test final achieved')
        eval_util.dprint(wd_final_achieved_pair)

        # ### OOD tasks

        # eval_util.dprint('evaluating on {} wd tasks'.format(len(self.ood_goals)))
        # ood_final_returns, ood_final_achieved = self._do_eval(self.ood_goals, epoch)

        # # Comment this line for walker-param
        # # ood_final_achieved_pair = [(ood_final_achieved[i], goal) for i, goal in enumerate(self.ood_goals)]
        # ood_final_achieved_pair = [(ood_final_achieved[i], -1) for i, goal in enumerate(self.ood_goals)]

        # eval_util.dprint('OOD test final achieved')
        # eval_util.dprint(ood_final_achieved_pair)

        # # save the final posterior
        # self.agent.log_diagnostics(self.eval_statistics)

        avg_train_return = np.mean(train_final_returns)
        avg_wd_return = np.mean(wd_final_returns)
        # avg_ood_return = np.mean(ood_final_returns)

        self.eval_statistics[
            'AverageReturn_all_train_tasks'] = avg_train_return
        self.eval_statistics['AverageReturn_all_wd_tasks'] = avg_wd_return
        # self.eval_statistics['AverageReturn_all_ood_tasks'] = avg_ood_return

        self.eval_statistics['Return_all_train_tasks'] = train_final_returns
        self.eval_statistics['Return_all_wd_tasks'] = wd_final_returns
        # self.eval_statistics['Return_all_ood_tasks'] = ood_final_returns

        self.eval_statistics[
            'Achieved_all_train_tasks'] = train_final_achieved_pair
        self.eval_statistics['Achieved_all_wd_tasks'] = wd_final_achieved_pair
        # self.eval_statistics['Achieved_all_ood_tasks'] = ood_final_achieved_pair

        for key, value in self.eval_statistics.items():
            logger.record_tabular(key, value)
        self.eval_statistics = None

        if self.plotter:
            self.plotter.draw()

    @abc.abstractmethod
    def training_mode(self, mode):
        """
        Set training mode to `mode`.
        :param mode: If True, training will happen (e.g. set the dropout
        probabilities to not all ones).
        """
        pass

    @abc.abstractmethod
    def _do_training(self, train_data):
        """
        Perform some update, e.g. perform one gradient step.
        :return:
        """
        pass
Esempio n. 14
0
    def __init__(
            self,
            env,
            agent,
            train_tasks,
            eval_tasks,
            meta_batch=64,
            num_iterations=100,
            num_train_steps_per_itr=1000,
            num_initial_steps=100,
            num_tasks_sample=100,
            num_steps_prior=100,
            num_steps_posterior=100,
            num_extra_rl_steps_posterior=100,
            num_evals=10,
            num_steps_per_eval=1000,
            batch_size=1024,
            low_batch_size=2048,  #TODO: Tune this batch size
            embedding_batch_size=1024,
            embedding_mini_batch_size=1024,
            max_path_length=1000,
            discount=0.99,
            replay_buffer_size=1000000,
            reward_scale=1,
            num_exp_traj_eval=1,
            update_post_train=1,
            eval_deterministic=True,
            render=False,
            save_replay_buffer=False,
            save_algorithm=False,
            save_environment=False,
            render_eval_paths=False,
            dump_eval_paths=False,
            plotter=None,
            use_goals=False):
        """
        :param env: training env
        :param agent: agent that is conditioned on a latent variable z that rl_algorithm is responsible for feeding in
        :param train_tasks: list of tasks used for training
        :param eval_tasks: list of tasks used for eval

        see default experiment config file for descriptions of the rest of the arguments
        """
        self.env = env
        self.agent = agent
        self.use_goals = use_goals
        assert (agent.use_goals == self.use_goals)
        self.exploration_agent = agent  # Can potentially use a different policy purely for exploration rather than also solving tasks, currently not being used
        self.train_tasks = train_tasks
        self.eval_tasks = eval_tasks
        self.meta_batch = meta_batch
        self.num_iterations = num_iterations
        self.num_train_steps_per_itr = num_train_steps_per_itr
        self.num_initial_steps = num_initial_steps
        self.num_tasks_sample = num_tasks_sample
        self.num_steps_prior = num_steps_prior
        self.num_steps_posterior = num_steps_posterior
        self.num_extra_rl_steps_posterior = num_extra_rl_steps_posterior
        self.num_evals = num_evals
        self.num_steps_per_eval = num_steps_per_eval
        self.batch_size = batch_size
        self.embedding_batch_size = embedding_batch_size
        self.embedding_mini_batch_size = embedding_mini_batch_size
        self.low_batch_size = low_batch_size
        self.max_path_length = max_path_length
        self.discount = discount
        self.replay_buffer_size = replay_buffer_size
        self.reward_scale = reward_scale
        self.update_post_train = update_post_train
        self.num_exp_traj_eval = num_exp_traj_eval
        self.eval_deterministic = eval_deterministic
        self.render = render
        self.save_replay_buffer = save_replay_buffer
        self.save_algorithm = save_algorithm
        self.save_environment = save_environment

        self.eval_statistics = None
        self.render_eval_paths = render_eval_paths
        self.dump_eval_paths = dump_eval_paths
        self.plotter = plotter

        obs_dim = int(np.prod(env.observation_space.shape))
        action_dim = int(np.prod(env.action_space.shape))

        self.sampler = InPlacePathSampler(
            env=env,
            policy=agent,
            max_path_length=self.max_path_length,
        )

        # separate replay buffers for
        # - training RL update
        # - training encoder update

        self.enc_replay_buffer = MultiTaskReplayBuffer(
            self.replay_buffer_size,
            env,
            self.train_tasks,
        )
        if self.use_goals:
            self.high_buffer = MultiTaskReplayBuffer(self.replay_buffer_size,
                                                     env, self.train_tasks)
            #Hacky method for changing the obs and action dimensions for the internal
            #buffers since they're not the same as the original environment
            internal_buffers = dict([
                (idx,
                 SimpleReplayBuffer(
                     max_replay_buffer_size=self.replay_buffer_size,
                     observation_dim=obs_dim,
                     action_dim=obs_dim,
                 )) for idx in self.train_tasks
            ])
            self.high_buffer.task_buffers = internal_buffers

            self.low_buffer = SimpleReplayBuffer(
                max_replay_buffer_size=replay_buffer_size,
                observation_dim=2 * obs_dim,
                action_dim=action_dim,
            )
        else:
            self.replay_buffer = MultiTaskReplayBuffer(
                self.replay_buffer_size,
                env,
                self.train_tasks,
            )

        self._n_env_steps_total = 0
        self._n_train_steps_total = 0
        self._n_rollouts_total = 0
        self._do_train_time = 0
        self._epoch_start_time = None
        self._algo_start_time = None
        self._old_table_keys = None
        self._current_path_builder = PathBuilder()
        self._exploration_paths = []
Esempio n. 15
0
    def __init__(self,
                 env,
                 agent,
                 train_tasks,
                 eval_tasks,
                 goal_radius,
                 eval_deterministic=True,
                 render=False,
                 render_eval_paths=False,
                 plotter=None,
                 **kwargs):
        """
        :param env: training env
        :param agent: agent that is conditioned on a latent variable z that rl_algorithm is responsible for feeding in
        :param train_tasks: list of tasks used for training
        :param eval_tasks: list of tasks used for eval
        :param goal_radius: reward threshold for defining sparse rewards

        see default experiment config file for descriptions of the rest of the arguments
        """
        self.env = env
        self.agent = agent
        self.train_tasks = train_tasks
        self.eval_tasks = eval_tasks
        self.goal_radius = goal_radius

        self.meta_batch = kwargs['meta_batch']
        self.batch_size = kwargs['batch_size']
        self.num_iterations = kwargs['num_iterations']
        self.num_train_steps_per_itr = kwargs['num_train_steps_per_itr']
        self.num_initial_steps = kwargs['num_initial_steps']
        self.num_tasks_sample = kwargs['num_tasks_sample']
        self.num_steps_prior = kwargs['num_steps_prior']
        self.num_steps_posterior = kwargs['num_steps_posterior']
        self.num_extra_rl_steps_posterior = kwargs[
            'num_extra_rl_steps_posterior']
        self.num_evals = kwargs['num_evals']
        self.num_steps_per_eval = kwargs['num_steps_per_eval']
        self.embedding_batch_size = kwargs['embedding_batch_size']
        self.embedding_mini_batch_size = kwargs['embedding_mini_batch_size']
        self.max_path_length = kwargs['max_path_length']
        self.discount = kwargs['discount']
        self.replay_buffer_size = kwargs['replay_buffer_size']
        self.reward_scale = kwargs['reward_scale']
        self.update_post_train = kwargs['update_post_train']
        self.num_exp_traj_eval = kwargs['num_exp_traj_eval']
        self.save_replay_buffer = kwargs['save_replay_buffer']
        self.save_algorithm = kwargs['save_algorithm']
        self.save_environment = kwargs['save_environment']
        self.dump_eval_paths = kwargs['dump_eval_paths']
        self.data_dir = kwargs['data_dir']
        self.train_epoch = kwargs['train_epoch']
        self.eval_epoch = kwargs['eval_epoch']
        self.sample = kwargs['sample']
        self.n_trj = kwargs['n_trj']
        self.allow_eval = kwargs['allow_eval']
        self.mb_replace = kwargs['mb_replace']

        self.eval_deterministic = eval_deterministic
        self.render = render
        self.eval_statistics = None
        self.render_eval_paths = render_eval_paths
        self.plotter = plotter

        self.train_buffer = MultiTaskReplayBuffer(self.replay_buffer_size, env,
                                                  self.train_tasks,
                                                  self.goal_radius)
        self.eval_buffer = MultiTaskReplayBuffer(self.replay_buffer_size, env,
                                                 self.eval_tasks,
                                                 self.goal_radius)
        self.replay_buffer = MultiTaskReplayBuffer(self.replay_buffer_size,
                                                   env, self.train_tasks,
                                                   self.goal_radius)
        self.enc_replay_buffer = MultiTaskReplayBuffer(self.replay_buffer_size,
                                                       env, self.train_tasks,
                                                       self.goal_radius)
        # offline sampler which samples from the train/eval buffer
        self.offline_sampler = OfflineInPlacePathSampler(
            env=env, policy=agent, max_path_length=self.max_path_length)
        # online sampler for evaluation (if collect on-policy context, for offline context, use self.offline_sampler)
        self.sampler = InPlacePathSampler(env=env,
                                          policy=agent,
                                          max_path_length=self.max_path_length)

        self._n_env_steps_total = 0
        self._n_train_steps_total = 0
        self._n_rollouts_total = 0
        self._do_train_time = 0
        self._epoch_start_time = None
        self._algo_start_time = None
        self._old_table_keys = None
        self._current_path_builder = PathBuilder()
        self._exploration_paths = []
        self.init_buffer()
    }

    # set up the policy
    # policy = joblib.load(POLICY_SAVE_PATH)['exploration_policy']
    policy = joblib.load(POLICY_SAVE_PATH)

    # set up the env
    # if env_specs['train_test_env']:
    #     _, training_env = get_env(env_specs)
    # else:
    #     training_env, _ = get_env(env_specs)

    # training_env = DebugFetchReachAndLiftEnv()
    training_env = WrappedRotatedFetchReachAnywhereEnv()

    # build an eval sampler that also renders
    eval_sampler = InPlacePathSampler(
        env=training_env,
        policy=policy,
        max_samples=max_samples,
        max_path_length=max_path_length,
        policy_uses_pixels=policy_specs['policy_uses_pixels'],
        policy_uses_task_params=policy_specs['policy_uses_task_params'],
        concat_task_params_to_policy_obs=policy_specs['concat_task_params_to_policy_obs'],
        animated=True
    )
    eval_sampler.obtain_samples()

    training_env.close()
    eval_sampler = None
Esempio n. 17
0
    def __init__(
            self,
            env,
            exploration_policy: ExplorationPolicy,
            training_env=None,
            num_epochs=100,
            num_steps_per_epoch=10000,
            num_steps_per_eval=1000,
            num_updates_per_env_step=1,
            max_num_episodes=None,
            batch_size=1024,
            max_path_length=1000,
            discount=0.99,
            replay_buffer_size=1000000,
            reward_scale=1,
            render=False,
            save_replay_buffer=False,
            save_algorithm=False,
            save_environment=False,
            save_best=False,
            save_best_starting_from_epoch=0,
            eval_sampler=None,
            eval_policy=None,
            replay_buffer=None,
            # for compatibility with deepmind control suite
            # Right now the semantics is that if observations is not a dictionary
            # then it means the policy just uses that. If it's a dictionary, it
            # checks whether policy_uses_pixels to see if it's true or false and
            # based on that it decides whether the policy takes 'pixels' or 'obs'
            # from the dictionary
            policy_uses_pixels=False,
            freq_saving=1,
            # for meta-learning
            policy_uses_task_params=False, # whether the policy uses the task parameters
            concat_task_params_to_policy_obs=False, # how the policy sees the task parameters
            # this is useful when you want to generate trajectories from the expert using the
            # exploration policy
            do_not_train=False,
            # some environment like halfcheetah_v2 have a timelimit that defines the terminal
            # this is used as a minor hack to turn off time limits
            no_terminal=False,
            **kwargs
    ):
        """
        Base class for RL Algorithms
        :param env: Environment used to evaluate.
        :param exploration_policy: Policy used to explore
        :param training_env: Environment used by the algorithm. By default, a
        copy of `env` will be made.
        :param num_epochs:
        :param num_steps_per_epoch:
        :param num_steps_per_eval:
        :param num_updates_per_env_step: Used by online training mode.
        :param num_updates_per_epoch: Used by batch training mode.
        :param batch_size:
        :param max_path_length:
        :param discount:
        :param replay_buffer_size:
        :param reward_scale:
        :param render:
        :param save_replay_buffer:
        :param save_algorithm:
        :param save_environment:
        :param eval_sampler:
        :param eval_policy: Policy to evaluate with.
        :param replay_buffer:
        """
        self.training_env = training_env or pickle.loads(pickle.dumps(env))
        # self.training_env = training_env or deepcopy(env)
        self.exploration_policy = exploration_policy
        self.num_epochs = num_epochs
        self.num_env_steps_per_epoch = num_steps_per_epoch
        self.num_steps_per_eval = num_steps_per_eval
        self.num_updates_per_train_call = num_updates_per_env_step
        self.batch_size = batch_size
        self.max_path_length = max_path_length
        self.discount = discount
        self.replay_buffer_size = replay_buffer_size
        self.reward_scale = reward_scale
        self.render = render
        self.save_replay_buffer = save_replay_buffer
        self.save_algorithm = save_algorithm
        self.save_environment = save_environment
        self.save_best = save_best
        self.save_best_starting_from_epoch = save_best_starting_from_epoch
        self.policy_uses_pixels = policy_uses_pixels
        self.policy_uses_task_params = policy_uses_task_params
        self.concat_task_params_to_policy_obs = concat_task_params_to_policy_obs
        self.freq_saving = freq_saving
        if eval_sampler is None:
            if eval_policy is None:
                eval_policy = exploration_policy
            eval_sampler = InPlacePathSampler(
                env=env,
                policy=eval_policy,
                max_samples=self.num_steps_per_eval + self.max_path_length,
                max_path_length=self.max_path_length, policy_uses_pixels=policy_uses_pixels,
                policy_uses_task_params=policy_uses_task_params,
                concat_task_params_to_policy_obs=concat_task_params_to_policy_obs
            )
        self.eval_policy = eval_policy
        self.eval_sampler = eval_sampler

        self.action_space = env.action_space
        self.obs_space = env.observation_space
        self.env = env
        if replay_buffer is None:
            replay_buffer = EnvReplayBuffer(
                self.replay_buffer_size,
                self.env,
                policy_uses_pixels=self.policy_uses_pixels,
                policy_uses_task_params=self.policy_uses_task_params,
                concat_task_params_to_policy_obs=self.concat_task_params_to_policy_obs
            )
        self.replay_buffer = replay_buffer

        self._n_env_steps_total = 0
        self._n_train_steps_total = 0
        self._n_rollouts_total = 0
        self._do_train_time = 0
        self._epoch_start_time = None
        self._algo_start_time = None
        self._old_table_keys = None
        self._current_path_builder = PathBuilder()
        self._exploration_paths = []
        self.do_not_train = do_not_train
        self.num_episodes = 0
        self.max_num_episodes = max_num_episodes if max_num_episodes is not None else float('inf')
        self.no_terminal = no_terminal
def experiment(log_dir, variant_overwrite, cpu=False):
    if not cpu:
        ptu.set_gpu_mode(True)  # optionally set the GPU (default=False)

    # Load experiment from file.
    env, _, data, variant = load_experiment(log_dir, variant_overwrite)
    #assert all([a == b for a, b in zip(print(samples)env.sampled_goal, variant['env_kwargs']['goal_prior'])])

    # Set log directory.
    exp_id = 'eval/ne{}-mpl{}-{}-rs{}/nhp{}'.format(
        variant['algo_kwargs']['num_episodes'],
        variant['algo_kwargs']['max_path_length'],
        ','.join(variant_overwrite['env_kwargs']['shaped_rewards']),
        variant['algo_kwargs']['reward_scale'],
        variant['historical_policies_kwargs']['num_historical_policies'],
    )
    exp_id = create_exp_name(exp_id)
    out_dir = os.path.join(log_dir, exp_id)
    print('Logging to:', out_dir)
    setup_logger(
        log_dir=out_dir,
        variant=variant,
        snapshot_mode='none',
        snapshot_gap=50,
    )

    # Load trained model from file.
    policy = data['policy']
    vf = data['vf']
    qf = data['qf']
    algorithm = SoftActorCritic(
        env=env,
        training_env=env,  # can't clone box2d env cause of swig
        save_environment=False,  # can't save box2d env cause of swig
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_kwargs'],
    )

    # Overwrite algorithm for p(z) adaptation (if model is SMM).
    if variant['intrinsic_reward'] == 'smm':
        discriminator = data['discriminator']
        density_model = data['density_model']
        SMMHook(base_algorithm=algorithm,
                discriminator=discriminator,
                density_model=density_model,
                **variant['smm_kwargs'])

    # Overwrite algorithm for historical averaging.
    if variant['historical_policies_kwargs']['num_historical_policies'] > 0:
        HistoricalPoliciesHook(
            base_algorithm=algorithm,
            log_dir=log_dir,
            **variant['historical_policies_kwargs'],
        )

    algorithm.to(ptu.device)
    #algorithm.train()
    samples = algorithm.get_eval_paths()
    #for path in samples:
    #    print(path['observations'])

    #plt.figure()
    #plt.plot(samples[0]['observations'][:, 0], samples[0]['observations'][:, 1])
    #plt.plot(3, 2)
    #plt.show()
    print(env.reset())
    print(samples[0]['observations'])
    i = 0
    for path in samples:

        np.save('./outtem/out%i.npy' % i, path['observations'])
        i = i + 1
    #print(algorithm.policy.get_action(np.array([0,0])))
    from rlkit.samplers.util import rollout
    from rlkit.samplers.in_place import InPlacePathSampler
    #path=rollout(env,algorithm.eval_policy,50)
    eval_sampler = InPlacePathSampler(
        env=env,
        policy=algorithm.eval_policy,
        max_samples=100,
        max_path_length=50,
    )
    path = algorithm.eval_sampler.obtain_samples()
    print(path[0]['observations'])
Esempio n. 19
0
    def __init__(
        self,
        env,
        exploration_policy: ExplorationPolicy,
        training_env=None,
        num_epochs=100,
        num_steps_per_epoch=10000,
        num_steps_per_eval=1000,
        num_updates_per_env_step=1,
        batch_size=1024,
        max_path_length=1000,
        discount=0.99,
        replay_buffer_size=1000000,
        reward_scale=1,
        render=False,
        save_replay_buffer=False,
        save_algorithm=False,
        save_environment=True,
        eval_sampler=None,
        eval_policy=None,
        replay_buffer=None,
    ):
        """
        Base class for RL Algorithms
        :param env: Environment used to evaluate.
        :param exploration_policy: Policy used to explore
        :param training_env: Environment used by the algorithm. By default, a
        copy of `env` will be made.
        :param num_epochs:
        :param num_steps_per_epoch:
        :param num_steps_per_eval:
        :param num_updates_per_env_step: Used by online training mode.
        :param num_updates_per_epoch: Used by batch training mode.
        :param batch_size:
        :param max_path_length:
        :param discount:
        :param replay_buffer_size:
        :param reward_scale:
        :param render:
        :param save_replay_buffer:
        :param save_algorithm:
        :param save_environment:
        :param eval_sampler:
        :param eval_policy: Policy to evaluate with.
        :param replay_buffer:
        """
        self.training_env = training_env or pickle.loads(pickle.dumps(env))
        self.exploration_policy = exploration_policy
        self.num_epochs = num_epochs
        self.num_env_steps_per_epoch = num_steps_per_epoch
        self.num_steps_per_eval = num_steps_per_eval
        self.num_updates_per_train_call = num_updates_per_env_step
        self.batch_size = batch_size
        self.max_path_length = max_path_length
        self.discount = discount
        self.replay_buffer_size = replay_buffer_size
        self.reward_scale = reward_scale
        self.render = render
        self.save_replay_buffer = save_replay_buffer
        self.save_algorithm = save_algorithm
        self.save_environment = save_environment
        if eval_sampler is None:
            if eval_policy is None:
                eval_policy = exploration_policy
            eval_sampler = InPlacePathSampler(
                env=env,
                policy=eval_policy,
                max_samples=self.num_steps_per_eval + self.max_path_length,
                max_path_length=self.max_path_length,
            )
        self.eval_policy = eval_policy
        self.eval_sampler = eval_sampler

        self.action_space = env.action_space
        self.obs_space = env.observation_space
        self.env = env
        if replay_buffer is None:
            replay_buffer = EnvReplayBuffer(
                self.replay_buffer_size,
                self.env,
            )
        self.replay_buffer = replay_buffer

        self._n_env_steps_total = 0
        self._n_train_steps_total = 0
        self._n_rollouts_total = 0
        self._do_train_time = 0
        self._epoch_start_time = None
        self._algo_start_time = None
        self._old_table_keys = None
        self._current_path_builder = PathBuilder()
        self._exploration_paths = []
    def __init__(
        self,
        env,
        agent,
        train_tasks,
        eval_tasks,
        meta_batch=64,
        num_iterations=100,
        num_train_steps_per_itr=1000,
        num_initial_steps=100,
        num_tasks_sample=100,
        num_steps_prior=100,
        num_steps_posterior=100,
        num_extra_rl_steps_posterior=100,
        num_evals=10,
        num_steps_per_eval=1000,
        batch_size=1024,
        embedding_batch_size=1024,
        embedding_mini_batch_size=1024,
        max_path_length=1000,
        discount=0.99,
        replay_buffer_size=1000000,
        reward_scale=1,
        num_exp_traj_eval=1,
        update_post_train=1,
        eval_deterministic=True,
        render=False,
        save_replay_buffer=False,
        save_algorithm=False,
        save_environment=False,
        render_eval_paths=False,
        dump_eval_paths=False,
        plotter=None,
        dyna=False,
        dyna_num_train_itr=50,
        dyna_num_train_steps_per_itr=50,
        dyna_tandem_train=True,
        dyna_n_layers=3,
        dyna_hidden_layer_size=64,
        dyna_learning_rate=1e-3,
    ):
        """
        :param env: training env
        :param agent: agent that is conditioned on a latent variable z that rl_algorithm is responsible for feeding in
        :param train_tasks: list of tasks used for training
        :param eval_tasks: list of tasks used for eval

        see default experiment config file for descriptions of the rest of the arguments
        """
        self.env = env
        self.agent = agent
        self.exploration_agent = agent  # Can potentially use a different policy purely for exploration rather than also solving tasks, currently not being used
        self.train_tasks = train_tasks
        self.eval_tasks = eval_tasks
        self.meta_batch = meta_batch
        self.num_iterations = num_iterations
        self.num_train_steps_per_itr = num_train_steps_per_itr
        self.num_initial_steps = num_initial_steps
        self.num_tasks_sample = num_tasks_sample
        self.num_steps_prior = num_steps_prior
        self.num_steps_posterior = num_steps_posterior
        self.num_extra_rl_steps_posterior = num_extra_rl_steps_posterior
        self.num_evals = num_evals
        self.num_steps_per_eval = num_steps_per_eval
        self.batch_size = batch_size
        self.embedding_batch_size = embedding_batch_size
        self.embedding_mini_batch_size = embedding_mini_batch_size
        self.max_path_length = max_path_length
        self.discount = discount
        self.replay_buffer_size = replay_buffer_size
        self.reward_scale = reward_scale
        self.update_post_train = update_post_train
        self.num_exp_traj_eval = num_exp_traj_eval
        self.eval_deterministic = eval_deterministic
        self.render = render
        self.save_replay_buffer = save_replay_buffer
        self.save_algorithm = save_algorithm
        self.save_environment = save_environment

        self.eval_statistics = None
        self.render_eval_paths = render_eval_paths
        self.dump_eval_paths = dump_eval_paths
        self.plotter = plotter

        self.dyna = dyna
        self.dyna_num_train_itr = dyna_num_train_itr
        self.dyna_num_train_steps_per_itr = dyna_num_train_steps_per_itr
        self.dyna_tandem_train = dyna_tandem_train
        self.dyna_n_layers = dyna_n_layers
        self.dyna_hidden_layer_size = dyna_hidden_layer_size
        self.dyna_learning_rate = dyna_learning_rate

        if dyna:
            self.sampler = DynamicsSampler(
                env=env,
                policy=agent,
                max_path_length=self.max_path_length,
                num_train_itr=dyna_num_train_itr,
                num_train_steps_per_itr=dyna_num_train_steps_per_itr,
                tandem_train=dyna_tandem_train,
                n_layers=dyna_n_layers,
                hidden_layer_size=dyna_hidden_layer_size,
                learning_rate=dyna_learning_rate,
            )
        else:
            self.sampler = InPlacePathSampler(
                env=env,
                policy=agent,
                max_path_length=self.max_path_length,
            )

        # separate replay buffers for
        # - training RL update
        # - training encoder update
        self.replay_buffer = MultiTaskReplayBuffer(
            self.replay_buffer_size,
            env,
            self.train_tasks,
        )

        self.enc_replay_buffer = MultiTaskReplayBuffer(
            self.replay_buffer_size,
            env,
            self.train_tasks,
        )

        self._n_env_steps_total = 0
        self._n_train_steps_total = 0
        self._n_rollouts_total = 0
        self._do_train_time = 0
        self._epoch_start_time = None
        self._algo_start_time = None
        self._old_table_keys = None
        self._current_path_builder = PathBuilder()
        self._exploration_paths = []
Esempio n. 21
0
class DdpgQfCombiner(object):
    def __init__(
            self,
            env,
            qf1,
            qf2,
            policy,
            replay_buffer1,
            replay_buffer2,
            num_epochs=1000,
            num_steps_per_epoch=1000,
            policy_learning_rate=1e-4,
            batch_size=128,
            num_steps_per_eval=3000,
            max_path_length=300,
            discount=0.99,
    ):
        super().__init__()
        self.env = env
        self.qf1 = qf1
        self.qf2 = qf2
        self.policy = policy
        self.replay_buffer1 = replay_buffer1
        self.replay_buffer2 = replay_buffer2
        self.num_steps_per_epoch = num_steps_per_epoch
        self.num_epochs = num_epochs
        self.policy_learning_rate = policy_learning_rate
        self.batch_size = batch_size
        self.discount = discount

        self.eval_sampler = InPlacePathSampler(
            env=env,
            policy=self.policy,
            max_samples=num_steps_per_eval,
            max_path_length=max_path_length,
        )

        self.policy_optimizer = optim.Adam(self.policy.parameters(),
                                           lr=self.policy_learning_rate)

    def train(self):
        for epoch in range(self.num_epochs):
            logger.push_prefix('Iteration #%d | ' % epoch)

            start_time = time.time()
            for _ in range(self.num_steps_per_epoch):
                batch = self.get_batch()
                train_dict = self.get_train_dict(batch)

                self.policy_optimizer.zero_grad()
                policy_loss = train_dict['Policy Loss']
                policy_loss.backward()
                self.policy_optimizer.step()
            logger.log("Train time: {}".format(time.time() - start_time))

            start_time = time.time()
            self.evaluate(epoch)
            logger.log("Eval time: {}".format(time.time() - start_time))

            params = self.get_epoch_snapshot(epoch)
            logger.save_itr_params(epoch, params)
            logger.pop_prefix()

    def to(self, device=ptu.device):
        self.policy.to(device)
        self.qf1.to(device)
        self.qf2.to(device)

    def get_batch(self):
        sample_size = self.batch_size // 2
        batch1 = self.replay_buffer1().random_batch(sample_size)
        batch2 = self.replay_buffer2().random_batch(sample_size)
        new_batch = {}
        for k, v in batch1.items():
            new_batch[k] = np.concatenate(
                (
                    v,
                    batch2[k]
                ),
                axis=0,
            )
        return np_to_pytorch_batch(new_batch)

    def get_train_dict(self, batch):
        obs = batch['observations']

        policy_actions = self.policy(obs)
        q_output = self.qf1(obs, policy_actions) + self.qf2(obs, policy_actions)
        policy_loss = - q_output.mean()

        return OrderedDict([
            ('Policy Actions', policy_actions),
            ('Policy Loss', policy_loss),
            ('QF Outputs', q_output),
        ])

    def evaluate(self, epoch):
        """
        Perform evaluation for this algorithm.

        :param epoch: The epoch number.
        """
        statistics = OrderedDict()

        train_batch = self.get_batch()
        statistics.update(self._statistics_from_batch(train_batch, "Train"))

        logger.log("Collecting samples for evaluation")
        test_paths = self._sample_eval_paths()
        statistics.update(get_generic_path_information(
            test_paths, stat_prefix="Test",
        ))
        statistics.update(self._statistics_from_paths(test_paths, "Test"))
        average_returns = get_average_returns(test_paths)
        statistics['AverageReturn'] = average_returns

        statistics['Epoch'] = epoch

        for key, value in statistics.items():
            logger.record_tabular(key, value)

        self.env.log_diagnostics(test_paths)
        logger.dump_tabular(with_prefix=False, with_timestamp=False)

    def _statistics_from_paths(self, paths, stat_prefix):
        rewards, terminals, obs, actions, next_obs = split_paths(paths)
        np_batch = dict(
            rewards=rewards,
            terminals=terminals,
            observations=obs,
            actions=actions,
            next_observations=next_obs,
        )
        batch = np_to_pytorch_batch(np_batch)
        statistics = self._statistics_from_batch(batch, stat_prefix)
        statistics.update(create_stats_ordered_dict(
            'Num Paths', len(paths), stat_prefix=stat_prefix
        ))
        return statistics

    def _statistics_from_batch(self, batch, stat_prefix):
        statistics = OrderedDict()

        train_dict = self.get_train_dict(batch)
        for name in [
            'Policy Loss',
        ]:
            tensor = train_dict[name]
            statistics_name = "{} {} Mean".format(stat_prefix, name)
            statistics[statistics_name] = np.mean(ptu.get_numpy(tensor))

        for name in [
            'QF Outputs',
            'Policy Actions',
        ]:
            tensor = train_dict[name]
            statistics.update(create_stats_ordered_dict(
                '{} {}'.format(stat_prefix, name),
                ptu.get_numpy(tensor)
            ))

        statistics.update(create_stats_ordered_dict(
            "{} Env Actions".format(stat_prefix),
            ptu.get_numpy(batch['actions'])
        ))

        return statistics

    def _sample_eval_paths(self):
        return self.eval_sampler.obtain_samples()

    def get_epoch_snapshot(self, epoch):
        return dict(
            epoch=epoch,
            policy=self.policy,
            env=self.env,
            algo=self,
        )