Beispiel #1
0
    def _train(self, env, policy, pool):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            pool (`PoolBase`): Sample pool to add samples to
        """
        self._init_training()
        self.sampler.initialize(env, policy, pool)

        evaluation_env = deep_clone(env) if self._eval_n_episodes else None

        with tf_utils.get_default_session().as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(
                    range(self._n_epochs + 1), save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    self.sampler.sample()
                    if not self.sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    for i in range(self._n_train_repeat):
                        self._do_training(
                            iteration=t + epoch * self._epoch_length,
                            batch=self.sampler.random_batch())
                    gt.stamp('train')

                self._evaluate(policy, evaluation_env)
                gt.stamp('eval')

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)

                time_itrs = gt.get_times().stamps.itrs
                time_eval = time_itrs['eval'][-1]
                time_total = gt.get_times().total
                time_train = time_itrs.get('train', [0])[-1]
                time_sample = time_itrs.get('sample', [0])[-1]

                logger.record_tabular('time-train', time_train)
                logger.record_tabular('time-eval', time_eval)
                logger.record_tabular('time-sample', time_sample)
                logger.record_tabular('time-total', time_total)
                logger.record_tabular('epoch', epoch)

                self.sampler.log_diagnostics()

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()
Beispiel #2
0
    def train(self):
        '''
        meta-training loop
        '''
        start_time = time.time()
        print("starting to pretrain")
        self.pretrain()
        print("done pretraining after time:", time.time() - start_time)
        params = self.get_epoch_snapshot(-1)
        logger.save_itr_params(-1, params)
        gt.reset()
        gt.set_def_unique(False)
        self._current_path_builder = PathBuilder()

        # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate
        for it_ in gt.timed_for(
                range(self.num_iterations),
                save_itrs=True,
        ):
            self._start_epoch(it_)
            self.training_mode(True)
            if it_ == 0 and self.num_initial_steps > 0:
                print('collecting initial pool of data for train and eval')
                # temp for evaluating
                for task_idx in self.train_task_indices:
                    if self.expl_data_collector:
                        init_expl_paths = self.expl_data_collector.collect_new_paths(
                            max_path_length=self.max_path_length,
                            num_steps=self.num_initial_steps,
                            discard_incomplete_paths=False,
                            task_idx=task_idx,
                        )
                        self.replay_buffer.add_paths(task_idx, init_expl_paths)
                        if not self.use_rl_buffer_for_enc_buffer:
                            self.enc_replay_buffer.add_paths(
                                task_idx, init_expl_paths)
                        self.expl_data_collector.end_epoch(-1)
                    else:
                        self.collect_exploration_data(self.num_initial_steps,
                                                      1, np.inf, task_idx)
            self.in_unsupervised_phase = (
                it_ >= self.num_iterations_with_reward_supervision)
            if it_ == self.num_iterations_with_reward_supervision:
                self._transition_to_unsupervised()
            update_encoder_buffer = not (
                self.in_unsupervised_phase
                and self.freeze_encoder_buffer_in_unsupervised_phase
            ) and not self.use_rl_buffer_for_enc_buffer
            clear_encoder_buffer = (
                update_encoder_buffer
                and self.clear_encoder_buffer_before_every_update
            ) and not self.use_rl_buffer_for_enc_buffer
            # TODO: propogate unsupervised mode elegantly
            # Sample data from train tasks.
            for i in range(self.num_tasks_sample):
                if len(self.exploration_task_indices) == 0:
                    # do no data collection
                    break
                task_idx = np.random.choice(self.exploration_task_indices)
                if clear_encoder_buffer:
                    self.enc_replay_buffer.task_buffers[task_idx].clear()
                # collect some trajectories with z ~ prior
                if self.num_steps_prior > 0:
                    if self.expl_data_collector:
                        # TODO: implement
                        new_expl_paths = self.expl_data_collector.collect_new_paths(
                            task_idx=task_idx,
                            max_path_length=self.max_path_length,
                            resample_latent_period=self.
                            exploration_resample_latent_period,
                            update_posterior_period=np.inf,
                            num_steps=self.num_steps_prior,
                            use_predicted_reward=self.in_unsupervised_phase,
                            discard_incomplete_paths=False,
                        )
                        self.replay_buffer.add_paths(task_idx, new_expl_paths)
                        self._n_env_steps_total += sum(
                            len(p['actions']) for p in new_expl_paths)
                        self._n_rollouts_total += len(new_expl_paths)
                        if update_encoder_buffer:
                            self.enc_replay_buffer.add_paths(
                                task_idx, new_expl_paths)
                    else:
                        self.collect_exploration_data(
                            num_samples=self.num_steps_prior,
                            resample_latent_period=self.
                            exploration_resample_latent_period,
                            update_posterior_period=np.inf,
                            add_to_enc_buffer=update_encoder_buffer,
                            use_predicted_reward=self.in_unsupervised_phase,
                            task_idx=task_idx,
                            # TODO: figure out if I want to replace this?
                            # it's only used when `clear_encoder_buffer_before_every_update` is True
                            # and when `freeze_encoder_buffer_in_unsupervised_phase` is False
                            # and when we're in unsupervised phase
                        )
                # collect some trajectories with z ~ posterior
                if self.num_steps_posterior > 0:
                    if self.expl_data_collector:
                        # TODO: implement
                        new_expl_paths = self.expl_data_collector.collect_new_paths(
                            task_idx=task_idx,
                            max_path_length=self.max_path_length,
                            resample_latent_period=self.
                            exploration_resample_latent_period,
                            update_posterior_period=self.update_post_train,
                            num_steps=self.num_steps_posterior,
                            use_predicted_reward=self.in_unsupervised_phase,
                            discard_incomplete_paths=False,
                        )
                        self.replay_buffer.add_paths(task_idx, new_expl_paths)
                        self._n_env_steps_total += sum(
                            len(p['actions']) for p in new_expl_paths)
                        self._n_rollouts_total += len(new_expl_paths)
                        if update_encoder_buffer and not self.use_rl_buffer_for_enc_buffer:
                            self.enc_replay_buffer.add_paths(
                                task_idx, new_expl_paths)
                    else:
                        self.collect_exploration_data(
                            num_samples=self.num_steps_posterior,
                            resample_latent_period=self.
                            exploration_resample_latent_period,
                            update_posterior_period=self.update_post_train,
                            add_to_enc_buffer=update_encoder_buffer,
                            use_predicted_reward=self.in_unsupervised_phase,
                            task_idx=task_idx,
                        )
                # even if encoder is trained only on samples from the prior, the policy needs to learn to handle z ~ posterior
                if self.num_extra_rl_steps_posterior > 0:
                    # TODO: implement
                    if self.expl_data_collector:
                        new_expl_paths = self.expl_data_collector.collect_new_paths(
                            task_idx=task_idx,
                            max_path_length=self.max_path_length,
                            resample_latent_period=self.
                            exploration_resample_latent_period,
                            update_posterior_period=self.update_post_train,
                            num_steps=self.num_extra_rl_steps_posterior,
                            use_predicted_reward=self.in_unsupervised_phase,
                            discard_incomplete_paths=False,
                        )
                        self.replay_buffer.add_paths(task_idx, new_expl_paths)
                        self._n_env_steps_total += sum(
                            len(p['actions']) for p in new_expl_paths)
                        self._n_rollouts_total += len(new_expl_paths)
                        if not self.use_rl_buffer_for_enc_buffer:
                            self.enc_replay_buffer.add_paths(
                                task_idx, new_expl_paths)
                    else:
                        add_to_enc_buffer = (
                            self.debug_enc_buffer_matches_rl_buffer
                            and not self.use_rl_buffer_for_enc_buffer)
                        self.collect_exploration_data(
                            num_samples=self.num_extra_rl_steps_posterior,
                            resample_latent_period=self.
                            exploration_resample_latent_period,
                            update_posterior_period=self.update_post_train,
                            add_to_enc_buffer=add_to_enc_buffer,
                            use_predicted_reward=self.in_unsupervised_phase,
                            task_idx=task_idx,
                        )
            gt.stamp('sample')

            # Sample train tasks and compute gradient updates on parameters.
            for train_step in range(self.num_train_steps_per_itr):
                if self.use_meta_learning_buffer:
                    batch = self.meta_replay_buffer.sample_meta_batch(
                        rl_batch_size=self.batch_size,
                        meta_batch_size=self.meta_batch,
                        embedding_batch_size=self.embedding_batch_size,
                    )
                    self.trainer.train(batch)
                else:
                    indices = np.random.choice(self.train_task_indices,
                                               self.meta_batch)

                    mb_size = self.embedding_mini_batch_size
                    num_updates = self.embedding_batch_size // mb_size

                    # sample context batch
                    # context_batch = self.sample_context(indices)
                    context_batch = self.enc_replay_buffer.sample_context(
                        indices, self.embedding_batch_size)

                    # zero out context and hidden encoder state
                    # self.agent.clear_z(num_tasks=len(indices))

                    # do this in a loop so we can truncate backprop in the recurrent encoder
                    for i in range(num_updates):
                        if self._debug_use_ground_truth_context:
                            context = context_batch
                        else:
                            context = context_batch[:,
                                                    i * mb_size:i * mb_size +
                                                    mb_size, :]
                        # batch = self.sample_batch(indices)
                        batch = self.replay_buffer.sample_batch(
                            indices, self.batch_size)
                        batch['context'] = context
                        batch['task_indices'] = indices
                        self.trainer.train(batch)
                        self._n_train_steps_total += 1

                    # stop backprop
                    # self.agent.detach_z()
                # train_data = self.replay_buffer.random_batch(self.batch_size)
            gt.stamp('train')

            self.training_mode(False)

            # eval
            self._try_to_eval(it_)

            self._end_epoch(it_)
Beispiel #3
0
    def train(self):
        '''
        meta-training loop
        '''
        self.pretrain()
        params = self.get_epoch_snapshot(-1)
        logger.save_itr_params(-1, params)
        gt.reset()
        gt.set_def_unique(False)
        self._current_path_builder = PathBuilder()

        # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate
        for it_ in gt.timed_for(
                range(self.num_iterations),
                save_itrs=True,
        ):
            self._start_epoch(it_)
            self.training_mode(True)
            if it_ == 0:
                print('collecting initial pool of data for train and eval')
                # temp for evaluating
                for idx in self.train_tasks:
                    print('idx:', idx)
                    print('num initial steps:', self.num_initial_steps)
                    self.task_idx = idx
                    self.env.reset_task(idx)
                    self.collect_data(self.num_initial_steps, 1, np.inf)
            # Sample data from train tasks.
            for i in range(self.num_tasks_sample):
                idx = np.random.randint(len(self.train_tasks))
                self.task_idx = idx
                self.env.reset_task(idx)
                self.enc_replay_buffer.task_buffers[idx].clear()
                # collect some trajectories with z ~ prior
                if self.num_steps_prior > 0:
                    self.collect_data(self.num_steps_prior, 1, np.inf)
                    #NOTE: the second argument to self.collect_data is how often you reset the context
                # collect some trajectories with z ~ posterior
                if self.num_steps_posterior > 0:
                    self.collect_data(self.num_steps_posterior, 1,
                                      self.update_post_train)
                # even if encoder is trained only on samples from the prior, the policy needs to learn to handle z ~ posterior
                if self.num_extra_rl_steps_posterior > 0:
                    self.collect_data(self.num_extra_rl_steps_posterior,
                                      1,
                                      self.update_post_train,
                                      add_to_enc_buffer=False)
            #So self.task_idx always ONE number and is used for data collection,
            #but during the actual training steps, you can have multiple indices
            #used at once

            # Sample train tasks and compute gradient updates on parameters.
            for train_step in range(
                    self.num_train_steps_per_itr
            ):  ## DEFAULT: num_train_steps_per_itr=2000
                indices = np.random.choice(
                    self.train_tasks,
                    self.meta_batch)  ##DEFAULT:self.meta_batch=16
                self._do_training(indices)
                self._n_train_steps_total += 1
            gt.stamp('train')

            self.training_mode(False)

            # eval
            self._try_to_eval(it_)
            gt.stamp('eval')

            self._end_epoch()
Beispiel #4
0
    def _train(self, env, policy, initial_exploration_policy, pool):
        """When training our policy expects an augmented observation."""
        self._init_training(env, policy, pool)

        with self._sess.as_default():
            observation = env.reset()
            policy.reset()
            log_p_z_episode = []  # Store log_p_z for this episode
            path_length = 0
            path_return = 0
            last_path_return = 0
            max_path_return = -np.inf
            n_episodes = 0

            if self._learn_p_z:
                log_p_z_list = [
                    deque(maxlen=self._max_path_length)
                    for _ in range(self._num_skills)
                ]

            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                path_length_list = []
                z = self._sample_z()
                aug_obs = utils.concat_obs_z(observation, z, self._num_skills)

                for t in range(self._epoch_length):
                    iteration = t + epoch * self._epoch_length

                    action, _ = policy.get_action(aug_obs)

                    if self._learn_p_z:
                        (obs, _) = utils.split_aug_obs(aug_obs,
                                                       self._num_skills)
                        feed_dict = {
                            self._discriminator._obs_pl: obs[None],
                            self._discriminator._action_pl: action[None]
                        }
                        logits = tf_utils.get_default_session().run(
                            self._discriminator._output_t, feed_dict)[0]
                        log_p_z = np.log(utils._softmax(logits)[z])
                        if self._learn_p_z:
                            log_p_z_list[z].append(log_p_z)

                    next_ob, reward, terminal, info = env.step(action)
                    aug_next_ob = utils.concat_obs_z(next_ob, z,
                                                     self._num_skills)
                    path_length += 1
                    path_return += reward

                    self._pool.add_sample(
                        aug_obs,
                        action,
                        reward,
                        terminal,
                        aug_next_ob,
                    )

                    if terminal or path_length >= self._max_path_length:
                        path_length_list.append(path_length)
                        observation = env.reset()
                        policy.reset()
                        log_p_z_episode = []
                        path_length = 0
                        max_path_return = max(max_path_return, path_return)
                        last_path_return = path_return

                        path_return = 0
                        n_episodes += 1

                    else:
                        aug_obs = aug_next_ob
                    gt.stamp('sample')

                    if self._pool.size >= self.sampler._min_pool_size:
                        for i in range(self._n_train_repeat):
                            batch = self._pool.random_batch(
                                self.sampler._batch_size)
                            self._do_training(iteration, batch)

                    gt.stamp('train')

                if self._learn_p_z:
                    print('learning p(z)')
                    for z in range(self._num_skills):
                        if log_p_z_list[z]:
                            print(
                                '\t skill = %d, min=%.2f, max=%.2f, mean=%.2f, len=%d'
                                % (z, np.min(
                                    log_p_z_list[z]), np.max(log_p_z_list[z]),
                                   np.mean(
                                       log_p_z_list[z]), len(log_p_z_list[z])))
                    log_p_z = [
                        np.mean(log_p_z)
                        if log_p_z else np.log(1.0 / self._num_skills)
                        for log_p_z in log_p_z_list
                    ]
                    print('log_p_z: %s' % log_p_z)
                    self._p_z = utils._softmax(log_p_z)

                # self._evaluate(epoch)

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)
                times_itrs = gt.get_times().stamps.itrs

                eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
                total_time = gt.get_times().total
                logger.record_tabular('time-train', times_itrs['train'][-1])
                logger.record_tabular('time-eval', eval_time)
                logger.record_tabular('time-sample', times_itrs['sample'][-1])
                logger.record_tabular('time-total', total_time)
                logger.record_tabular('epoch', epoch)
                logger.record_tabular('episodes', n_episodes)
                logger.record_tabular('max-path-return', max_path_return)
                logger.record_tabular('last-path-return', last_path_return)
                logger.record_tabular('pool-size', self._pool.size)
                logger.record_tabular('path-length', np.mean(path_length_list))

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()

                gt.stamp('eval')

            env.terminate()
Beispiel #5
0
    def _train(self):
        """Return a generator that performs RL training.

        Args:
            env (`SoftlearningEnv`): Environment used for training.
            policy (`Policy`): Policy used for training
            initial_exploration_policy ('Policy'): Policy used for exploration
                If None, then all exploration is done using policy
            pool (`PoolBase`): Sample pool to add samples to
        """
        training_environment = self._training_environment
        evaluation_environment = self._evaluation_environment
        policy = self._policy
        pool = self._pool
        model_metrics = {}

        if not self._training_started:
            self._init_training()

        self.sampler.initialize(training_environment, policy, pool)

        gt.reset_root()
        gt.rename_root('RLAlgorithm')
        gt.set_def_unique(False)

        self._training_before_hook()

        for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)):

            if self._epoch % 200 == 0:
                #### model training
                print('[ MOPO ] log_dir: {} | ratio: {}'.format(
                    self._log_dir, self._real_ratio))
                print(
                    '[ MOPO ] Training model at epoch {} | freq {} | timestep {} (total: {})'
                    .format(self._epoch, self._model_train_freq,
                            self._timestep, self._total_timestep))
                max_epochs = 1 if self._model.model_loaded else None
                model_train_metrics = self._train_model(
                    batch_size=256,
                    max_epochs=max_epochs,
                    holdout_ratio=0.2,
                    max_t=self._max_model_t)
                model_metrics.update(model_train_metrics)
                self._log_model()
                gt.stamp('epoch_train_model')
                ####

            self._epoch_before_hook()
            gt.stamp('epoch_before_hook')

            self._training_progress = Progress(self._epoch_length *
                                               self._n_train_repeat)
            start_samples = self.sampler._total_samples
            for timestep in count():
                self._timestep = timestep

                if (timestep >= self._epoch_length and self.ready_to_train):
                    break

                self._timestep_before_hook()
                gt.stamp('timestep_before_hook')

                ## model rollouts
                if timestep % self._model_train_freq == 0 and self._real_ratio < 1.0:
                    self._training_progress.pause()
                    self._set_rollout_length()
                    self._reallocate_model_pool()
                    model_rollout_metrics = self._rollout_model(
                        rollout_batch_size=self._rollout_batch_size,
                        deterministic=self._deterministic)
                    model_metrics.update(model_rollout_metrics)

                    gt.stamp('epoch_rollout_model')
                    self._training_progress.resume()

                ## train actor and critic
                if self.ready_to_train:
                    self._do_training_repeats(timestep=timestep)
                gt.stamp('train')

                self._timestep_after_hook()
                gt.stamp('timestep_after_hook')

            training_paths = self.sampler.get_last_n_paths(
                math.ceil(self._epoch_length / self.sampler._max_path_length))

            evaluation_paths = self._evaluation_paths(policy,
                                                      evaluation_environment)
            gt.stamp('evaluation_paths')

            if evaluation_paths:
                evaluation_metrics = self._evaluate_rollouts(
                    evaluation_paths, evaluation_environment)
                gt.stamp('evaluation_metrics')
            else:
                evaluation_metrics = {}

            gt.stamp('epoch_after_hook')

            sampler_diagnostics = self.sampler.get_diagnostics()

            diagnostics = self.get_diagnostics(
                iteration=self._total_timestep,
                batch=self._evaluation_batch(),
                training_paths=training_paths,
                evaluation_paths=evaluation_paths)

            time_diagnostics = gt.get_times().stamps.itrs

            diagnostics.update(
                OrderedDict((
                    *((f'evaluation/{key}', evaluation_metrics[key])
                      for key in sorted(evaluation_metrics.keys())),
                    *((f'times/{key}', time_diagnostics[key][-1])
                      for key in sorted(time_diagnostics.keys())),
                    *((f'sampler/{key}', sampler_diagnostics[key])
                      for key in sorted(sampler_diagnostics.keys())),
                    *((f'model/{key}', model_metrics[key])
                      for key in sorted(model_metrics.keys())),
                    ('epoch', self._epoch),
                    ('timestep', self._timestep),
                    ('timesteps_total', self._total_timestep),
                    ('train-steps', self._num_train_steps),
                )))

            if self._eval_render_mode is not None and hasattr(
                    evaluation_environment, 'render_rollouts'):
                training_environment.render_rollouts(evaluation_paths)

            ## ensure we did not collect any more data
            assert self._pool.size == self._init_pool_size

            yield diagnostics

        epi_ret = self._rollout_model_for_eval(
            self._training_environment.reset)
        np.savetxt("EEepi_ret__fin.csv", epi_ret, delimiter=',')

        self.sampler.terminate()

        self._training_after_hook()

        self._training_progress.close()

        yield {'done': True, **diagnostics}
Beispiel #6
0
    def _train(self, env, policy, pool, initial_exploration_policy=None):
        """Return a generator that performs RL training.

        Args:
            env (`SoftlearningEnv`): Environment used for training.
            policy (`Policy`): Policy used for training
            initial_exploration_policy ('Policy'): Policy used for exploration
                If None, then all exploration is done using policy
            pool (`PoolBase`): Sample pool to add samples to
        """

        if not self._training_started:
            self._init_training()

            self._initial_exploration_hook(env, initial_exploration_policy,
                                           pool)

        self.sampler.initialize(env, policy, pool)
        evaluation_env = env.copy() if self._eval_n_episodes else None

        gt.reset_root()
        gt.rename_root('RLAlgorithm')
        gt.set_def_unique(False)

        self._training_before_hook()

        for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)):
            self._epoch_before_hook()
            gt.stamp('epoch_before_hook')

            start_samples = self.sampler._total_samples
            for i in count():
                samples_now = self.sampler._total_samples
                self._timestep = samples_now - start_samples

                if (samples_now >= start_samples + self._epoch_length
                        and self.ready_to_train):
                    break

                self._timestep_before_hook()
                gt.stamp('timestep_before_hook')

                self._do_sampling(timestep=self._total_timestep)
                gt.stamp('sample')

                if self.ready_to_train:
                    self._do_training_repeats(timestep=self._total_timestep)
                gt.stamp('train')

                self._timestep_after_hook()
                gt.stamp('timestep_after_hook')

            training_paths = self.sampler.get_last_n_paths(
                math.ceil(self._epoch_length / self.sampler._max_path_length))
            gt.stamp('training_paths')
            evaluation_paths = self._evaluation_paths(policy, evaluation_env)
            gt.stamp('evaluation_paths')

            training_metrics = self._evaluate_rollouts(training_paths, env)
            gt.stamp('training_metrics')
            if evaluation_paths:
                evaluation_metrics = self._evaluate_rollouts(
                    evaluation_paths, evaluation_env)
                gt.stamp('evaluation_metrics')
            else:
                evaluation_metrics = {}

            self._epoch_after_hook(training_paths)
            gt.stamp('epoch_after_hook')

            sampler_diagnostics = self.sampler.get_diagnostics()

            diagnostics = self.get_diagnostics(
                iteration=self._total_timestep,
                batch=self._evaluation_batch(),
                training_paths=training_paths,
                evaluation_paths=evaluation_paths)

            time_diagnostics = gt.get_times().stamps.itrs

            diagnostics.update(
                OrderedDict((
                    *((f'evaluation/{key}', evaluation_metrics[key])
                      for key in sorted(evaluation_metrics.keys())),
                    *((f'training/{key}', training_metrics[key])
                      for key in sorted(training_metrics.keys())),
                    *((f'times/{key}', time_diagnostics[key][-1])
                      for key in sorted(time_diagnostics.keys())),
                    *((f'sampler/{key}', sampler_diagnostics[key])
                      for key in sorted(sampler_diagnostics.keys())),
                    ('epoch', self._epoch),
                    ('timestep', self._timestep),
                    ('timesteps_total', self._total_timestep),
                    ('train-steps', self._num_train_steps),
                    #('orthogonal-constraints',self.get_penalty()),
                )))

            try:
                diagnostics.update(
                    OrderedDict(
                        (('orthogonal-constraints', self.get_penalty()), )))
            except:
                pass

            if self._eval_render_mode is not None and hasattr(
                    evaluation_env, 'render_rollouts'):
                # TODO(hartikainen): Make this consistent such that there's no
                # need for the hasattr check.
                env.render_rollouts(evaluation_paths)

            yield diagnostics

        self.sampler.terminate()

        self._training_after_hook()

        yield {'done': True, **diagnostics}
    def _train(self, env, policy, pool):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            pool (`PoolBase`): Sample pool to add samples to
        """

        self._init_training(env, policy, pool)

        with self._sess.as_default():
            observation = env.reset()
            policy.reset()

            path_length = 0
            path_return = 0
            last_path_return = 0
            max_path_return = -np.inf
            n_episodes = 0
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(
                    range(self._n_epochs + 1), save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                if self.iter_callback is not None:
                    self.iter_callback(locals(), globals())

                for t in range(self._epoch_length):
                    iteration = t + epoch * self._epoch_length

                    action, _ = policy.get_action(observation)
                    next_ob, reward, terminal, info = env.step(action)
                    path_length += 1
                    path_return += reward

                    self.pool.add_sample(
                        observation,
                        action,
                        reward,
                        terminal,
                        next_ob,
                    )

                    if terminal or path_length >= self._max_path_length:
                        observation = env.reset()
                        policy.reset()
                        path_length = 0
                        max_path_return = max(max_path_return, path_return)
                        last_path_return = path_return

                        path_return = 0
                        n_episodes += 1

                    else:
                        observation = next_ob
                    gt.stamp('sample')

                    if self.pool.size >= self._min_pool_size:
                        for i in range(self._n_train_repeat):
                            batch = self.pool.random_batch(self._batch_size)
                            self._do_training(iteration, batch)

                    gt.stamp('train')

                self._evaluate(epoch)

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)
                times_itrs = gt.get_times().stamps.itrs

                eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
                total_time = gt.get_times().total
                logger.record_tabular('time-train', times_itrs['train'][-1])
                logger.record_tabular('time-eval', eval_time)
                logger.record_tabular('time-sample', times_itrs['sample'][-1])
                logger.record_tabular('time-total', total_time)
                logger.record_tabular('epoch', epoch)
                logger.record_tabular('episodes', n_episodes)
                logger.record_tabular('max-path-return', max_path_return)
                logger.record_tabular('last-path-return', last_path_return)
                logger.record_tabular('pool-size', self.pool.size)

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()

                gt.stamp('eval')

            env.terminate()
Beispiel #8
0
    def _train(self, env, policy, pool):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            pool (`PoolBase`): Sample pool to add samples to
        """

        self._init_training(env, policy, pool)

        with self._sess.as_default():
            observation = env.reset()
            policy.reset()

            path_length = 0
            path_return = 0
            last_path_return = 0
            max_path_return = -np.inf
            n_episodes = 0
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    iteration = t + epoch * self._epoch_length

                    action, _ = policy.get_action(observation)
                    next_ob, reward, terminal, info = env.step(action)
                    path_length += 1
                    path_return += reward

                    self.pool.add_sample(
                        observation,
                        action,
                        reward,
                        terminal,
                        next_ob,
                    )

                    if terminal or path_length >= self._max_path_length:
                        observation = env.reset()
                        policy.reset()
                        path_length = 0
                        max_path_return = max(max_path_return, path_return)
                        last_path_return = path_return

                        path_return = 0
                        n_episodes += 1

                    else:
                        observation = next_ob
                    gt.stamp('sample')

                    if self.pool.size >= self._min_pool_size:
                        for i in range(self._n_train_repeat):
                            batch = self.pool.random_batch(self._batch_size)
                            self._do_training(iteration, batch)

                    gt.stamp('train')

                self._evaluate(epoch)

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)
                times_itrs = gt.get_times().stamps.itrs

                eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
                total_time = gt.get_times().total
                logger.record_tabular('time-train', times_itrs['train'][-1])
                logger.record_tabular('time-eval', eval_time)
                logger.record_tabular('time-sample', times_itrs['sample'][-1])
                logger.record_tabular('time-total', total_time)
                logger.record_tabular('epoch', epoch)
                logger.record_tabular('episodes', n_episodes)
                logger.record_tabular('max-path-return', max_path_return)
                logger.record_tabular('last-path-return', last_path_return)
                logger.record_tabular('pool-size', self.pool.size)

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()

                gt.stamp('eval')

            env.terminate()
Beispiel #9
0
 def train(self, start_epoch=0):
     self.pretrain()
     self.training_mode(False)
     gt.reset()
     gt.set_def_unique(False)
     self.start_training(start_epoch=start_epoch)
Beispiel #10
0
    def test(self,
             task,
             seed,
             iteration_num,
             render,
             load_iter=None,
             debug=False):
        set_seed(seed)
        iteration_num = int(iteration_num)

        gt.reset()
        gt.set_def_unique(False)
        start_iter = 0

        params = self.logger.load_params(load_iter)
        start_iter = self._set_params(params)
        self.theta.train()
        extra_data = self.logger.load_extra_data()
        self._set_extra_data(extra_data)

        rollout = []
        state = task.reset()
        self.controller.set_task(task)

        for i in gt.timed_for(range(start_iter, iteration_num),
                              save_itrs=True):

            t = 0
            done = False
            reward_sum = 0
            state = task.reset()
            while not done:
                past_traj = [r[-self.M:] for r in rollout]

                if past_traj != []:
                    for _ in range(self.adaptation_update_num):
                        loss = self._compute_adaptation_loss(
                            self.theta, past_traj)
                        zero_grad(self.theta.parameters())
                        self._meta_update(loss)

                action = self.controller.plan(self.theta, state, None, debug)
                next_state, reward, done, _ = task.step(action)
                reward_sum += reward

                if render:
                    task.render()

                if action.shape == ():
                    action = [action]

                rollout = _aggregate_rollout(rollout, state, action,
                                             next_state)
                state = next_state
                t += 1

                if done:
                    rollout = []
                    state = task.reset()

            print('Iteration:', i, 'Reward:', reward_sum, 'Traj len:', t)
Beispiel #11
0
    def _train(self, env, policy, initial_exploration_policy,
               sub_level_policies_paths, pool, g):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            initial_exploration_policy ('Policy'): Policy used for exploration
                If None, then all exploration is done using policy
            pool (`PoolBase`): Sample pool to add samples to
        """
        '''self._init_training(env, policy, pool)
        if initial_exploration_policy is None:
            self.sampler.initialize(env, policy,sub_level_policies, pool)
            initial_exploration_done = True
        else:
            self.sampler.initialize(env, initial_exploration_policy,sub_level_policies, pool)
            initial_exploration_done = False'''

        with self._sess.as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            #loading-low-level-policies
            sub_level_policies = []
            for p in range(0, len(sub_level_policies_paths)):
                path = sub_level_policies_paths[p]
                if path[:2] == 'ik':
                    with tf.variable_scope(str(p), reuse=False):
                        policy_snapshot = IK_Policy(path[2])
                    sub_level_policies.append(policy_snapshot)
                else:
                    with tf.variable_scope(str(p), reuse=False):
                        policy_snapshot = joblib.load(
                            sub_level_policies_paths[p])
                    sub_level_policies.append(policy_snapshot["policy"])

            self._init_training(env, policy, pool)

            if initial_exploration_policy is None:
                self.sampler.initialize(env, policy, sub_level_policies, pool,
                                        g, self.use_demos)
                initial_exploration_done = True
            else:
                self.sampler.initialize(env, initial_exploration_policy,
                                        sub_level_policies, pool, g,
                                        self.use_demos)
                initial_exploration_done = False

            import pdb
            pdb.set_trace()

            if self.use_demos:
                print("Using demonstrations: {} steps".format(
                    self.sampler.pool.demo_buffer._size))
            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True,
                                      quick_print=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                for t in trange(self._epoch_length):
                    # TODO.codeconsolidation: Add control interval to sampler
                    if not initial_exploration_done:
                        if self._epoch_length * epoch >= self._n_initial_exploration_steps:
                            self.sampler.set_policy(policy)
                            initial_exploration_done = True
                    self.sampler.sample(
                        initial_exploration_done,
                        g,
                    )
                    if not self.sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    for i in range(self._n_train_repeat):
                        self._do_training(iteration=t +
                                          epoch * self._epoch_length,
                                          batch=self.sampler.random_batch())
                    gt.stamp('train')

                self._evaluate(epoch, initial_exploration_done,
                               sub_level_policies, g)
                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)
                times_itrs = gt.get_times().stamps.itrs

                eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
                total_time = gt.get_times().total
                logger.record_tabular('time-train', times_itrs['train'][-1])
                logger.record_tabular('time-eval', eval_time)
                logger.record_tabular('time-sample', times_itrs['sample'][-1])
                logger.record_tabular('time-total', total_time)
                logger.record_tabular('epoch', epoch)

                self.sampler.log_diagnostics()

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()

                gt.stamp('eval')

            self.sampler.terminate()
Beispiel #12
0
    def _train(self, env, policy, pool):
        """When training our policy expects an augmented observation."""
        self._init_training(env, policy, pool)

        with self._sess.as_default():
            env._wrapped_env.env.initialize(seed_task=SEED_TASK)
            observation = env.reset()
            policy.reset()
            log_p_z_episode = []  # Store log_p_z for this episode
            path_length = 0
            path_return = 0
            last_path_return = 0
            max_path_return = -np.inf
            n_episodes = 0
            self.prev_n_episodes = 0

            if self._learn_p_z:
                log_p_z_list = [
                    deque(maxlen=self._max_path_length)
                    for _ in range(self._num_skills)
                ]

            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True):

                path_length_list = []
                z = self._sample_z()
                aug_obs = utils.concat_obs_z(observation, z, self._num_skills)

                for t in range(self._epoch_length):
                    iteration = t + epoch * self._epoch_length

                    action, _ = policy.get_action(aug_obs)

                    if self._learn_p_z:
                        (obs, _) = utils.split_aug_obs(aug_obs,
                                                       self._num_skills)
                        feed_dict = {
                            self._discriminator._obs_pl: obs[None],
                            self._discriminator._action_pl: action[None]
                        }
                        logits = tf_utils.get_default_session().run(
                            self._discriminator._output_t, feed_dict)[0]
                        log_p_z = np.log(utils._softmax(logits)[z])
                        if self._learn_p_z:
                            log_p_z_list[z].append(log_p_z)

                    next_ob, reward, terminal, info = env.step(action)
                    aug_next_ob = utils.concat_obs_z(next_ob, z,
                                                     self._num_skills)
                    path_length += 1
                    path_return += reward

                    self._pool.add_sample(
                        aug_obs,
                        action,
                        reward,
                        terminal,
                        aug_next_ob,
                    )

                    if terminal or path_length >= self._max_path_length:
                        path_length_list.append(path_length)

                        # print("\n===RESET", epoch, n_episodes, "===", self._epoch_length, path_length, "===",
                        #     # env._wrapped_env.env.nstep_internal,
                        #     datetime.datetime.now())

                        env._wrapped_env.env.initialize(seed_task=SEED_TASK)
                        observation = env.reset()
                        policy.reset()
                        log_p_z_episode = []
                        path_length = 0
                        max_path_return = max(max_path_return, path_return)
                        last_path_return = path_return

                        path_return = 0
                        n_episodes += 1

                        # EPOCH IS DONE epoch
                        if not epoch % 10:
                            logger.log("Epoch: {:4} | Episodes: {}".format(
                                epoch, n_episodes),
                                       with_prefix=False)

                        if not n_episodes % self.eval_freq or \
                               n_episodes >= EPISODE_LIMIT or \
                               epoch >= self._n_epochs:
                            # is_final = epoch >= self._n_epochs \
                            #            or n_episodes >= EPISODE_LIMIT
                            self.sample_skills_to_bd(n_epoch=epoch,
                                                     n_episodes=n_episodes)
                            # Make snapshot
                            params = self.get_snapshot(epoch)
                            logger.save_itr_params(epoch, params)

                            gt.stamp('behaviours')

                    else:
                        aug_obs = aug_next_ob

                    gt.stamp('sample')

                    if self._pool.size >= self._min_pool_size:
                        for i in range(self._n_train_repeat):
                            batch = self._pool.random_batch(self._batch_size)
                            self._do_training(iteration, batch)

                    gt.stamp('train')

                    # Terminate after 1000000 episodes
                    if n_episodes >= EPISODE_LIMIT:
                        break

                else:
                    continue
                break

                if self._learn_p_z:
                    print('learning p(z)')
                    for z in range(self._num_skills):
                        if log_p_z_list[z]:
                            print(
                                '\t skill = %d, min=%.2f, max=%.2f, mean=%.2f, len=%d'
                                % (z, np.min(
                                    log_p_z_list[z]), np.max(log_p_z_list[z]),
                                   np.mean(
                                       log_p_z_list[z]), len(log_p_z_list[z])))
                    log_p_z = [
                        np.mean(log_p_z)
                        if log_p_z else np.log(1.0 / self._num_skills)
                        for log_p_z in log_p_z_list
                    ]
                    print('log_p_z: %s' % log_p_z)
                    self._p_z = utils._softmax(log_p_z)

            logger.push_prefix('Epoch #%d | ' % epoch)
            self._evaluate(epoch)

            params = self.get_snapshot(epoch)
            logger.save_itr_params(epoch, params)
            times_itrs = gt.get_times().stamps.itrs

            eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
            total_time = gt.get_times().total
            logger.record_tabular('time-train', times_itrs['train'][-1])
            logger.record_tabular('time-eval', eval_time)
            logger.record_tabular('time-sample', times_itrs['sample'][-1])
            logger.record_tabular('time-total', total_time)
            logger.record_tabular('epoch', epoch)
            logger.record_tabular('episodes', n_episodes)
            logger.record_tabular('max-path-return', max_path_return)
            logger.record_tabular('last-path-return', last_path_return)
            logger.record_tabular('pool-size', self._pool.size)
            logger.record_tabular('path-length', np.mean(path_length_list))

            logger.dump_tabular(with_prefix=False)
            logger.pop_prefix()

            gt.stamp('eval')

            env.terminate()
Beispiel #13
0
    def train(self):
        '''
        meta-training loop
        '''
        self.pretrain()
        params = self.get_epoch_snapshot(-1)
        logger.save_itr_params(-1, params)
        gt.reset()
        gt.set_def_unique(False)
        self._current_path_builder = PathBuilder()

        # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate
        for it_ in gt.timed_for(
                range(self.num_iterations),
                save_itrs=True,
        ):
            self._start_epoch(it_)
            self.training_mode(True)
            if it_ == 0:
                print('collecting initial pool of data for train and eval')
                # temp for evaluating                                                                                   #3-9
                for idx in self.train_tasks:  #对于每个任务
                    self.task_idx = idx  #更换当前task id
                    self.env.reset_task(idx)  #重置task
                    self.collect_data(
                        self.num_initial_steps, 1,
                        np.inf)  #训练前采集num_initial_steps条transitions,每采集一条,更新z
            # Sample data from train tasks.
            for i in range(
                    self.num_tasks_sample
            ):  #随机抽取num_tasks_sample个任务                                       3-10
                idx = np.random.randint(len(self.train_tasks))
                self.task_idx = idx
                self.env.reset_task(idx)
                self.enc_replay_buffer.task_buffers[idx].clear()

                # collect some trajectories with z ~ prior
                if self.num_steps_prior > 0:  #采集多少c来推先验z
                    self.collect_data(self.num_steps_prior, 1, np.inf)  #
                # collect some trajectories with z ~ posterior
                if self.num_steps_posterior > 0:  #采集多少c来推后验z
                    self.collect_data(self.num_steps_posterior, 1,
                                      self.update_post_train)
                # even if encoder is trained only on samples from the prior, the policy needs to learn to handle z ~ posterior
                if self.num_extra_rl_steps_posterior > 0:  #额外采集用于rl训练的c,不用于encoder
                    self.collect_data(self.num_extra_rl_steps_posterior,
                                      1,
                                      self.update_post_train,
                                      add_to_enc_buffer=False)

            # Sample train tasks and compute gradient updates on parameters.
            for train_step in range(
                    self.num_train_steps_per_itr
            ):  #每次迭代需要梯度更新多少步                           11
                indices = np.random.choice(
                    self.train_tasks,
                    self.meta_batch)  #从train_tasks里面随机选出meta_batch个任务 13
                self._do_training(indices)  #14-21
                self._n_train_steps_total += 1
            gt.stamp('train')

            self.training_mode(False)

            # eval
            self._try_to_eval(it_)
            gt.stamp('eval')

            self._end_epoch()
Beispiel #14
0
    def _train(self):
        """Return a generator that performs RL training.

        Args:
            env (`SoftlearningEnv`): Environment used for training.
            policy (`Policy`): Policy used for training
            initial_exploration_policy ('Policy'): Policy used for exploration
                If None, then all exploration is done using policy
            pool (`PoolBase`): Sample pool to add samples to
        """

        #### pool is e.g. simple_replay_pool
        training_environment = self._training_environment
        evaluation_environment = self._evaluation_environment
        policy = self._policy
        pool = self._pool
        model_metrics = {}

        #### init Qs for SAC
        if not self._training_started:
            self._init_training()

            #### perform some initial steps (gather samples) using initial policy
            ######  fills pool with _n_initial_exploration_steps samples
            self._initial_exploration_hook(training_environment,
                                           self._initial_exploration_policy,
                                           pool)

        #### set up sampler with train env and actual policy (may be different from initial exploration policy)
        ######## note: sampler is set up with the pool that may be already filled from initial exploration hook
        self.sampler.initialize(training_environment, policy, pool)

        #### reset gtimer (for coverage of project development)
        gt.reset_root()
        gt.rename_root('RLAlgorithm')
        gt.set_def_unique(False)

        #### not implemented, could train policy before hook
        self._training_before_hook()

        #### iterate over epochs, gt.timed_for to create loop with gt timestamps
        for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)):
            #### do something at beginning of epoch (in this case reset self._train_steps_this_epoch=0)
            self._epoch_before_hook()
            gt.stamp('epoch_before_hook')

            #### util class Progress, e.g. for plotting a progress bar
            #######   note: sampler may already contain samples in its pool from initial_exploration_hook or previous epochs
            self._training_progress = Progress(self._epoch_length *
                                               self._n_train_repeat /
                                               self._train_every_n_steps)
            start_samples = self.sampler._total_samples

            ### train for epoch_length ###
            for i in count():

                #### _timestep is within an epoch
                samples_now = self.sampler._total_samples
                self._timestep = samples_now - start_samples

                #### check if you're at the end of an epoch to train
                if (samples_now >= start_samples + self._epoch_length
                        and self.ready_to_train):
                    break

                #### not implemented atm
                self._timestep_before_hook()
                gt.stamp('timestep_before_hook')

                #### start model rollout
                if self._timestep % self._model_train_freq == 0 and self._real_ratio < 1.0:
                    self._training_progress.pause()
                    print('[ MBPO ] log_dir: {} | ratio: {}'.format(
                        self._log_dir, self._real_ratio))
                    print(
                        '[ MBPO ] Training model at epoch {} | freq {} | timestep {} (total: {}) | epoch train steps: {} (total: {})'
                        .format(self._epoch, self._model_train_freq,
                                self._timestep, self._total_timestep,
                                self._train_steps_this_epoch,
                                self._num_train_steps))

                    #### train the model with input:(obs, act), outputs: (rew, delta_obs), inputs are divided into sets with holdout_ratio
                    #@anyboby debug
                    samples = self._pool.return_all_samples()
                    self.fake_env.reset_model()
                    model_train_metrics = self.fake_env.train(
                        samples,
                        batch_size=512,
                        max_epochs=None,
                        holdout_ratio=0.2,
                        max_t=self._max_model_t)
                    model_metrics.update(model_train_metrics)
                    gt.stamp('epoch_train_model')

                    #### rollout model env ####
                    self._set_rollout_length()
                    self._reallocate_model_pool(
                        use_mjc_model_pool=self.use_mjc_state_model)
                    model_rollout_metrics = self._rollout_model(
                        rollout_batch_size=self._rollout_batch_size,
                        deterministic=self._deterministic)
                    model_metrics.update(model_rollout_metrics)
                    ###########################

                    gt.stamp('epoch_rollout_model')
                    # self._visualize_model(self._evaluation_environment, self._total_timestep)
                    self._training_progress.resume()

                ##### śampling from the real world ! #####
                ##### _total_timestep % train_every_n_steps is checked inside _do_sampling
                self._do_sampling(timestep=self._total_timestep)
                gt.stamp('sample')

                ### n_train_repeat from config ###
                if self.ready_to_train:
                    self._do_training_repeats(timestep=self._total_timestep)
                gt.stamp('train')

                self._timestep_after_hook()
                gt.stamp('timestep_after_hook')

            training_paths = self.sampler.get_last_n_paths(
                math.ceil(self._epoch_length / self.sampler._max_path_length))
            gt.stamp('training_paths')
            evaluation_paths = self._evaluation_paths(policy,
                                                      evaluation_environment)
            gt.stamp('evaluation_paths')

            training_metrics = self._evaluate_rollouts(training_paths,
                                                       training_environment)
            gt.stamp('training_metrics')
            if evaluation_paths:
                evaluation_metrics = self._evaluate_rollouts(
                    evaluation_paths, evaluation_environment)
                gt.stamp('evaluation_metrics')
            else:
                evaluation_metrics = {}

            self._epoch_after_hook(training_paths)
            gt.stamp('epoch_after_hook')

            sampler_diagnostics = self.sampler.get_diagnostics()

            diagnostics = self.get_diagnostics(
                iteration=self._total_timestep,
                batch=self._evaluation_batch(),
                training_paths=training_paths,
                evaluation_paths=evaluation_paths)

            time_diagnostics = gt.get_times().stamps.itrs

            diagnostics.update(
                OrderedDict((
                    *((f'evaluation/{key}', evaluation_metrics[key])
                      for key in sorted(evaluation_metrics.keys())),
                    *((f'training/{key}', training_metrics[key])
                      for key in sorted(training_metrics.keys())),
                    *((f'times/{key}', time_diagnostics[key][-1])
                      for key in sorted(time_diagnostics.keys())),
                    *((f'sampler/{key}', sampler_diagnostics[key])
                      for key in sorted(sampler_diagnostics.keys())),
                    *((f'model/{key}', model_metrics[key])
                      for key in sorted(model_metrics.keys())),
                    ('epoch', self._epoch),
                    ('timestep', self._timestep),
                    ('timesteps_total', self._total_timestep),
                    ('train-steps', self._num_train_steps),
                )))

            if self._eval_render_mode is not None and hasattr(
                    evaluation_environment, 'render_rollouts'):
                training_environment.render_rollouts(evaluation_paths)

            yield diagnostics

        self.sampler.terminate()

        self._training_after_hook()

        self._training_progress.close()

        ### this is where we yield the episode diagnostics to tune trial runner ###
        yield {'done': True, **diagnostics}
Beispiel #15
0
    def _train(self, env, policy, pool, initial_exploration_policy=None):
        """Return a generator that performs RL training.

        Args:
            env (`SoftlearningEnv`): Environment used for training.
            policy (`Policy`): Policy used for training
            initial_exploration_policy ('Policy'): Policy used for exploration
                If None, then all exploration is done using policy
            pool (`PoolBase`): Sample pool to add samples to
        """

        if not self._training_started:
            self._init_training()

            self._initial_exploration_hook(env, initial_exploration_policy,
                                           pool)

        self.sampler.initialize(env, policy, pool)
        evaluation_env = env.copy() if self._eval_n_episodes else None

        gt.reset_root()
        gt.rename_root('RLAlgorithm')
        gt.set_def_unique(False)

        self._training_before_hook()

        for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)):
            self._epoch_before_hook()
            gt.stamp('epoch_before_hook')

            start_samples = self.sampler._total_samples
            for i in count():
                samples_now = self.sampler._total_samples
                self._timestep = samples_now - start_samples

                if (samples_now >= start_samples + self._epoch_length
                        and self.ready_to_train):
                    break

                self._timestep_before_hook()
                gt.stamp('timestep_before_hook')

                self._do_sampling(timestep=self._total_timestep)
                gt.stamp('sample')

                if self.ready_to_train:
                    self._do_training_repeats(timestep=self._total_timestep)
                gt.stamp('train')

                self._timestep_after_hook()
                gt.stamp('timestep_after_hook')

            training_paths = self.sampler.get_last_n_paths(
                math.ceil(self._epoch_length / self.sampler._max_path_length))
            gt.stamp('training_paths')
            evaluation_paths = self._evaluation_paths(policy, evaluation_env)
            gt.stamp('evaluation_paths')

            training_metrics = self._evaluate_rollouts(training_paths, env)
            gt.stamp('training_metrics')

            should_save_path = (self._path_save_frequency > 0 and
                                self._epoch % self._path_save_frequency == 0)
            if should_save_path:
                import pickle
                for i, path in enumerate(training_paths):
                    #path.pop('images')
                    path_file_name = f'training_path_{self._epoch}_{i}.pkl'
                    path_file_path = os.path.join(os.getcwd(), 'paths',
                                                  path_file_name)
                    if not os.path.exists(os.path.dirname(path_file_path)):
                        os.makedirs(os.path.dirname(path_file_path))
                    with open(path_file_path, 'wb') as f:
                        pickle.dump(path, f)

            if evaluation_paths:
                evaluation_metrics = self._evaluate_rollouts(
                    evaluation_paths, evaluation_env)
                gt.stamp('evaluation_metrics')
            else:
                evaluation_metrics = {}

            self._epoch_after_hook(training_paths)
            gt.stamp('epoch_after_hook')

            sampler_diagnostics = self.sampler.get_diagnostics()

            diagnostics = self.get_diagnostics(
                iteration=self._total_timestep,
                batch=self._evaluation_batch(),
                training_paths=training_paths,
                evaluation_paths=evaluation_paths)

            time_diagnostics = gt.get_times().stamps.itrs

            diagnostics.update(
                OrderedDict((
                    *((f'evaluation/{key}', evaluation_metrics[key])
                      for key in sorted(evaluation_metrics.keys())),
                    *((f'training/{key}', training_metrics[key])
                      for key in sorted(training_metrics.keys())),
                    *((f'times/{key}', time_diagnostics[key][-1])
                      for key in sorted(time_diagnostics.keys())),
                    *((f'sampler/{key}', sampler_diagnostics[key])
                      for key in sorted(sampler_diagnostics.keys())),
                    ('epoch', self._epoch),
                    ('timestep', self._timestep),
                    ('timesteps_total', self._total_timestep),
                    ('train-steps', self._num_train_steps),
                )))

            if self._eval_render_mode is not None and hasattr(
                    evaluation_env, 'render_rollouts'):
                # TODO(hartikainen): Make this consistent such that there's no
                # need for the hasattr check.
                env.render_rollouts(evaluation_paths)

            yield diagnostics

        self.sampler.terminate()

        self._training_after_hook()
Beispiel #16
0
    def _train(self):
        """Return a generator that performs RL training.

        Args:
            env (`SoftlearningEnv`): Environment used for training.
            policy (`Policy`): Policy used for training
            initial_exploration_policy ('Policy'): Policy used for exploration
                If None, then all exploration is done using policy
            pool (`PoolBase`): Sample pool to add samples to
        """
        training_environment = self._training_environment
        evaluation_environment = self._evaluation_environment
        policy = self._policy
        pool = self._pool

        if not self._training_started:
            self._init_training()

            self._initial_exploration_hook(training_environment,
                                           self._initial_exploration_policy,
                                           pool)

        self.sampler.initialize(training_environment, policy, pool)

        gt.reset_root()
        gt.rename_root('RLAlgorithm')
        gt.set_def_unique(False)

        self._training_before_hook()

        import time
        for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)):
            self._epoch_before_hook()
            gt.stamp('epoch_before_hook')
            start_samples = self.sampler._total_samples

            sample_times = []
            for i in count():
                samples_now = self.sampler._total_samples
                self._timestep = samples_now - start_samples

                if (samples_now >= start_samples + self._epoch_length
                        and self.ready_to_train):
                    break

                self._timestep_before_hook()
                gt.stamp('timestep_before_hook')
                t0 = time.time()
                self._do_sampling(timestep=self._total_timestep)
                gt.stamp('sample')
                sample_times.append(time.time() - t0)
                if self.ready_to_train:
                    self._do_training_repeats(timestep=self._total_timestep)
                gt.stamp('train')

                self._timestep_after_hook()
                gt.stamp('timestep_after_hook')

            print("Average Sample Time: ", np.mean(np.array(sample_times)))

            training_paths = self._training_paths()
            # self.sampler.get_last_n_paths(
            #     math.ceil(self._epoch_length / self.sampler._max_path_length))
            gt.stamp('training_paths')
            evaluation_paths = self._evaluation_paths(policy,
                                                      evaluation_environment)
            gt.stamp('evaluation_paths')

            training_metrics = self._evaluate_rollouts(training_paths,
                                                       training_environment)
            gt.stamp('training_metrics')

            #should_save_path = (
            #    self._path_save_frequency > 0
            #    and self._epoch % self._path_save_frequency == 0)
            #if should_save_path:
            #    import pickle
            #    for i, path in enumerate(training_paths):
            #        #path.pop('images')
            #        path_file_name = f'training_path_{self._epoch}_{i}.pkl'
            #        path_file_path = os.path.join(
            #            os.getcwd(), 'paths', path_file_name)
            #        if not os.path.exists(os.path.dirname(path_file_path)):
            #            os.makedirs(os.path.dirname(path_file_path))
            #        with open(path_file_path, 'wb' ) as f:
            #            pickle.dump(path, f)

            if evaluation_paths:
                evaluation_metrics = self._evaluate_rollouts(
                    evaluation_paths, evaluation_environment)
                gt.stamp('evaluation_metrics')
            else:
                evaluation_metrics = {}

            self._epoch_after_hook(training_paths)
            gt.stamp('epoch_after_hook')

            sampler_diagnostics = self.sampler.get_diagnostics()

            diagnostics = self.get_diagnostics(
                iteration=self._total_timestep,
                batch=self._evaluation_batch(),
                training_paths=training_paths,
                evaluation_paths=evaluation_paths)

            time_diagnostics = gt.get_times().stamps.itrs

            diagnostics.update(
                OrderedDict((
                    *((f'evaluation/{key}', evaluation_metrics[key])
                      for key in sorted(evaluation_metrics.keys())),
                    *((f'training/{key}', training_metrics[key])
                      for key in sorted(training_metrics.keys())),
                    *((f'times/{key}', time_diagnostics[key][-1])
                      for key in sorted(time_diagnostics.keys())),
                    *((f'sampler/{key}', sampler_diagnostics[key])
                      for key in sorted(sampler_diagnostics.keys())),
                    ('epoch', self._epoch),
                    ('timestep', self._timestep),
                    ('timesteps_total', self._total_timestep),
                    ('train-steps', self._num_train_steps),
                )))

            if self._eval_render_kwargs and hasattr(evaluation_environment,
                                                    'render_rollouts'):
                # TODO: Make this consistent such that there's no
                # need for the hasattr check.
                training_environment.render_rollouts(evaluation_paths)

            yield diagnostics

        self.sampler.terminate()

        self._training_after_hook()

        del evaluation_paths

        yield {'done': True, **diagnostics}
Beispiel #17
0
    def _train(self, env, policy, pool):
        """When training our policy expects an augmented observation."""
        self._init_training(env, policy, pool)

        with self._sess.as_default():
            # reset with goal
            goal = env.sample_goal()
            observation = env.reset(goal=goal)
            policy.reset()

            # sample z ~ p(z|g)
            z = self._embedding.get_z(goal=goal)

            path_length = 0
            path_return = 0
            last_path_return = 0
            max_path_return = -np.inf
            n_episodes = 0
            trajectory = []
            z_indx = 0

            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)
                path_length_list = []

                for t in range(self._epoch_length):
                    iteration = t + epoch * self._epoch_length

                    # flatten observation with given latent z
                    aug_obs = np.concatenate((observation['observation'], z))
                    action, _ = policy.get_action(aug_obs)
                    next_ob, reward, terminal, info = env.step(action)

                    # assert all(next_ob['desired_goal'] == goal)
                    assert reward == env.compute_reward(
                        next_ob['achieved_goal'], next_ob['desired_goal'],
                        info)

                    path_length += 1
                    path_return += reward

                    trajectory.append(
                        (observation, action, reward, next_ob, terminal))

                    if terminal or path_length >= self._max_path_length:
                        path_length_list.append(path_length)

                        # add hindsight samples
                        self._pool.add_hindsight_episode(
                            episode=trajectory,
                            embedding=self._embedding,
                            latent=z,
                            goal=goal,
                        )

                        z_indx += 1
                        if z_indx >= self._n_latents:
                            goal = env.sample_goal()
                            z_indx = 0
                        z = self._embedding.get_z(goal=goal)

                        observation = env.reset(goal=goal)
                        policy.reset()

                        path_length = 0
                        max_path_return = max(max_path_return, path_return)
                        last_path_return = path_return
                        path_return = 0
                        n_episodes += 1
                        trajectory = []

                    else:
                        observation = next_ob

                    gt.stamp('sample')

                    if self._pool.size >= self._min_pool_size:
                        for i in range(self._n_train_repeat):
                            batch = self._pool.random_batch(self._batch_size)
                            self._do_training(iteration, batch)

                    gt.stamp('train')

                self._evaluate(epoch)

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)
                times_itrs = gt.get_times().stamps.itrs

                eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
                total_time = gt.get_times().total
                logger.record_tabular('time-train', times_itrs['train'][-1])
                logger.record_tabular('time-eval', eval_time)
                logger.record_tabular('time-sample', times_itrs['sample'][-1])
                logger.record_tabular('time-total', total_time)
                logger.record_tabular('epoch', epoch)
                logger.record_tabular('episodes', n_episodes)
                logger.record_tabular('steps',
                                      iteration)  # also record total steps
                logger.record_tabular('max-path-return', max_path_return)
                logger.record_tabular('last-path-return', last_path_return)
                logger.record_tabular('pool-size', self._pool.size)
                logger.record_tabular('path-length', np.mean(path_length_list))

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()

                gt.stamp('eval')

            env.terminate()
def main(arglist):
    game_name = arglist.game_name
    # 'abs', 'one'
    reward_type = arglist.reward_type
    p = arglist.p
    agent_num = arglist.n
    u_range = 1.
    k = 0
    print(arglist.aux, 'arglist.aux')
    model_names_setting = arglist.model_names_setting.split('_')
    model_names = [model_names_setting[0]] + [model_names_setting[1]] * (agent_num - 1)
    model_name = '_'.join(model_names)
    path_prefix = game_name
    if game_name == 'pbeauty':
        env = PBeautyGame(agent_num=agent_num, reward_type=reward_type, p=p)
        path_prefix  = game_name + '-' + reward_type + '-' + str(p)
    elif 'matrix' in game_name:
        matrix_game_name = game_name.split('-')[-1]
        repeated = arglist.repeat
        max_step = arglist.max_path_length
        memory = arglist.memory
        env = MatrixGame(game=matrix_game_name, agent_num=agent_num,
                         action_num=2, repeated=repeated,
                         max_step=max_step, memory=memory,
                         discrete_action=False, tuple_obs=False)
        path_prefix = '{}-{}-{}-{}'.format(game_name, repeated, max_step, memory)

    elif 'particle' in game_name:
        particle_game_name = game_name.split('-')[-1]
        env, agent_num, model_name, model_names = get_particle_game(particle_game_name, arglist)

    now = datetime.datetime.now()
    timestamp = now.strftime('%Y-%m-%d %H:%M:%S.%f %Z')
    if 'CG' in model_name:
        model_name = model_name + '-{}'.format(arglist.mu)
    if not arglist.aux:
        model_name = model_name + '-{}'.format(arglist.aux)

    suffix = '{}/{}/{}/{}'.format(path_prefix, agent_num, model_name, timestamp)

    print(suffix)

    logger.add_tabular_output('./log/{}.csv'.format(suffix))
    snapshot_dir = './snapshot/{}'.format(suffix)
    policy_dir = './policy/{}'.format(suffix)
    os.makedirs(snapshot_dir, exist_ok=True)
    os.makedirs(policy_dir, exist_ok=True)
    logger.set_snapshot_dir(snapshot_dir)

    agents = []
    M = arglist.hidden_size
    batch_size = arglist.batch_size
    sampler = MASampler(agent_num=agent_num, joint=True, max_path_length=30, min_pool_size=100, batch_size=batch_size)

    base_kwargs = {
        'sampler': sampler,
        'epoch_length': 1,
        'n_epochs': arglist.max_steps,
        'n_train_repeat': 1,
        'eval_render': True,
        'eval_n_episodes': 10
    }

    with U.single_threaded_session():
        for i, model_name in enumerate(model_names):
            if 'PR2AC' in model_name:
                k = int(model_name[-1])
                g = False
                mu = arglist.mu
                if 'G' in model_name:
                    g = True
                agent = pr2ac_agent(model_name, i, env, M, u_range, base_kwargs, k=k, g=g, mu=mu, game_name=game_name, aux=arglist.aux)
            elif model_name == 'MASQL':
                agent = masql_agent(model_name, i, env, M, u_range, base_kwargs, game_name=game_name)
            else:
                if model_name == 'DDPG':
                    joint = False
                    opponent_modelling = False
                elif model_name == 'MADDPG':
                    joint = True
                    opponent_modelling = False
                elif model_name == 'DDPG-OM' or model_name == 'DDPG-ToM':
                    joint = True
                    opponent_modelling = True
                agent = ddpg_agent(joint, opponent_modelling, model_names, i, env, M, u_range, base_kwargs, game_name=game_name)

            agents.append(agent)

        sampler.initialize(env, agents)

        for agent in agents:
            agent._init_training()
        gt.rename_root('MARLAlgorithm')
        gt.reset()
        gt.set_def_unique(False)
        initial_exploration_done = False
        # noise = .1
        noise = 1.
        alpha = .5


        for agent in agents:
            try:
                agent.policy.set_noise_level(noise)
            except:
                pass

        for epoch in gt.timed_for(range(base_kwargs['n_epochs'] + 1)):
            logger.push_prefix('Epoch #%d | ' % epoch)
            if epoch % 1 == 0:
                print(suffix)
            for t in range(base_kwargs['epoch_length']):
                # TODO.code consolidation: Add control interval to sampler
                if not initial_exploration_done:
                    if epoch >= 1000:
                        initial_exploration_done = True
                sampler.sample()
                # print('Sampling')
                if not initial_exploration_done:
                    continue
                gt.stamp('sample')
                # print('Sample Done')
                if epoch == base_kwargs['n_epochs']:
                    noise = 0.1

                    for agent in agents:
                        try:
                            agent.policy.set_noise_level(noise)
                        except:
                            pass
                    # alpha = .1
                if epoch > base_kwargs['n_epochs'] / 10:
                    noise = 0.1
                    for agent in agents:
                        try:
                            agent.policy.set_noise_level(noise)
                        except:
                            pass
                    # alpha = .1
                if epoch > base_kwargs['n_epochs'] / 5:
                    noise = 0.05
                    for agent in agents:
                        try:
                            agent.policy.set_noise_level(noise)
                        except:
                            pass
                if epoch > base_kwargs['n_epochs'] / 6:
                    noise = 0.01
                    for agent in agents:
                        try:
                            agent.policy.set_noise_level(noise)
                        except:
                            pass

                for j in range(base_kwargs['n_train_repeat']):
                    batch_n = []
                    recent_batch_n = []
                    indices = None
                    receent_indices = None
                    for i, agent in enumerate(agents):
                        if i == 0:
                            batch = agent.pool.random_batch(batch_size)
                            indices = agent.pool.indices
                            receent_indices = list(range(agent.pool._top-batch_size, agent.pool._top))

                        batch_n.append(agent.pool.random_batch_by_indices(indices))
                        recent_batch_n.append(agent.pool.random_batch_by_indices(receent_indices))

                    # print(len(batch_n))
                    target_next_actions_n = []
                    try:
                        for agent, batch in zip(agents, batch_n):
                            target_next_actions_n.append(agent._target_policy.get_actions(batch['next_observations']))
                    except:
                        pass


                    opponent_actions_n = np.array([batch['actions'] for batch in batch_n])
                    recent_opponent_actions_n = np.array([batch['actions'] for batch in recent_batch_n])

                    ####### figure out
                    recent_opponent_observations_n = []
                    for batch in recent_batch_n:
                        recent_opponent_observations_n.append(batch['observations'])


                    current_actions = [agents[i]._policy.get_actions(batch_n[i]['next_observations'])[0][0] for i in range(agent_num)]
                    all_actions_k = []
                    for i, agent in enumerate(agents):
                        if isinstance(agent, MAVBAC):
                            if agent._k > 0:
                                batch_actions_k = agent._policy.get_all_actions(batch_n[i]['next_observations'])
                                actions_k = [a[0][0] for a in batch_actions_k]
                                all_actions_k.append(';'.join(list(map(str, actions_k))))
                    if len(all_actions_k) > 0:
                        with open('{}/all_actions.csv'.format(policy_dir), 'a') as f:
                            f.write(','.join(list(map(str, all_actions_k))) + '\n')
                    with open('{}/policy.csv'.format(policy_dir), 'a') as f:
                        f.write(','.join(list(map(str, current_actions)))+'\n')
                    # print('============')
                    for i, agent in enumerate(agents):
                        try:
                            batch_n[i]['next_actions'] = deepcopy(target_next_actions_n[i])
                        except:
                            pass
                        batch_n[i]['opponent_actions'] = np.reshape(np.delete(deepcopy(opponent_actions_n), i, 0), (-1, agent._opponent_action_dim))
                        if agent.joint:
                            if agent.opponent_modelling:
                                batch_n[i]['recent_opponent_observations'] = recent_opponent_observations_n[i]
                                batch_n[i]['recent_opponent_actions'] = np.reshape(np.delete(deepcopy(recent_opponent_actions_n), i, 0), (-1, agent._opponent_action_dim))
                                batch_n[i]['opponent_next_actions'] = agent.opponent_policy.get_actions(batch_n[i]['next_observations'])
                            else:
                                batch_n[i]['opponent_next_actions'] = np.reshape(np.delete(deepcopy(target_next_actions_n), i, 0), (-1, agent._opponent_action_dim))
                        if isinstance(agent, MAVBAC) or isinstance(agent, MASQL):
                            agent._do_training(iteration=t + epoch * agent._epoch_length, batch=batch_n[i], annealing=alpha)
                        else:
                            agent._do_training(iteration=t + epoch * agent._epoch_length, batch=batch_n[i])
                gt.stamp('train')

            # self._evaluate(epoch)

            # for agent in agents:
            #     params = agent.get_snapshot(epoch)
            #     logger.save_itr_params(epoch, params)
            # times_itrs = gt.get_times().stamps.itrs
            #
            # eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
            # total_time = gt.get_times().total
            # logger.record_tabular('time-train', times_itrs['train'][-1])
            # logger.record_tabular('time-eval', eval_time)
            # logger.record_tabular('time-sample', times_itrs['sample'][-1])
            # logger.record_tabular('time-total', total_time)
            # logger.record_tabular('epoch', epoch)

            # sampler.log_diagnostics()

            # logger.dump_tabular(with_prefix=False)
            logger.pop_prefix()
            sampler.terminate()
Beispiel #19
0
def objective(arglist):
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
    sess = tf.Session(config=config)
    set_session(sess)

    game_name = arglist.game_name
    # 'abs', 'one'
    reward_type = arglist.reward_type
    p = arglist.p
    agent_num = arglist.n
    u_range = 1.
    k = 0
    print(arglist.aux, 'arglist.aux')
    model_names_setting = arglist.model_names_setting.split('_')
    model_names = model_names_setting
    model_name = '_'.join(model_names)
    path_prefix = game_name
    if game_name == 'pbeauty':
        env = PBeautyGame(agent_num=agent_num, reward_type=reward_type, p=p)
        path_prefix  = game_name + '-' + reward_type + '-' + str(p)
    elif 'matrix' in game_name:
        matrix_game_name = game_name.split('-')[-1]
        repeated = arglist.repeat
        max_step = arglist.max_path_length
        memory = arglist.memory
        env = MatrixGame(game=matrix_game_name, agent_num=agent_num,
                         action_num=2, repeated=repeated,
                         max_step=max_step, memory=memory,
                         discrete_action=False, tuple_obs=False)
        path_prefix = '{}-{}-{}-{}'.format(game_name, repeated, max_step, memory)

    elif 'diff' in game_name:
        diff_game_name = game_name.split('-')[-1]
        agent_num = 3
        s2 = arglist.s2
        x2 = arglist.x2
        y2 = arglist.y2
        con = arglist.con
        env = DifferentialGame(diff_game_name, agent_num, x2, y2, s2, con)

    elif 'particle' in game_name:
        particle_game_name = game_name.split('-')[-1]
        env, agent_num, model_name, model_names = get_particle_game(particle_game_name, arglist)

    now = datetime.datetime.now()
    timestamp = now.strftime('%Y-%m-%d %H:%M:%S.%f %Z')
    if 'CG' in model_name:
        model_name = model_name + '-{}'.format(arglist.mu)
    if not arglist.aux:
        model_name = model_name + '-{}'.format(arglist.aux)

    suffix = '{}/{}/{}/{}'.format(path_prefix, agent_num, model_name, timestamp)

    print(suffix)

    # logger.add_tabular_output('./log/{}.csv'.format(suffix))
    # snapshot_dir = './snapshot/{}'.format(suffix)
    # policy_dir = './policy/{}'.format(suffix)
    # os.makedirs(snapshot_dir, exist_ok=True)
    # os.makedirs(policy_dir, exist_ok=True)
    # logger.set_snapshot_dir(snapshot_dir)

    agents = []
    M = arglist.hidden_size
    batch_size = arglist.batch_size

    sampler = MASampler(agent_num=agent_num, joint=True, global_reward=arglist.global_reward, max_path_length=25, min_pool_size=100, batch_size=batch_size)

    base_kwargs = {
        'sampler': sampler,
        'epoch_length': 1,
        'n_epochs': arglist.max_steps,
        'n_train_repeat': 1,
        'eval_render': True,
        'eval_n_episodes': 10
    }

    _alpha = arglist.alpha
    lr = arglist.lr
    n_pars = arglist.n_pars
    result = 0.

    with U.single_threaded_session():
        for i, model_name in enumerate(model_names):
            if 'PR2AC' in model_name:
                k = int(model_name[-1])
                g = False
                mu = arglist.mu
                if 'G' in model_name:
                    g = True
                agent = pr2ac_agent(model_name, i, env, M, u_range, base_kwargs,  lr=lr, n_pars=n_pars, k=k, g=g, mu=mu, game_name=game_name, aux=arglist.aux)
            elif model_name == 'MASQL':
                agent = masql_agent(model_name, i, env, M, u_range, base_kwargs,  lr=lr, n_pars=n_pars, game_name=game_name)
            elif model_name == 'ROMMEO':
                agent = rom_agent(model_name, i, env, M, u_range, base_kwargs, game_name=game_name)
            else:
                if model_name == 'DDPG':
                    joint = False
                    opponent_modelling = False
                elif model_name == 'MADDPG':
                    joint = True
                    opponent_modelling = False
                elif model_name == 'DDPG-OM':
                    joint = True
                    opponent_modelling = True
                agent = ddpg_agent(joint, opponent_modelling, model_names, i, env, M, u_range, base_kwargs,lr=lr, game_name=game_name)

            agents.append(agent)

        sampler.initialize(env, agents)

        for agent in agents:
            agent._init_training()
        gt.rename_root('MARLAlgorithm')
        gt.reset()
        gt.set_def_unique(False)
        initial_exploration_done = False
        # noise = .1
        noise = .5


        for agent in agents:
            try:
                agent.policy.set_noise_level(noise)
            except:
                pass
        # alpha = .5
        for steps in gt.timed_for(range(base_kwargs['n_epochs'] + 1)):
            # import pdb; pdb.set_trace()
            # alpha = _alpha + np.exp(-0.1 * max(steps-10, 0)) * 500.
            if steps < base_kwargs['n_epochs']//3:
                # alpha = _alpha
                alpha = _alpha + np.exp(-0.1 * max(steps-10, 0)) * 500.
            elif  steps < base_kwargs['n_epochs']//2:
                alpha = _alpha/10
            else:
                alpha = .3
            tflog('alpha', alpha)
            print('alpha', alpha)
            # if steps > 100 and steps<150:
            #     alpha = .1 - 0.099 * steps/(150)
            # elif steps >= 150:
            #     alpha = 1e-3
            print('alpha', alpha)
            # logger.push_prefix('Epoch #%d | ' % steps)
            if steps % (25*1000) == 0:
                print(suffix)
            for t in range(base_kwargs['epoch_length']):
                # TODO.code consolidation: Add control interval to sampler
                if not initial_exploration_done:
                    # if steps >= 1000:
                    if steps >= 10:
                        initial_exploration_done = True
                sampler.sample()
                if not initial_exploration_done:
                    continue
                gt.stamp('sample')
                print('Sample Done')
                if steps == 10000:
                    noise = 0.1

                    for agent in agents:
                        try:
                            agent.policy.set_noise_level(noise)
                        except:
                            pass
                    # alpha = 10.
                # if steps == 2000:
                if steps > base_kwargs['n_epochs'] / 10:
                    noise = 0.1
                    for agent in agents:
                        try:
                            agent.policy.set_noise_level(noise)
                        except:
                            pass
                    # alpha = .1
                if steps > base_kwargs['n_epochs'] / 5:
                    noise = 0.05
                    for agent in agents:
                        try:
                            agent.policy.set_noise_level(noise)
                        except:
                            pass
                if steps > base_kwargs['n_epochs'] / 6:
                    noise = 0.01
                    for agent in agents:
                        try:
                            agent.policy.set_noise_level(noise)
                        except:
                            pass
                if steps % arglist.training_interval != 0:
                    continue
                for j in range(base_kwargs['n_train_repeat']):
                    batch_n = []
                    recent_batch_n = []
                    indices = None
                    receent_indices = None
                    for i, agent in enumerate(agents):
                        if i == 0:
                            batch = agent.pool.random_batch(batch_size)
                            indices = agent.pool.indices
                            receent_indices = list(range(agent.pool._top-batch_size, agent.pool._top))

                        batch_n.append(agent.pool.random_batch_by_indices(indices))
                        recent_batch_n.append(agent.pool.random_batch_by_indices(receent_indices))

                    # print(len(batch_n))
                    target_next_actions_n = []
                    # try:
                    all_obs = np.array(np.concatenate([batch['observations'] for batch in batch_n], axis=-1))
                    all_next_obs = np.array(np.concatenate([batch['next_observations'] for batch in batch_n], axis=-1))
                    # print(all_obs[0])
                    for batch in batch_n:
                        # print('making all obs')
                        batch['all_observations'] = deepcopy(all_obs)
                        batch['all_next_observations'] = deepcopy(all_next_obs)
                    opponent_current_actions_n = []
                    for agent, batch in zip(agents, batch_n):
                        target_next_actions_n.append(agent.target_policy.get_actions(batch['next_observations']))
                        opponent_current_actions_n.append(agent.policy.get_actions(batch['observations']))

                    for i, agent in enumerate(agents):
                        batch_n[i]['opponent_current_actions'] = np.reshape(
                            np.delete(deepcopy(opponent_current_actions_n), i, 0), (-1, agent._opponent_action_dim))
                    # except:
                    #     pass


                    opponent_actions_n = np.array([batch['actions'] for batch in batch_n])
                    recent_opponent_actions_n = np.array([batch['actions'] for batch in recent_batch_n])

                    ####### figure out
                    recent_opponent_observations_n = []
                    for batch in recent_batch_n:
                        recent_opponent_observations_n.append(batch['observations'])


                    current_actions = [agents[i].policy.get_actions(batch_n[i]['next_observations'])[0][0] for i in range(agent_num)]
                    all_actions_k = []
                    for i, agent in enumerate(agents):
                        if isinstance(agent, MAVBAC):
                            if agent._k > 0:
                                batch_actions_k = agent.policy.get_all_actions(batch_n[i]['next_observations'])
                                actions_k = [a[0][0] for a in batch_actions_k]
                                all_actions_k.append(';'.join(list(map(str, actions_k))))
                    # if len(all_actions_k) > 0:
                    #     with open('{}/all_actions.csv'.format(policy_dir), 'a') as f:
                    #         f.write(','.join(list(map(str, all_actions_k))) + '\n')
                    # with open('{}/policy.csv'.format(policy_dir), 'a') as f:
                    #     f.write(','.join(list(map(str, current_actions)))+'\n')
                    # print('============')
                    for i, agent in enumerate(agents):
                        try:
                            batch_n[i]['next_actions'] = deepcopy(target_next_actions_n[i])
                        except:
                            pass
                        batch_n[i]['opponent_actions'] = np.reshape(np.delete(deepcopy(opponent_actions_n), i, 0), (-1, agent._opponent_action_dim))
                        if agent.joint:
                            if agent.opponent_modelling:
                                batch_n[i]['recent_opponent_observations'] = recent_opponent_observations_n[i]
                                batch_n[i]['recent_opponent_actions'] = np.reshape(np.delete(deepcopy(recent_opponent_actions_n), i, 0), (-1, agent._opponent_action_dim))
                                batch_n[i]['opponent_next_actions'] = agent.opponent_policy.get_actions(batch_n[i]['next_observations'])
                            else:
                                batch_n[i]['opponent_next_actions'] = np.reshape(np.delete(deepcopy(target_next_actions_n), i, 0), (-1, agent._opponent_action_dim))

                        if isinstance(agent, MAVBAC) or isinstance(agent, MASQL) or isinstance(agent, ROMMEO):
                            agent._do_training(iteration=t + steps * agent._epoch_length, batch=batch_n[i], annealing=alpha)
                        else:
                            agent._do_training(iteration=t + steps * agent._epoch_length, batch=batch_n[i])
                gt.stamp('train')
            result = sampler.terminate()
    clear_session()
    return result
    def train(self, init_episode=0):

        if init_episode == 0:
            # Eval and log
            self.eval()
            self.log(write_table_header=True)

        gt.reset()
        gt.set_def_unique(False)

        expected_accum_rewards = np.zeros(self.total_episodes)

        episodes_iter = range(init_episode, self.total_episodes)
        if not logger.get_log_stdout():
            # Fancy iterable bar
            episodes_iter = tqdm.tqdm(episodes_iter)

        for it in gt.timed_for(episodes_iter, save_itrs=True):
            # Put models in training mode
            for model in self.trainable_models:
                model.train()

            obs = self.env.reset()
            rollout_steps = 0
            for step in range(self.train_steps):
                if self.render:
                    self.env.render()
                interaction_info = interaction(
                    self.env,
                    self.policy,
                    obs,
                    device=self.torch_device,
                    deterministic=False,
                )
                self.num_train_interactions += 1
                rollout_steps += 1
                gt.stamp('sample')

                # Add data to replay_buffer
                self.replay_buffer.add_sample(**interaction_info)

                # Only train when there are enough samples from buffer
                if self.replay_buffer.available_samples() > self.batch_size:
                    for ii in range(self.optimization_steps):
                        self.learn()
                gt.stamp('train')

                # Reset environment if it is done
                if interaction_info['termination'] \
                        or rollout_steps > self.max_horizon:
                    obs = self.env.reset()
                    rollout_steps = 0
                else:
                    obs = interaction_info['next_obs']

            # Evaluate current policy to check performance
            expected_accum_rewards[it] = self.eval()

            self.log()

            self.num_episodes += 1

        return expected_accum_rewards
    def _train(self, env, policy, pool, load=None):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            pool (`PoolBase`): Sample pool to add samples to
        """
        self._init_training()
        self.sampler.initialize(env, policy, pool)

        # evaluation_env = deep_clone(env) if self._eval_n_episodes else None

        with tf_utils.get_default_session().as_default() as sess:
            if load is not None:
                saver = tf.train.Saver()
                saver.restore(sess, load)
                print('pre-trained model restored ...')
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    self.sampler.sample()
                    if not self.sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    for i in range(self._n_train_repeat):
                        self._do_training(iteration=t +
                                          epoch * self._epoch_length,
                                          batch=self.sampler.random_batch())
                    gt.stamp('train')

                if epoch % 1 == 0 or epoch >= ENV_PARAMS['n_epochs'] - 20:
                    self._evaluate(policy, env)
                    print('@ epoch %d : ' % epoch)
                    # gt.stamp('eval')
                    #
                    # params = self.get_snapshot(epoch)
                    # logger.save_itr_params(epoch, params)
                    #
                    # time_itrs = gt.get_times().stamps.itrs
                    # time_eval = time_itrs['eval'][-1]
                    # time_total = gt.get_times().total
                    # time_train = time_itrs.get('train', [0])[-1]
                    # time_sample = time_itrs.get('sample', [0])[-1]
                    #
                    # logger.record_tabular('time-train', time_train)
                    # logger.record_tabular('time-eval', time_eval)
                    # logger.record_tabular('time-sample', time_sample)
                    # logger.record_tabular('time-total', time_total)
                    # logger.record_tabular('epoch', epoch)

                    self.sampler.log_diagnostics()

                    logger.dump_tabular(with_prefix=False)
                    logger.pop_prefix()

                    # env.reset()

                if (epoch > ENV_PARAMS['n_epochs'] * 0 and epoch % 5
                        == 0) or epoch >= ENV_PARAMS['n_epochs'] - 100:
                    saver = tf.train.Saver()
                    saver.save(sess,
                               save_path=save_path + '/model-' + str(epoch) +
                               '.ckpt')
                    print('Model saved ...')

            self.sampler.terminate()
Beispiel #22
0
    def _train(self, env, policy, replay_buffer, sess):
        """Perform RL training.

        Parameters
        ----------
        env : gym.Env
            Environment used for training
        policy : Policy
            Policy used for training
        replay_buffer : ReplayBuffer
            Replay buffer to add samples to
        """
        self._init_training()
        self.sampler.initialize(env, policy, replay_buffer)
        evaluation_env = deep_clone(env) if self._eval_n_episodes else None

        with sess.as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    self.sampler.sample()
                    if not self.sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    for i in range(self._n_train_repeat):
                        self._do_training(iteration=t +
                                          epoch * self._epoch_length,
                                          batch=self.sampler.random_batch())
                    gt.stamp('train')

                self._evaluate(policy, evaluation_env)
                gt.stamp('eval')

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)

                time_itrs = gt.get_times().stamps.itrs
                time_eval = time_itrs['eval'][-1]
                time_total = gt.get_times().total
                time_train = time_itrs.get('train', [0])[-1]
                time_sample = time_itrs.get('sample', [0])[-1]

                logger.record_tabular('time-train', time_train)
                logger.record_tabular('time-eval', time_eval)
                logger.record_tabular('time-sample', time_sample)
                logger.record_tabular('time-total', time_total)
                logger.record_tabular('epoch', epoch)

                logger.record_time(time_total, time_sample)

                self.sampler.log_diagnostics()

                logger.save_stats()
                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()

            self.sampler.terminate()
Beispiel #23
0
    def _train(self,
               env,
               policy,
               pool,
               qf=None,
               vf=None,
               saver=None,
               _ec=None,
               dynamic_ec=False):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            pool (`PoolBase`): Sample pool to add samples to
        """

        self._init_training(env, policy, pool)
        self.sampler.initialize(env, policy, pool)
        if dynamic_ec:
            dicrese_rate = _ec / self._n_epochs

        logger2 = mylogger2.get_logger()
        os.makedirs(os.path.join(logger2.log_dir, 'model'),
                    exist_ok=logger2.exist_ok)
        optuna_break = False

        with self._sess.as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)
            for epoch in gt.timed_for(range(self._n_epochs), save_itrs=True):
                if optuna_break:
                    continue
                # logger.push_prefix('Epoch #%d | ' % epoch)
                epoch_states = []
                kurtosis = []
                signed_variance = []
                for t in range(self._epoch_length):
                    # TODO.codeconsolidation: Add control interval to sampler
                    done, _n_episodes, obs, next_obs, info = self.sampler.sample(
                    )
                    epoch_states.append(obs)

                    state_importances = self.policy.calc_knack([obs])
                    kurtosis.append(state_importances["kurtosis"][0])
                    signed_variance.append(
                        state_importances["signed_variance"]
                        [0])  # be careful of batch_ready < epoch_length
                    if not self.sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    for i in range(self._n_train_repeat):
                        self._do_training(iteration=t +
                                          epoch * self._epoch_length,
                                          batch=self.sampler.random_batch())
                    gt.stamp('train')

                # evaluation
                if epoch % self._eval_n_frequency == 0:
                    eval_average_return = self._evaluate(epoch)
                    logger.record_tabular('eval_average_return',
                                          eval_average_return)
                    if hasattr(self.policy, "optuna_trial"):
                        if self.policy.optuna_trial is not None:
                            self.policy.optuna_trial.report(
                                eval_average_return,
                                epoch)  # report intermediate_value
                            if self.policy.optuna_trial.should_prune():
                                optuna_break = True
                                continue
                                # raise optuna.structs.TrialPruned()
                else:
                    logger.record_tabular('eval_average_return', np.nan)
                gt.stamp('eval')

                # logging about time and step
                times_itrs = gt.get_times().stamps.itrs
                eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
                total_time = gt.get_times().total
                logger.record_tabular('time-train', times_itrs['train'][-1])
                logger.record_tabular('time-eval', eval_time)
                logger.record_tabular('time-sample', times_itrs['sample'][-1])
                logger.record_tabular('time-total', total_time)
                logger.record_tabular('epoch', epoch)
                logger.record_tabular('total_step',
                                      self.sampler._total_samples)
                logger.record_tabular('total_episode',
                                      self.sampler._n_episodes)

                # logging about array
                if hasattr(self.policy, "current_knack_thresh"):
                    current_knack_thresh = self.policy.current_knack_thresh
                    _ = self.policy.calc_and_update_knack(epoch_states)
                if logger2.save_array_flag:
                    kwargs1 = {
                        'epoch': epoch,
                        'states': np.array(epoch_states),
                        'knack_kurtosis': np.array(kurtosis),
                        'signed_variance': np.array(signed_variance)
                    }
                    if hasattr(self.policy, "current_knack_thresh"):
                        kwargs1.update(
                            {'current_knack_thresh': current_knack_thresh})
                        kwargs1.update(self.policy.get_q_params())
                    logger2.add_array_data(kwargs1)

                    if epoch % 10 == 0:
                        #  TODO save only parameters
                        saver.save(self._sess,
                                   os.path.join(logger2.log_dir, 'model'))
                        gt.stamp("tf save")
                gt.stamp("calc knacks")

                if dynamic_ec:
                    self._sess.run(tf.assign(_ec, _ec - dicrese_rate))

                logger.dump_tabular()
                logger2.write()
                # print(gt.report())

            # finalize processing
            if optuna_break:
                return None
            if logger2.save_array_flag:
                saver.save(self._sess, os.path.join(logger2.log_dir, 'model'))
            self.sampler.terminate()
            return eval_average_return
Beispiel #24
0
    def _train(self):
        """Return a generator that runs the standard RL loop."""
        training_environment = self._training_environment
        evaluation_environment = self._evaluation_environment
        policy = self._policy

        gt.reset_root()
        gt.rename_root('RLAlgorithm')
        gt.set_def_unique(False)

        self._training_before_hook()

        for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)):
            self._epoch_before_hook()
            gt.stamp('epoch_before_hook')

            update_diagnostics = []

            start_samples = self.sampler._total_samples
            for i in count():
                samples_now = self.sampler._total_samples
                self._timestep = samples_now - start_samples

                if (samples_now >= start_samples + self._epoch_length
                        and self.ready_to_train):
                    break

                self._timestep_before_hook()
                gt.stamp('timestep_before_hook')

                self._do_sampling(timestep=self._total_timestep)
                gt.stamp('sample')

                if self.ready_to_train:
                    repeat_diagnostics = self._do_training_repeats(
                        timestep=self._total_timestep)
                    if repeat_diagnostics is not None:
                        update_diagnostics.append(repeat_diagnostics)

                gt.stamp('train')

                self._timestep_after_hook()
                gt.stamp('timestep_after_hook')

            update_diagnostics = tree.map_structure(lambda *d: np.mean(d),
                                                    *update_diagnostics)

            training_paths = self.sampler.get_last_n_paths(
                math.ceil(self._epoch_length / self.sampler._max_path_length))
            gt.stamp('training_paths')
            evaluation_paths = self._evaluation_paths(policy,
                                                      evaluation_environment)
            gt.stamp('evaluation_paths')

            training_metrics = self._evaluate_rollouts(training_paths,
                                                       training_environment,
                                                       self._total_timestep,
                                                       evaluation_type='train')
            gt.stamp('training_metrics')
            if evaluation_paths:
                evaluation_metrics = self._evaluate_rollouts(
                    evaluation_paths,
                    evaluation_environment,
                    self._total_timestep,
                    evaluation_type='evaluation')
                gt.stamp('evaluation_metrics')
            else:
                evaluation_metrics = {}

            self._epoch_after_hook(training_paths)
            gt.stamp('epoch_after_hook')

            sampler_diagnostics = self.sampler.get_diagnostics()

            diagnostics = self.get_diagnostics(
                iteration=self._total_timestep,
                batch=self._evaluation_batch(),
                training_paths=training_paths,
                evaluation_paths=evaluation_paths)

            time_diagnostics = {
                key: times[-1]
                for key, times in gt.get_times().stamps.itrs.items()
            }

            # TODO(hartikainen/tf2): Fix the naming of training/update
            # diagnostics/metric
            diagnostics.update((
                ('evaluation', evaluation_metrics),
                ('training', training_metrics),
                ('update', update_diagnostics),
                ('times', time_diagnostics),
                ('sampler', sampler_diagnostics),
                ('epoch', self._epoch),
                ('timestep', self._timestep),
                ('total_timestep', self._total_timestep),
                ('num_train_steps', self._num_train_steps),
            ))

            if self._eval_render_kwargs and hasattr(evaluation_environment,
                                                    'render_rollouts'):
                # TODO(hartikainen): Make this consistent such that there's no
                # need for the hasattr check.
                training_environment.render_rollouts(evaluation_paths)

            yield diagnostics

        self.sampler.terminate()

        self._training_after_hook()

        yield {'done': True, **diagnostics}
Beispiel #25
0
    def _train(self, env, policy, initial_exploration_policy, pool):
        """Perform RL training.

        Args:
            env (`rllab.Env`): Environment used for training
            policy (`Policy`): Policy used for training
            initial_exploration_policy ('Policy'): Policy used for exploration
                If None, then all exploration is done using policy
            pool (`PoolBase`): Sample pool to add samples to
        """

        self._init_training(env, policy, pool)
        if initial_exploration_policy is None:
            self.sampler.initialize(env, policy, pool)
            initial_exploration_done = True
        else:
            self.sampler.initialize(env, initial_exploration_policy, pool)
            initial_exploration_done = False

        with self._sess.as_default():
            gt.rename_root('RLAlgorithm')
            gt.reset()
            gt.set_def_unique(False)

            for epoch in gt.timed_for(range(self._n_epochs + 1),
                                      save_itrs=True):
                logger.push_prefix('Epoch #%d | ' % epoch)

                for t in range(self._epoch_length):
                    # TODO.codeconsolidation: Add control interval to sampler
                    if not initial_exploration_done:
                        if self._epoch_length * epoch >= self._n_initial_exploration_steps:
                            self.sampler.set_policy(policy)
                            initial_exploration_done = True
                    self.sampler.sample()
                    if not self.sampler.batch_ready():
                        continue
                    gt.stamp('sample')

                    for i in range(self._n_train_repeat):
                        self._do_training(iteration=t +
                                          epoch * self._epoch_length,
                                          batch=self.sampler.random_batch())
                    gt.stamp('train')

                self._evaluate(epoch)

                params = self.get_snapshot(epoch)
                logger.save_itr_params(epoch, params)
                times_itrs = gt.get_times().stamps.itrs

                eval_time = times_itrs['eval'][-1] if epoch > 1 else 0
                total_time = gt.get_times().total
                logger.record_tabular('time-train', times_itrs['train'][-1])
                logger.record_tabular('time-eval', eval_time)
                logger.record_tabular('time-sample', times_itrs['sample'][-1])
                logger.record_tabular('time-total', total_time)
                logger.record_tabular('epoch', epoch)

                self.sampler.log_diagnostics()

                logger.dump_tabular(with_prefix=False)
                logger.pop_prefix()

                gt.stamp('eval')

            self.sampler.terminate()
Beispiel #26
0
    def _train(self):
        """Return a generator that performs RL training.

        Args:
            env (`SoftlearningEnv`): Environment used for training.
            policy (`Policy`): Policy used for training
            initial_exploration_policy ('Policy'): Policy used for exploration
                If None, then all exploration is done using policy
            pool (`PoolBase`): Sample pool to add samples to
        """
        training_environment = self._training_environment
        evaluation_environment = self._evaluation_environment
        policy = self._policy
        pool = self._pool
        model_metrics = {}

        if not self._training_started:
            self._init_training()

            self._initial_exploration_hook(training_environment,
                                           self._initial_exploration_policy,
                                           pool)

        self.sampler.initialize(training_environment, policy, pool)

        gt.reset_root()
        gt.rename_root('RLAlgorithm')
        gt.set_def_unique(False)

        self._training_before_hook()

        for self._epoch in gt.timed_for(range(self._epoch, self._n_epochs)):

            self._epoch_before_hook()
            gt.stamp('epoch_before_hook')

            self._training_progress = Progress(self._epoch_length *
                                               self._n_train_repeat)
            start_samples = self.sampler._total_samples
            for i in count():
                samples_now = self.sampler._total_samples
                self._timestep = samples_now - start_samples

                if (samples_now >= start_samples + self._epoch_length
                        and self.ready_to_train):
                    break

                self._timestep_before_hook()
                gt.stamp('timestep_before_hook')

                if self._timestep % self._model_train_freq == 0 and self._real_ratio < 1.0:
                    self._training_progress.pause()
                    print('[ MEEE ] log_dir: {} | ratio: {}'.format(
                        self._log_dir, self._real_ratio))
                    print(
                        '[ MEEE ] Training model at epoch {} | freq {} | timestep {} (total: {}) | epoch train steps: {} (total: {})'
                        .format(self._epoch, self._model_train_freq,
                                self._timestep, self._total_timestep,
                                self._train_steps_this_epoch,
                                self._num_train_steps))

                    model_train_metrics = self._train_model(
                        batch_size=256,
                        max_epochs=None,
                        holdout_ratio=0.2,
                        max_t=self._max_model_t)
                    model_metrics.update(model_train_metrics)
                    gt.stamp('epoch_train_model')

                    self._set_rollout_length()
                    self._reallocate_model_pool()
                    model_rollout_metrics = self._rollout_model(
                        rollout_batch_size=self._rollout_batch_size,
                        deterministic=self._deterministic)
                    model_metrics.update(model_rollout_metrics)

                    gt.stamp('epoch_rollout_model')
                    # self._visualize_model(self._evaluation_environment, self._total_timestep)
                    self._training_progress.resume()

                # No UCB exploration
                #self._do_sampling(timestep=self._total_timestep)

                self._do_sampling(timestep=self._total_timestep,
                                  disturb=True,
                                  fake_env=self.fake_env,
                                  Qs=self._Qs)
                #print("**exploration**")
                gt.stamp('sample')

                if self.ready_to_train:
                    self._do_training_repeats(timestep=self._total_timestep)
                gt.stamp('train')

                self._timestep_after_hook()
                gt.stamp('timestep_after_hook')

            training_paths = self.sampler.get_last_n_paths(
                math.ceil(self._epoch_length / self.sampler._max_path_length))
            gt.stamp('training_paths')
            evaluation_paths = self._evaluation_paths(policy,
                                                      evaluation_environment)
            gt.stamp('evaluation_paths')

            training_metrics = self._evaluate_rollouts(training_paths,
                                                       training_environment)
            gt.stamp('training_metrics')
            if evaluation_paths:
                evaluation_metrics = self._evaluate_rollouts(
                    evaluation_paths, evaluation_environment)
                gt.stamp('evaluation_metrics')
            else:
                evaluation_metrics = {}

            self._epoch_after_hook(training_paths)
            gt.stamp('epoch_after_hook')

            sampler_diagnostics = self.sampler.get_diagnostics()

            diagnostics = self.get_diagnostics(
                iteration=self._total_timestep,
                batch=self._evaluation_batch(),
                training_paths=training_paths,
                evaluation_paths=evaluation_paths)

            time_diagnostics = gt.get_times().stamps.itrs

            diagnostics.update(
                OrderedDict((
                    *((f'evaluation/{key}', evaluation_metrics[key])
                      for key in sorted(evaluation_metrics.keys())),
                    *((f'training/{key}', training_metrics[key])
                      for key in sorted(training_metrics.keys())),
                    *((f'times/{key}', time_diagnostics[key][-1])
                      for key in sorted(time_diagnostics.keys())),
                    *((f'sampler/{key}', sampler_diagnostics[key])
                      for key in sorted(sampler_diagnostics.keys())),
                    *((f'model/{key}', model_metrics[key])
                      for key in sorted(model_metrics.keys())),
                    ('epoch', self._epoch),
                    ('timestep', self._timestep),
                    ('timesteps_total', self._total_timestep),
                    ('train-steps', self._num_train_steps),
                )))

            if self._eval_render_mode is not None and hasattr(
                    evaluation_environment, 'render_rollouts'):
                training_environment.render_rollouts(evaluation_paths)

            yield diagnostics

        self.sampler.terminate()

        self._training_after_hook()

        self._training_progress.close()

        yield {'done': True, **diagnostics}
Beispiel #27
0
    def train(self):
        '''
        meta-training loop
        '''
        self.pretrain()
        params = self.get_epoch_snapshot(-1)
        logger.save_itr_params(-1, params)
        gt.reset()
        gt.set_def_unique(False)
        self._current_path_builder = PathBuilder()

        # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate
        for it_ in gt.timed_for(
                range(self.num_iterations),
                save_itrs=True,
        ):
            self._start_epoch(it_)
            self.training_mode(True)

            # In offline PEARL, we do not need collecting data
            # self.replay_buffer = self.enc_replay_buffer from above initialization
            if it_ == 0:
                print('collecting initial pool of data for train and eval')
                # temp for evaluating
                for idx in self.train_tasks:  # [0, 1]
                    self.task_idx = idx
                    self.env.reset_task(
                        idx
                    )  # 0 for {'distributions':-1} which is cheetah-mixed
                    self.collect_data(self.num_initial_steps, 1, np.inf)

            # Sample data from train tasks.
            for i in range(self.num_tasks_sample):
                idx = np.random.randint(len(self.train_tasks))
                self.task_idx = idx
                self.env.reset_task(idx)
                self.enc_replay_buffer.task_buffers[idx].clear()

                # collect some trajectories with z ~ prior
                if self.num_steps_prior > 0:
                    self.collect_data(self.num_steps_prior, 1, np.inf)
                # collect some trajectories with z ~ posterior
                if self.num_steps_posterior > 0:
                    self.collect_data(self.num_steps_posterior, 1,
                                      self.update_post_train)
                # even if encoder is trained only on samples from the prior, the policy needs to learn to handle z ~ posterior
                if self.num_extra_rl_steps_posterior > 0:
                    self.collect_data(self.num_extra_rl_steps_posterior,
                                      1,
                                      self.update_post_train,
                                      add_to_enc_buffer=False)

            # Sample train tasks and compute gradient updates on parameters.
            for train_step in range(self.num_train_steps_per_itr):
                indices = np.random.choice(self.train_tasks, self.meta_batch)
                self._do_training(indices)
                self._n_train_steps_total += 1
            gt.stamp('train')

            self.training_mode(False)

            # eval
            self._try_to_eval(it_)
            gt.stamp('eval')

            self._end_epoch()
Beispiel #28
0
    def train(self):
        '''
        meta-training loop
        '''
        self.pretrain()
        params = self.get_epoch_snapshot(-1)
        logger.save_itr_params(-1, params)
        gt.reset()
        gt.set_def_unique(False)
        self._current_path_builder = PathBuilder()
        self.train_obs = self._start_new_rollout()

        # at each iteration, we first collect data from tasks, perform meta-updates, then try to evaluate
        for it_ in gt.timed_for(
                range(self.num_iterations),
                save_itrs=True,
        ):
            self._start_epoch(it_)
            self.training_mode(True)
            if it_ == 0:
                print('collecting initial pool of data for train and eval')
                # temp for evaluating
                for idx in self.train_tasks:
                    self.task_idx = idx
                    self.env.reset_task(idx)
                    self.collect_data_sampling_from_prior(
                        num_samples=self.max_path_length * 10,
                        resample_z_every_n=self.max_path_length,
                        eval_task=False)
                """
                for idx in self.eval_tasks:
                    self.task_idx = idx
                    self.env.reset_task(idx)
                    # TODO: make number of initial trajectories a parameter
                    self.collect_data_sampling_from_prior(num_samples=self.max_path_length * 20,
                                                          resample_z_every_n=self.max_path_length,
                                                          eval_task=True)
                """

            # Sample data from train tasks.
            for i in range(self.num_tasks_sample):
                idx = np.random.randint(len(self.train_tasks))
                self.task_idx = idx
                self.env.reset_task(idx)

                # TODO: there may be more permutations of sampling/adding to encoding buffer we may wish to try
                if self.train_embedding_source == 'initial_pool':
                    # embeddings are computed using only the initial pool of data
                    # sample data from posterior to train RL algorithm
                    self.collect_data_from_task_posterior(
                        idx=idx,
                        num_samples=self.num_steps_per_task,
                        add_to_enc_buffer=False)
                elif self.train_embedding_source == 'posterior_only':
                    self.collect_data_from_task_posterior(
                        idx=idx,
                        num_samples=self.num_steps_per_task,
                        eval_task=False,
                        add_to_enc_buffer=True)
                elif self.train_embedding_source == 'online_exploration_trajectories':
                    # embeddings are computed using only data collected using the prior
                    # sample data from posterior to train RL algorithm
                    self.enc_replay_buffer.task_buffers[idx].clear()
                    # resamples using current policy, conditioned on prior
                    self.collect_data_sampling_from_prior(
                        num_samples=self.num_steps_per_task,
                        resample_z_every_n=self.max_path_length,
                        add_to_enc_buffer=True)

                    self.collect_data_from_task_posterior(
                        idx=idx,
                        num_samples=self.num_steps_per_task,
                        add_to_enc_buffer=False)
                elif self.train_embedding_source == 'online_on_policy_trajectories':
                    # sample from prior, then sample more from the posterior
                    # embeddings computed from both prior and posterior data
                    self.enc_replay_buffer.task_buffers[idx].clear()
                    self.collect_data_online(
                        idx=idx,
                        num_samples=self.num_steps_per_task,
                        add_to_enc_buffer=True)
                else:
                    raise Exception(
                        "Invalid option for computing train embedding {}".
                        format(self.train_embedding_source))

            # Sample train tasks and compute gradient updates on parameters.
            for train_step in range(self.num_train_steps_per_itr):
                indices = np.random.choice(self.train_tasks, self.meta_batch)
                self._do_training(indices)
                self._n_train_steps_total += 1
            gt.stamp('train')

            #self.training_mode(False)

            # eval
            self._try_to_eval(it_)
            gt.stamp('eval')

            self._end_epoch()