Ejemplo n.º 1
0
    def setup(self,
              algo,
              env,
              sampler_cls=None,
              sampler_args=None,
              n_workers=psutil.cpu_count(logical=False),
              worker_class=None,
              worker_args=None):
        """Set up trainer and sessions for algorithm and environment.

        This method saves algo and env within trainer and creates a sampler,
        and initializes all uninitialized variables in session.

        Note:
            After setup() is called all variables in session should have been
            initialized. setup() respects existing values in session so
            policy weights can be loaded before setup().

        Args:
            algo (RLAlgorithm): An algorithm instance.
            env (Environment): An environment instance.
            sampler_cls (type): A class which implements :class:`Sampler`
            sampler_args (dict): Arguments to be passed to sampler constructor.
            n_workers (int): The number of workers the sampler should use.
            worker_class (type): Type of worker the sampler should use.
            worker_args (dict or None): Additional arguments that should be
                passed to the worker.

        """
        self.initialize_tf_vars()
        logger.log(self.sess.graph)
        super().setup(algo, env, sampler_cls, sampler_args, n_workers,
                      worker_class, worker_args)
Ejemplo n.º 2
0
    def evaluate(self, algo, test_rollouts_per_task=None):
        """Evaluate the Meta-RL algorithm on the test tasks.

        Args:
            algo (metarl.np.algos.MetaRLAlgorithm): The algorithm to evaluate.
            test_rollouts_per_task (int or None): Number of rollouts per task.

        """
        if test_rollouts_per_task is None:
            test_rollouts_per_task = self._n_test_rollouts
        adapted_trajectories = []
        logger.log('Sampling for adapation and meta-testing...')
        for env_up in self._test_task_sampler.sample(self._n_test_tasks):
            policy = algo.get_exploration_policy()
            traj = TrajectoryBatch.concatenate(*[
                self._test_sampler.obtain_samples(self._eval_itr, 1, policy,
                                                  env_up)
                for _ in range(self._n_exploration_traj)
            ])
            adapted_policy = algo.adapt_policy(policy, traj)
            adapted_traj = self._test_sampler.obtain_samples(
                self._eval_itr, test_rollouts_per_task * self._max_path_length,
                adapted_policy)
            adapted_trajectories.append(adapted_traj)
        logger.log('Finished meta-testing...')

        with tabular.prefix(self._prefix + '/' if self._prefix else ''):
            log_multitask_performance(
                self._eval_itr,
                TrajectoryBatch.concatenate(*adapted_trajectories),
                getattr(algo, 'discount', 1.0),
                task_names=self._test_task_names)
        self._eval_itr += 1
Ejemplo n.º 3
0
    def save(self, epoch, paths=None):
        """Save snapshot of current batch.

        Args:
            itr(int): Index of iteration (epoch).
            paths(dict): Batch of samples after preprocessed. If None,
                no paths will be logged to the snapshot.

        """
        if not self.has_setup:
            raise Exception('Use setup() to setup runner before saving.')

        logger.log('Saving snapshot...')

        params = dict()
        # Save arguments
        params['setup_args'] = self._setup_args
        params['train_args'] = self.train_args

        # Save states
        params['env'] = self.env
        # TODO: add this back
        # params['algo'] = self.algo
        if paths:
            params['paths'] = paths
        params['last_epoch'] = epoch
        self._snapshotter.save_itr_params(epoch, params)

        logger.log('Saved')
Ejemplo n.º 4
0
    def train_once(self, itr, paths):
        paths = self.process_samples(itr, paths)

        epoch = itr // self.n_samples
        i_sample = itr - epoch * self.n_samples
        tabular.record('Epoch', epoch)
        tabular.record('# Sample', i_sample)
        # -- Stage: Process path
        rtn = paths['average_return']
        self.all_returns.append(paths['average_return'])

        # -- Stage: Update policy distribution.
        if (itr + 1) % self.n_samples == 0:
            avg_rtns = np.array(self.all_returns)
            best_inds = np.argsort(-avg_rtns)[:self.n_best]
            best_params = np.array(self.all_params)[best_inds]

            # MLE of normal distribution
            self.cur_mean = best_params.mean(axis=0)
            self.cur_std = best_params.std(axis=0)
            self.policy.set_param_values(self.cur_mean)

            # Clear for next epoch
            rtn = max(self.all_returns)
            self.all_returns.clear()
            self.all_params.clear()

        # -- Stage: Generate a new policy for next path sampling
        self.cur_params = self.sample_params(itr)
        self.all_params.append(self.cur_params.copy())
        self.policy.set_param_values(self.cur_params)

        logger.log(tabular)
        return rtn
Ejemplo n.º 5
0
    def train_once(self, itr, paths):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        """
        paths = self.process_samples(itr, paths)

        epoch = itr // self.n_samples
        i_sample = itr - epoch * self.n_samples

        tabular.record('Epoch', epoch)
        tabular.record('# Sample', i_sample)

        rtn = paths['average_return']
        self.all_returns.append(paths['average_return'])

        if (itr + 1) % self.n_samples == 0:
            avg_rtns = np.array(self.all_returns)
            self.es.tell(self.all_params, -avg_rtns)
            self.policy.set_param_values(self.es.best.get()[0])

            # Clear for next epoch
            rtn = max(self.all_returns)
            self.all_returns.clear()
            self.all_params = self._sample_params()

        self.cur_params = self.all_params[(i_sample + 1) % self.n_samples]
        self.policy.set_param_values(self.cur_params)

        logger.log(tabular)
        return rtn
Ejemplo n.º 6
0
    def _train_once(self, itr):
        """Perform one iteration of training.

        Args:
            itr (int): Iteration number.

        """
        for grad_step_timer in range(self._grad_steps_per_env_step):
            if (self._replay_buffer.n_transitions_stored >=
                    self._min_buffer_size):
                # Sample from buffer
                samples = self._replay_buffer.sample_transitions(
                    self._buffer_batch_size)
                samples = as_torch_dict(samples)

                # Optimize
                qf_loss, y, q, policy_loss = torch_to_np(
                    self._optimize_policy(samples, grad_step_timer))

                self._episode_policy_losses.append(policy_loss)
                self._episode_qf_losses.append(qf_loss)
                self._epoch_ys.append(y)
                self._epoch_qs.append(q)

        if itr % self._steps_per_epoch == 0:
            logger.log('Training finished')
            epoch = itr // self._steps_per_epoch

            if (self._replay_buffer.n_transitions_stored >=
                    self._min_buffer_size):
                tabular.record('Epoch', epoch)
                self._log_statistics()
Ejemplo n.º 7
0
    def train_once(self, itr, paths):
        """Perform one iteration of training."""
        paths = self.process_samples(itr, paths)

        epoch = itr / self.n_epoch_cycles

        self.episode_rewards.extend([
            path for path, complete in zip(paths['undiscounted_returns'],
                                           paths['complete']) if complete
        ])
        self.success_history.extend([
            path for path, complete in zip(paths['success_history'],
                                           paths['complete']) if complete
        ])

        last_average_return = np.mean(self.episode_rewards)
        for train_itr in range(self.n_train_steps):
            if self.replay_buffer.n_transitions_stored >= self.min_buffer_size:  # noqa: E501
                self.evaluate = True
                samples = self.replay_buffer.sample(self.buffer_batch_size)
                qf_loss, y, q, policy_loss = torch_to_np(
                    self.optimize_policy(itr, samples))  # pylint: disable=all

                self.episode_policy_losses.append(policy_loss)
                self.episode_qf_losses.append(qf_loss)
                self.epoch_ys.append(y)
                self.epoch_qs.append(q)

        if itr % self.n_epoch_cycles == 0:
            logger.log('Training finished')

            if self.evaluate:
                tabular.record('Epoch', epoch)
                tabular.record('AverageReturn', np.mean(self.episode_rewards))
                tabular.record('StdReturn', np.std(self.episode_rewards))
                tabular.record('Policy/AveragePolicyLoss',
                               np.mean(self.episode_policy_losses))
                tabular.record('QFunction/AverageQFunctionLoss',
                               np.mean(self.episode_qf_losses))
                tabular.record('QFunction/AverageQ', np.mean(self.epoch_qs))
                tabular.record('QFunction/MaxQ', np.max(self.epoch_qs))
                tabular.record('QFunction/AverageAbsQ',
                               np.mean(np.abs(self.epoch_qs)))
                tabular.record('QFunction/AverageY', np.mean(self.epoch_ys))
                tabular.record('QFunction/MaxY', np.max(self.epoch_ys))
                tabular.record('QFunction/AverageAbsY',
                               np.mean(np.abs(self.epoch_ys)))
                tabular.record('AverageSuccessRate',
                               np.mean(self.success_history))

            if not self.smooth_return:
                self.episode_rewards = []
                self.episode_policy_losses = []
                self.episode_qf_losses = []
                self.epoch_ys = []
                self.epoch_qs = []

            self.success_history.clear()

        return last_average_return
Ejemplo n.º 8
0
    def optimize_policy(self, itr, samples_data):
        policy_opt_input_values = self._policy_opt_input_values(samples_data)

        # Train policy network
        logger.log('Computing loss before')
        loss_before = self.optimizer.loss(policy_opt_input_values)
        logger.log('Computing KL before')
        policy_kl_before = self.f_policy_kl(*policy_opt_input_values)
        logger.log('Optimizing')
        self.optimizer.optimize(policy_opt_input_values)
        logger.log('Computing KL after')
        policy_kl = self.f_policy_kl(*policy_opt_input_values)
        logger.log('Computing loss after')
        loss_after = self.optimizer.loss(policy_opt_input_values)
        tabular.record('{}/LossBefore'.format(self.policy.name), loss_before)
        tabular.record('{}/LossAfter'.format(self.policy.name), loss_after)
        tabular.record('{}/dLoss'.format(self.policy.name),
                       loss_before - loss_after)
        tabular.record('{}/KLBefore'.format(self.policy.name),
                       policy_kl_before)
        tabular.record('{}/KL'.format(self.policy.name), policy_kl)

        pol_ent = self.f_policy_entropy(*policy_opt_input_values)
        tabular.record('{}/Entropy'.format(self.policy.name), pol_ent)

        num_traj = self.batch_size // self.max_path_length
        actions = samples_data['actions'][:num_traj, ...]

        histogram = Histogram(actions)
        tabular.record('{}/Actions'.format(self.policy.name), histogram)

        self._fit_baseline(samples_data)

        return self.get_itr_snapshot(itr, samples_data)
Ejemplo n.º 9
0
    def optimize_policy(self, itr, samples_data):
        """Optimize policy.

        Args:
            itr (int): Iteration number.
            samples_data (dict): Processed sample data.
                See process_samples() for details.

        """
        policy_opt_input_values = self._policy_opt_input_values(samples_data)
        # Train policy network
        logger.log('Computing loss before')
        loss_before = self._optimizer.loss(policy_opt_input_values)
        logger.log('Computing KL before')
        policy_kl_before = self._f_policy_kl(*policy_opt_input_values)
        logger.log('Optimizing')
        self._optimizer.optimize(policy_opt_input_values)
        logger.log('Computing KL after')
        policy_kl = self._f_policy_kl(*policy_opt_input_values)
        logger.log('Computing loss after')
        loss_after = self._optimizer.loss(policy_opt_input_values)
        tabular.record('{}/LossBefore'.format(self.policy.name), loss_before)
        tabular.record('{}/LossAfter'.format(self.policy.name), loss_after)
        tabular.record('{}/dLoss'.format(self.policy.name),
                       loss_before - loss_after)
        tabular.record('{}/KLBefore'.format(self.policy.name),
                       policy_kl_before)
        tabular.record('{}/KL'.format(self.policy.name), policy_kl)
        pol_ent = self._f_policy_entropy(*policy_opt_input_values)
        tabular.record('{}/Entropy'.format(self.policy.name), np.mean(pol_ent))

        self._fit_baseline(samples_data)
Ejemplo n.º 10
0
    def _fit_baseline_with_data(self, samples_data):
        """Update baselines from samples.

        Args:
            samples_data (dict): Processed sample data.
                See garage.tf.paths_to_tensors() for details.

        """
        policy_opt_input_values = self._policy_opt_input_values(samples_data)

        # Augment reward from baselines
        rewards_tensor = self._f_rewards(*policy_opt_input_values)
        returns_tensor = self._f_returns(*policy_opt_input_values)
        returns_tensor = np.squeeze(returns_tensor, -1)

        paths = samples_data['paths']
        valids = samples_data['valids']

        # Recompute parts of samples_data
        aug_rewards = []
        aug_returns = []
        for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids,
                                       paths):
            path['rewards'] = rew[val.astype(np.bool)]
            path['returns'] = ret[val.astype(np.bool)]
            aug_rewards.append(path['rewards'])
            aug_returns.append(path['returns'])
        samples_data['rewards'] = np_tensor_utils.pad_tensor_n(
            aug_rewards, self.max_path_length)
        samples_data['returns'] = np_tensor_utils.pad_tensor_n(
            aug_returns, self.max_path_length)

        # Fit baseline
        logger.log('Fitting baseline...')
        self._baseline.fit(paths)
Ejemplo n.º 11
0
    def test_one_folder(self, meta_train_dir, itrs):
        snapshot_config = SnapshotConfig(snapshot_dir=meta_train_dir,
                                         snapshot_mode='all',
                                         snapshot_gap=1)

        runner = LocalRunner(snapshot_config=snapshot_config)
        meta_sampler = AllSetTaskSampler(self.meta_task_cls)
        runner.restore(meta_train_dir)

        meta_evaluator = MetaEvaluator(
            runner,
            test_task_sampler=meta_sampler,
            max_path_length=self.max_path_length,
            n_test_tasks=meta_sampler.n_tasks,
            n_exploration_traj=self.adapt_rollout_per_task,
            prefix='')

        for itr in itrs:
            log_filename = os.path.join(meta_train_dir,
                                        'meta-test-itr_{}.csv'.format(itr))
            logger.add_output(CsvOutput(log_filename))
            logger.log("Writing into {}".format(log_filename))

            runner.restore(meta_train_dir, from_epoch=itr)
            meta_evaluator.evaluate(runner._algo, self.test_rollout_per_task)

            tabular.record('Iteration', runner._stats.total_epoch)
            tabular.record('TotalEnvSteps', runner._stats.total_env_steps)
            logger.log(tabular)
            logger.dump_output_type(CsvOutput)
            logger.remove_output_type(CsvOutput)
Ejemplo n.º 12
0
    def save(self, epoch, paths=None):
        """Save snapshot of current batch.

        Args:
            itr(int): Index of iteration (epoch).
            paths(dict): Batch of samples after preprocessed. If None,
                no paths will be logged to the snapshot.

        """
        assert self.has_setup

        logger.log('Saving snapshot...')

        params = dict()
        # Save arguments
        params['setup_args'] = self.setup_args
        params['train_args'] = self.train_args

        # Save states
        params['env'] = self.env
        params['algo'] = self.algo
        if paths:
            params['paths'] = paths
        params['last_epoch'] = epoch
        snapshotter.save_itr_params(epoch, params)

        logger.log('Saved')
Ejemplo n.º 13
0
    def _train_once(self, itr, episodes):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            itr (int): Iteration number.
            episodes (EpisodeBatch): Batch of episodes.

        Returns:
            numpy.float64: Average return.

        """
        # -- Stage: Calculate and pad baselines
        obs = [
            self._baseline.predict({'observations': obs})
            for obs in episodes.observations_list
        ]
        baselines = episodes.pad_to_last(np.concatenate(obs))

        # -- Stage: Run and calculate performance of the algorithm
        undiscounted_returns = log_performance(itr,
                                               episodes,
                                               discount=self._discount)
        self._episode_reward_mean.extend(undiscounted_returns)
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self._episode_reward_mean))

        logger.log('Optimizing policy...')
        self._optimize_policy(episodes, baselines)

        return np.mean(undiscounted_returns)
Ejemplo n.º 14
0
    def save(self, epoch):
        """Save snapshot of current batch.

        Args:
            epoch (int): Epoch.

        Raises:
            NotSetupError: if save() is called before the runner is set up.

        """
        if not self._has_setup:
            raise NotSetupError('Use setup() to setup runner before saving.')

        logger.log('Saving snapshot...')

        params = dict()
        # Save arguments
        params['setup_args'] = self._setup_args
        params['train_args'] = self._train_args
        params['stats'] = self._stats

        # Save states
        params['env'] = self._env
        params['algo'] = self._algo
        params['n_workers'] = self._n_workers
        params['worker_class'] = self._worker_class
        params['worker_args'] = self._worker_args

        self._snapshotter.save_itr_params(epoch, params)

        logger.log('Saved')
Ejemplo n.º 15
0
def populate_task(env, policy, scope=None):
    """Set each worker's env and policy.

    Parameters
    ----------
    env : :py:class:`ast_toolbox.envs.ASTEnv`
        The environment.
    policy : :py:class:`garage.tf.policies.Policy`
        The policy.
    scope : str
        Scope for identifying the algorithm.
        Must be specified if running multiple algorithms
        simultaneously, each using different environments
        and policies.
    """
    logger.log('Populating workers...')
    if singleton_pool.n_parallel > 1:
        singleton_pool.run_each(
            _worker_populate_task,
            [(pickle.dumps(env), pickle.dumps(policy), scope)] *
            singleton_pool.n_parallel)
    else:
        # avoid unnecessary copying
        g = _get_scoped_g(singleton_pool.G, scope)
        g.env = env
        g.policy = policy
    logger.log('Populated')
Ejemplo n.º 16
0
    def _optimize_policy(self, itr, episodes, baselines, embed_eps,
                         embed_ep_infos):
        """Optimize policy.

        Args:
            itr (int): Iteration number.
            episodes (EpisodeBatch): Batch of episodes.
            baselines (np.ndarray): Baseline predictions.
            embed_eps (np.ndarray): Embedding episodes.
            embed_ep_infos (dict): Embedding distribution information.

        """
        del itr

        policy_opt_input_values = self._policy_opt_input_values(
            episodes, baselines, embed_eps)
        inference_opt_input_values = self._inference_opt_input_values(
            episodes, embed_eps, embed_ep_infos)

        self._train_policy_and_encoder_networks(policy_opt_input_values)
        self._train_inference_network(inference_opt_input_values)

        # paths = samples_data['paths']
        fit_paths = self._evaluate(policy_opt_input_values, episodes,
                                   baselines, embed_ep_infos)
        self._visualize_distribution()

        logger.log('Fitting baseline...')
        self._baseline.fit(fit_paths)

        self._old_policy.parameters = self.policy.parameters
        self._old_policy.encoder.model.parameters = (
            self.policy.encoder.model.parameters)
        self._old_inference.model.parameters = self._inference.model.parameters
Ejemplo n.º 17
0
def test_meta_evaluator():
    set_seed(100)
    tasks = SetTaskSampler(PointEnv, wrapper=set_length)
    max_episode_length = 200
    with tempfile.TemporaryDirectory() as log_dir_name:
        trainer = Trainer(
            SnapshotConfig(snapshot_dir=log_dir_name,
                           snapshot_mode='last',
                           snapshot_gap=1))
        env = PointEnv(max_episode_length=max_episode_length)
        algo = OptimalActionInference(env=env,
                                      max_episode_length=max_episode_length)
        trainer.setup(algo, env)
        meta_eval = MetaEvaluator(test_task_sampler=tasks, n_test_tasks=10)
        log_file = tempfile.NamedTemporaryFile()
        csv_output = CsvOutput(log_file.name)
        logger.add_output(csv_output)
        meta_eval.evaluate(algo)
        logger.log(tabular)
        meta_eval.evaluate(algo)
        logger.log(tabular)
        logger.dump_output_type(CsvOutput)
        logger.remove_output_type(CsvOutput)
        with open(log_file.name, 'r') as file:
            rows = list(csv.DictReader(file))
        assert len(rows) == 2
        assert float(
            rows[0]['MetaTest/__unnamed_task__/TerminationRate']) < 1.0
        assert float(rows[0]['MetaTest/__unnamed_task__/Iteration']) == 0
        assert (float(rows[0]['MetaTest/__unnamed_task__/MaxReturn']) >= float(
            rows[0]['MetaTest/__unnamed_task__/AverageReturn']))
        assert (float(rows[0]['MetaTest/__unnamed_task__/AverageReturn']) >=
                float(rows[0]['MetaTest/__unnamed_task__/MinReturn']))
        assert float(rows[1]['MetaTest/__unnamed_task__/Iteration']) == 1
Ejemplo n.º 18
0
    def optimize_policy(self, itr, samples_data):
        """Optimize policy.

        Args:
            itr (int): Iteration number.
            samples_data (dict): Processed sample data.
                See process_samples() for details.

        """
        del itr

        policy_opt_input_values = self._policy_opt_input_values(samples_data)
        inference_opt_input_values = self._inference_opt_input_values(
            samples_data)

        self._train_policy_and_encoder_networks(policy_opt_input_values)
        self._train_inference_network(inference_opt_input_values)

        paths = samples_data['paths']
        self.evaluate(policy_opt_input_values, samples_data)
        self.visualize_distribution()

        logger.log('Fitting baseline...')
        self._baseline.fit(paths)

        self._old_policy.parameters = self.policy.parameters
        self._old_policy.encoder.model.parameters = (
            self.policy.encoder.model.parameters)
        self._old_inference.model.parameters = self._inference.model.parameters
Ejemplo n.º 19
0
    def train_once(self, itr, paths):
        paths = self.process_samples(itr, paths)

        self.log_diagnostics(paths)
        logger.log('Optimizing policy...')
        self.optimize_policy(itr, paths)
        return paths['average_return']
Ejemplo n.º 20
0
    def optimize_policy(self, itr, samples_data):
        policy_opt_input_values = self._policy_opt_input_values(samples_data)

        # Train policy network
        logger.log('Computing loss before')
        loss_before = self.optimizer.loss(policy_opt_input_values)
        logger.log('Computing KL before')
        policy_kl_before = self.f_policy_kl(*policy_opt_input_values)
        logger.log('Optimizing')
        self.optimizer.optimize(policy_opt_input_values)
        logger.log('Computing KL after')
        policy_kl = self.f_policy_kl(*policy_opt_input_values)
        logger.log('Computing loss after')
        loss_after = self.optimizer.loss(policy_opt_input_values)
        tabular.record('{}/LossBefore'.format(self.policy.name), loss_before)
        tabular.record('{}/LossAfter'.format(self.policy.name), loss_after)
        tabular.record('{}/dLoss'.format(self.policy.name),
                       loss_before - loss_after)
        tabular.record('{}/KLBefore'.format(self.policy.name),
                       policy_kl_before)
        tabular.record('{}/KL'.format(self.policy.name), policy_kl)
        pol_ent = self.f_policy_entropy(*policy_opt_input_values)
        tabular.record('{}/Entropy'.format(self.policy.name), np.mean(pol_ent))

        self._fit_baseline(samples_data)
Ejemplo n.º 21
0
    def _fit_baseline_with_data(self, episodes, baselines):
        """Update baselines from samples.

        Args:
            episodes (EpisodeBatch): Batch of episodes.
            baselines (np.ndarray): Baseline predictions.

        Returns:
            np.ndarray: Augment returns.

        """
        policy_opt_input_values = self._policy_opt_input_values(
            episodes, baselines)

        returns_tensor = self._f_returns(*policy_opt_input_values)
        returns_tensor = np.squeeze(returns_tensor, -1)

        paths = []
        valids = episodes.valids
        observations = episodes.padded_observations

        # Compute returns
        for ret, val, ob in zip(returns_tensor, valids, observations):
            returns = ret[val.astype(np.bool)]
            obs = ob[val.astype(np.bool)]
            paths.append(dict(observations=obs, returns=returns))

        # Fit baseline
        logger.log('Fitting baseline...')
        self._baseline.fit(paths)
        return returns_tensor
Ejemplo n.º 22
0
    def train_once(self, itr, paths):
        epoch = itr // self.n_samples
        i_sample = itr - epoch * self.n_samples

        tabular.record('Epoch', epoch)
        tabular.record('# Sample', i_sample)

        rtn = paths['average_return']
        self.all_returns.append(paths['average_return'])

        if (itr + 1) % self.n_samples == 0:
            avg_rtns = np.array(self.all_returns)
            self.es.tell(self.all_params, -avg_rtns)
            self.policy.set_param_values(self.es.result()[0])

            # Clear for next epoch
            rtn = max(self.all_returns)
            self.all_returns.clear()
            self.all_params = self.sample_params()

        self.cur_params = self.all_params[(i_sample + 1) % self.n_samples]
        self.policy.set_param_values(self.cur_params)

        logger.log(tabular)
        return rtn
Ejemplo n.º 23
0
def worker_init_envs(g, alloc, scope, env):
    logger.log('initializing environment on worker %d' % g.worker_id)
    if not hasattr(g, 'parallel_vec_envs'):
        g.parallel_vec_envs = dict()
        g.parallel_vec_env_template = dict()
    g.parallel_vec_envs[scope] = [(idx, pickle.loads(pickle.dumps(env)))
                                  for idx in alloc]
    g.parallel_vec_env_template[scope] = env
Ejemplo n.º 24
0
    def evaluate(self, algo, test_rollouts_per_task=None):
        """Evaluate the Meta-RL algorithm on the test tasks.

        Args:
            algo (garage.np.algos.MetaRLAlgorithm): The algorithm to evaluate.
            test_rollouts_per_task (int or None): Number of rollouts per task.

        """
        if test_rollouts_per_task is None:
            test_rollouts_per_task = self._n_test_rollouts
        adapted_trajectories = []
        logger.log('Sampling for adapation and meta-testing...')
        if self._test_sampler is None:
            self._test_sampler = self._sampler_class.from_worker_factory(
                WorkerFactory(seed=get_seed(),
                              max_path_length=self._max_path_length,
                              n_workers=1,
                              worker_class=self._worker_class,
                              worker_args=self._worker_args),
                agents=algo.get_exploration_policy(),
                envs=self._test_task_sampler.sample(1))
        for env_up in self._test_task_sampler.sample(self._n_test_tasks):
            policy = algo.get_exploration_policy()
            traj = self._trajectory_batch_class.concatenate(*[
                self._test_sampler.obtain_samples(self._eval_itr, 1, policy,
                                                  env_up)
                for _ in range(self._n_exploration_traj)
            ])
            adapted_policy = algo.adapt_policy(policy, traj)
            adapted_traj = self._test_sampler.obtain_samples(
                self._eval_itr, test_rollouts_per_task * self._max_path_length,
                adapted_policy)
            adapted_trajectories.append(adapted_traj)
        logger.log('Finished meta-testing...')

        if self._test_task_names is not None:
            name_map = dict(enumerate(self._test_task_names))
        else:
            name_map = None

        with tabular.prefix(self._prefix + '/' if self._prefix else ''):
            log_multitask_performance(
                self._eval_itr,
                self._trajectory_batch_class.concatenate(
                    *adapted_trajectories),
                getattr(algo, 'discount', 1.0),
                trajectory_class=self._trajectory_batch_class,
                name_map=name_map)
        self._eval_itr += 1

        if self._trajectory_batch_class == TrajectoryBatch:
            rewards = self._trajectory_batch_class.concatenate(
                *adapted_trajectories).rewards
        else:
            rewards = self._trajectory_batch_class.concatenate(
                *adapted_trajectories).env_rewards

        return sum(rewards) / len(rewards)
Ejemplo n.º 25
0
    def optimize(self, inputs, extra_inputs=None, callback=None):

        if not inputs:
            # Assumes that we should always sample mini-batches
            raise NotImplementedError

        f_loss = self._opt_fun['f_loss']

        if extra_inputs is None:
            extra_inputs = tuple()

        last_loss = f_loss(*(tuple(inputs) + extra_inputs))

        start_time = time.time()

        dataset = BatchDataset(inputs,
                               self._batch_size,
                               extra_inputs=extra_inputs)

        sess = tf.get_default_session()

        for epoch in range(self._max_epochs):
            if self._verbose:
                logger.log('Epoch {}'.format(epoch))
                progbar = pyprind.ProgBar(len(inputs[0]))

            for batch in dataset.iterate(update=True):
                sess.run(self._train_op,
                         dict(list(zip(self._input_vars, batch))))
                if self._verbose:
                    progbar.update(len(batch[0]))

            if self._verbose:
                if progbar.active:
                    progbar.stop()

            new_loss = f_loss(*(tuple(inputs) + extra_inputs))

            if self._verbose:
                logger.log('Epoch: {} | Loss: {}'.format(epoch, new_loss))
            if self._callback or callback:
                elapsed = time.time() - start_time
                callback_args = dict(
                    loss=new_loss,
                    params=self._target.get_param_values(
                        trainable=True) if self._target else None,
                    itr=epoch,
                    elapsed=elapsed,
                )
                if self._callback:
                    self._callback(callback_args)
                if callback:
                    callback(**callback_args)

            if abs(last_loss - new_loss) < self._tolerance:
                break
            last_loss = new_loss
Ejemplo n.º 26
0
    def train_once(self, itr, paths):
        """Perform one step of policy optimization given one batch of samples.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths.

        Returns:
            float: The average return in last epoch cycle.

        """
        # -- Stage: Calculate baseline
        if hasattr(self._baseline, 'predict_n'):
            baseline_predictions = self._baseline.predict_n(paths)
        else:
            baseline_predictions = [
                self._baseline.predict(path) for path in paths
            ]

        # -- Stage: Pre-process samples based on collected paths
        samples_data = paths_to_tensors(paths, self.max_path_length,
                                        baseline_predictions, self._discount)

        # -- Stage: Run and calculate performance of the algorithm
        undiscounted_returns = log_performance(
            itr,
            TrajectoryBatch.from_trajectory_list(self._env_spec, paths),
            discount=self._discount)
        self._episode_reward_mean.extend(undiscounted_returns)
        tabular.record('Extras/EpisodeRewardMean',
                       np.mean(self._episode_reward_mean))
        samples_data['average_return'] = np.mean(undiscounted_returns)

        epoch = itr // self._n_samples
        i_sample = itr - epoch * self._n_samples

        tabular.record('Epoch', epoch)
        tabular.record('# Sample', i_sample)

        rtn = samples_data['average_return']
        self._all_returns.append(samples_data['average_return'])

        if (itr + 1) % self._n_samples == 0:
            avg_rtns = np.array(self._all_returns)
            self._es.tell(self._all_params, -avg_rtns)
            self.policy.set_param_values(self._es.best.get()[0])

            # Clear for next epoch
            rtn = max(self._all_returns)
            self._all_returns.clear()
            self._all_params = self._sample_params()

        self._cur_params = self._all_params[(i_sample + 1) % self._n_samples]
        self.policy.set_param_values(self._cur_params)

        logger.log(tabular)
        return rtn
Ejemplo n.º 27
0
    def _make_context(self, *args, **kwargs):
        """Make a context from the template information and variant args.

        Currently, all arguments should be keyword arguments.

        Args:
            args (list): Should be empty.
            kwargs (dict): Keyword arguments for the wrapped function. Will be
                logged to `variant.json`

        Returns:
            ExperimentContext: The created experiment context.

        Raises:
            ValueError: If args is not empty.

        """
        if args:
            raise ValueError('garage.experiment currently only supports '
                             'keyword arguments')
        name = self.name
        if name is None:
            name = self.function.__name__
        if self.name_parameters:
            name = self._augment_name(name, kwargs)
        log_dir = self.log_dir
        if log_dir is None:
            log_dir = ('{data}/local/{prefix}/{name}'.format(
                data=os.path.join(os.getcwd(), 'data'),
                prefix=self.prefix,
                name=name))
        log_dir = _make_sequential_log_dir(log_dir)

        tabular_log_file = os.path.join(log_dir, 'progress.csv')
        text_log_file = os.path.join(log_dir, 'debug.log')
        variant_log_file = os.path.join(log_dir, 'variant.json')
        metadata_log_file = os.path.join(log_dir, 'metadata.json')

        dump_json(variant_log_file, kwargs)
        git_root_path, metadata = get_metadata()
        dump_json(metadata_log_file, metadata)
        if git_root_path and self.archive_launch_repo:
            make_launcher_archive(git_root_path=git_root_path, log_dir=log_dir)

        logger.add_output(dowel.TextOutput(text_log_file))
        logger.add_output(dowel.CsvOutput(tabular_log_file))
        logger.add_output(
            dowel.TensorBoardOutput(log_dir, x_axis='TotalEnvSteps'))
        logger.add_output(dowel.StdOutput())

        logger.push_prefix('[{}] '.format(name))
        logger.log('Logging to {}'.format(log_dir))

        return ExperimentContext(snapshot_dir=log_dir,
                                 snapshot_mode=self.snapshot_mode,
                                 snapshot_gap=self.snapshot_gap)
Ejemplo n.º 28
0
    def log_diagnostics(self, paths):
        """Log diagnostic information.

        Args:
            paths (list[dict]): A list of collected paths.

        """
        logger.log('Logging diagnostics...')
        self.policy.log_diagnostics(paths)
        self._baseline.log_diagnostics(paths)
Ejemplo n.º 29
0
 def _tasks_adapt_train_once(self):
     for idx in range(self._num_steps_per_epoch):
         indices = np.random.choice(range(self._num_train_tasks),
                                    self._meta_batch_size)
         kl_loss, value_loss, policy_loss = self._tasks_adapt_optimize_policy(
             indices)
         if idx % BATCH_PRINT == 0:
             logger.log(
                 "task adapt at batch {} with kl_loss {}, value loss {} and policy loss {}"
                 .format(idx, kl_loss, value_loss, policy_loss))
Ejemplo n.º 30
0
def evaluate_arduino(datum, send_type=int, return_type=float):
    if send_type is int and return_type is float: 
        return_val = evaluate_int_float(datum)
    elif send_type is bool and return_type is bool: 
        return_val = evaluate_bool_bool(datum)
    else:
        raise NotImplementedError
    if send_type is int and not isinstance(return_val, return_type):
        logger.log(return_val)
        import pdb; pdb.set_trace()
    return return_val