def _process_samples(self, itr, paths): # pylint: disable=too-many-statements """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (OrderedDict[dict]): A list of collected paths for each task. In RL^2, there are n environments/tasks and paths in each of them will be concatenated at some point and fed to the policy. Returns: EpisodeBatch: Processed batch of episodes for feeding the inner algorithm. numpy.float64: The average return. Raises: ValueError: If 'batch_idx' is not found. """ concatenated_paths = [] paths_by_task = collections.defaultdict(list) for path in paths: path['returns'] = discount_cumsum(path['rewards'], self._discount) path['lengths'] = [len(path['rewards'])] if 'batch_idx' in path: paths_by_task[path['batch_idx']].append(path) elif 'batch_idx' in path['agent_infos']: paths_by_task[path['agent_infos']['batch_idx'][0]].append(path) else: raise ValueError( 'Batch idx is required for RL2 but not found, ' 'Make sure to use garage.tf.algos.rl2.RL2Worker ' 'for sampling') # all path in paths_by_task[i] are sampled from task[i] for _paths in paths_by_task.values(): concatenated_path = self._concatenate_paths(_paths) concatenated_paths.append(concatenated_path) name_map = None if hasattr(self._task_sampler, '_envs') and hasattr( self._task_sampler._envs[0]._env, 'all_task_names'): names = [ env._env.all_task_names[0] for env in self._task_sampler._envs ] name_map = dict(enumerate(names)) undiscounted_returns = log_multitask_performance( itr, EpisodeBatch.from_list(self._env_spec, paths), self._inner_algo._discount, name_map=name_map) average_return = np.mean(undiscounted_returns) episodes = EpisodeBatch.from_list(self._env_spec, concatenated_paths) return episodes, average_return
def obtain_evaluation_episodes(policy, env, max_episode_length=1000, num_eps=100, deterministic=True): """Sample the policy for num_eps episodes and return average values. Args: policy (Policy): Policy to use as the actor when gathering samples. env (Environment): The environement used to obtain episodes. max_episode_length (int): Maximum episode length. The episode will truncated when length of episode reaches max_episode_length. num_eps (int): Number of episodes. deterministic (bool): Whether the a deterministic approach is used in rollout. Returns: EpisodeBatch: Evaluation episodes, representing the best current performance of the algorithm. """ episodes = [] # Use a finite length rollout for evaluation. with click.progressbar(range(num_eps), label='Evaluating') as pbar: for _ in pbar: eps = rollout(env, policy, max_episode_length=max_episode_length, deterministic=deterministic) episodes.append(eps) return EpisodeBatch.from_list(env.spec, episodes)
def train_once(self, itr, paths): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: float: The average return in last epoch cycle. """ # -- Stage: Calculate baseline if hasattr(self._baseline, 'predict_n'): baseline_predictions = self._baseline.predict_n(paths) else: baseline_predictions = [ self._baseline.predict(path) for path in paths ] # -- Stage: Pre-process samples based on collected paths samples_data = paths_to_tensors(paths, self.max_episode_length, baseline_predictions, self._discount) # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance(itr, EpisodeBatch.from_list( self._env_spec, paths), discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) samples_data['average_return'] = np.mean(undiscounted_returns) epoch = itr // self._n_samples i_sample = itr - epoch * self._n_samples tabular.record('Epoch', epoch) tabular.record('# Sample', i_sample) rtn = samples_data['average_return'] self._all_returns.append(samples_data['average_return']) if (itr + 1) % self._n_samples == 0: avg_rtns = np.array(self._all_returns) self._es.tell(self._all_params, -avg_rtns) self.policy.set_param_values(self._es.best.get()[0]) # Clear for next epoch rtn = max(self._all_returns) self._all_returns.clear() self._all_params = self._sample_params() self._cur_params = self._all_params[(i_sample + 1) % self._n_samples] self.policy.set_param_values(self._cur_params) logger.log(tabular) return rtn
def _train_once(self, itr, paths): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: numpy.float64: Average return. """ # -- Stage: Calculate baseline paths = [ dict( observations=path['observations'], actions=( self._env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos'], dones=np.array([ step_type == StepType.TERMINAL for step_type in path['step_types'] ])) for path in paths ] if hasattr(self._baseline, 'predict_n'): baseline_predictions = self._baseline.predict_n(paths) else: baseline_predictions = [ self._baseline.predict(path) for path in paths ] # -- Stage: Pre-process samples based on collected paths samples_data = paths_to_tensors(paths, self.max_episode_length, baseline_predictions, self._discount, self._gae_lambda) # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance( itr, EpisodeBatch.from_list(self._env_spec, paths), discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) samples_data['average_return'] = np.mean(undiscounted_returns) logger.log('Optimizing policy...') self._optimize_policy(samples_data) return samples_data['average_return']
def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): Experiment runner. """ for epoch in runner.step_epochs(): samples = runner.obtain_samples(epoch) log_performance(epoch, EpisodeBatch.from_list(self.env_spec, samples), self._discount) self._train_once(samples)
def train(self, runner): """Get samples and train the policy. Args: runner (LocalRunner): LocalRunner. """ for epoch in runner.step_epochs(): samples = runner.obtain_samples(epoch) log_performance(epoch, EpisodeBatch.from_list(self.env_spec, samples), self._discount) self._train_once(epoch, samples)
def _log_performance(self, itr, all_samples, loss_before, loss_after, kl_before, kl, policy_entropy): """Evaluate performance of this batch. Args: itr (int): Iteration number. all_samples (list[list[_MAMLEpisodeBatch]]): Two dimensional list of _MAMLEpisodeBatch of size [meta_batch_size * (num_grad_updates + 1)] loss_before (float): Loss before optimization step. loss_after (float): Loss after optimization step. kl_before (float): KL divergence before optimization step. kl (float): KL divergence after optimization step. policy_entropy (float): Policy entropy. Returns: float: The average return in last epoch cycle. """ tabular.record('Iteration', itr) name_map = None if hasattr(self._env, 'all_task_names'): names = self._env.all_task_names name_map = dict(zip(names, names)) rtns = log_multitask_performance( itr, EpisodeBatch.from_list( env_spec=self._env.spec, paths=[ path for task_paths in all_samples for path in task_paths[self._num_grad_updates].paths ]), discount=self._inner_algo.discount, name_map=name_map) with tabular.prefix(self._policy.name + '/'): tabular.record('LossBefore', loss_before) tabular.record('LossAfter', loss_after) tabular.record('dLoss', loss_before - loss_after) tabular.record('KLBefore', kl_before) tabular.record('KLAfter', kl) tabular.record('Entropy', policy_entropy) return np.mean(rtns)
def _obtain_samples(self, trainer, epoch): """Obtain samples from self._source. Args: trainer (Trainer): Experiment trainer, which may be used to obtain samples. epoch (int): The current epoch. Returns: TimeStepBatch: Batch of samples. """ if isinstance(self._source, Policy): batch = EpisodeBatch.from_list(self._env_spec, trainer.obtain_samples(epoch)) log_performance(epoch, batch, 1.0, prefix='Expert') return batch else: batches = [] while (sum(len(batch.actions) for batch in batches) < self._batch_size): batches.append(next(self._source)) return TimeStepBatch.concatenate(*batches)
def train_once(self, itr, paths): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: numpy.float64: Average return. """ undiscounted_returns = log_performance(itr, EpisodeBatch.from_list( self._env_spec, paths), discount=self._discount) samples_data = self.paths_to_tensors(paths) samples_data['average_return'] = np.mean(undiscounted_returns) logger.log('Optimizing policy...') self.optimize_policy(itr, samples_data) return samples_data['average_return']
def train_once(self, itr, paths): """Train the algorithm once. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: numpy.float64: Calculated mean value of undiscounted returns. """ obs, actions, rewards, returns, valids, baselines = \ self.process_samples(paths) if self._maximum_entropy: policy_entropies = self._compute_policy_entropy(obs) rewards += self._policy_ent_coeff * policy_entropies obs_flat = torch.cat(filter_valids(obs, valids)) actions_flat = torch.cat(filter_valids(actions, valids)) rewards_flat = torch.cat(filter_valids(rewards, valids)) returns_flat = torch.cat(filter_valids(returns, valids)) advs_flat = self._compute_advantage(rewards, valids, baselines) with torch.no_grad(): policy_loss_before = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_before = self._value_function.compute_loss( obs_flat, returns_flat) kl_before = self._compute_kl_constraint(obs) self._train(obs_flat, actions_flat, rewards_flat, returns_flat, advs_flat) with torch.no_grad(): policy_loss_after = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_after = self._value_function.compute_loss( obs_flat, returns_flat) kl_after = self._compute_kl_constraint(obs) policy_entropy = self._compute_policy_entropy(obs) with tabular.prefix(self.policy.name): tabular.record('/LossBefore', policy_loss_before.item()) tabular.record('/LossAfter', policy_loss_after.item()) tabular.record('/dLoss', (policy_loss_before - policy_loss_after).item()) tabular.record('/KLBefore', kl_before.item()) tabular.record('/KL', kl_after.item()) tabular.record('/Entropy', policy_entropy.mean().item()) with tabular.prefix(self._value_function.name): tabular.record('/LossBefore', vf_loss_before.item()) tabular.record('/LossAfter', vf_loss_after.item()) tabular.record('/dLoss', vf_loss_before.item() - vf_loss_after.item()) self._old_policy.load_state_dict(self.policy.state_dict()) undiscounted_returns = log_performance(itr, EpisodeBatch.from_list( self._env_spec, paths), discount=self.discount) return np.mean(undiscounted_returns)
def _process_samples(self, itr, paths): # pylint: disable=too-many-statements """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (OrderedDict[dict]): A list of collected paths for each task. In RL^2, there are n environments/tasks and paths in each of them will be concatenated at some point and fed to the policy. Returns: dict: Processed sample data, with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * returns: (numpy.ndarray) * valids: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) * paths: (list[dict]) * average_return: (numpy.float64) Raises: ValueError: If 'batch_idx' is not found. """ concatenated_paths = [] paths_by_task = collections.defaultdict(list) for path in paths: path['returns'] = discount_cumsum(path['rewards'], self._discount) path['lengths'] = [len(path['rewards'])] if 'batch_idx' in path: paths_by_task[path['batch_idx']].append(path) elif 'batch_idx' in path['agent_infos']: paths_by_task[path['agent_infos']['batch_idx'][0]].append(path) else: raise ValueError( 'Batch idx is required for RL2 but not found, ' 'Make sure to use garage.tf.algos.rl2.RL2Worker ' 'for sampling') # all path in paths_by_task[i] are sampled from task[i] for _paths in paths_by_task.values(): concatenated_path = self._concatenate_paths(_paths) concatenated_paths.append(concatenated_path) # stack and pad to max path length of the concatenated # path, which will be fed to inner algo # i.e. max_episode_length * episode_per_task concatenated_paths_stacked = (stack_and_pad_tensor_dict_list( concatenated_paths, self._inner_algo.max_episode_length)) name_map = None if hasattr(self._task_sampler, '_envs') and hasattr( self._task_sampler._envs[0]._env, 'all_task_names'): names = [ env._env.all_task_names[0] for env in self._task_sampler._envs ] name_map = dict(enumerate(names)) undiscounted_returns = log_multitask_performance( itr, EpisodeBatch.from_list(self._env_spec, paths), self._inner_algo._discount, name_map=name_map) concatenated_paths_stacked['paths'] = concatenated_paths concatenated_paths_stacked['average_return'] = np.mean( undiscounted_returns) return concatenated_paths_stacked