def train_once(self, itr, paths): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: float: The average return in last epoch cycle. """ # -- Stage: Calculate baseline if hasattr(self._baseline, 'predict_n'): baseline_predictions = self._baseline.predict_n(paths) else: baseline_predictions = [ self._baseline.predict(path) for path in paths ] # -- Stage: Pre-process samples based on collected paths samples_data = paths_to_tensors(paths, self.max_episode_length, baseline_predictions, self._discount) # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance(itr, EpisodeBatch.from_list( self._env_spec, paths), discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) samples_data['average_return'] = np.mean(undiscounted_returns) epoch = itr // self._n_samples i_sample = itr - epoch * self._n_samples tabular.record('Epoch', epoch) tabular.record('# Sample', i_sample) rtn = samples_data['average_return'] self._all_returns.append(samples_data['average_return']) if (itr + 1) % self._n_samples == 0: avg_rtns = np.array(self._all_returns) self._es.tell(self._all_params, -avg_rtns) self.policy.set_param_values(self._es.best.get()[0]) # Clear for next epoch rtn = max(self._all_returns) self._all_returns.clear() self._all_params = self._sample_params() self._cur_params = self._all_params[(i_sample + 1) % self._n_samples] self.policy.set_param_values(self._cur_params) logger.log(tabular) return rtn
def train_once(self, itr, paths): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: float: The average return of epoch cycle. """ # -- Stage: Calculate baseline if hasattr(self._baseline, 'predict_n'): baseline_predictions = self._baseline.predict_n(paths) else: baseline_predictions = [ self._baseline.predict(path) for path in paths ] # -- Stage: Pre-process samples based on collected paths samples_data = paths_to_tensors(paths, self.max_path_length, baseline_predictions, self._discount) # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self._env_spec, paths), discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) samples_data['average_return'] = np.mean(undiscounted_returns) epoch = itr // self._n_samples i_sample = itr - epoch * self._n_samples tabular.record('Epoch', epoch) tabular.record('# Sample', i_sample) # -- Stage: Process samples_data rtn = samples_data['average_return'] self._all_returns.append(samples_data['average_return']) # -- Stage: Update policy distribution. if (itr + 1) % self._n_samples == 0: avg_rtns = np.array(self._all_returns) best_inds = np.argsort(-avg_rtns)[:self._n_best] best_params = np.array(self._all_params)[best_inds] # MLE of normal distribution self._cur_mean = best_params.mean(axis=0) self._cur_std = best_params.std(axis=0) self.policy.set_param_values(self._cur_mean) # Clear for next epoch rtn = max(self._all_returns) self._all_returns.clear() self._all_params.clear() # -- Stage: Generate a new policy for next path sampling self._cur_params = self._sample_params(itr) self._all_params.append(self._cur_params.copy()) self.policy.set_param_values(self._cur_params) logger.log(tabular) return rtn