def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): LocalRunner is passed to give algorithm the access to runner.step_epochs(), which provides services such as snapshotting and sampler control. Returns: float: The average return in last epoch cycle. """ last_return = None runner.enable_logging = False for _ in runner.step_epochs(): for cycle in range(self._steps_per_epoch): runner.step_path = runner.obtain_samples(runner.step_itr) for path in runner.step_path: path['rewards'] *= self._reward_scale last_return = self.train_once(runner.step_itr, runner.step_path) if (cycle == 0 and self.replay_buffer.n_transitions_stored >= self._min_buffer_size): runner.enable_logging = True log_performance(runner.step_itr, obtain_evaluation_samples( self.policy, runner.get_env_copy()), discount=self._discount) runner.step_itr += 1 return last_return
def _evaluate_policy(self, epoch): """Evaluate the performance of the policy via deterministic rollouts. Statistics such as (average) discounted return and success rate are recorded. Args: epoch (int): The current training epoch. Returns: float: The average return across self._num_evaluation_trajectories trajectories """ eval_trajs = [] for _ in range(self._num_tasks): eval_trajs.append( obtain_evaluation_samples( self.policy, self._eval_env, num_trajs=self._num_evaluation_trajectories)) eval_trajs = TrajectoryBatch.concatenate(*eval_trajs) last_return = log_multitask_performance(epoch, eval_trajs, self._discount) return last_return
def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): LocalRunner is passed to give algorithm the access to runner.step_epochs(), which provides services such as snapshotting and sampler control. Returns: float: The average return in last epoch cycle. """ if not self._eval_env: self._eval_env = runner.get_env_copy() last_returns = [float('nan')] runner.enable_logging = False for _ in runner.step_epochs(): for cycle in range(self._steps_per_epoch): runner.step_path = runner.obtain_trajectories(runner.step_itr) self.train_once(runner.step_itr, runner.step_path) if (cycle == 0 and self.replay_buffer.n_transitions_stored >= self._min_buffer_size): runner.enable_logging = True eval_samples = obtain_evaluation_samples( self.policy, self._eval_env) last_returns = log_performance(runner.step_itr, eval_samples, discount=self._discount) runner.step_itr += 1 return np.mean(last_returns)
def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): LocalRunner is passed to give algorithm the access to runner.step_epochs(), which provides services such as snapshotting and sampler control. """ if not self._eval_env: self._eval_env = runner.get_env_copy() for epoch in runner.step_epochs(): if self._eval_env is not None: log_performance(epoch, obtain_evaluation_samples( self.learner, self._eval_env), discount=1.0) losses = self._train_once(runner, epoch) with tabular.prefix(self._name + '/'): tabular.record('MeanLoss', np.mean(losses)) tabular.record('StdLoss', np.std(losses))
def _evaluate_policy(self, epoch): """Evaluate the performance of the policy via deterministic rollouts. Statistics such as (average) discounted return and success rate are recorded. Args: epoch(int): The current training epoch. Returns: float: The average return across self._num_evaluation_trajectories trajectories """ eval_trajectories = obtain_evaluation_samples( self.policy, self._eval_env, num_trajs=self._num_evaluation_trajectories) last_return = log_performance(epoch, eval_trajectories, discount=self._discount) return last_return