def train(self, trainer): """Obtain samplers and start actual training for each epoch. Args: trainer (Trainer): Experiment trainer, which provides services such as snapshotting and sampler control. Returns: float: The average return in last epoch cycle. """ if not self._eval_env: self._eval_env = trainer.get_env_copy() last_returns = [float('nan')] trainer.enable_logging = False for _ in trainer.step_epochs(): for cycle in range(self._steps_per_epoch): trainer.step_path = trainer.obtain_episodes(trainer.step_itr) if hasattr(self.exploration_policy, 'update'): self.exploration_policy.update(trainer.step_path) self._train_once(trainer.step_itr, trainer.step_path) if (cycle == 0 and self._replay_buffer.n_transitions_stored >= self._min_buffer_size): trainer.enable_logging = True eval_episodes = obtain_evaluation_episodes( self.policy, self._eval_env) last_returns = log_performance(trainer.step_itr, eval_episodes, discount=self._discount) trainer.step_itr += 1 return np.mean(last_returns)
def _evaluate_policy(self, epoch): """Evaluate the performance of the policy via deterministic sampling. Statistics such as (average) discounted return and success rate are recorded. Args: epoch (int): The current training epoch. Returns: float: The average return across self._num_evaluation_episodes episodes """ eval_eps = [] for eval_env in self._eval_env: eval_eps.append( obtain_evaluation_episodes( self.policy, eval_env, self._max_episode_length_eval, num_eps=self._num_evaluation_episodes, deterministic=self._use_deterministic_evaluation)) eval_eps = EpisodeBatch.concatenate(*eval_eps) last_return = log_multitask_performance(epoch, eval_eps, self._discount) return last_return
def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): Experiment runner. Returns: float: The average return in last epoch cycle. """ if not self._eval_env: self._eval_env = runner.get_env_copy() last_returns = [float('nan')] runner.enable_logging = False for _ in runner.step_epochs(): for cycle in range(self._steps_per_epoch): runner.step_path = runner.obtain_episodes(runner.step_itr) self.train_once(runner.step_itr, runner.step_path) if (cycle == 0 and self.replay_buffer.n_transitions_stored >= self._min_buffer_size): runner.enable_logging = True eval_eps = obtain_evaluation_episodes( self.policy, self._eval_env) last_returns = log_performance(runner.step_itr, eval_eps, discount=self._discount) runner.step_itr += 1 return np.mean(last_returns)
def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): Experiment runner, which provides services such as snapshotting and sampler control. Returns: float: The average return in last epoch cycle. """ if not self._eval_env: self._eval_env = runner.get_env_copy() last_returns = [float('nan')] runner.enable_logging = False qf_losses = [] for _ in runner.step_epochs(): for cycle in range(self._steps_per_epoch): runner.step_path = runner.obtain_episodes(runner.step_itr) qf_losses.extend( self.train_once(runner.step_itr, runner.step_path)) if (cycle == 0 and self.replay_buffer.n_transitions_stored >= self._min_buffer_size): runner.enable_logging = True eval_episodes = obtain_evaluation_episodes( self.policy, self._eval_env) last_returns = log_performance(runner.step_itr, eval_episodes, discount=self._discount) runner.step_itr += 1 tabular.record('DQN/QFLossMean', np.mean(qf_losses)) tabular.record('DQN/QFLossStd', np.std(qf_losses)) return np.mean(last_returns)
def _evaluate_policy(self): """Evaluate the performance of the policy via deterministic rollouts. Statistics such as (average) discounted return and success rate are recorded. Returns: TrajectoryBatch: Evaluation trajectories, representing the best current performance of the algorithm. """ return obtain_evaluation_episodes( self.exploration_policy, self._eval_env, self._max_episode_length_eval, num_eps=self._num_evaluation_episodes, deterministic=self._use_deterministic_evaluation)
def train(self, trainer): """Obtain samplers and start actual training for each epoch. Args: trainer (Trainer): Experiment trainer, for services such as snapshotting and sampler control. """ if not self._eval_env: self._eval_env = trainer.get_env_copy() for epoch in trainer.step_epochs(): if self._eval_env is not None: log_performance(epoch, obtain_evaluation_episodes( self.learner, self._eval_env), discount=1.0) losses = self._train_once(trainer, epoch) with tabular.prefix(self._name + '/'): tabular.record('MeanLoss', np.mean(losses)) tabular.record('StdLoss', np.std(losses))