def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): LocalRunner is passed to give algorithm the access to runner.step_epochs(), which provides services such as snapshotting and sampler control. Returns: float: The average return in last epoch cycle. """ last_return = None runner.enable_logging = False for _ in runner.step_epochs(): for cycle in range(self._steps_per_epoch): runner.step_path = runner.obtain_samples(runner.step_itr) for path in runner.step_path: path['rewards'] *= self._reward_scale last_return = self.train_once(runner.step_itr, runner.step_path) if (cycle == 0 and self.replay_buffer.n_transitions_stored >= self._min_buffer_size): runner.enable_logging = True log_performance(runner.step_itr, obtain_evaluation_samples( self.policy, runner.get_env_copy()), discount=self._discount) runner.step_itr += 1 return last_return
def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): LocalRunner is passed to give algorithm the access to runner.step_epochs(), which provides services such as snapshotting and sampler control. Returns: float: The average return in last epoch cycle. """ last_return = None for _ in runner.step_epochs(): for cycle in range(self.steps_per_epoch): runner.step_path = runner.obtain_samples(runner.step_itr) for path in runner.step_path: path['rewards'] *= self.reward_scale last_return = self.train_once(runner.step_itr, runner.step_path) if cycle == 0 and self._buffer_prefilled: log_performance(runner.step_itr, self._obtain_evaluation_samples( runner.get_env_copy()), discount=self.discount) tabular.record('TotalEnvSteps', runner.total_env_steps) runner.step_itr += 1 return last_return
def train(self, runner): """Get samples and train the policy. Args: runner (LocalRunner): LocalRunner. """ for epoch in runner.step_epochs(): samples = runner.obtain_samples(epoch) log_performance(epoch, EpisodeBatch.from_list(self.env_spec, samples), self._discount) self._train_once(epoch, samples)
def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): Experiment runner. """ for epoch in runner.step_epochs(): samples = runner.obtain_samples(epoch) log_performance(epoch, EpisodeBatch.from_list(self.env_spec, samples), self._discount) self._train_once(samples)
def _train_once(self, itr, episodes): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. episodes (EpisodeBatch): Batch of episodes. Returns: numpy.float64: Average return. """ # -- Stage: Calculate and pad baselines obs = [ self._baseline.predict({'observations': obs}) for obs in episodes.observations_list ] baselines = pad_batch_array(np.concatenate(obs), episodes.lengths, self.max_episode_length) # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance(itr, episodes, discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) logger.log('Optimizing policy...') self._optimize_policy(episodes, baselines) return np.mean(undiscounted_returns)
def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): Experiment runner. Returns: float: The average return in last epoch cycle. """ if not self._eval_env: self._eval_env = runner.get_env_copy() last_returns = [float('nan')] runner.enable_logging = False for _ in runner.step_epochs(): for cycle in range(self._steps_per_epoch): runner.step_path = runner.obtain_episodes(runner.step_itr) self.train_once(runner.step_itr, runner.step_path) if (cycle == 0 and self.replay_buffer.n_transitions_stored >= self._min_buffer_size): runner.enable_logging = True eval_eps = obtain_evaluation_episodes( self.policy, self._eval_env) last_returns = log_performance(runner.step_itr, eval_eps, discount=self._discount) runner.step_itr += 1 return np.mean(last_returns)
def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): Experiment runner, which provides services such as snapshotting and sampler control. Returns: float: The average return in last epoch cycle. """ if not self._eval_env: self._eval_env = runner.get_env_copy() last_returns = [float('nan')] runner.enable_logging = False qf_losses = [] for _ in runner.step_epochs(): for cycle in range(self._steps_per_epoch): runner.step_path = runner.obtain_episodes(runner.step_itr) qf_losses.extend( self.train_once(runner.step_itr, runner.step_path)) if (cycle == 0 and self.replay_buffer.n_transitions_stored >= self._min_buffer_size): runner.enable_logging = True eval_episodes = obtain_evaluation_episodes( self.policy, self._eval_env) last_returns = log_performance(runner.step_itr, eval_episodes, discount=self._discount) runner.step_itr += 1 tabular.record('DQN/QFLossMean', np.mean(qf_losses)) tabular.record('DQN/QFLossStd', np.std(qf_losses)) return np.mean(last_returns)
def _train_once(self, itr, episodes): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. episodes (EpisodeBatch): Batch of episodes. Returns: numpy.float64: Average return. """ # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance( itr, episodes, discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) average_return = np.mean(undiscounted_returns) logger.log('Optimizing policy...') self._optimize_policy(episodes) return average_return
def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): LocalRunner is passed to give algorithm the access to runner.step_epochs(), which provides services such as snapshotting and sampler control. Returns: float: The average return in last epoch cycle. """ if not self._eval_env: self._eval_env = runner.get_env_copy() last_returns = [float('nan')] runner.enable_logging = False for _ in runner.step_epochs(): for cycle in range(self._steps_per_epoch): runner.step_path = runner.obtain_trajectories(runner.step_itr) self.train_once(runner.step_itr, runner.step_path) if (cycle == 0 and self.replay_buffer.n_transitions_stored >= self._min_buffer_size): runner.enable_logging = True eval_samples = obtain_evaluation_samples( self.policy, self._eval_env) last_returns = log_performance(runner.step_itr, eval_samples, discount=self._discount) runner.step_itr += 1 return np.mean(last_returns)
def train(self, trainer): """Obtain samplers and start actual training for each epoch. Args: trainer (Trainer): Experiment trainer, which provides services such as snapshotting and sampler control. Returns: float: The average return in last epoch cycle. """ if not self._eval_env: self._eval_env = trainer.get_env_copy() last_returns = [float('nan')] trainer.enable_logging = False for _ in trainer.step_epochs(): for cycle in range(self._steps_per_epoch): trainer.step_path = trainer.obtain_episodes(trainer.step_itr) if hasattr(self.exploration_policy, 'update'): self.exploration_policy.update(trainer.step_path) self._train_once(trainer.step_itr, trainer.step_path) if (cycle == 0 and self._replay_buffer.n_transitions_stored >= self._min_buffer_size): trainer.enable_logging = True eval_episodes = obtain_evaluation_episodes( self.policy, self._eval_env) last_returns = log_performance(trainer.step_itr, eval_episodes, discount=self._discount) trainer.step_itr += 1 return np.mean(last_returns)
def test_log_performance(): lengths = np.array([10, 5, 1, 1]) batch = EpisodeBatch( EnvSpec(akro.Box(np.array([0., 0., 0.]), np.array([1., 1., 1.])), akro.Box(np.array([-1., -1.]), np.array([0., 0.]))), observations=np.ones((sum(lengths), 3), dtype=np.float32), last_observations=np.ones((len(lengths), 3), dtype=np.float32), actions=np.zeros((sum(lengths), 2), dtype=np.float32), rewards=np.array([ 0.34026529, 0.58263177, 0.84307509, 0.97651095, 0.81723901, 0.22631398, 0.03421301, 0.97515046, 0.64311832, 0.65068933, 0.17657714, 0.04783857, 0.73904013, 0.41364329, 0.52235551, 0.24203526, 0.43328910 ]), step_types=np.array( [StepType.FIRST] + [StepType.MID] * (lengths[0] - 2) + [StepType.TERMINAL] + [StepType.FIRST] + [StepType.MID] * (lengths[1] - 2) + [StepType.TERMINAL] + [StepType.FIRST] + [StepType.FIRST], dtype=StepType), env_infos={ 'success': np.array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=bool) }, agent_infos={}, lengths=lengths) log_file = tempfile.NamedTemporaryFile() csv_output = dowel.CsvOutput(log_file.name) logger.add_output(csv_output) log_performance(7, batch, 0.8, prefix='test_log_performance') logger.log(tabular) logger.dump_output_type(dowel.CsvOutput) with open(log_file.name, 'r') as file: rows = list(csv.DictReader(file)) res = {k: float(r) for (k, r) in rows[0].items()} assert res['test_log_performance/Iteration'] == 7 assert res['test_log_performance/NumEpisodes'] == 4 assert math.isclose(res['test_log_performance/SuccessRate'], 0.75) assert math.isclose(res['test_log_performance/TerminationRate'], 0.5) assert math.isclose(res['test_log_performance/AverageDiscountedReturn'], 1.1131040640673113) assert math.isclose(res['test_log_performance/AverageReturn'], 2.1659965525) assert math.isclose(res['test_log_performance/StdReturn'], 2.354067152038576)
def train_once(self, itr, paths): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: float: The average return in last epoch cycle. """ # -- Stage: Calculate baseline if hasattr(self._baseline, 'predict_n'): baseline_predictions = self._baseline.predict_n(paths) else: baseline_predictions = [ self._baseline.predict(path) for path in paths ] # -- Stage: Pre-process samples based on collected paths samples_data = paths_to_tensors(paths, self.max_episode_length, baseline_predictions, self._discount) # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance(itr, EpisodeBatch.from_list( self._env_spec, paths), discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) samples_data['average_return'] = np.mean(undiscounted_returns) epoch = itr // self._n_samples i_sample = itr - epoch * self._n_samples tabular.record('Epoch', epoch) tabular.record('# Sample', i_sample) rtn = samples_data['average_return'] self._all_returns.append(samples_data['average_return']) if (itr + 1) % self._n_samples == 0: avg_rtns = np.array(self._all_returns) self._es.tell(self._all_params, -avg_rtns) self.policy.set_param_values(self._es.best.get()[0]) # Clear for next epoch rtn = max(self._all_returns) self._all_returns.clear() self._all_params = self._sample_params() self._cur_params = self._all_params[(i_sample + 1) % self._n_samples] self.policy.set_param_values(self._cur_params) logger.log(tabular) return rtn
def train(self, trainer): """Obtain samplers and start actual training for each epoch. Args: trainer (Trainer): Experiment trainer, which provides services such as snapshotting and sampler control. """ if not self._eval_env: self._eval_env = trainer.get_env_copy() trainer.enable_logging = False for _ in trainer.step_epochs(): for cycle in range(self._steps_per_epoch): # Obtain trasnsition batch and store it in replay buffer. # Get action randomly from environment within warm-up steps. # Afterwards, get action from policy. if self._uniform_random_policy and \ trainer.step_itr < self._start_steps: trainer.step_path = trainer.obtain_episodes( trainer.step_itr, agent_update=self._uniform_random_policy) else: trainer.step_path = trainer.obtain_episodes( trainer.step_itr, agent_update=self.exploration_policy) self._replay_buffer.add_episode_batch(trainer.step_path) # Update after warm-up steps. if trainer.total_env_steps >= self._update_after: self._train_once(trainer.step_itr) # Evaluate and log the results. if (cycle == 0 and self._replay_buffer.n_transitions_stored >= self._min_buffer_size): trainer.enable_logging = True eval_eps = self._evaluate_policy() log_performance(trainer.step_path, eval_eps, discount=self._discount, prefix='Training') log_performance(trainer.step_itr, eval_eps, discount=self._discount, prefix='Evaluation') trainer.step_itr += 1
def train(self, trainer): """Obtain samplers and start actual training for each epoch. Args: trainer (Trainer): Experiment trainer, for services such as snapshotting and sampler control. """ if not self._eval_env: self._eval_env = trainer.get_env_copy() for epoch in trainer.step_epochs(): if self._eval_env is not None: log_performance(epoch, obtain_evaluation_episodes( self.learner, self._eval_env), discount=1.0) losses = self._train_once(trainer, epoch) with tabular.prefix(self._name + '/'): tabular.record('MeanLoss', np.mean(losses)) tabular.record('StdLoss', np.std(losses))
def _train_once(self, itr, paths): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: numpy.float64: Average return. """ # -- Stage: Calculate baseline paths = [ dict( observations=path['observations'], actions=( self._env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos'], dones=np.array([ step_type == StepType.TERMINAL for step_type in path['step_types'] ])) for path in paths ] if hasattr(self._baseline, 'predict_n'): baseline_predictions = self._baseline.predict_n(paths) else: baseline_predictions = [ self._baseline.predict(path) for path in paths ] # -- Stage: Pre-process samples based on collected paths samples_data = paths_to_tensors(paths, self.max_episode_length, baseline_predictions, self._discount, self._gae_lambda) # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance( itr, EpisodeBatch.from_list(self._env_spec, paths), discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) samples_data['average_return'] = np.mean(undiscounted_returns) logger.log('Optimizing policy...') self._optimize_policy(samples_data) return samples_data['average_return']
def train(self, trainer): """Obtain samplers and start actual training for each epoch. Args: trainer (Trainer): Experiment trainer. Returns: float: The average return in last epoch cycle. """ if not self._eval_env: self._eval_env = trainer.get_env_copy() last_returns = [float('nan')] if self._min_buffer_size > self.replay_buffer.n_transitions_stored: num_warmup_steps = (self._min_buffer_size - self.replay_buffer.n_transitions_stored) self.replay_buffer.add_episode_batch( trainer.obtain_episodes(0, num_warmup_steps)) trainer.enable_logging = True for _ in trainer.step_epochs(): if (self.replay_buffer.n_transitions_stored >= self._min_buffer_size): logger.log('Evaluating policy') params_before = self.exploration_policy.get_param_values() eval_eps = obtain_evaluation_episodes( (self.exploration_policy if not self._deterministic_eval else self.policy), self._eval_env, num_eps=self._num_eval_episodes, max_episode_length=self._max_episode_length_eval) self.exploration_policy.set_param_values(params_before) last_returns = log_performance(trainer.step_itr, eval_eps, discount=self._discount) self._episode_reward_mean.extend(last_returns) tabular.record('Evaluation/100EpRewardMean', np.mean(self._episode_reward_mean)) for _ in range(self._steps_per_epoch): trainer.step_episode = trainer.obtain_episodes( trainer.step_itr) if hasattr(self.exploration_policy, 'update'): self.exploration_policy.update(trainer.step_episode) self._train_once(trainer.step_itr, trainer.step_episode) trainer.step_itr += 1 return np.mean(last_returns)
def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): LocalRunner is passed to give algorithm the access to runner.step_epochs(), which provides services such as snapshotting and sampler control. """ if not self._eval_env: self._eval_env = runner.get_env_copy() for epoch in runner.step_epochs(): if self._eval_env is not None: log_performance(epoch, obtain_evaluation_samples( self.learner, self._eval_env), discount=1.0) losses = self._train_once(runner, epoch) with tabular.prefix(self._name + '/'): tabular.record('MeanLoss', np.mean(losses)) tabular.record('StdLoss', np.std(losses))
def _obtain_samples(self, trainer, epoch): """Obtain samples from self._source. Args: trainer (Trainer): Experiment trainer, which may be used to obtain samples. epoch (int): The current epoch. Returns: TimeStepBatch: Batch of samples. """ if isinstance(self._source, Policy): batch = trainer.obtain_episodes(epoch) log_performance(epoch, batch, 1.0, prefix='Expert') return batch else: batches = [] while (sum(len(batch.actions) for batch in batches) < self._batch_size): batches.append(next(self._source)) return TimeStepBatch.concatenate(*batches)
def _train_once(self, itr, episodes): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. episodes (garage.EpisodeBatch): Episodes collected using the current policy. Returns: float: The average return of epoch cycle. """ # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance(itr, episodes, discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) average_return = np.mean(undiscounted_returns) epoch = itr // self._n_samples i_sample = itr - epoch * self._n_samples tabular.record('Epoch', epoch) tabular.record('# Sample', i_sample) rtn = average_return self._all_returns.append(average_return) # -- Stage: Update policy distribution. if (itr + 1) % self._n_samples == 0: avg_rtns = np.array(self._all_returns) best_inds = np.argsort(-avg_rtns)[:self._n_best] best_params = np.array(self._all_params)[best_inds] # MLE of normal distribution self._cur_mean = best_params.mean(axis=0) self._cur_std = best_params.std(axis=0) self.policy.set_param_values(self._cur_mean) # Clear for next epoch rtn = max(self._all_returns) self._all_returns.clear() self._all_params.clear() # -- Stage: Generate a new policy for next path sampling self._cur_params = self._sample_params(itr) self._all_params.append(self._cur_params.copy()) self.policy.set_param_values(self._cur_params) logger.log(tabular) return rtn
def _obtain_samples(self, runner, epoch): """Obtain samples from self._source. Args: runner (LocalRunner): LocalRunner to which may be used to obtain samples. epoch (int): The current epoch. Returns: TimeStepBatch: Batch of samples. """ if isinstance(self._source, Policy): batch = TrajectoryBatch.from_trajectory_list( self.env_spec, runner.obtain_samples(epoch)) log_performance(epoch, batch, 1.0, prefix='Expert') return batch else: batches = [] while (sum(len(batch.actions) for batch in batches) < self._batch_size): batches.append(next(self._source)) return TimeStepBatch.concatenate(*batches)
def evaluate(self, algo): """Evaluate the Meta-RL algorithm on the test tasks. Args: algo (garage.np.algos.MetaRLAlgorithm): The algorithm to evaluate. """ adapted_trajectories = [] for env_up in self._test_task_sampler.sample(self._n_test_tasks): policy = algo.get_exploration_policy() traj = TrajectoryBatch.concatenate(*[ self._test_sampler.obtain_samples(self._eval_itr, 1, policy, env_up) for _ in range(self._n_exploration_traj) ]) adapted_policy = algo.adapt_policy(policy, traj) adapted_traj = self._test_sampler.obtain_samples( self._eval_itr, 1, adapted_policy) adapted_trajectories.append(adapted_traj) log_performance(self._eval_itr, TrajectoryBatch.concatenate(*adapted_trajectories), getattr(algo, 'discount', 1.0), prefix=self._prefix) self._eval_itr += 1
def train_once(self, itr, paths): """Train the algorithm once. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: dict: Processed sample data, with key * average_return: (float) """ obs, actions, rewards, valids, baselines = self.process_samples( itr, paths) loss = self._compute_loss(itr, obs, actions, rewards, valids, baselines) self._old_policy.load_state_dict(self.policy.state_dict()) self._optimizer.zero_grad() loss.backward() kl_before = self._compute_kl_constraint(obs).detach() self._optimize(itr, obs, actions, rewards, valids, baselines) with torch.no_grad(): loss_after = self._compute_loss(itr, obs, actions, rewards, valids, baselines) kl = self._compute_kl_constraint(obs) policy_entropy = self._compute_policy_entropy(obs) average_returns = log_performance(itr, TrajectoryBatch.from_trajectory_list( self.env_spec, paths), discount=self.discount) with tabular.prefix(self.policy.name): tabular.record('LossBefore', loss.item()) tabular.record('LossAfter', loss_after.item()) tabular.record('dLoss', loss.item() - loss_after.item()) tabular.record('KLBefore', kl_before.item()) tabular.record('KL', kl.item()) tabular.record('Entropy', policy_entropy.mean().item()) self.baseline.fit(paths) return np.mean(average_returns)
def _train_once(self, itr, episodes): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. episodes (garage.EpisodeBatch): Episodes collected using the current policy. Returns: float: The average return of epoch cycle. """ # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance(itr, episodes, discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) average_return = np.mean(undiscounted_returns) epoch = itr // self._n_samples i_sample = itr - epoch * self._n_samples tabular.record('Epoch', epoch) tabular.record('# Sample', i_sample) rtn = average_return self._all_returns.append(average_return) if (itr + 1) % self._n_samples == 0: avg_rtns = np.array(self._all_returns) self._es.tell(self._all_params, -avg_rtns) self.policy.set_param_values(self._es.best.get()[0]) # Clear for next epoch rtn = max(self._all_returns) self._all_returns.clear() self._all_params = self._sample_params() self._cur_params = self._all_params[(i_sample + 1) % self._n_samples] self.policy.set_param_values(self._cur_params) logger.log(tabular) return rtn
def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): LocalRunner is passed to give algorithm the access to runner.step_epochs(), which provides services such as snapshotting and sampler control. Returns: float: The average return in last epoch cycle. """ last_return = None for _ in runner.step_epochs(): for _ in range(self.steps_per_epoch): if not self._buffer_prefilled: batch_size = int(self.min_buffer_size) else: batch_size = None runner.step_path = runner.obtain_samples( runner.step_itr, batch_size) path_returns = [] for path in runner.step_path: self.replay_buffer.add_transitions( observation=path['observations'], action=path['actions'], reward=path['rewards'], next_observation=path['next_observations'], terminal=path['dones']) path_returns.append(sum(path['rewards'])) assert len(path_returns) is len(runner.step_path) self.episode_rewards.append(np.mean(path_returns)) for _ in range(self._gradient_steps): policy_loss, qf1_loss, qf2_loss = self.train_once() last_return = log_performance(runner.step_itr, self._obtain_evaluation_samples( runner.get_env_copy(), num_trajs=10), discount=self.discount) self._log_statistics(policy_loss, qf1_loss, qf2_loss) tabular.record('TotalEnvSteps', runner.total_env_steps) runner.step_itr += 1 return np.mean(last_return)
def _evaluate_policy(self, epoch): """Evaluate the performance of the policy via deterministic rollouts. Statistics such as (average) discounted return and success rate are recorded. Args: epoch(int): The current training epoch. Returns: float: The average return across self._num_evaluation_trajectories trajectories """ eval_trajectories = self._obtain_evaluation_samples( self._eval_env, num_trajs=self._num_evaluation_trajectories) last_return = log_performance(epoch, eval_trajectories, discount=self.discount) return last_return
def _train_once(self, itr, episodes): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. episodes (EpisodeBatch): Batch of episodes. Returns: numpy.float64: Average return. """ undiscounted_returns = log_performance(itr, episodes, discount=self._discount) # Calculate baseline predictions baselines = [] start = 0 for length in episodes.lengths: stop = start + length baseline = self._baseline.predict( dict(observations=episodes.observations[start:stop], tasks=episodes.env_infos['task_onehot'][start:stop], latents=episodes.agent_infos['latent'][start:stop])) baselines.append(baseline) start = stop baselines = pad_batch_array(np.concatenate(baselines), episodes.lengths, self.max_episode_length) # Process trajectories embed_eps, embed_ep_infos = self._process_episodes(episodes) average_return = np.mean(undiscounted_returns) logger.log('Optimizing policy...') self._optimize_policy(itr, episodes, baselines, embed_eps, embed_ep_infos) return average_return
def train_once(self, itr, paths): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: numpy.float64: Average return. """ undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self._env_spec, paths), discount=self._discount) samples_data = self.paths_to_tensors(paths) samples_data['average_return'] = np.mean(undiscounted_returns) logger.log('Optimizing policy...') self.optimize_policy(itr, samples_data) return samples_data['average_return']
def _evaluate_policy(self, epoch): """Evaluate the performance of the policy via deterministic sampling. Statistics such as (average) discounted return and success rate are recorded. Args: epoch (int): The current training epoch. Returns: float: The average return across self._num_evaluation_episodes episodes """ eval_episodes = obtain_evaluation_episodes( self.policy, self._eval_env, self._max_episode_length_eval, num_eps=self._num_evaluation_episodes, deterministic=self._use_deterministic_evaluation) last_return = log_performance(epoch, eval_episodes, discount=self._discount) return last_return
def train_once(self, itr, paths): """Train the algorithm once. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: numpy.float64: Calculated mean value of undiscounted returns. """ obs, actions, rewards, returns, valids, baselines = \ self.process_samples(paths) if self._maximum_entropy: policy_entropies = self._compute_policy_entropy(obs) rewards += self._policy_ent_coeff * policy_entropies obs_flat = torch.cat(filter_valids(obs, valids)) actions_flat = torch.cat(filter_valids(actions, valids)) rewards_flat = torch.cat(filter_valids(rewards, valids)) returns_flat = torch.cat(filter_valids(returns, valids)) advs_flat = self._compute_advantage(rewards, valids, baselines) with torch.no_grad(): policy_loss_before = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_before = self._value_function.compute_loss( obs_flat, returns_flat) kl_before = self._compute_kl_constraint(obs) self._train(obs_flat, actions_flat, rewards_flat, returns_flat, advs_flat) with torch.no_grad(): policy_loss_after = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_after = self._value_function.compute_loss( obs_flat, returns_flat) kl_after = self._compute_kl_constraint(obs) policy_entropy = self._compute_policy_entropy(obs) with tabular.prefix(self.policy.name): tabular.record('/LossBefore', policy_loss_before.item()) tabular.record('/LossAfter', policy_loss_after.item()) tabular.record('/dLoss', (policy_loss_before - policy_loss_after).item()) tabular.record('/KLBefore', kl_before.item()) tabular.record('/KL', kl_after.item()) tabular.record('/Entropy', policy_entropy.mean().item()) with tabular.prefix(self._value_function.name): tabular.record('/LossBefore', vf_loss_before.item()) tabular.record('/LossAfter', vf_loss_after.item()) tabular.record('/dLoss', vf_loss_before.item() - vf_loss_after.item()) self._old_policy.load_state_dict(self.policy.state_dict()) undiscounted_returns = log_performance(itr, EpisodeBatch.from_list( self._env_spec, paths), discount=self.discount) return np.mean(undiscounted_returns)
def process_samples(self, itr, paths): # pylint: disable=too-many-statements """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: dict: Processed sample data, with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * baselines: (numpy.ndarray) * returns: (numpy.ndarray) * valids: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) * paths: (list[dict]) * average_return: (numpy.float64) """ baselines = [] returns = [] total_steps = 0 max_path_length = self.max_path_length undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self.env_spec, paths), discount=self.discount) if self.flatten_input: paths = [ dict( observations=(self.env_spec.observation_space.flatten_n( path['observations'])), actions=( self.env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos'], dones=path['dones']) for path in paths ] else: paths = [ dict( observations=path['observations'], actions=( self.env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos'], dones=path['dones']) for path in paths ] if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): total_steps += len(path['rewards']) path_baselines = np.append(all_path_baselines[idx], 0) deltas = (path['rewards'] + self.discount * path_baselines[1:] - path_baselines[:-1]) path['advantages'] = np_tensor_utils.discount_cumsum( deltas, self.discount * self.gae_lambda) path['deltas'] = deltas for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = np_tensor_utils.discount_cumsum( path['rewards'], self.discount) returns.append(path['returns']) # make all paths the same length obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path['returns'] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) baselines = tensor_utils.pad_tensor_n(baselines, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) lengths = np.asarray([v.sum() for v in valids]) ent = np.sum(self.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) self.episode_reward_mean.extend(undiscounted_returns) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('Extras/EpisodeRewardMean', np.mean(self.episode_reward_mean)) samples_data = dict( observations=obs, actions=actions, rewards=rewards, baselines=baselines, returns=returns, valids=valids, lengths=lengths, agent_infos=agent_infos, env_infos=env_infos, paths=paths, average_return=np.mean(undiscounted_returns), ) return samples_data