def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): LocalRunner is passed to give algorithm the access to runner.step_epochs(), which provides services such as snapshotting and sampler control. Returns: float: The average return in last epoch cycle. """ last_return = None for _ in runner.step_epochs(): for cycle in range(self.steps_per_epoch): runner.step_path = runner.obtain_samples(runner.step_itr) for path in runner.step_path: path['rewards'] *= self.reward_scale last_return = self.train_once(runner.step_itr, runner.step_path) if cycle == 0 and self.evaluate: log_performance(runner.step_itr, self._obtain_evaluation_samples( runner.get_env_copy()), discount=self.discount) tabular.record('TotalEnvSteps', runner.total_env_steps) runner.step_itr += 1 return last_return
def train_once(self, itr, paths): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: float: The average return in last epoch cycle. """ # -- Stage: Calculate baseline if hasattr(self._baseline, 'predict_n'): baseline_predictions = self._baseline.predict_n(paths) else: baseline_predictions = [ self._baseline.predict(path) for path in paths ] # -- Stage: Pre-process samples based on collected paths samples_data = paths_to_tensors(paths, self.max_path_length, baseline_predictions, self._discount) # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self._env_spec, paths), discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) samples_data['average_return'] = np.mean(undiscounted_returns) epoch = itr // self._n_samples i_sample = itr - epoch * self._n_samples tabular.record('Epoch', epoch) tabular.record('# Sample', i_sample) rtn = samples_data['average_return'] self._all_returns.append(samples_data['average_return']) if (itr + 1) % self._n_samples == 0: avg_rtns = np.array(self._all_returns) self._es.tell(self._all_params, -avg_rtns) self.policy.set_param_values(self._es.best.get()[0]) # Clear for next epoch rtn = max(self._all_returns) self._all_returns.clear() self._all_params = self._sample_params() self._cur_params = self._all_params[(i_sample + 1) % self._n_samples] self.policy.set_param_values(self._cur_params) logger.log(tabular) return rtn
def test_log_performance(): lengths = np.array([10, 5, 1, 1]) batch = TrajectoryBatch( EnvSpec(akro.Box(np.array([0., 0., 0.]), np.array([1., 1., 1.])), akro.Box(np.array([-1., -1.]), np.array([0., 0.]))), observations=np.ones((sum(lengths), 3), dtype=np.float32), last_observations=np.ones((len(lengths), 3), dtype=np.float32), actions=np.zeros((sum(lengths), 2), dtype=np.float32), rewards=np.array([ 0.34026529, 0.58263177, 0.84307509, 0.97651095, 0.81723901, 0.22631398, 0.03421301, 0.97515046, 0.64311832, 0.65068933, 0.17657714, 0.04783857, 0.73904013, 0.41364329, 0.52235551, 0.24203526, 0.43328910 ]), terminals=np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1], dtype=bool), env_infos={ 'success': np.array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=bool) }, agent_infos={}, lengths=lengths) log_file = tempfile.NamedTemporaryFile() csv_output = dowel.CsvOutput(log_file.name) logger.add_output(csv_output) log_performance(7, batch, 0.8, prefix='test_log_performance') logger.log(tabular) logger.dump_output_type(dowel.CsvOutput) with open(log_file.name, 'r') as file: rows = list(csv.DictReader(file)) res = {k: float(r) for (k, r) in rows[0].items()} assert res['test_log_performance/Iteration'] == 7 assert res['test_log_performance/NumTrajs'] == 4 assert math.isclose(res['test_log_performance/SuccessRate'], 0.75) assert math.isclose(res['test_log_performance/CompletionRate'], 0.5) assert math.isclose(res['test_log_performance/AverageDiscountedReturn'], 1.1131040640673113) assert math.isclose(res['test_log_performance/AverageReturn'], 2.1659965525) assert math.isclose(res['test_log_performance/StdReturn'], 2.354067152038576)
def train_once(self, itr, paths): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: numpy.float64: Average return. """ # -- Stage: Calculate baseline paths = [ dict( observations=self._env_spec.observation_space.flatten_n( path['observations']) if self._flatten_input else path['observations'], actions=( self._env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos'], dones=path['dones']) for path in paths ] if hasattr(self._baseline, 'predict_n'): baseline_predictions = self._baseline.predict_n(paths) else: baseline_predictions = [ self._baseline.predict(path) for path in paths ] # -- Stage: Pre-process samples based on collected paths samples_data = paths_to_tensors(paths, self.max_path_length, baseline_predictions, self._discount, self._gae_lambda) # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self._env_spec, paths), discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) samples_data['average_return'] = np.mean(undiscounted_returns) self.log_diagnostics(samples_data) logger.log('Optimizing policy...') self.optimize_policy(samples_data) return samples_data['average_return']
def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): LocalRunner is passed to give algorithm the access to runner.step_epochs(), which provides services such as snapshotting and sampler control. Returns: float: The average return in last epoch cycle. """ for _ in runner.step_epochs(): if self.replay_buffer.n_transitions_stored < self.min_buffer_size: batch_size = self.min_buffer_size else: batch_size = None runner.step_path = runner.obtain_samples(runner.step_itr, batch_size) for sample in runner.step_path: self.replay_buffer.store(obs=sample.observation, act=sample.action, rew=sample.reward, next_obs=sample.next_observation, done=sample.terminal) self.episode_rewards.append(sum([sample.reward for sample in runner.step_path])) for _ in range(self.gradient_steps): last_return, policy_loss, qf1_loss, qf2_loss = self.train_once(runner.step_itr, runner.step_path) log_performance( runner.step_itr, self._obtain_evaluation_samples(runner.get_env_copy(), num_trajs=10), discount=self.discount) self.log_statistics(policy_loss, qf1_loss, qf2_loss) tabular.record('TotalEnvSteps', runner.total_env_steps) runner.step_itr += 1 return last_return
def _evaluate_policy(self, epoch): """Evaluate the performance of the policy via deterministic rollouts. Statistics such as (average) discounted return and success rate are recorded. Args: epoch(int): The current training epoch. Returns: float: The average return across self._num_evaluation_trajectories trajectories """ eval_trajectories = self._obtain_evaluation_samples( self._eval_env, num_trajs=self._num_evaluation_trajectories) last_return = log_performance(epoch, eval_trajectories, discount=self.discount) return last_return
def train_once(self, itr, paths): """Train the algorithm once. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: dict: Processed sample data, with key * average_return: (float) """ batch_size = (self._training_batch_size if self._training_batch_size else len(paths)) samples = self.process_samples(itr, paths) for _ in range(self._training_epochs): minibatch_ids_list = torch.randperm(len(paths)).split(batch_size) for minibatch_ids in minibatch_ids_list: obs, actions, rewards, valids, baselines = self._get_minibatch(samples, minibatch_ids) loss = self._compute_loss(itr, obs, actions, rewards, valids, baselines) self._old_policy.load_state_dict(self.policy.state_dict()) self._optimizer.zero_grad() loss.backward() self._optimize(itr, obs, actions, rewards, valids, baselines) self.baseline.fit(paths) average_returns = log_performance(itr, TrajectoryBatch.from_trajectory_list( self.env_spec, paths), discount=self.discount) return np.mean(average_returns)
def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): LocalRunner is passed to give algorithm the access to runner.step_epochs(), which provides services such as snapshotting and sampler control. Returns: float: The average return in last epoch cycle. """ last_return = None for _ in runner.step_epochs(): for cycle in range(self.epoch_cycles): if self.replay_buffer.n_transitions_stored < self.min_buffer_size: batch_size = self.min_buffer_size else: batch_size = None runner.step_path = runner.obtain_samples( runner.step_itr, batch_size) for sample in runner.step_path: self.replay_buffer.store(obs=sample.observation, act=sample.action, rew=sample.reward, next_obs=sample.next_observation, done=sample.terminal) for _ in range(self.gradient_steps): last_return, policy_loss, qf1_loss, qf2_loss = self.train_once( runner.step_itr, runner.step_path) if cycle == self.epoch_cycles - 1: self.episode_rewards.append( sum([sample.reward for sample in runner.step_path])) # evaluation epoch_local_success_rate = [] for task_number, name in enumerate(self.env.task_names_ordered): eval_env = self.eval_env_dict[name] _, avg_success_rate = log_performance( runner.step_itr, self._obtain_evaluation_samples( MTEnvEvalWrapper(eval_env, task_number, self._num_tasks, self.env._max_plain_dim), num_trajs=self.num_eval_paths), discount=self.discount, prefix=name) epoch_local_success_rate.append(avg_success_rate) self.epoch_mean_success_rate.append( np.mean(epoch_local_success_rate)) self.epoch_median_success_rate.append( np.median(epoch_local_success_rate)) tabular.record('local/Mean_SuccessRate', self.epoch_mean_success_rate[-1]) tabular.record('local/Median_SuccessRate', self.epoch_median_success_rate[-1]) tabular.record('local/Max_Median_SuccessRate', np.max(self.epoch_median_success_rate)) tabular.record('local/Max_Mean_SuccessRate', np.max(self.epoch_mean_success_rate)) self.log_statistics(policy_loss, qf1_loss, qf2_loss) tabular.record('TotalEnvSteps', runner.total_env_steps) runner.step_itr += 1 return last_return
def train_once(self, itr, paths): """Train the algorithm once. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: numpy.float64: Calculated mean value of undiscounted returns. """ obs, actions, rewards, returns, valids, baselines = \ self.process_samples(paths) if self._maximum_entropy: policy_entropies = self._compute_policy_entropy(obs) rewards += self._policy_ent_coeff * policy_entropies obs_flat = torch.cat(filter_valids(obs, valids)) actions_flat = torch.cat(filter_valids(actions, valids)) rewards_flat = torch.cat(filter_valids(rewards, valids)) returns_flat = torch.cat(filter_valids(returns, valids)) advs_flat = self._compute_advantage(rewards, valids, baselines) with torch.no_grad(): policy_loss_before = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_before = self._value_function.compute_loss( obs_flat, returns_flat) kl_before = self._compute_kl_constraint(obs) self._train(obs_flat, actions_flat, rewards_flat, returns_flat, advs_flat) with torch.no_grad(): policy_loss_after = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_after = self._value_function.compute_loss( obs_flat, returns_flat) kl_after = self._compute_kl_constraint(obs) policy_entropy = self._compute_policy_entropy(obs) with tabular.prefix(self.policy.name): tabular.record('/LossBefore', policy_loss_before.item()) tabular.record('/LossAfter', policy_loss_after.item()) tabular.record('/dLoss', (policy_loss_before - policy_loss_after).item()) tabular.record('/KLBefore', kl_before.item()) tabular.record('/KL', kl_after.item()) tabular.record('/Entropy', policy_entropy.mean().item()) with tabular.prefix(self._value_function.name): tabular.record('/LossBefore', vf_loss_before.item()) tabular.record('/LossAfter', vf_loss_after.item()) tabular.record('/dLoss', vf_loss_before.item() - vf_loss_after.item()) self._old_policy.load_state_dict(self.policy.state_dict()) undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self._env_spec, paths), discount=self.discount) return np.mean(undiscounted_returns)
def log_performance(self, indices, test, epoch): """Get average returns for specific tasks. Args: indices (list): List of tasks. """ discounted_returns = [] undiscounted_returns = [] completion = [] success = [] traj = [] for idx in indices: eval_paths = [] for _ in range(self._num_evals): paths = self.collect_paths(idx, test) paths[-1]['terminals'] = paths[-1]['terminals'].squeeze() paths[-1]['dones'] = paths[-1]['terminals'] # HalfCheetahVel env if 'task' in paths[-1]['env_infos'].keys(): paths[-1]['env_infos']['task'] = paths[-1]['env_infos'][ 'task']['velocity'] eval_paths.append(paths[-1]) discounted_returns.append( discount_cumsum(paths[-1]['rewards'], self._discount)) undiscounted_returns.append(sum(paths[-1]['rewards'])) completion.append(float(paths[-1]['terminals'].any())) # calculate success rate for metaworld tasks if 'success' in paths[-1]['env_infos']: success.append(paths[-1]['env_infos']['success'].any()) if test: env = self.test_env[idx]() temp_traj = TrajectoryBatch.from_trajectory_list( env, eval_paths) else: env = self.env[idx]() temp_traj = TrajectoryBatch.from_trajectory_list( env, eval_paths) traj.append(temp_traj) if test: with tabular.prefix('Test/'): if self._test_task_names: log_multitask_performance( epoch, TrajectoryBatch.concatenate(*traj), self._discount, task_names=self._test_task_names) log_performance(epoch, TrajectoryBatch.concatenate(*traj), self._discount, prefix='Average') else: with tabular.prefix('Train/'): if self._train_task_names: log_multitask_performance( epoch, TrajectoryBatch.concatenate(*traj), self._discount, task_names=self._train_task_names) log_performance(epoch, TrajectoryBatch.concatenate(*traj), self._discount, prefix='Average')
def process_samples(self, itr, paths): """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: dict: Processed sample data, with key * average_return: (float) """ baselines = [] returns = [] max_path_length = self.max_path_length if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = tensor_utils.discount_cumsum( path['rewards'], self.discount) returns.append(path['returns']) obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) ent = np.sum(self.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self.env_spec, paths), discount=self.discount) self.episode_reward_mean.extend(undiscounted_returns) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('Extras/EpisodeRewardMean', np.mean(self.episode_reward_mean)) samples_data = dict(average_return=np.mean(undiscounted_returns)) return samples_data