def test_act_box_env_spec_mismatch_eps(eps_data): with pytest.raises(ValueError, match='actions should have'): eps_data['env_spec'].action_space = akro.Box(low=1, high=np.inf, shape=(4, 3, 2), dtype=np.float32) t = EpisodeBatch(**eps_data) del t
def test_agent_infos_batch_mismatch_eps(eps_data): with pytest.raises( ValueError, match='entry in agent_infos must have a batch dimension'): eps_data['agent_infos']['hidden'] = eps_data['agent_infos'][ 'hidden'][:-1] t = EpisodeBatch(**eps_data) del t
def test_agent_infos_batch_mismatch_eps(eps_data): with pytest.raises( ValueError, match="Entry 'hidden' in agent_infos has batch size 141"): eps_data['agent_infos']['hidden'] = eps_data['agent_infos'][ 'hidden'][:-1] t = EpisodeBatch(**eps_data) del t
def test_to_epsbatch_list(eps_data): t = EpisodeBatch(**eps_data) t_list = t.to_list() assert len(t_list) == len(eps_data['lengths']) start = 0 for length, last_obs, s in zip(eps_data['lengths'], eps_data['last_observations'], t_list): stop = start + length assert ( s['observations'] == eps_data['observations'][start:stop]).all() assert (s['next_observations'] == np.concatenate( (eps_data['observations'][start + 1:stop], [last_obs]))).all() assert (s['actions'] == eps_data['actions'][start:stop]).all() assert (s['rewards'] == eps_data['rewards'][start:stop]).all() assert (s['step_types'] == eps_data['step_types'][start:stop]).all() start = stop assert start == len(eps_data['rewards'])
def test_last_obs_env_spec_mismatch_eps(eps_data): with pytest.raises(ValueError, match=('last_observations must have the ' 'same number of entries')): eps_data['last_observations'] = \ eps_data['last_observations'][:, :, :, :1] t = EpisodeBatch(**eps_data) del t
def test_time_step_batch_from_episode_batch(eps_data): eps = EpisodeBatch(**eps_data) timestep_batch = TimeStepBatch.from_episode_batch(eps) assert (timestep_batch.observations == eps.observations).all() assert (timestep_batch.next_observations[:eps.lengths[0] - 1] == eps.observations[1:eps.lengths[0]]).all() assert (timestep_batch.next_observations[eps.lengths[0]] == eps.last_observations[0]).all()
def _process_samples(self, itr, episodes): # pylint: disable=too-many-statements """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. episodes (EpisodeBatch): Original collected episode batch for each task. For each episode, episode.agent_infos['batch_idx'] indicates which task this episode belongs to. In RL^2, there are n environments/tasks and paths in each of them will be concatenated at some point and fed to the policy. Returns: EpisodeBatch: Processed batch of episodes for feeding the inner algorithm. numpy.float64: The average return. Raises: ValueError: If 'batch_idx' is not found. """ concatenated_paths = [] paths_by_task = collections.defaultdict(list) for episode in episodes.split(): if hasattr(episode, 'batch_idx'): paths_by_task[episode.batch_idx[0]].append(episode) elif 'batch_idx' in episode.agent_infos: paths_by_task[episode.agent_infos['batch_idx'][0]].append( episode) else: raise ValueError( 'Batch idx is required for RL2 but not found, ' 'Make sure to use garage.tf.algos.rl2.RL2Worker ' 'for sampling') # all path in paths_by_task[i] are sampled from task[i] for episode_list in paths_by_task.values(): concatenated_path = self._concatenate_episodes(episode_list) concatenated_paths.append(concatenated_path) concatenated_episodes = EpisodeBatch.concatenate(*concatenated_paths) name_map = None if hasattr(self._task_sampler, '_envs') and hasattr( self._task_sampler._envs[0]._env, 'all_task_names'): names = [ env._env.all_task_names[0] for env in self._task_sampler._envs ] name_map = dict(enumerate(names)) undiscounted_returns = log_multitask_performance( itr, episodes, self._inner_algo._discount, name_map=name_map) average_return = np.mean(undiscounted_returns) return concatenated_episodes, average_return
def train_once(self, itr, paths): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: float: The average return in last epoch cycle. """ # -- Stage: Calculate baseline if hasattr(self._baseline, 'predict_n'): baseline_predictions = self._baseline.predict_n(paths) else: baseline_predictions = [ self._baseline.predict(path) for path in paths ] # -- Stage: Pre-process samples based on collected paths samples_data = paths_to_tensors(paths, self.max_episode_length, baseline_predictions, self._discount) # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance(itr, EpisodeBatch.from_list( self._env_spec, paths), discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) samples_data['average_return'] = np.mean(undiscounted_returns) epoch = itr // self._n_samples i_sample = itr - epoch * self._n_samples tabular.record('Epoch', epoch) tabular.record('# Sample', i_sample) rtn = samples_data['average_return'] self._all_returns.append(samples_data['average_return']) if (itr + 1) % self._n_samples == 0: avg_rtns = np.array(self._all_returns) self._es.tell(self._all_params, -avg_rtns) self.policy.set_param_values(self._es.best.get()[0]) # Clear for next epoch rtn = max(self._all_returns) self._all_returns.clear() self._all_params = self._sample_params() self._cur_params = self._all_params[(i_sample + 1) % self._n_samples] self.policy.set_param_values(self._cur_params) logger.log(tabular) return rtn
def log_multitask_performance(itr, batch, discount, name_map=None): r"""Log performance of episodes from multiple tasks. Args: itr (int): Iteration number to be logged. batch (EpisodeBatch): Batch of episodes. The episodes should have either the "task_name" or "task_id" `env_infos`. If the "task_name" is not present, then `name_map` is required, and should map from task id's to task names. discount (float): Discount used in computing returns. name_map (dict[int, str] or None): Mapping from task id's to task names. Optional if the "task_name" environment info is present. Note that if provided, all tasks listed in this map will be logged, even if there are no episodes present for them. Returns: numpy.ndarray: Undiscounted returns averaged across all tasks. Has shape :math:`(N \bullet [T])`. """ eps_by_name = defaultdict(list) for eps in batch.split(): task_name = '__unnamed_task__' if 'task_name' in eps.env_infos: task_name = eps.env_infos['task_name'][0] elif 'task_id' in eps.env_infos: name_map = {} if name_map is None else name_map task_id = eps.env_infos['task_id'][0] task_name = name_map.get(task_id, 'Task #{}'.format(task_id)) eps_by_name[task_name].append(eps) if name_map is None: task_names = eps_by_name.keys() else: task_names = name_map.values() for task_name in task_names: if task_name in eps_by_name: episodes = eps_by_name[task_name] log_performance(itr, EpisodeBatch.concatenate(*episodes), discount, prefix=task_name) else: with tabular.prefix(task_name + '/'): tabular.record('Iteration', itr) tabular.record('NumEpisodes', 0) tabular.record('AverageDiscountedReturn', np.nan) tabular.record('AverageReturn', np.nan) tabular.record('StdReturn', np.nan) tabular.record('MaxReturn', np.nan) tabular.record('MinReturn', np.nan) tabular.record('TerminationRate', np.nan) tabular.record('SuccessRate', np.nan) return log_performance(itr, batch, discount=discount, prefix='Average')
def test_new_eps(eps_data): t = EpisodeBatch(**eps_data) assert t.env_spec is eps_data['env_spec'] assert t.observations is eps_data['observations'] assert t.last_observations is eps_data['last_observations'] assert t.actions is eps_data['actions'] assert t.rewards is eps_data['rewards'] assert t.env_infos is eps_data['env_infos'] assert t.agent_infos is eps_data['agent_infos'] assert t.step_types is eps_data['step_types'] assert t.lengths is eps_data['lengths']
def _train_once(self, itr, paths): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: numpy.float64: Average return. """ # -- Stage: Calculate baseline paths = [ dict( observations=path['observations'], actions=( self._env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos'], dones=np.array([ step_type == StepType.TERMINAL for step_type in path['step_types'] ])) for path in paths ] if hasattr(self._baseline, 'predict_n'): baseline_predictions = self._baseline.predict_n(paths) else: baseline_predictions = [ self._baseline.predict(path) for path in paths ] # -- Stage: Pre-process samples based on collected paths samples_data = paths_to_tensors(paths, self.max_episode_length, baseline_predictions, self._discount, self._gae_lambda) # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance( itr, EpisodeBatch.from_list(self._env_spec, paths), discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) samples_data['average_return'] = np.mean(undiscounted_returns) logger.log('Optimizing policy...') self._optimize_policy(samples_data) return samples_data['average_return']
def test_episodes_to_acts_obs_list(eps_data): t = EpisodeBatch(**eps_data) acts_list = t.actions_list obs_list = t.observations_list start = 0 assert len(acts_list) == len(t.lengths) assert len(obs_list) == len(t.lengths) for i, length in enumerate(t.lengths): stop = start + length assert (acts_list[i] == t.actions[start:stop]).all() assert (obs_list[i] == t.observations[start:stop]).all() start = stop
def collect_episode(self): # print('called') """Collect the current episode, clearing the internal buffer. Returns: EpisodeBatch: A batch of the episodes completed since the last call to collect_episode(). """ observations = self._observations self._observations = [] last_observations = self._last_observations self._last_observations = [] actions = [] rewards = [] env_infos = defaultdict(list) step_types = [] for es in self._env_steps: actions.append(es.action) rewards.append(es.reward) step_types.append(es.step_type) for k, v in es.env_info.items(): env_infos[k].append(v) self._env_steps = [] agent_infos = self._agent_infos self._agent_infos = defaultdict(list) for k, v in agent_infos.items(): agent_infos[k] = np.asarray(v) for k, v in env_infos.items(): env_infos[k] = np.asarray(v) episode_infos = self._episode_infos self._episode_infos = defaultdict(list) for k, v in episode_infos.items(): episode_infos[k] = np.asarray(v) lengths = self._lengths self._lengths = [] # print(f'OBSERVATIONS {len(observations)}, LAST OBSERVATIONS: {len(last_observations)}') return EpisodeBatch(env_spec=self.env.spec, episode_infos=episode_infos, observations=np.asarray(observations), last_observations=np.asarray(last_observations), actions=np.asarray(actions), rewards=np.asarray(rewards), step_types=np.asarray(step_types, dtype=StepType), env_infos=dict(env_infos), agent_infos=dict(agent_infos), lengths=np.asarray(lengths, dtype='i'))
def test_log_multitask_performance_task_id(): lengths = np.array([10, 5, 1, 1]) batch = EpisodeBatch( EnvSpec(akro.Box(np.array([0., 0., 0.]), np.array([1., 1., 1.])), akro.Box(np.array([-1., -1.]), np.array([0., 0.]))), observations=np.ones((sum(lengths), 3), dtype=np.float32), last_observations=np.ones((len(lengths), 3), dtype=np.float32), actions=np.zeros((sum(lengths), 2), dtype=np.float32), rewards=np.array([ 0.34026529, 0.58263177, 0.84307509, 0.97651095, 0.81723901, 0.22631398, 0.03421301, 0.97515046, 0.64311832, 0.65068933, 0.17657714, 0.04783857, 0.73904013, 0.41364329, 0.52235551, 0.24203526, 0.43328910 ]), step_types=np.array([StepType.MID] * sum(lengths), dtype=StepType), env_infos={ 'success': np.array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=bool), 'task_id': np.array([1] * 10 + [3] * 5 + [1] + [4]) }, agent_infos={}, lengths=lengths) log_file = tempfile.NamedTemporaryFile() csv_output = dowel.CsvOutput(log_file.name) logger.add_output(csv_output) log_multitask_performance(7, batch, 0.8, { 1: 'env1', 3: 'env2', 4: 'env3', 5: 'env4' }) logger.log(tabular) logger.dump_output_type(dowel.CsvOutput) with open(log_file.name, 'r') as file: rows = list(csv.DictReader(file)) res = {k: float(r) for (k, r) in rows[0].items()} assert res['env1/Iteration'] == 7 assert res['env2/Iteration'] == 7 assert res['env3/Iteration'] == 7 assert res['env4/Iteration'] == 7 assert res['env1/NumEpisodes'] == 2 assert res['env2/NumEpisodes'] == 1 assert res['env3/NumEpisodes'] == 1 assert res['env4/NumEpisodes'] == 0 assert math.isclose(res['env1/SuccessRate'], 0.5) assert math.isclose(res['env2/SuccessRate'], 1.0) assert math.isclose(res['env3/SuccessRate'], 1.0) assert math.isnan(res['env4/SuccessRate']) assert math.isnan(res['env4/AverageReturn'])
def train(self, runner): """Get samples and train the policy. Args: runner (LocalRunner): LocalRunner. """ for epoch in runner.step_epochs(): samples = runner.obtain_samples(epoch) log_performance(epoch, EpisodeBatch.from_list(self.env_spec, samples), self._discount) self._train_once(epoch, samples)
def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): Experiment runner. """ for epoch in runner.step_epochs(): samples = runner.obtain_samples(epoch) log_performance(epoch, EpisodeBatch.from_list(self.env_spec, samples), self._discount) self._train_once(samples)
def collect_episode(self): """Collect all completed episodes. Returns: EpisodeBatch: A batch of the episodes completed since the last call to collect_episode(). """ if len(self._completed_episodes) == 1: result = self._completed_episodes[0] else: result = EpisodeBatch.concatenate(*self._completed_episodes) self._completed_episodes = [] return result
def test_new_eps(eps_data): t = EpisodeBatch(**eps_data) assert t.env_spec is eps_data['env_spec'] assert t.observations is eps_data['observations'] assert t.last_observations is eps_data['last_observations'] assert t.actions is eps_data['actions'] assert t.rewards is eps_data['rewards'] assert t.env_infos is eps_data['env_infos'] assert t.agent_infos is eps_data['agent_infos'] assert t.step_types is eps_data['step_types'] assert t.lengths is eps_data['lengths'] assert t.episode_infos_by_episode is eps_data['episode_infos'] assert (t.episode_infos['task_one_hot'][0].shape == eps_data['episode_infos']['task_one_hot'][0].shape)
def test_log_performance(): lengths = np.array([10, 5, 1, 1]) batch = EpisodeBatch( EnvSpec(akro.Box(np.array([0., 0., 0.]), np.array([1., 1., 1.])), akro.Box(np.array([-1., -1.]), np.array([0., 0.]))), observations=np.ones((sum(lengths), 3), dtype=np.float32), last_observations=np.ones((len(lengths), 3), dtype=np.float32), actions=np.zeros((sum(lengths), 2), dtype=np.float32), rewards=np.array([ 0.34026529, 0.58263177, 0.84307509, 0.97651095, 0.81723901, 0.22631398, 0.03421301, 0.97515046, 0.64311832, 0.65068933, 0.17657714, 0.04783857, 0.73904013, 0.41364329, 0.52235551, 0.24203526, 0.43328910 ]), step_types=np.array( [StepType.FIRST] + [StepType.MID] * (lengths[0] - 2) + [StepType.TERMINAL] + [StepType.FIRST] + [StepType.MID] * (lengths[1] - 2) + [StepType.TERMINAL] + [StepType.FIRST] + [StepType.FIRST], dtype=StepType), env_infos={ 'success': np.array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=bool) }, agent_infos={}, lengths=lengths) log_file = tempfile.NamedTemporaryFile() csv_output = dowel.CsvOutput(log_file.name) logger.add_output(csv_output) log_performance(7, batch, 0.8, prefix='test_log_performance') logger.log(tabular) logger.dump_output_type(dowel.CsvOutput) with open(log_file.name, 'r') as file: rows = list(csv.DictReader(file)) res = {k: float(r) for (k, r) in rows[0].items()} assert res['test_log_performance/Iteration'] == 7 assert res['test_log_performance/NumEpisodes'] == 4 assert math.isclose(res['test_log_performance/SuccessRate'], 0.75) assert math.isclose(res['test_log_performance/TerminationRate'], 0.5) assert math.isclose(res['test_log_performance/AverageDiscountedReturn'], 1.1131040640673113) assert math.isclose(res['test_log_performance/AverageReturn'], 2.1659965525) assert math.isclose(res['test_log_performance/StdReturn'], 2.354067152038576)
def _concatenate_episodes(self, episode_list): """Concatenate episodes. The input list contains samples from different episodes but same task/environment. In RL^2, paths within each meta batch are all concatenate into a single path and fed to the policy. Args: episode_list (list[EpisodeBatch]): Input paths. All paths are from different episodes, but the same task/environment. Returns: EpisodeBatch: Concatenated episode from the same task/environment. Shape of values: :math:`[max_episode_length * episode_per_task, S^*]` """ env_infos = { k: np.concatenate([b.env_infos[k] for b in episode_list]) for k in episode_list[0].env_infos.keys() } agent_infos = { k: np.concatenate([b.agent_infos[k] for b in episode_list]) for k in episode_list[0].agent_infos.keys() } episode_infos = { k: np.concatenate([b.episode_infos[k] for b in episode_list]) for k in episode_list[0].episode_infos.keys() } actions = np.concatenate([ self._env_spec.action_space.flatten_n(ep.actions) for ep in episode_list ]) return EpisodeBatch( env_spec=episode_list[0].env_spec, episode_infos=episode_infos, observations=np.concatenate( [ep.observations for ep in episode_list]), last_observations=episode_list[-1].last_observations, actions=actions, rewards=np.concatenate([ep.rewards for ep in episode_list]), env_infos=env_infos, agent_infos=agent_infos, step_types=np.concatenate([ep.step_types for ep in episode_list]), lengths=np.asarray([sum([ep.lengths[0] for ep in episode_list])]))
def _log_performance(self, itr, all_samples, loss_before, loss_after, kl_before, kl, policy_entropy): """Evaluate performance of this batch. Args: itr (int): Iteration number. all_samples (list[list[_MAMLEpisodeBatch]]): Two dimensional list of _MAMLEpisodeBatch of size [meta_batch_size * (num_grad_updates + 1)] loss_before (float): Loss before optimization step. loss_after (float): Loss after optimization step. kl_before (float): KL divergence before optimization step. kl (float): KL divergence after optimization step. policy_entropy (float): Policy entropy. Returns: float: The average return in last epoch cycle. """ tabular.record('Iteration', itr) name_map = None if hasattr(self._env, 'all_task_names'): names = self._env.all_task_names name_map = dict(zip(names, names)) rtns = log_multitask_performance( itr, EpisodeBatch.from_list( env_spec=self._env.spec, paths=[ path for task_paths in all_samples for path in task_paths[self._num_grad_updates].paths ]), discount=self._inner_algo.discount, name_map=name_map) with tabular.prefix(self._policy.name + '/'): tabular.record('LossBefore', loss_before) tabular.record('LossAfter', loss_after) tabular.record('dLoss', loss_before - loss_after) tabular.record('KLBefore', kl_before) tabular.record('KLAfter', kl) tabular.record('Entropy', policy_entropy) return np.mean(rtns)
def collect_episode(self): """Gather fragments from all in-progress episodes. Returns: EpisodeBatch: A batch of the episode fragments. """ for i, frag in enumerate(self._fragments): assert frag.env is self._envs[i] if len(frag.rewards) > 0: complete_frag = frag.to_batch() self._complete_fragments.append(complete_frag) self._fragments[i] = InProgressEpisode(frag.env, frag.last_obs) assert len(self._complete_fragments) > 0 result = EpisodeBatch.concatenate(*self._complete_fragments) self._complete_fragments = [] return result
def train_once(self, itr, paths): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: numpy.float64: Average return. """ undiscounted_returns = log_performance(itr, EpisodeBatch.from_list( self._env_spec, paths), discount=self._discount) samples_data = self.paths_to_tensors(paths) samples_data['average_return'] = np.mean(undiscounted_returns) logger.log('Optimizing policy...') self.optimize_policy(itr, samples_data) return samples_data['average_return']
def _obtain_samples(self, trainer, epoch): """Obtain samples from self._source. Args: trainer (Trainer): Experiment trainer, which may be used to obtain samples. epoch (int): The current epoch. Returns: TimeStepBatch: Batch of samples. """ if isinstance(self._source, Policy): batch = EpisodeBatch.from_list(self._env_spec, trainer.obtain_samples(epoch)) log_performance(epoch, batch, 1.0, prefix='Expert') return batch else: batches = [] while (sum(len(batch.actions) for batch in batches) < self._batch_size): batches.append(next(self._source)) return TimeStepBatch.concatenate(*batches)
def slice_episodes(episodes, slice_size): sliced = [] for eps in episodes.split(): splits = math.ceil(eps.lengths[0] / slice_size) split_indices = np.array_split(np.arange(eps.lengths[0]), splits) next_obs = eps.next_observations for indices in split_indices: last_obs = np.asarray([next_obs[indices[-1]]]) t = EpisodeBatch( env_spec=eps.env_spec, observations=eps.observations[indices], last_observations=last_obs, actions=eps.actions[indices], rewards=eps.rewards[indices], step_types=eps.step_types[indices], env_infos={k: v[indices] for (k, v) in eps.env_infos.items()}, agent_infos={ k: v[indices] for (k, v) in eps.agent_infos.items() }, lengths=np.asarray([len(indices)], dtype='l')) sliced.append(t) return sliced
def _gather_episode(self, episode_number, last_observation): assert 0 < self._episode_lengths[ episode_number] <= self._max_episode_length env_infos = self._env_infos[episode_number] agent_infos = self._agent_infos[episode_number] episode_infos = self._episode_infos[episode_number] for k, v in env_infos.items(): env_infos[k] = np.asarray(v) for k, v in agent_infos.items(): agent_infos[k] = np.asarray(v) for k, v in episode_infos.items(): episode_infos[k] = np.asarray(v) eps = EpisodeBatch( env_spec=self._envs[episode_number].spec, episode_infos=dict(episode_infos), observations=np.asarray(self._observations[episode_number]), last_observations=np.asarray([last_observation]), actions=np.asarray(self._actions[episode_number]), rewards=np.asarray(self._rewards[episode_number]), step_types=np.asarray(self._step_types[episode_number], dtype=StepType), env_infos=dict(env_infos), agent_infos=dict(agent_infos), lengths=np.asarray([self._episode_lengths[episode_number]], dtype='l')) self._completed_episodes.append(eps) self._observations[episode_number] = [] self._actions[episode_number] = [] self._rewards[episode_number] = [] self._step_types[episode_number] = [] self._episode_lengths[episode_number] = 0 self._prev_obs[episode_number] = self._envs[episode_number].reset()[0] self._env_infos[episode_number] = collections.defaultdict(list) self._agent_infos[episode_number] = collections.defaultdict(list) self._episode_infos[episode_number] = collections.defaultdict(list)
def test_episodes_padding_tensors(eps_data): t = EpisodeBatch(**eps_data) N = len(t.lengths) max_ep_l = t.env_spec.max_episode_length observations = t.padded_observations actions = t.padded_actions rewards = t.padded_rewards valids = t.valids agent_infos = t.padded_agent_infos assert observations.shape == (N, max_ep_l, *t.observations[0].shape) assert actions.shape == (N, max_ep_l, *t.actions[0].shape) assert rewards.shape == (N, max_ep_l) assert valids.shape == (N, max_ep_l) assert agent_infos.keys() == t.agent_infos.keys() for key in agent_infos.keys(): assert agent_infos[key].shape == (N, max_ep_l, *t.agent_infos[key][0].shape) start = 0 for i, length in enumerate(t.lengths): stop = start + length assert (observations[i][:length] == t.observations[start:stop]).all() assert np.count_nonzero(observations[i][length:]) == 0 assert (actions[i][:length] == t.actions[start:stop]).all() assert np.count_nonzero(actions[i][length:]) == 0 assert (rewards[i][:length] == t.rewards[start:stop]).all() assert np.count_nonzero(rewards[i][length:]) == 0 assert (valids[i][:length] == np.ones((length, ))).all() assert np.count_nonzero(valids[i][length:]) == 0 for key in agent_infos.keys(): assert (agent_infos[key][i][:length] == t.agent_infos[key] [start:stop]).all() assert np.count_nonzero(agent_infos[key][i][length:]) == 0 start = stop
def obtain_samples(self, num_samples, agent_update, env_updates=None): """Sample the policy for new episodes. Args: num_samples (int): Number of steps the the sampler should collect. agent_update (object): Value which will be passed into the `agent_update_fn` before sampling episodes. If a list is passed in, it must have length exactly `factory.n_workers`, and will be spread across the workers. env_updates (object): Value which will be passed into the `env_update_fn` before sampling episodes. If a list is passed in, it must have length exactly `factory.n_workers`, and will be spread across the workers. Returns: EpisodeBatch: Batch of gathered episodes. """ self.update_workers(agent_update, env_updates) completed_samples = 0 batches = [] # TODO: can we replace the while, so all processes are scheduled beforehand? while completed_samples < num_samples: pids = [w.rollout.remote() for w in self.workers] results = [ray.get(pid) for pid in pids] for episode_batch in results: num_returned_samples = episode_batch.lengths.sum() completed_samples += num_returned_samples batches.append(episode_batch) # Note: EpisodeBatch takes care of concatenating - is this a performance issue? samples = EpisodeBatch.concatenate(*batches) self.total_env_steps += sum(samples.lengths) return samples
def train_once(self, itr, paths): """Train the algorithm once. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: numpy.float64: Calculated mean value of undiscounted returns. """ obs, actions, rewards, returns, valids, baselines = \ self.process_samples(paths) if self._maximum_entropy: policy_entropies = self._compute_policy_entropy(obs) rewards += self._policy_ent_coeff * policy_entropies obs_flat = torch.cat(filter_valids(obs, valids)) actions_flat = torch.cat(filter_valids(actions, valids)) rewards_flat = torch.cat(filter_valids(rewards, valids)) returns_flat = torch.cat(filter_valids(returns, valids)) advs_flat = self._compute_advantage(rewards, valids, baselines) with torch.no_grad(): policy_loss_before = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_before = self._value_function.compute_loss( obs_flat, returns_flat) kl_before = self._compute_kl_constraint(obs) self._train(obs_flat, actions_flat, rewards_flat, returns_flat, advs_flat) with torch.no_grad(): policy_loss_after = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_after = self._value_function.compute_loss( obs_flat, returns_flat) kl_after = self._compute_kl_constraint(obs) policy_entropy = self._compute_policy_entropy(obs) with tabular.prefix(self.policy.name): tabular.record('/LossBefore', policy_loss_before.item()) tabular.record('/LossAfter', policy_loss_after.item()) tabular.record('/dLoss', (policy_loss_before - policy_loss_after).item()) tabular.record('/KLBefore', kl_before.item()) tabular.record('/KL', kl_after.item()) tabular.record('/Entropy', policy_entropy.mean().item()) with tabular.prefix(self._value_function.name): tabular.record('/LossBefore', vf_loss_before.item()) tabular.record('/LossAfter', vf_loss_after.item()) tabular.record('/dLoss', vf_loss_before.item() - vf_loss_after.item()) self._old_policy.load_state_dict(self.policy.state_dict()) undiscounted_returns = log_performance(itr, EpisodeBatch.from_list( self._env_spec, paths), discount=self.discount) return np.mean(undiscounted_returns)
def test_episode_batch_to_timestep_batch(self, eps_data): t = EpisodeBatch(**eps_data) replay_buffer = PathBuffer(capacity_in_transitions=100) replay_buffer.add_episode_batch(t) timesteps = replay_buffer.sample_timesteps(10) assert len(timesteps.rewards) == 10