def _train_once(self, epoch, paths): """Perform one step of policy optimization given one batch of samples. Args: epoch (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: float: The average return of epoch cycle. """ returns = [] for path in paths: returns.append(discount_cumsum(path['rewards'], self._discount)) avg_return = np.mean(np.concatenate(returns)) self._all_avg_returns.append(avg_return) if (epoch + 1) % self._n_samples == 0: avg_rtns = np.array(self._all_avg_returns) best_inds = np.argsort(-avg_rtns)[:self._n_best] best_params = np.array(self._all_params)[best_inds] self._cur_mean = best_params.mean(axis=0) self._cur_std = best_params.std(axis=0) self.policy.set_param_values(self._cur_mean) avg_return = max(self._all_avg_returns) self._all_avg_returns.clear() self._all_params.clear() self._cur_params = self._sample_params(epoch) self._all_params.append(self._cur_params.copy()) self.policy.set_param_values(self._cur_params) return avg_return
def _process_samples(self, episodes): """Process sample data based on the collected paths. Args: episodes (EpisodeBatch): Collected batch of episodes. Returns: _MAMLEpisodeBatch: Processed samples data. """ paths = episodes.to_list() for path in paths: path['returns'] = discount_cumsum( path['rewards'], self._inner_algo.discount).copy() self._train_value_function(paths) obs = torch.Tensor(episodes.padded_observations) actions = torch.Tensor(episodes.padded_actions) rewards = torch.Tensor(episodes.padded_rewards) valids = torch.Tensor(episodes.lengths).int() with torch.no_grad(): # pylint: disable=protected-access baselines = self._inner_algo._value_function(obs) return _MAMLEpisodeBatch(paths, obs, actions, rewards, valids, baselines)
def _process_samples(self, itr, paths): # pylint: disable=too-many-statements """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (OrderedDict[dict]): A list of collected paths for each task. In RL^2, there are n environments/tasks and paths in each of them will be concatenated at some point and fed to the policy. Returns: EpisodeBatch: Processed batch of episodes for feeding the inner algorithm. numpy.float64: The average return. Raises: ValueError: If 'batch_idx' is not found. """ concatenated_paths = [] paths_by_task = collections.defaultdict(list) for path in paths: path['returns'] = discount_cumsum(path['rewards'], self._discount) path['lengths'] = [len(path['rewards'])] if 'batch_idx' in path: paths_by_task[path['batch_idx']].append(path) elif 'batch_idx' in path['agent_infos']: paths_by_task[path['agent_infos']['batch_idx'][0]].append(path) else: raise ValueError( 'Batch idx is required for RL2 but not found, ' 'Make sure to use garage.tf.algos.rl2.RL2Worker ' 'for sampling') # all path in paths_by_task[i] are sampled from task[i] for _paths in paths_by_task.values(): concatenated_path = self._concatenate_paths(_paths) concatenated_paths.append(concatenated_path) name_map = None if hasattr(self._task_sampler, '_envs') and hasattr( self._task_sampler._envs[0]._env, 'all_task_names'): names = [ env._env.all_task_names[0] for env in self._task_sampler._envs ] name_map = dict(enumerate(names)) undiscounted_returns = log_multitask_performance( itr, EpisodeBatch.from_list(self._env_spec, paths), self._inner_algo._discount, name_map=name_map) average_return = np.mean(undiscounted_returns) episodes = EpisodeBatch.from_list(self._env_spec, concatenated_paths) return episodes, average_return
def _compute_meta_loss(self, all_samples, all_params, set_grad=True): """Compute loss to meta-optimize. Args: all_samples (list[list[_MAMLEpisodeBatch]]): A two dimensional list of _MAMLEpisodeBatch of size [meta_batch_size * (num_grad_updates + 1)] all_params (list[dict]): A list of named parameter dictionaries. Each dictionary contains key value pair of names (str) and parameters (torch.Tensor). set_grad (bool): Whether to enable gradient calculation or not. Returns: torch.Tensor: Calculated mean value of loss. """ theta = dict(self._policy.named_parameters()) old_theta = dict(self._old_policy.named_parameters()) losses = [] for task_samples, task_params in zip(all_samples, all_params): with torch.set_grad_enabled(set_grad): # SG-MRL specific # pylint: disable=protected-access initial_samples = task_samples[0] init_log_probs = self._inner_algo._compute_log_probs(*initial_samples[1:]) for i in range(self._num_grad_updates): require_grad = i < self._num_grad_updates - 1 or set_grad self._adapt(task_samples[i], set_grad=require_grad) update_module_params(self._old_policy, task_params) with torch.set_grad_enabled(set_grad): # pylint: disable=protected-access last_update = task_samples[-1] loss = self._inner_algo._compute_loss(*last_update[1:]) # SG-MRL specific with torch.set_grad_enabled(False): adapted_reward = last_update.rewards.detach().clone().numpy() # note that we treat it as a constant j_tilde = np.mean([discount_cumsum(path, self._inner_algo.discount)[0] for path in adapted_reward]) # SG-MRL specific loss += j_tilde * init_log_probs losses.append(loss) update_module_params(self._policy, theta) update_module_params(self._old_policy, old_theta) return torch.stack(losses).mean()
def _process_samples(self, paths): """Process sample data based on the collected paths. Args: paths (list[dict]): A list of collected paths. Returns: _MAMLEpisodeBatch: Processed samples data. """ for path in paths: path['returns'] = discount_cumsum( path['rewards'], self._inner_algo.discount).copy() self._train_value_function(paths) obs, actions, rewards, _, valids, baselines = self._inner_algo._process_samples( # pylint: disable=protected-access # noqa: E501 paths) return _MAMLEpisodeBatch(paths, obs, actions, rewards, valids, baselines)
def log_performance(itr, batch, discount, prefix='Evaluation'): """Evaluate the performance of an algorithm on a batch of episodes. Args: itr (int): Iteration number. batch (EpisodeBatch): The episodes to evaluate with. discount (float): Discount value, from algorithm's property. prefix (str): Prefix to add to all logged keys. Returns: numpy.ndarray: Undiscounted returns. """ returns = [] undiscounted_returns = [] termination = [] success = [] for eps in batch.split(): returns.append(discount_cumsum(eps.rewards, discount)) undiscounted_returns.append(sum(eps.rewards)) termination.append( float( any(step_type == StepType.TERMINAL for step_type in eps.step_types))) if 'success' in eps.env_infos: success.append(float(eps.env_infos['success'].any())) average_discounted_return = np.mean([rtn[0] for rtn in returns]) with tabular.prefix(prefix + '/'): tabular.record('Iteration', itr) tabular.record('NumEpisodes', len(returns)) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) tabular.record('TerminationRate', np.mean(termination)) if success: tabular.record('SuccessRate', np.mean(success)) return undiscounted_returns
def _process_samples(self, paths): r"""Process sample data based on the collected paths. Notes: P is the maximum episode length (self.max_episode_length) Args: paths (list[dict]): A list of collected paths Returns: torch.Tensor: The observations of the environment with shape :math:`(N, P, O*)`. torch.Tensor: The actions fed to the environment with shape :math:`(N, P, A*)`. torch.Tensor: The acquired rewards with shape :math:`(N, P)`. list[int]: Numbers of valid steps in each paths. torch.Tensor: Value function estimation at each step with shape :math:`(N, P)`. """ valids = torch.Tensor([len(path['actions']) for path in paths]).int() obs = torch.stack([ pad_to_last(path['observations'], total_length=self.max_episode_length, axis=0) for path in paths ]) actions = torch.stack([ pad_to_last(path['actions'], total_length=self.max_episode_length, axis=0) for path in paths ]) rewards = torch.stack([ pad_to_last(path['rewards'], total_length=self.max_episode_length) for path in paths ]) returns = torch.stack([ pad_to_last(discount_cumsum(path['rewards'], self.discount).copy(), total_length=self.max_episode_length) for path in paths ]) with torch.no_grad(): baselines = self._value_function(obs) return obs, actions, rewards, returns, valids, baselines
def _train_once(self, samples): """Perform one step of policy optimization given one batch of samples. Args: samples (list[dict]): A list of collected paths. Returns: numpy.float64: Average return. """ losses = [] self._policy_opt.zero_grad() for path in samples: returns_numpy = discount_cumsum(path['rewards'], self._discount) returns = torch.Tensor(returns_numpy.copy()) obs = torch.Tensor(path['observations']) actions = torch.Tensor(path['actions']) dist = self.policy(obs)[0] log_likelihoods = dist.log_prob(actions) loss = (-log_likelihoods * returns).mean() loss.backward() losses.append(loss.item()) self._policy_opt.step() return np.mean(losses)
def _train_once(self, samples): """Perform one step of policy optimization given one batch of samples. Args: samples (list[dict]): A list of collected samples. Returns: numpy.float64: Average return. """ obs = np.concatenate([path['observations'] for path in samples]) actions = np.concatenate([path['actions'] for path in samples]) returns = [] for path in samples: returns.append(discount_cumsum(path['rewards'], self._discount)) returns = np.concatenate(returns) sess = tf.compat.v1.get_default_session() sess.run(self._train_op, feed_dict={ self._observation: obs, self._action: actions, self._returns: returns, }) return np.mean(returns)
def log_performance(itr, batch, discount, prefix="Evaluation", use_wandb=True): """Evaluate the performance of an algorithm on a batch of episodes. Args: itr (int): Iteration number. batch (EpisodeBatch): The episodes to evaluate with. discount (float): Discount value, from algorithm"s property. prefix (str): Prefix to add to all logged keys. Returns: numpy.ndarray: Undiscounted returns. """ returns = [] undiscounted_returns = [] termination = [] success = [] rewards = [] grasp_success = [] near_object = [] episode_mean_grasp_reward = [] episode_max_grasp_reward = [] episode_min_grasp_reward = [] episode_mean_in_place_reward = [] episode_max_in_place_reward = [] episode_min_in_place_reward = [] for eps in batch.split(): rewards.append(eps.rewards) returns.append(discount_cumsum(eps.rewards, discount)) undiscounted_returns.append(sum(eps.rewards)) termination.append( float( any(step_type == StepType.TERMINAL for step_type in eps.step_types))) if "success" in eps.env_infos: success.append(float(eps.env_infos["success"].any())) if "grasp_success" in eps.env_infos: grasp_success.append(float(eps.env_infos["grasp_success"].any())) if "near_object" in eps.env_infos: near_object.append(float(eps.env_infos["near_object"].any())) if "grasp_reward" in eps.env_infos: episode_mean_grasp_reward.append( np.mean(eps.env_infos["grasp_reward"])) episode_max_grasp_reward.append(max(eps.env_infos["grasp_reward"])) episode_min_grasp_reward.append(min(eps.env_infos["grasp_reward"])) if "in_place_reward" in eps.env_infos: episode_mean_in_place_reward.append( np.mean(eps.env_infos["in_place_reward"])) episode_max_in_place_reward.append( max(eps.env_infos["in_place_reward"])) episode_min_in_place_reward.append( min(eps.env_infos["in_place_reward"])) average_discounted_return = np.mean([rtn[0] for rtn in returns]) with tabular.prefix(prefix + "/"): tabular.record("Iteration", itr) tabular.record("NumEpisodes", len(returns)) tabular.record("MinReward", np.min(rewards)) tabular.record("MaxReward", np.max(rewards)) tabular.record("AverageDiscountedReturn", average_discounted_return) tabular.record("AverageReturn", np.mean(undiscounted_returns)) tabular.record("StdReturn", np.std(undiscounted_returns)) tabular.record("MaxReturn", np.max(undiscounted_returns)) tabular.record("MinReturn", np.min(undiscounted_returns)) tabular.record("TerminationRate", np.mean(termination)) if success: tabular.record("SuccessRate", np.mean(success)) if grasp_success: tabular.record("GraspSuccessRate", np.mean(grasp_success)) if near_object: tabular.record("NearObject", np.mean(near_object)) if episode_mean_grasp_reward: tabular.record("EpisodeMeanGraspReward", np.mean(episode_mean_grasp_reward)) tabular.record("EpisodeMeanMaxGraspReward", np.mean(episode_max_grasp_reward)) tabular.record("EpisodeMeanMinGraspReward", np.mean(episode_min_grasp_reward)) if episode_mean_in_place_reward: tabular.record("EpisodeMeanInPlaceReward", np.mean(episode_mean_in_place_reward)) tabular.record("EpisodeMeanMaxInPlaceReward", np.mean(episode_max_in_place_reward)) tabular.record("EpisodeMeanMinInPlaceReward", np.mean(episode_min_in_place_reward)) log_dict = None if use_wandb: log_dict = {} log_dict[prefix + "/Iteration"] = itr log_dict[prefix + "/NumEpisodes"] = len(returns) log_dict[prefix + "/MinReward"] = np.min(rewards) log_dict[prefix + "/MaxReward"] = np.max(rewards) log_dict[prefix + "/AverageDiscountedReturn"] = average_discounted_return log_dict[prefix + "AverageReturn"] = np.mean(undiscounted_returns) log_dict[prefix + "/StdReturn"] = np.std(undiscounted_returns) log_dict[prefix + "/MaxReturn"] = np.max(undiscounted_returns) log_dict[prefix + "/MinReturn"] = np.min(undiscounted_returns) log_dict[prefix + "/TerminationRate"] = np.mean(termination) if success: log_dict[prefix + "/SuccessRate"] = np.mean(success) if grasp_success: log_dict[prefix + "Misc/GraspSuccessRate"] = np.mean(grasp_success) if near_object: log_dict[prefix + "Misc/NearObject"] = np.mean(near_object) if episode_mean_grasp_reward: log_dict[prefix + "Misc/EpisodeMeanGraspReward"] = np.mean(episode_mean_grasp_reward) log_dict[prefix + "Misc/EpisodeMeanMaxGraspReward"] = np.mean(episode_max_grasp_reward) log_dict[prefix + "Misc/EpisodeMeanMinGraspReward"] = np.mean(episode_min_grasp_reward) if episode_mean_in_place_reward: log_dict[prefix + "Misc/EpisodeMeanInPlaceReward"] = np.mean(episode_mean_grasp_reward) log_dict[prefix + "Misc/EpisodeMeanMaxInPlaceReward"] = np.mean(episode_max_in_place_reward) log_dict[prefix + "Misc/EpisodeMeanMinInPlaceReward"] = np.mean(episode_min_in_place_reward) return undiscounted_returns, log_dict
def paths_to_tensors(paths, max_episode_length, baseline_predictions, discount, gae_lambda): """Return processed sample data based on the collected paths. Args: paths (list[dict]): A list of collected paths. max_episode_length (int): Maximum length of a single episode. baseline_predictions(numpy.ndarray): : Predicted value of GAE (Generalized Advantage Estimation) Baseline. discount (float): Environment reward discount. gae_lambda (float): Lambda used for generalized advantage estimation. Returns: dict: Processed sample data, with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * baselines: (numpy.ndarray) * returns: (numpy.ndarray) * valids: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) * paths: (list[dict]) """ baselines = [] returns = [] total_steps = 0 for idx, path in enumerate(paths): total_steps += len(path['rewards']) path_baselines = np.append(baseline_predictions[idx], 0) deltas = (path['rewards'] + discount * path_baselines[1:] - path_baselines[:-1]) path['advantages'] = discount_cumsum(deltas, discount * gae_lambda) path['deltas'] = deltas for idx, path in enumerate(paths): # baselines path['baselines'] = baseline_predictions[idx] baselines.append(path['baselines']) # returns path['returns'] = discount_cumsum(path['rewards'], discount) returns.append(path['returns']) # make all paths the same length obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_episode_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_episode_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_episode_length) returns = [path['returns'] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_episode_length) baselines = tensor_utils.pad_tensor_n(baselines, max_episode_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_episode_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_episode_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_episode_length) lengths = np.asarray([v.sum() for v in valids]) samples_data = dict( observations=obs, actions=actions, rewards=rewards, baselines=baselines, returns=returns, valids=valids, lengths=lengths, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) return samples_data
def _evaluate(self, policy_opt_input_values, episodes, baselines, embed_ep_infos): """Evaluate rewards and everything else. Args: policy_opt_input_values (list[np.ndarray]): Flattened policy optimization input values. episodes (EpisodeBatch): Batch of episodes. baselines (np.ndarray): Baseline predictions. embed_ep_infos (dict): Embedding distribution information. Returns: dict: Paths for fitting the baseline. """ # pylint: disable=too-many-statements fit_paths = [] valids = episodes.valids observations = episodes.padded_observations tasks = pad_batch_array(episodes.env_infos['task_onehot'], episodes.lengths, self.max_episode_length) latents = pad_batch_array(episodes.agent_infos['latent'], episodes.lengths, self.max_episode_length) baselines_list = [] for baseline, valid in zip(baselines, valids): baselines_list.append(baseline[valid.astype(np.bool)]) # Augment reward from baselines rewards_tensor = self._f_rewards(*policy_opt_input_values) returns_tensor = self._f_returns(*policy_opt_input_values) returns_tensor = np.squeeze(returns_tensor, -1) env_rewards = episodes.rewards env_returns = [ discount_cumsum(rwd, self._discount) for rwd in episodes.padded_rewards ] env_average_discounted_return = np.mean( [ret[0] for ret in env_returns]) # Recompute returns and prepare paths for fitting the baseline aug_rewards = [] aug_returns = [] for rew, ret, val, task, latent, obs in zip(rewards_tensor, returns_tensor, valids, tasks, latents, observations): returns = ret[val.astype(np.bool)] task = task[val.astype(np.bool)] latent = latent[val.astype(np.bool)] obs = obs[val.astype(np.bool)] aug_rewards.append(rew[val.astype(np.bool)]) aug_returns.append(returns) fit_paths.append( dict(observations=obs, tasks=task, latents=latent, returns=returns)) aug_rewards = concat_tensor_list(aug_rewards) aug_returns = concat_tensor_list(aug_returns) # Calculate effect of the entropy terms d_rewards = np.mean(aug_rewards - env_rewards) tabular.record('{}/EntRewards'.format(self.policy.name), d_rewards) aug_average_discounted_return = (np.mean( [ret[0] for ret in returns_tensor])) d_returns = np.mean(aug_average_discounted_return - env_average_discounted_return) tabular.record('{}/EntReturns'.format(self.policy.name), d_returns) # Calculate explained variance ev = explained_variance_1d(np.concatenate(baselines_list), aug_returns) tabular.record('{}/ExplainedVariance'.format(self._baseline.name), ev) inference_rmse = (embed_ep_infos['mean'] - latents)**2. inference_rmse = np.sqrt(inference_rmse.mean()) tabular.record('Inference/RMSE', inference_rmse) inference_rrse = rrse(latents, embed_ep_infos['mean']) tabular.record('Inference/RRSE', inference_rrse) embed_ent = self._f_encoder_entropy(*policy_opt_input_values) tabular.record('{}/Encoder/Entropy'.format(self.policy.name), embed_ent) infer_ce = self._f_inference_ce(*policy_opt_input_values) tabular.record('Inference/CrossEntropy', infer_ce) pol_ent = self._f_policy_entropy(*policy_opt_input_values) pol_ent = np.sum(pol_ent) / np.sum(episodes.lengths) tabular.record('{}/Entropy'.format(self.policy.name), pol_ent) task_ents = self._f_task_entropies(*policy_opt_input_values) tasks = tasks[:, 0, :] _, task_indices = np.nonzero(tasks) path_lengths = np.sum(valids, axis=1) for t in range(self.policy.task_space.flat_dim): lengths = path_lengths[task_indices == t] completed = lengths < self.max_episode_length pct_completed = np.mean(completed) tabular.record('Tasks/EpisodeLength/t={}'.format(t), np.mean(lengths)) tabular.record('Tasks/TerminationRate/t={}'.format(t), pct_completed) tabular.record('Tasks/Entropy/t={}'.format(t), task_ents[t]) return fit_paths
def _train_once(self, itr, eps): """Train the algorithm once. Args: itr (int): Iteration number. eps (EpisodeBatch): A batch of collected paths. Returns: numpy.float64: Calculated mean value of undiscounted returns. """ obs = torch.Tensor(eps.padded_observations) rewards = torch.Tensor(eps.padded_rewards) returns = torch.Tensor( np.stack([ discount_cumsum(reward, self.discount) for reward in eps.padded_rewards ])) valids = eps.lengths with torch.no_grad(): baselines = self._value_function(obs) if self._maximum_entropy: policy_entropies = self._compute_policy_entropy(obs) rewards += self._policy_ent_coeff * policy_entropies obs_flat = torch.Tensor(eps.observations) actions_flat = torch.Tensor(eps.actions) rewards_flat = torch.Tensor(eps.rewards) returns_flat = torch.cat(filter_valids(returns, valids)) advs_flat = self._compute_advantage(rewards, valids, baselines) with torch.no_grad(): policy_loss_before = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_before = self._value_function.compute_loss( obs_flat, returns_flat) kl_before = self._compute_kl_constraint(obs) self._train(obs_flat, actions_flat, rewards_flat, returns_flat, advs_flat) with torch.no_grad(): policy_loss_after = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_after = self._value_function.compute_loss( obs_flat, returns_flat) kl_after = self._compute_kl_constraint(obs) policy_entropy = self._compute_policy_entropy(obs) with tabular.prefix(self.policy.name): tabular.record('/LossBefore', policy_loss_before.item()) tabular.record('/LossAfter', policy_loss_after.item()) tabular.record('/dLoss', (policy_loss_before - policy_loss_after).item()) tabular.record('/KLBefore', kl_before.item()) tabular.record('/KL', kl_after.item()) tabular.record('/Entropy', policy_entropy.mean().item()) with tabular.prefix(self._value_function.name): tabular.record('/LossBefore', vf_loss_before.item()) tabular.record('/LossAfter', vf_loss_after.item()) tabular.record('/dLoss', vf_loss_before.item() - vf_loss_after.item()) self._old_policy.load_state_dict(self.policy.state_dict()) undiscounted_returns = log_performance(itr, eps, discount=self._discount) return np.mean(undiscounted_returns)
def _process_samples(self, itr, paths): # pylint: disable=too-many-statements """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (OrderedDict[dict]): A list of collected paths for each task. In RL^2, there are n environments/tasks and paths in each of them will be concatenated at some point and fed to the policy. Returns: dict: Processed sample data, with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * returns: (numpy.ndarray) * valids: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) * paths: (list[dict]) * average_return: (numpy.float64) Raises: ValueError: If 'batch_idx' is not found. """ concatenated_paths = [] paths_by_task = collections.defaultdict(list) for path in paths: path['returns'] = discount_cumsum(path['rewards'], self._discount) path['lengths'] = [len(path['rewards'])] if 'batch_idx' in path: paths_by_task[path['batch_idx']].append(path) elif 'batch_idx' in path['agent_infos']: paths_by_task[path['agent_infos']['batch_idx'][0]].append(path) else: raise ValueError( 'Batch idx is required for RL2 but not found, ' 'Make sure to use garage.tf.algos.rl2.RL2Worker ' 'for sampling') # all path in paths_by_task[i] are sampled from task[i] for _paths in paths_by_task.values(): concatenated_path = self._concatenate_paths(_paths) concatenated_paths.append(concatenated_path) # stack and pad to max path length of the concatenated # path, which will be fed to inner algo # i.e. max_episode_length * episode_per_task concatenated_paths_stacked = (stack_and_pad_tensor_dict_list( concatenated_paths, self._inner_algo.max_episode_length)) name_map = None if hasattr(self._task_sampler, '_envs') and hasattr( self._task_sampler._envs[0]._env, 'all_task_names'): names = [ env._env.all_task_names[0] for env in self._task_sampler._envs ] name_map = dict(enumerate(names)) undiscounted_returns = log_multitask_performance( itr, EpisodeBatch.from_list(self._env_spec, paths), self._inner_algo._discount, name_map=name_map) concatenated_paths_stacked['paths'] = concatenated_paths concatenated_paths_stacked['average_return'] = np.mean( undiscounted_returns) return concatenated_paths_stacked