def evaluate_performance(self, itr, all_samples, loss_before, loss_after, kl_before, kl, policy_entropy): """Evaluate performance of this batch. Args: itr (int): Iteration number. all_samples (list[list[MAMLTrajectoryBatch]]): Two dimensional list of MAMLTrajectoryBatch of size [meta_batch_size * (num_grad_updates + 1)] loss_before (float): Loss before optimization step. loss_after (float): Loss after optimization step. kl_before (float): KL divergence before optimization step. kl (float): KL divergence after optimization step. policy_entropy (float): Policy entropy. Returns: float: The average return in last epoch cycle. """ tabular.record('Iteration', itr) for i in range(self._num_grad_updates + 1): all_rewards = [ path_rewards for task_samples in all_samples for path_rewards in task_samples[i].rewards.numpy() ] discounted_returns = [ tensor_utils.discount_cumsum(path_rewards, self._inner_algo.discount)[0] for path_rewards in all_rewards ] undiscounted_returns = np.sum(all_rewards, axis=-1) average_return = np.mean(undiscounted_returns) with tabular.prefix('Update_{0}/'.format(i)): tabular.record('AverageDiscountedReturn', np.mean(discounted_returns)) tabular.record('AverageReturn', average_return) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) tabular.record('NumTrajs', len(all_rewards)) with tabular.prefix(self._policy.name + '/'): tabular.record('LossBefore', loss_before) tabular.record('LossAfter', loss_after) tabular.record('dLoss', loss_before - loss_after) tabular.record('KLBefore', kl_before) tabular.record('KLAfter', kl) tabular.record('Entropy', policy_entropy) return average_return
def _train_once(self): """Perform one iteration of training.""" policy_loss_list = [] qf_loss_list = [] contrastive_loss_list = [] alpha_loss_list = [] alpha_list = [] for _ in range(self._num_steps_per_epoch): indices = np.random.choice(range(self._num_train_tasks), self._meta_batch_size) policy_loss, qf_loss, contrastive_loss, alpha_loss, alpha = self._optimize_policy( indices) policy_loss_list.append(policy_loss) qf_loss_list.append(qf_loss) contrastive_loss_list.append(contrastive_loss) alpha_loss_list.append(alpha_loss) alpha_list.append(alpha) with tabular.prefix('MetaTrain/Average/'): tabular.record('PolicyLoss', np.average(np.array(policy_loss_list))) tabular.record('QfLoss', np.average(np.array(qf_loss_list))) tabular.record('ContrastiveLoss', np.average(np.array(contrastive_loss_list))) tabular.record('AlphaLoss', np.average(np.array(alpha_loss_list))) tabular.record('AlphaLoss', np.average(np.array(alpha_loss_list))) tabular.record('Alpha', np.average(np.array(alpha_list)))
def evaluate(self, algo, test_rollouts_per_task=None): """Evaluate the Meta-RL algorithm on the test tasks. Args: algo (metarl.np.algos.MetaRLAlgorithm): The algorithm to evaluate. test_rollouts_per_task (int or None): Number of rollouts per task. """ if test_rollouts_per_task is None: test_rollouts_per_task = self._n_test_rollouts adapted_trajectories = [] logger.log('Sampling for adapation and meta-testing...') for env_up in self._test_task_sampler.sample(self._n_test_tasks): policy = algo.get_exploration_policy() traj = TrajectoryBatch.concatenate(*[ self._test_sampler.obtain_samples(self._eval_itr, 1, policy, env_up) for _ in range(self._n_exploration_traj) ]) adapted_policy = algo.adapt_policy(policy, traj) adapted_traj = self._test_sampler.obtain_samples( self._eval_itr, test_rollouts_per_task * self._max_path_length, adapted_policy) adapted_trajectories.append(adapted_traj) logger.log('Finished meta-testing...') with tabular.prefix(self._prefix + '/' if self._prefix else ''): log_multitask_performance( self._eval_itr, TrajectoryBatch.concatenate(*adapted_trajectories), getattr(algo, 'discount', 1.0), task_names=self._test_task_names) self._eval_itr += 1
def evaluate(self, algo, test_rollouts_per_task=None): """Evaluate the Meta-RL algorithm on the test tasks. Args: algo (garage.np.algos.MetaRLAlgorithm): The algorithm to evaluate. test_rollouts_per_task (int or None): Number of rollouts per task. """ if test_rollouts_per_task is None: test_rollouts_per_task = self._n_test_rollouts adapted_trajectories = [] logger.log('Sampling for adapation and meta-testing...') if self._test_sampler is None: self._test_sampler = self._sampler_class.from_worker_factory( WorkerFactory(seed=get_seed(), max_path_length=self._max_path_length, n_workers=1, worker_class=self._worker_class, worker_args=self._worker_args), agents=algo.get_exploration_policy(), envs=self._test_task_sampler.sample(1)) for env_up in self._test_task_sampler.sample(self._n_test_tasks): policy = algo.get_exploration_policy() traj = self._trajectory_batch_class.concatenate(*[ self._test_sampler.obtain_samples(self._eval_itr, 1, policy, env_up) for _ in range(self._n_exploration_traj) ]) adapted_policy = algo.adapt_policy(policy, traj) adapted_traj = self._test_sampler.obtain_samples( self._eval_itr, test_rollouts_per_task * self._max_path_length, adapted_policy) adapted_trajectories.append(adapted_traj) logger.log('Finished meta-testing...') if self._test_task_names is not None: name_map = dict(enumerate(self._test_task_names)) else: name_map = None with tabular.prefix(self._prefix + '/' if self._prefix else ''): log_multitask_performance( self._eval_itr, self._trajectory_batch_class.concatenate( *adapted_trajectories), getattr(algo, 'discount', 1.0), trajectory_class=self._trajectory_batch_class, name_map=name_map) self._eval_itr += 1 if self._trajectory_batch_class == TrajectoryBatch: rewards = self._trajectory_batch_class.concatenate( *adapted_trajectories).rewards else: rewards = self._trajectory_batch_class.concatenate( *adapted_trajectories).env_rewards return sum(rewards) / len(rewards)
def log_multitask_performance(itr, batch, discount, name_map=None): r"""Log performance of trajectories from multiple tasks. Args: itr (int): Iteration number to be logged. batch (garage.TrajectoryBatch): Batch of trajectories. The trajectories should have either the "task_name" or "task_id" `env_infos`. If the "task_name" is not present, then `name_map` is required, and should map from task id's to task names. discount (float): Discount used in computing returns. name_map (dict[int, str] or None): Mapping from task id's to task names. Optional if the "task_name" environment info is present. Note that if provided, all tasks listed in this map will be logged, even if there are no trajectories present for them. Returns: numpy.ndarray: Undiscounted returns averaged across all tasks. Has shape :math:`(N \bullet [T])`. """ traj_by_name = defaultdict(list) for trajectory in batch.split(): try: task_name = trajectory.env_infos['task_name'][0] except KeyError: try: task_id = trajectory.env_infos['task_id'][0] task_name = name_map[task_id] except KeyError: task_name = 'Task #{}'.format(task_id) traj_by_name[task_name].append(trajectory) if name_map is None: task_names = traj_by_name.keys() else: task_names = name_map.values() for task_name in task_names: if task_name in traj_by_name: trajectories = traj_by_name[task_name] log_performance(itr, garage.TrajectoryBatch.concatenate(*trajectories), discount, prefix=task_name) else: with tabular.prefix(task_name + '/'): tabular.record('Iteration', itr) tabular.record('NumTrajs', 0) tabular.record('AverageDiscountedReturn', np.nan) tabular.record('AverageReturn', np.nan) tabular.record('StdReturn', np.nan) tabular.record('MaxReturn', np.nan) tabular.record('MinReturn', np.nan) tabular.record('CompletionRate', np.nan) tabular.record('SuccessRate', np.nan) return log_performance(itr, batch, discount=discount, prefix='Average')
def log_multitask_performance(itr, batch, discount, name_map=None): r"""Log performance of episodes from multiple tasks. Args: itr (int): Iteration number to be logged. batch (EpisodeBatch): Batch of episodes. The episodes should have either the "task_name" or "task_id" `env_infos`. If the "task_name" is not present, then `name_map` is required, and should map from task id's to task names. discount (float): Discount used in computing returns. name_map (dict[int, str] or None): Mapping from task id's to task names. Optional if the "task_name" environment info is present. Note that if provided, all tasks listed in this map will be logged, even if there are no episodes present for them. Returns: numpy.ndarray: Undiscounted returns averaged across all tasks. Has shape :math:`(N \bullet [T])`. """ eps_by_name = defaultdict(list) for eps in batch.split(): task_name = '__unnamed_task__' if 'task_name' in eps.env_infos: task_name = eps.env_infos['task_name'][0] elif 'task_id' in eps.env_infos: name_map = {} if name_map is None else name_map task_id = eps.env_infos['task_id'][0] task_name = name_map.get(task_id, 'Task #{}'.format(task_id)) eps_by_name[task_name].append(eps) if name_map is None: task_names = eps_by_name.keys() else: task_names = name_map.values() for task_name in task_names: if task_name in eps_by_name: episodes = eps_by_name[task_name] log_performance(itr, EpisodeBatch.concatenate(*episodes), discount, prefix=task_name) else: with tabular.prefix(task_name + '/'): tabular.record('Iteration', itr) tabular.record('NumEpisodes', 0) tabular.record('AverageDiscountedReturn', np.nan) tabular.record('AverageReturn', np.nan) tabular.record('StdReturn', np.nan) tabular.record('MaxReturn', np.nan) tabular.record('MinReturn', np.nan) tabular.record('TerminationRate', np.nan) tabular.record('SuccessRate', np.nan) return log_performance(itr, batch, discount=discount, prefix='Average')
def evaluate(self, algo, test_episodes_per_task=None): """Evaluate the Meta-RL algorithm on the test tasks. Args: algo (MetaRLAlgorithm): The algorithm to evaluate. test_episodes_per_task (int or None): Number of episodes per task. """ if test_episodes_per_task is None: test_episodes_per_task = self._n_test_episodes adapted_episodes = [] logger.log('Sampling for adapation and meta-testing...') env_updates = self._test_task_sampler.sample(self._n_test_tasks) if self._test_sampler is None: env = env_updates[0]() self._max_episode_length = env.spec.max_episode_length self._test_sampler = LocalSampler.from_worker_factory( WorkerFactory(seed=get_seed(), max_episode_length=self._max_episode_length, n_workers=1, worker_class=self._worker_class, worker_args=self._worker_args), agents=algo.get_exploration_policy(), envs=env) for env_up in env_updates: policy = algo.get_exploration_policy() eps = EpisodeBatch.concatenate(*[ self._test_sampler.obtain_samples(self._eval_itr, 1, policy, env_up) for _ in range(self._n_exploration_eps) ]) adapted_policy = algo.adapt_policy(policy, eps) adapted_eps = self._test_sampler.obtain_samples( self._eval_itr, test_episodes_per_task * self._max_episode_length, adapted_policy) adapted_episodes.append(adapted_eps) logger.log('Finished meta-testing...') if self._test_task_names is not None: name_map = dict(enumerate(self._test_task_names)) else: name_map = None with tabular.prefix(self._prefix + '/' if self._prefix else ''): log_multitask_performance( self._eval_itr, EpisodeBatch.concatenate(*adapted_episodes), getattr(algo, 'discount', 1.0), name_map=name_map) self._eval_itr += 1
def evaluate(self, algo, test_episodes_per_task=None): """Evaluate the Meta-RL algorithm on the test tasks. Args: algo (MetaRLAlgorithm): The algorithm to evaluate. test_episodes_per_task (int or None): Number of episodes per task. """ if test_episodes_per_task is None: test_episodes_per_task = self._n_test_episodes adapted_episodes = [] logger.log('Sampling for adapation and meta-testing...') env_updates = self._test_task_sampler.sample(self._n_test_tasks) for env_up in env_updates: policy = algo.get_exploration_policy() eps = EpisodeBatch.concatenate(*[ algo._sampler.obtain_samples(self._eval_itr, 1, policy, env_up) for _ in range(self._n_exploration_eps) ]) adapted_policy = algo.get_adapted_test_policy(policy, eps) adapted_eps = algo._sampler.obtain_samples( self._eval_itr, test_episodes_per_task * env_up().spec.max_episode_length, adapted_policy) adapted_episodes.append(adapted_eps) if self._verbose: for ep in adapted_episodes: print(ep.env_infos['task'][0]) print(f'last observations: {ep.last_observations}') print('------------------------------------') logger.log('Finished meta-testing...') if self._test_task_names is not None: name_map = dict(enumerate(self._test_task_names)) else: name_map = None with tabular.prefix(self._prefix + '/' if self._prefix else ''): log_multitask_performance( self._eval_itr, EpisodeBatch.concatenate(*adapted_episodes), getattr(algo, 'discount', 1.0), name_map=name_map) self._eval_itr += 1 return adapted_episodes
def _log_performance(self, itr, batch, discount, prefix='Evaluation'): self_returns = [] env_returns = [] undiscounted_self_returns = [] undiscounted_env_returns = [] completion = [] success = [] for trajectory in batch.split(): self_returns.append( discount_cumsum(trajectory.self_rewards, discount)) env_returns.append( discount_cumsum(trajectory.env_rewards, discount)) undiscounted_self_returns.append(sum(trajectory.self_rewards)) undiscounted_env_returns.append(sum(trajectory.env_rewards)) completion.append(float(trajectory.terminals.any())) if 'success' in trajectory.env_infos: success.append(float(trajectory.env_infos['success'].any())) average_discounted_self_return = np.mean( [rtn[0] for rtn in self_returns]) average_discounted_env_return = np.mean( [rtn[0] for rtn in env_returns]) with tabular.prefix(prefix + '/'): tabular.record('Iteration', itr) tabular.record('NumTrajs', len(self_returns)) # pseudo reward tabular.record('AverageDiscountedSelfReturn', average_discounted_self_return) tabular.record('AverageSelfReturn', np.mean(undiscounted_self_returns)) tabular.record('StdSelfReturn', np.std(undiscounted_self_returns)) tabular.record('MaxSelfReturn', np.max(undiscounted_self_returns)) tabular.record('MinSelfReturn', np.min(undiscounted_self_returns)) # env reward tabular.record('AverageDiscountedEnvReturn', average_discounted_env_return) tabular.record('AverageEnvReturn', np.mean(undiscounted_env_returns)) tabular.record('StdEnvReturn', np.std(undiscounted_env_returns)) tabular.record('MaxEnvReturn', np.max(undiscounted_env_returns)) tabular.record('MinEnvReturn', np.min(undiscounted_env_returns)) tabular.record('CompletionRate', np.mean(completion)) if success: tabular.record('SuccessRate', np.mean(success)) return undiscounted_self_returns, undiscounted_env_returns
def log_performance(itr, batch, discount, trajectory_class=TrajectoryBatch, prefix='Evaluation'): """Evaluate the performance of an algorithm on a batch of trajectories. Args: itr (int): Iteration number. batch (TrajectoryBatch): The trajectories to evaluate with. discount (float): Discount value, from algorithm's property. prefix (str): Prefix to add to all logged keys. Returns: numpy.ndarray: Undiscounted returns. """ returns = [] undiscounted_returns = [] completion = [] success = [] for trajectory in batch.split(): if trajectory_class == TrajectoryBatch: returns.append(discount_cumsum(trajectory.rewards, discount)) undiscounted_returns.append(sum(trajectory.rewards)) else: returns.append(discount_cumsum(trajectory.env_rewards, discount)) undiscounted_returns.append(sum(trajectory.env_rewards)) completion.append(float(trajectory.terminals.any())) if 'success' in trajectory.env_infos: success.append(float(trajectory.env_infos['success'].any())) average_discounted_return = np.mean([rtn[0] for rtn in returns]) with tabular.prefix(prefix + '/'): tabular.record('Iteration', itr) tabular.record('NumTrajs', len(returns)) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) tabular.record('CompletionRate', np.mean(completion)) if success: tabular.record('SuccessRate', np.mean(success)) return undiscounted_returns
def train_once(self, itr, paths): """Train the algorithm once. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: dict: Processed sample data, with key * average_return: (float) """ obs, actions, rewards, valids, baselines = self.process_samples( itr, paths) loss = self._compute_loss(itr, obs, actions, rewards, valids, baselines) self._old_policy.load_state_dict(self.policy.state_dict()) self._optimizer.zero_grad() loss.backward() kl_before = self._compute_kl_constraint(obs).detach() self._optimize(itr, obs, actions, rewards, valids, baselines) with torch.no_grad(): loss_after = self._compute_loss(itr, obs, actions, rewards, valids, baselines) kl = self._compute_kl_constraint(obs) policy_entropy = self._compute_policy_entropy(obs) average_returns = log_performance(itr, TrajectoryBatch.from_trajectory_list( self.env_spec, paths), discount=self.discount) with tabular.prefix(self.policy.name): tabular.record('LossBefore', loss.item()) tabular.record('LossAfter', loss_after.item()) tabular.record('dLoss', loss.item() - loss_after.item()) tabular.record('KLBefore', kl_before.item()) tabular.record('KL', kl.item()) tabular.record('Entropy', policy_entropy.mean().item()) self.baseline.fit(paths) return np.mean(average_returns)
def log_multitask_performance(itr, batch, discount, name_map={}, task_names=None): traj_by_name = defaultdict(list) for trajectory in batch.split(): try: task_name = trajectory.env_infos['task_name'][0] except KeyError: try: task_id = trajectory.env_infos['task_id'][0] task_name = name_map[task_id] except KeyError: task_name = 'Task #{}'.format(task_id) traj_by_name[task_name].append(trajectory) if task_names is None: for (task_name, trajectories) in traj_by_name.items(): log_performance(itr, metarl.TrajectoryBatch.concatenate(*trajectories), discount, prefix=task_name) else: for task_name in sorted(task_names): if task_name in traj_by_name: trajectories = traj_by_name[task_name] log_performance( itr, metarl.TrajectoryBatch.concatenate(*trajectories), discount, prefix=task_name) else: with tabular.prefix(task_name + '/'): tabular.record('Iteration', -1) tabular.record('NumTrajs', -1) tabular.record('AverageDiscountedReturn', -1.) tabular.record('AverageReturn', -1) tabular.record('StdReturn', -1) tabular.record('MaxReturn', -1) tabular.record('MinReturn', -1) tabular.record('CompletionRate', -1) tabular.record('SuccessRate', -1) return log_performance(itr, batch, discount=discount, prefix="Average")
def log_performance(itr, batch, discount, prefix='Evaluation'): """Evaluate the performance of an algorithm on a batch of episodes. Args: itr (int): Iteration number. batch (EpisodeBatch): The episodes to evaluate with. discount (float): Discount value, from algorithm's property. prefix (str): Prefix to add to all logged keys. Returns: numpy.ndarray: Undiscounted returns. """ returns = [] undiscounted_returns = [] termination = [] success = [] for eps in batch.split(): returns.append(discount_cumsum(eps.rewards, discount)) undiscounted_returns.append(sum(eps.rewards)) termination.append( float( any(step_type == StepType.TERMINAL for step_type in eps.step_types))) if 'success' in eps.env_infos: success.append(float(eps.env_infos['success'].any())) average_discounted_return = np.mean([rtn[0] for rtn in returns]) with tabular.prefix(prefix + '/'): tabular.record('Iteration', itr) tabular.record('NumEpisodes', len(returns)) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) tabular.record('TerminationRate', np.mean(termination)) if success: tabular.record('SuccessRate', np.mean(success)) return undiscounted_returns
def _log(self, itr, paths, loss_before, loss_after, kl_before, kl, policy_entropy): """Log information per iteration based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths loss_before (float): Loss before optimization step. loss_after (float): Loss after optimization step. kl_before (float): KL divergence before optimization step. kl (float): KL divergence after optimization step. policy_entropy (float): Policy entropy. Returns: float: The average return in last epoch cycle. """ average_discounted_return = (np.mean( [path['returns'][0] for path in paths])) undiscounted_returns = [sum(path['rewards']) for path in paths] average_return = np.mean(undiscounted_returns) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Iteration', itr) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', average_return) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) tabular.record('NumTrajs', len(paths)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) with tabular.prefix(self.policy.name): tabular.record('LossBefore', loss_before) tabular.record('LossAfter', loss_after) tabular.record('dLoss', loss_before - loss_after) tabular.record('KLBefore', kl_before) tabular.record('KL', kl) tabular.record('Entropy', policy_entropy) return average_return
def train(self, runner): """Obtain samplers and start actual training for each epoch. Args: runner (LocalRunner): LocalRunner is passed to give algorithm the access to runner.step_epochs(), which provides services such as snapshotting and sampler control. """ if not self._eval_env: self._eval_env = runner.get_env_copy() for epoch in runner.step_epochs(): if self._eval_env is not None: log_performance(epoch, obtain_evaluation_samples( self.learner, self._eval_env), discount=1.0) losses = self._train_once(runner, epoch) with tabular.prefix(self._name + '/'): tabular.record('MeanLoss', np.mean(losses)) tabular.record('StdLoss', np.std(losses))
def _train_once(self, itr, eps): """Train the algorithm once. Args: itr (int): Iteration number. eps (EpisodeBatch): A batch of collected paths. Returns: numpy.float64: Calculated mean value of undiscounted returns. """ obs = torch.Tensor(eps.padded_observations) rewards = torch.Tensor(eps.padded_rewards) returns = torch.Tensor( np.stack([ discount_cumsum(reward, self.discount) for reward in eps.padded_rewards ])) valids = eps.lengths with torch.no_grad(): baselines = self._value_function(obs) if self._maximum_entropy: policy_entropies = self._compute_policy_entropy(obs) rewards += self._policy_ent_coeff * policy_entropies obs_flat = torch.Tensor(eps.observations) actions_flat = torch.Tensor(eps.actions) rewards_flat = torch.Tensor(eps.rewards) returns_flat = torch.cat(filter_valids(returns, valids)) advs_flat = self._compute_advantage(rewards, valids, baselines) with torch.no_grad(): policy_loss_before = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_before = self._value_function.compute_loss( obs_flat, returns_flat) kl_before = self._compute_kl_constraint(obs) self._train(obs_flat, actions_flat, rewards_flat, returns_flat, advs_flat) with torch.no_grad(): policy_loss_after = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_after = self._value_function.compute_loss( obs_flat, returns_flat) kl_after = self._compute_kl_constraint(obs) policy_entropy = self._compute_policy_entropy(obs) with tabular.prefix(self.policy.name): tabular.record('/LossBefore', policy_loss_before.item()) tabular.record('/LossAfter', policy_loss_after.item()) tabular.record('/dLoss', (policy_loss_before - policy_loss_after).item()) tabular.record('/KLBefore', kl_before.item()) tabular.record('/KL', kl_after.item()) tabular.record('/Entropy', policy_entropy.mean().item()) with tabular.prefix(self._value_function.name): tabular.record('/LossBefore', vf_loss_before.item()) tabular.record('/LossAfter', vf_loss_after.item()) tabular.record('/dLoss', vf_loss_before.item() - vf_loss_after.item()) self._old_policy.load_state_dict(self.policy.state_dict()) undiscounted_returns = log_performance(itr, eps, discount=self._discount) return np.mean(undiscounted_returns)
def log_performance(self, indices, test, epoch): """Get average returns for specific tasks. Args: indices (list): List of tasks. """ discounted_returns = [] undiscounted_returns = [] completion = [] success = [] traj = [] for idx in indices: eval_paths = [] for _ in range(self._num_evals): paths = self.collect_paths(idx, test) paths[-1]['terminals'] = paths[-1]['terminals'].squeeze() paths[-1]['dones'] = paths[-1]['terminals'] # HalfCheetahVel env if 'task' in paths[-1]['env_infos'].keys(): paths[-1]['env_infos']['task'] = paths[-1]['env_infos'][ 'task']['velocity'] eval_paths.append(paths[-1]) discounted_returns.append( discount_cumsum(paths[-1]['rewards'], self._discount)) undiscounted_returns.append(sum(paths[-1]['rewards'])) completion.append(float(paths[-1]['terminals'].any())) # calculate success rate for metaworld tasks if 'success' in paths[-1]['env_infos']: success.append(paths[-1]['env_infos']['success'].any()) if test: env = self.test_env[idx]() temp_traj = TrajectoryBatch.from_trajectory_list( env, eval_paths) else: env = self.env[idx]() temp_traj = TrajectoryBatch.from_trajectory_list( env, eval_paths) traj.append(temp_traj) if test: with tabular.prefix('Test/'): if self._test_task_names: log_multitask_performance( epoch, TrajectoryBatch.concatenate(*traj), self._discount, task_names=self._test_task_names) log_performance(epoch, TrajectoryBatch.concatenate(*traj), self._discount, prefix='Average') else: with tabular.prefix('Train/'): if self._train_task_names: log_multitask_performance( epoch, TrajectoryBatch.concatenate(*traj), self._discount, task_names=self._train_task_names) log_performance(epoch, TrajectoryBatch.concatenate(*traj), self._discount, prefix='Average')