def evaluate(self, algo, test_rollouts_per_task=None): """Evaluate the Meta-RL algorithm on the test tasks. Args: algo (garage.np.algos.MetaRLAlgorithm): The algorithm to evaluate. test_rollouts_per_task (int or None): Number of rollouts per task. """ if test_rollouts_per_task is None: test_rollouts_per_task = self._n_test_rollouts adapted_trajectories = [] logger.log('Sampling for adapation and meta-testing...') if self._test_sampler is None: self._test_sampler = self._sampler_class.from_worker_factory( WorkerFactory(seed=get_seed(), max_path_length=self._max_path_length, n_workers=1, worker_class=self._worker_class, worker_args=self._worker_args), agents=algo.get_exploration_policy(), envs=self._test_task_sampler.sample(1)) for env_up in self._test_task_sampler.sample(self._n_test_tasks): policy = algo.get_exploration_policy() traj = self._trajectory_batch_class.concatenate(*[ self._test_sampler.obtain_samples(self._eval_itr, 1, policy, env_up) for _ in range(self._n_exploration_traj) ]) adapted_policy = algo.adapt_policy(policy, traj) adapted_traj = self._test_sampler.obtain_samples( self._eval_itr, test_rollouts_per_task * self._max_path_length, adapted_policy) adapted_trajectories.append(adapted_traj) logger.log('Finished meta-testing...') if self._test_task_names is not None: name_map = dict(enumerate(self._test_task_names)) else: name_map = None with tabular.prefix(self._prefix + '/' if self._prefix else ''): log_multitask_performance( self._eval_itr, self._trajectory_batch_class.concatenate( *adapted_trajectories), getattr(algo, 'discount', 1.0), trajectory_class=self._trajectory_batch_class, name_map=name_map) self._eval_itr += 1 if self._trajectory_batch_class == TrajectoryBatch: rewards = self._trajectory_batch_class.concatenate( *adapted_trajectories).rewards else: rewards = self._trajectory_batch_class.concatenate( *adapted_trajectories).env_rewards return sum(rewards) / len(rewards)
def test_log_multitask_performance_task_id(): lengths = np.array([10, 5, 1, 1]) batch = TrajectoryBatch( EnvSpec(akro.Box(np.array([0., 0., 0.]), np.array([1., 1., 1.])), akro.Box(np.array([-1., -1.]), np.array([0., 0.]))), observations=np.ones((sum(lengths), 3), dtype=np.float32), last_observations=np.ones((len(lengths), 3), dtype=np.float32), actions=np.zeros((sum(lengths), 2), dtype=np.float32), rewards=np.array([ 0.34026529, 0.58263177, 0.84307509, 0.97651095, 0.81723901, 0.22631398, 0.03421301, 0.97515046, 0.64311832, 0.65068933, 0.17657714, 0.04783857, 0.73904013, 0.41364329, 0.52235551, 0.24203526, 0.43328910 ]), terminals=np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1], dtype=bool), env_infos={ 'success': np.array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1], dtype=bool), 'task_id': np.array([1] * 10 + [3] * 5 + [1] + [4]) }, agent_infos={}, lengths=lengths) log_file = tempfile.NamedTemporaryFile() csv_output = dowel.CsvOutput(log_file.name) logger.add_output(csv_output) log_multitask_performance(7, batch, 0.8, { 1: 'env1', 3: 'env2', 4: 'env3', 5: 'env4' }) logger.log(tabular) logger.dump_output_type(dowel.CsvOutput) with open(log_file.name, 'r') as file: rows = list(csv.DictReader(file)) res = {k: float(r) for (k, r) in rows[0].items()} assert res['env1/Iteration'] == 7 assert res['env2/Iteration'] == 7 assert res['env3/Iteration'] == 7 assert res['env4/Iteration'] == 7 assert res['env1/NumTrajs'] == 2 assert res['env2/NumTrajs'] == 1 assert res['env3/NumTrajs'] == 1 assert res['env4/NumTrajs'] == 0 assert math.isclose(res['env1/SuccessRate'], 0.5) assert math.isclose(res['env2/SuccessRate'], 1.0) assert math.isclose(res['env3/SuccessRate'], 1.0) assert math.isnan(res['env4/SuccessRate']) assert math.isnan(res['env4/AverageReturn'])
def evaluate(self, algo, test_episodes_per_task=None): """Evaluate the Meta-RL algorithm on the test tasks. Args: algo (MetaRLAlgorithm): The algorithm to evaluate. test_episodes_per_task (int or None): Number of episodes per task. """ if test_episodes_per_task is None: test_episodes_per_task = self._n_test_episodes adapted_episodes = [] logger.log('Sampling for adapation and meta-testing...') env_updates = self._test_task_sampler.sample(self._n_test_tasks) if self._test_sampler is None: env = env_updates[0]() self._max_episode_length = env.spec.max_episode_length self._test_sampler = LocalSampler.from_worker_factory( WorkerFactory(seed=get_seed(), max_episode_length=self._max_episode_length, n_workers=1, worker_class=self._worker_class, worker_args=self._worker_args), agents=algo.get_exploration_policy(), envs=env) for env_up in env_updates: policy = algo.get_exploration_policy() eps = EpisodeBatch.concatenate(*[ self._test_sampler.obtain_samples(self._eval_itr, 1, policy, env_up) for _ in range(self._n_exploration_eps) ]) adapted_policy = algo.adapt_policy(policy, eps) adapted_eps = self._test_sampler.obtain_samples( self._eval_itr, test_episodes_per_task * self._max_episode_length, adapted_policy) adapted_episodes.append(adapted_eps) logger.log('Finished meta-testing...') if self._test_task_names is not None: name_map = dict(enumerate(self._test_task_names)) else: name_map = None with tabular.prefix(self._prefix + '/' if self._prefix else ''): log_multitask_performance( self._eval_itr, EpisodeBatch.concatenate(*adapted_episodes), getattr(algo, 'discount', 1.0), name_map=name_map) self._eval_itr += 1
def evaluate(self, algo, test_episodes_per_task=None): """Evaluate the Meta-RL algorithm on the test tasks. Args: algo (MetaRLAlgorithm): The algorithm to evaluate. test_episodes_per_task (int or None): Number of episodes per task. """ if test_episodes_per_task is None: test_episodes_per_task = self._n_test_episodes adapted_episodes = [] logger.log('Sampling for adapation and meta-testing...') env_updates = self._test_task_sampler.sample(self._n_test_tasks) for env_up in env_updates: policy = algo.get_exploration_policy() eps = EpisodeBatch.concatenate(*[ algo._sampler.obtain_samples(self._eval_itr, 1, policy, env_up) for _ in range(self._n_exploration_eps) ]) adapted_policy = algo.get_adapted_test_policy(policy, eps) adapted_eps = algo._sampler.obtain_samples( self._eval_itr, test_episodes_per_task * env_up().spec.max_episode_length, adapted_policy) adapted_episodes.append(adapted_eps) if self._verbose: for ep in adapted_episodes: print(ep.env_infos['task'][0]) print(f'last observations: {ep.last_observations}') print('------------------------------------') logger.log('Finished meta-testing...') if self._test_task_names is not None: name_map = dict(enumerate(self._test_task_names)) else: name_map = None with tabular.prefix(self._prefix + '/' if self._prefix else ''): log_multitask_performance( self._eval_itr, EpisodeBatch.concatenate(*adapted_episodes), getattr(algo, 'discount', 1.0), name_map=name_map) self._eval_itr += 1 return adapted_episodes
def _evaluate_policy(self, epoch): """Evaluate the performance of the policy via deterministic sampling. Statistics such as (average) discounted return and success rate are recorded. Args: epoch (int): The current training epoch. Returns: float: The average return across self._num_evaluation_episodes episodes """ eval_eps = [] for eval_env in self._eval_env: eval_eps.append( obtain_evaluation_episodes( self.policy, eval_env, self._max_episode_length_eval, num_eps=self._num_evaluation_episodes, deterministic=self._use_deterministic_evaluation)) eval_eps = EpisodeBatch.concatenate(*eval_eps) last_return = log_multitask_performance(epoch, eval_eps, self._discount) return last_return
def _evaluate_policy(self, epoch): """Evaluate the performance of the policy via deterministic rollouts. Statistics such as (average) discounted return and success rate are recorded. Args: epoch (int): The current training epoch. Returns: float: The average return across self._num_evaluation_trajectories trajectories """ eval_trajs = [] for _ in range(self._num_tasks): eval_trajs.append( obtain_evaluation_samples( self.policy, self._eval_env, num_trajs=self._num_evaluation_trajectories)) eval_trajs = TrajectoryBatch.concatenate(*eval_trajs) last_return = log_multitask_performance(epoch, eval_trajs, self._discount) return last_return
def _process_samples(self, itr, paths): # pylint: disable=too-many-statements """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (OrderedDict[dict]): A list of collected paths for each task. In RL^2, there are n environments/tasks and paths in each of them will be concatenated at some point and fed to the policy. Returns: EpisodeBatch: Processed batch of episodes for feeding the inner algorithm. numpy.float64: The average return. Raises: ValueError: If 'batch_idx' is not found. """ concatenated_paths = [] paths_by_task = collections.defaultdict(list) for path in paths: path['returns'] = discount_cumsum(path['rewards'], self._discount) path['lengths'] = [len(path['rewards'])] if 'batch_idx' in path: paths_by_task[path['batch_idx']].append(path) elif 'batch_idx' in path['agent_infos']: paths_by_task[path['agent_infos']['batch_idx'][0]].append(path) else: raise ValueError( 'Batch idx is required for RL2 but not found, ' 'Make sure to use garage.tf.algos.rl2.RL2Worker ' 'for sampling') # all path in paths_by_task[i] are sampled from task[i] for _paths in paths_by_task.values(): concatenated_path = self._concatenate_paths(_paths) concatenated_paths.append(concatenated_path) name_map = None if hasattr(self._task_sampler, '_envs') and hasattr( self._task_sampler._envs[0]._env, 'all_task_names'): names = [ env._env.all_task_names[0] for env in self._task_sampler._envs ] name_map = dict(enumerate(names)) undiscounted_returns = log_multitask_performance( itr, EpisodeBatch.from_list(self._env_spec, paths), self._inner_algo._discount, name_map=name_map) average_return = np.mean(undiscounted_returns) episodes = EpisodeBatch.from_list(self._env_spec, concatenated_paths) return episodes, average_return
def _process_samples(self, itr, episodes): # pylint: disable=too-many-statements """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. episodes (EpisodeBatch): Original collected episode batch for each task. For each episode, episode.agent_infos['batch_idx'] indicates which task this episode belongs to. In RL^2, there are n environments/tasks and paths in each of them will be concatenated at some point and fed to the policy. Returns: EpisodeBatch: Processed batch of episodes for feeding the inner algorithm. numpy.float64: The average return. Raises: ValueError: If 'batch_idx' is not found. """ concatenated_paths = [] paths_by_task = collections.defaultdict(list) for episode in episodes.split(): if hasattr(episode, 'batch_idx'): paths_by_task[episode.batch_idx[0]].append(episode) elif 'batch_idx' in episode.agent_infos: paths_by_task[episode.agent_infos['batch_idx'][0]].append( episode) else: raise ValueError( 'Batch idx is required for RL2 but not found, ' 'Make sure to use garage.tf.algos.rl2.RL2Worker ' 'for sampling') # all path in paths_by_task[i] are sampled from task[i] for episode_list in paths_by_task.values(): concatenated_path = self._concatenate_episodes(episode_list) concatenated_paths.append(concatenated_path) concatenated_episodes = EpisodeBatch.concatenate(*concatenated_paths) name_map = None if hasattr(self._task_sampler, '_envs') and hasattr( self._task_sampler._envs[0]._env, 'all_task_names'): names = [ env._env.all_task_names[0] for env in self._task_sampler._envs ] name_map = dict(enumerate(names)) undiscounted_returns = log_multitask_performance( itr, episodes, self._inner_algo._discount, name_map=name_map) average_return = np.mean(undiscounted_returns) return concatenated_episodes, average_return
def log_performance(self, itr, all_samples, loss_before, loss_after, kl_before, kl, policy_entropy): """Evaluate performance of this batch. Args: itr (int): Iteration number. all_samples (list[list[MAMLTrajectoryBatch]]): Two dimensional list of MAMLTrajectoryBatch of size [meta_batch_size * (num_grad_updates + 1)] loss_before (float): Loss before optimization step. loss_after (float): Loss after optimization step. kl_before (float): KL divergence before optimization step. kl (float): KL divergence after optimization step. policy_entropy (float): Policy entropy. Returns: float: The average return in last epoch cycle. """ tabular.record('Iteration', itr) name_map = None if hasattr(self._env, 'all_task_names'): names = self._env.all_task_names name_map = dict(zip(names, names)) rtns = log_multitask_performance( itr, TrajectoryBatch.from_trajectory_list( env_spec=self._env.spec, paths=[ path for task_paths in all_samples for path in task_paths[self._num_grad_updates].paths ]), discount=self._inner_algo.discount, name_map=name_map) with tabular.prefix(self._policy.name + '/'): tabular.record('LossBefore', loss_before) tabular.record('LossAfter', loss_after) tabular.record('dLoss', loss_before - loss_after) tabular.record('KLBefore', kl_before) tabular.record('KLAfter', kl) tabular.record('Entropy', policy_entropy) return np.mean(rtns)
def _process_samples(self, itr, paths): # pylint: disable=too-many-statements """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (OrderedDict[dict]): A list of collected paths for each task. In RL^2, there are n environments/tasks and paths in each of them will be concatenated at some point and fed to the policy. Returns: dict: Processed sample data, with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * returns: (numpy.ndarray) * valids: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) * paths: (list[dict]) * average_return: (numpy.float64) Raises: ValueError: If 'batch_idx' is not found. """ concatenated_paths = [] paths_by_task = collections.defaultdict(list) for path in paths: path['returns'] = np_tensor_utils.discount_cumsum( path['rewards'], self._discount) path['lengths'] = [len(path['rewards'])] if 'batch_idx' in path: paths_by_task[path['batch_idx']].append(path) elif 'batch_idx' in path['agent_infos']: paths_by_task[path['agent_infos']['batch_idx'][0]].append(path) else: raise ValueError( 'Batch idx is required for RL2 but not found, ' 'Make sure to use garage.tf.algos.rl2.RL2Worker ' 'for sampling') # all path in paths_by_task[i] are sampled from task[i] for _paths in paths_by_task.values(): concatenated_path = self._concatenate_paths(_paths) concatenated_paths.append(concatenated_path) # stack and pad to max path length of the concatenated # path, which will be fed to inner algo # i.e. max_path_length * episode_per_task concatenated_paths_stacked = ( np_tensor_utils.stack_and_pad_tensor_dict_list( concatenated_paths, self._inner_algo.max_path_length)) name_map = None if hasattr(self._task_sampler, '_envs') and hasattr( self._task_sampler._envs[0].env, 'all_task_names'): names = [ env.env.all_task_names[0] for env in self._task_sampler._envs ] name_map = dict(enumerate(names)) undiscounted_returns = log_multitask_performance( itr, TrajectoryBatch.from_trajectory_list(self._env_spec, paths), self._inner_algo.discount, name_map=name_map) concatenated_paths_stacked['paths'] = concatenated_paths concatenated_paths_stacked['average_return'] = np.mean( undiscounted_returns) return concatenated_paths_stacked