def process_samples(self, itr, paths): """Process sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: tuple: * obs (torch.Tensor): The observations of the environment. * actions (torch.Tensor): The actions fed to the environment. * rewards (torch.Tensor): The acquired rewards. * valids (list[int]): Numbers of valid steps in each paths. * baselines (torch.Tensor): Value function estimation at each step. """ for path in paths: if 'returns' not in path: path['returns'] = tensor_utils.discount_cumsum( path['rewards'], self.discount) returns = torch.stack([ pad_to_last(tensor_utils.discount_cumsum(path['rewards'], self.discount).copy(), total_length=self.max_path_length) for path in paths ]) valids = torch.Tensor([len(path['actions']) for path in paths]).int() obs = torch.stack([ pad_to_last(path['observations'], total_length=self.max_path_length, axis=0) for path in paths ]) avail_actions = torch.stack([ pad_one_to_last(path['avail_actions'], total_length=self.max_path_length, axis=0) for path in paths ]) # Cannot pad all zero since prob sum cannot be zero actions = torch.stack([ pad_to_last(path['actions'], total_length=self.max_path_length, axis=0) for path in paths ]) rewards = torch.stack([ pad_to_last(path['rewards'], total_length=self.max_path_length) for path in paths ]) if isinstance(self.baseline, LinearFeatureBaseline): baselines = torch.stack([ pad_to_last(self._get_baselines(path), total_length=self.max_path_length) for path in paths ]) else: with torch.no_grad(): baselines = self.baseline.forward(obs) return obs, avail_actions, actions, rewards, valids, baselines, returns
def _log_performance(self, itr, batch, discount, prefix='Evaluation'): self_returns = [] env_returns = [] undiscounted_self_returns = [] undiscounted_env_returns = [] completion = [] success = [] for trajectory in batch.split(): self_returns.append( discount_cumsum(trajectory.self_rewards, discount)) env_returns.append( discount_cumsum(trajectory.env_rewards, discount)) undiscounted_self_returns.append(sum(trajectory.self_rewards)) undiscounted_env_returns.append(sum(trajectory.env_rewards)) completion.append(float(trajectory.terminals.any())) if 'success' in trajectory.env_infos: success.append(float(trajectory.env_infos['success'].any())) average_discounted_self_return = np.mean( [rtn[0] for rtn in self_returns]) average_discounted_env_return = np.mean( [rtn[0] for rtn in env_returns]) with tabular.prefix(prefix + '/'): tabular.record('Iteration', itr) tabular.record('NumTrajs', len(self_returns)) # pseudo reward tabular.record('AverageDiscountedSelfReturn', average_discounted_self_return) tabular.record('AverageSelfReturn', np.mean(undiscounted_self_returns)) tabular.record('StdSelfReturn', np.std(undiscounted_self_returns)) tabular.record('MaxSelfReturn', np.max(undiscounted_self_returns)) tabular.record('MinSelfReturn', np.min(undiscounted_self_returns)) # env reward tabular.record('AverageDiscountedEnvReturn', average_discounted_env_return) tabular.record('AverageEnvReturn', np.mean(undiscounted_env_returns)) tabular.record('StdEnvReturn', np.std(undiscounted_env_returns)) tabular.record('MaxEnvReturn', np.max(undiscounted_env_returns)) tabular.record('MinEnvReturn', np.min(undiscounted_env_returns)) tabular.record('CompletionRate', np.mean(completion)) if success: tabular.record('SuccessRate', np.mean(success)) return undiscounted_self_returns, undiscounted_env_returns
def log_performance(itr, batch, discount, trajectory_class=TrajectoryBatch, prefix='Evaluation'): """Evaluate the performance of an algorithm on a batch of trajectories. Args: itr (int): Iteration number. batch (TrajectoryBatch): The trajectories to evaluate with. discount (float): Discount value, from algorithm's property. prefix (str): Prefix to add to all logged keys. Returns: numpy.ndarray: Undiscounted returns. """ returns = [] undiscounted_returns = [] completion = [] success = [] for trajectory in batch.split(): if trajectory_class == TrajectoryBatch: returns.append(discount_cumsum(trajectory.rewards, discount)) undiscounted_returns.append(sum(trajectory.rewards)) else: returns.append(discount_cumsum(trajectory.env_rewards, discount)) undiscounted_returns.append(sum(trajectory.env_rewards)) completion.append(float(trajectory.terminals.any())) if 'success' in trajectory.env_infos: success.append(float(trajectory.env_infos['success'].any())) average_discounted_return = np.mean([rtn[0] for rtn in returns]) with tabular.prefix(prefix + '/'): tabular.record('Iteration', itr) tabular.record('NumTrajs', len(returns)) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) tabular.record('CompletionRate', np.mean(completion)) if success: tabular.record('SuccessRate', np.mean(success)) return undiscounted_returns
def _train_once(self, epoch, paths): """Perform one step of policy optimization given one batch of samples. Args: epoch (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: float: The average return of epoch cycle. """ returns = [] for path in paths: returns.append( tensor_utils.discount_cumsum(path['rewards'], self._discount)) avg_return = np.mean(np.concatenate(returns)) self._all_avg_returns.append(avg_return) if (epoch + 1) % self._n_samples == 0: avg_rtns = np.array(self._all_avg_returns) best_inds = np.argsort(-avg_rtns)[:self._n_best] best_params = np.array(self._all_params)[best_inds] self._cur_mean = best_params.mean(axis=0) self._cur_std = best_params.std(axis=0) self.policy.set_param_values(self._cur_mean) avg_return = max(self._all_avg_returns) self._all_avg_returns.clear() self._all_params.clear() self._cur_params = self._sample_params(epoch) self._all_params.append(self._cur_params.copy()) self.policy.set_param_values(self._cur_params) return avg_return
def _train_once(self, samples): """Perform one step of policy optimization given one batch of samples. Args: samples (list[dict]): A list of collected samples. Returns: numpy.float64: Average return. """ obs = np.concatenate([path['observations'] for path in samples]) actions = np.concatenate([path['actions'] for path in samples]) returns = [] for path in samples: returns.append( tensor_utils.discount_cumsum(path['rewards'], self._discount)) returns = np.concatenate(returns) sess = tf.compat.v1.get_default_session() sess.run(self._train_op, feed_dict={ self._observation: obs, self._action: actions, self._returns: returns, }) return np.mean(returns)
def process_samples(self, itr, paths): """Process sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: dict: Processed sample data, with key * average_return: (float) """ for path in paths: path['returns'] = tensor_utils.discount_cumsum( path['rewards'], self.discount) valids = [len(path['actions']) for path in paths] obs = torch.stack([ pad_to_last(path['observations'], total_length=self.max_path_length, axis=0) for path in paths ]) actions = torch.stack([ pad_to_last(path['actions'], total_length=self.max_path_length, axis=0) for path in paths ]) rewards = torch.stack([ pad_to_last(path['rewards'], total_length=self.max_path_length) for path in paths ]) #print(valids) return valids, obs, actions, rewards
def _train_once(self, samples): """Perform one step of policy optimization given one batch of samples. Args: samples (list[dict]): A list of collected paths. Returns: numpy.float64: Average return. """ losses = [] self._policy_opt.zero_grad() for path in samples: returns_numpy = tensor_utils.discount_cumsum( path['rewards'], self._discount) returns = torch.Tensor(returns_numpy.copy()) obs = torch.Tensor(path['observations']) actions = torch.Tensor(path['actions']) dist = self.policy(obs)[0] log_likelihoods = dist.log_prob(actions) loss = (-log_likelihoods * returns).mean() loss.backward() losses.append(loss.item()) self._policy_opt.step() return np.mean(losses)
def evaluate_performance(self, itr, all_samples, loss_before, loss_after, kl_before, kl, policy_entropy): """Evaluate performance of this batch. Args: itr (int): Iteration number. all_samples (list[list[MAMLTrajectoryBatch]]): Two dimensional list of MAMLTrajectoryBatch of size [meta_batch_size * (num_grad_updates + 1)] loss_before (float): Loss before optimization step. loss_after (float): Loss after optimization step. kl_before (float): KL divergence before optimization step. kl (float): KL divergence after optimization step. policy_entropy (float): Policy entropy. Returns: float: The average return in last epoch cycle. """ tabular.record('Iteration', itr) for i in range(self._num_grad_updates + 1): all_rewards = [ path_rewards for task_samples in all_samples for path_rewards in task_samples[i].rewards.numpy() ] discounted_returns = [ tensor_utils.discount_cumsum(path_rewards, self._inner_algo.discount)[0] for path_rewards in all_rewards ] undiscounted_returns = np.sum(all_rewards, axis=-1) average_return = np.mean(undiscounted_returns) with tabular.prefix('Update_{0}/'.format(i)): tabular.record('AverageDiscountedReturn', np.mean(discounted_returns)) tabular.record('AverageReturn', average_return) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) tabular.record('NumTrajs', len(all_rewards)) with tabular.prefix(self._policy.name + '/'): tabular.record('LossBefore', loss_before) tabular.record('LossAfter', loss_after) tabular.record('dLoss', loss_before - loss_after) tabular.record('KLBefore', kl_before) tabular.record('KLAfter', kl) tabular.record('Entropy', policy_entropy) return average_return
def evaluate_performance(self, itr, batch): # pylint: disable=no-self-use r"""Evaluate the performance of the algorithm. Args: itr (int): Iteration number. batch (dict): Evaluation trajectories, representing the best current performance of the algorithm, with keys: * env_spec (garage.envs.EnvSpec): Specification for the environment from which this data was sampled. * observations (numpy.ndarray): A numpy array containing the observations for all time steps in this batch. * actions (numpy.ndarray): A numpy array containing the actions for all time steps in this batch. * rewards (numpy.ndarray): A numpy array containing the rewards for all time steps in this batch. * terminals (numpy.ndarray): A boolean numpy array containing the termination signals for all time steps in this batch. * env_infos (dict): A dict of numpy arrays arbitrary environment state information. * agent_infos (numpy.ndarray): A dict of numpy arrays arbitrary agent state information. * lengths (numpy.ndarray): An integer numpy array containing the length of each trajectory in this batch. * discount (float): Discount value, from algorithm's property. Returns: numpy.ndarray: Undiscounted returns. """ returns = [] for reward in batch['rewards']: rtn = np_tensor_utils.discount_cumsum(reward, batch['discount']) returns.append(rtn) average_discounted_return = np.mean([rtn[0] for rtn in returns]) undiscounted_returns = [sum(reward) for reward in batch['rewards']] tabular.record('Iteration', itr) tabular.record('Evaluation/NumTrajs', len(returns)) tabular.record('Evaluation/AverageDiscountedReturn', average_discounted_return) tabular.record('Evaluation/AverageReturn', np.mean(undiscounted_returns)) tabular.record('Evaluation/StdReturn', np.std(undiscounted_returns)) tabular.record('Evaluation/MaxReturn', np.max(undiscounted_returns)) tabular.record('Evaluation/MinReturn', np.min(undiscounted_returns)) return undiscounted_returns
def process_samples(self, itr, paths): r"""Process sample data based on the collected paths. Notes: P is the maximum path length (self.max_path_length) Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: torch.Tensor: The observations of the environment with shape :math:`(N, P, O*)`. torch.Tensor: The actions fed to the environment with shape :math:`(N, P, A*)`. torch.Tensor: The acquired rewards with shape :math:`(N, P)`. list[int]: Numbers of valid steps in each paths. torch.Tensor: Value function estimation at each step with shape :math:`(N, P)`. """ for path in paths: if 'returns' not in path: path['returns'] = tu.discount_cumsum(path['rewards'], self.discount) valids = torch.Tensor([len(path['actions']) for path in paths]).int() obs = torch.stack([ pad_to_last(path['observations'], total_length=self.max_path_length, axis=0) for path in paths ]) actions = torch.stack([ pad_to_last(path['actions'], total_length=self.max_path_length, axis=0) for path in paths ]) rewards = torch.stack([ pad_to_last(path['rewards'], total_length=self.max_path_length) for path in paths ]) baselines = torch.stack([ pad_to_last(self._get_baselines(path), total_length=self.max_path_length) for path in paths ]) return obs, actions, rewards, valids, baselines
def process_samples(self, itr, paths): """Process sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: tuple: * obs (torch.Tensor): The observations of the environment. * actions (torch.Tensor): The actions fed to the environment. * rewards (torch.Tensor): The acquired rewards. * valids (list[int]): Numbers of valid steps in each paths. * baselines (torch.Tensor): Value function estimation at each step. """ for path in paths: if 'returns' not in path: path['returns'] = tensor_utils.discount_cumsum( path['rewards'], self.discount) valids = [len(path['actions']) for path in paths] obs = torch.stack([ pad_to_last(path['observations'], total_length=self.max_path_length, axis=0) for path in paths ]) actions = torch.stack([ pad_to_last(path['actions'], total_length=self.max_path_length, axis=0) for path in paths ]) rewards = torch.stack([ pad_to_last(path['rewards'], total_length=self.max_path_length) for path in paths ]) baselines = torch.stack([ pad_to_last(self._get_baselines(path), total_length=self.max_path_length) for path in paths ]) return obs, actions, rewards, valids, baselines
def _process_samples(self, itr, paths): """Process sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: MAMLTrajectoryBatch: Processed samples data. """ for path in paths: path['returns'] = tensor_utils.discount_cumsum( path['rewards'], self._inner_algo.discount) self._baseline.fit(paths) obs, actions, rewards, valids, baselines \ = self._inner_algo.process_samples(itr, paths) return MAMLTrajectoryBatch(obs, actions, rewards, valids, baselines)
def log_performance(itr, batch, discount, prefix='Evaluation'): """Evaluate the performance of an algorithm on a batch of episodes. Args: itr (int): Iteration number. batch (EpisodeBatch): The episodes to evaluate with. discount (float): Discount value, from algorithm's property. prefix (str): Prefix to add to all logged keys. Returns: numpy.ndarray: Undiscounted returns. """ returns = [] undiscounted_returns = [] termination = [] success = [] for eps in batch.split(): returns.append(discount_cumsum(eps.rewards, discount)) undiscounted_returns.append(sum(eps.rewards)) termination.append( float( any(step_type == StepType.TERMINAL for step_type in eps.step_types))) if 'success' in eps.env_infos: success.append(float(eps.env_infos['success'].any())) average_discounted_return = np.mean([rtn[0] for rtn in returns]) with tabular.prefix(prefix + '/'): tabular.record('Iteration', itr) tabular.record('NumEpisodes', len(returns)) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) tabular.record('TerminationRate', np.mean(termination)) if success: tabular.record('SuccessRate', np.mean(success)) return undiscounted_returns
def process_samples(self, paths): r"""Process sample data based on the collected paths. Notes: P is the maximum episode length (self.max_episode_length) Args: paths (list[dict]): A list of collected paths Returns: torch.Tensor: The observations of the environment with shape :math:`(N, P, O*)`. torch.Tensor: The actions fed to the environment with shape :math:`(N, P, A*)`. torch.Tensor: The acquired rewards with shape :math:`(N, P)`. list[int]: Numbers of valid steps in each paths. torch.Tensor: Value function estimation at each step with shape :math:`(N, P)`. """ valids = torch.Tensor([len(path['actions']) for path in paths]).int() obs = torch.stack([ pad_to_last(path['observations'], total_length=self.max_episode_length, axis=0) for path in paths ]) actions = torch.stack([ pad_to_last(path['actions'], total_length=self.max_episode_length, axis=0) for path in paths ]) rewards = torch.stack([ pad_to_last(path['rewards'], total_length=self.max_episode_length) for path in paths ]) returns = torch.stack([ pad_to_last(tu.discount_cumsum(path['rewards'], self.discount).copy(), total_length=self.max_episode_length) for path in paths ]) with torch.no_grad(): baselines = self._value_function(obs) return obs, actions, rewards, returns, valids, baselines
def _process_samples(self, paths): """Process sample data based on the collected paths. Args: paths (list[dict]): A list of collected paths. Returns: MAMLTrajectoryBatch: Processed samples data. """ for path in paths: path['returns'] = tensor_utils.discount_cumsum( path['rewards'], self._inner_algo.discount).copy() self._train_value_function(paths) obs, actions, rewards, _, valids, baselines \ = self._inner_algo.process_samples(paths) return MAMLTrajectoryBatch(paths, obs, actions, rewards, valids, baselines)
def process_samples(self, itr, paths): """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: dict: Processed sample data, with key * average_return: (float) """ baselines = [] returns = [] max_path_length = self.max_path_length if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = tensor_utils.discount_cumsum( path['rewards'], self.discount) returns.append(path['returns']) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = (np.mean( [path['returns'][0] for path in paths])) undiscounted_returns = [sum(path['rewards']) for path in paths] self.episode_reward_mean.extend(undiscounted_returns) ent = np.sum(self.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict(average_return=np.mean(undiscounted_returns)) tabular.record('Iteration', itr) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('Extras/EpisodeRewardMean', np.mean(self.episode_reward_mean)) tabular.record('NumTrajs', len(paths)) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) return samples_data
def traj_list_to_tensors(paths, max_path_length, baseline_predictions, discount): """Return processed sample data based on the collected paths. Args: paths (list[dict]): A list of collected paths. max_path_length (int): Maximum length of a single rollout. baseline_predictions(numpy.ndarray): : Predicted value of GAE (Generalized Advantage Estimation) Baseline. discount (float): Environment reward discount. Returns: dict: Processed sample data, with key * observations (numpy.ndarray): Padded array of the observations of the environment * actions (numpy.ndarray): Padded array of the actions fed to the the environment * rewards (numpy.ndarray): Padded array of the acquired rewards * agent_infos (dict): a dictionary of {stacked tensors or dictionary of stacked tensors} * env_infos (dict): a dictionary of {stacked tensors or dictionary of stacked tensors} * rewards (numpy.ndarray): Padded array of the validity information """ baselines = [] returns = [] for idx, path in enumerate(paths): # baselines path["baselines"] = baseline_predictions[idx] baselines.append(path["baselines"]) # returns path["returns"] = tensor_utils.discount_cumsum( path["rewards"], discount) returns.append(path["returns"]) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) samples_data = dict( observations=obs, actions=actions, rewards=rewards, agent_infos=agent_infos, env_infos=env_infos, valids=valids, ) return samples_data
def process_samples(self, itr, paths): # pylint: disable=too-many-statements """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: dict: Processed sample data, with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * baselines: (numpy.ndarray) * returns: (numpy.ndarray) * valids: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) * paths: (list[dict]) * average_return: (numpy.float64) """ baselines = [] returns = [] total_steps = 0 max_path_length = self.max_path_length undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self.env_spec, paths), discount=self.discount) if self.flatten_input: paths = [ dict( observations=(self.env_spec.observation_space.flatten_n( path['observations'])), actions=( self.env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos'], dones=path['dones']) for path in paths ] else: paths = [ dict( observations=path['observations'], actions=( self.env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos'], dones=path['dones']) for path in paths ] if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): total_steps += len(path['rewards']) path_baselines = np.append(all_path_baselines[idx], 0) deltas = (path['rewards'] + self.discount * path_baselines[1:] - path_baselines[:-1]) path['advantages'] = np_tensor_utils.discount_cumsum( deltas, self.discount * self.gae_lambda) path['deltas'] = deltas for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = np_tensor_utils.discount_cumsum( path['rewards'], self.discount) returns.append(path['returns']) # make all paths the same length obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path['returns'] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) baselines = tensor_utils.pad_tensor_n(baselines, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) lengths = np.asarray([v.sum() for v in valids]) ent = np.sum(self.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) self.episode_reward_mean.extend(undiscounted_returns) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('Extras/EpisodeRewardMean', np.mean(self.episode_reward_mean)) samples_data = dict( observations=obs, actions=actions, rewards=rewards, baselines=baselines, returns=returns, valids=valids, lengths=lengths, agent_infos=agent_infos, env_infos=env_infos, paths=paths, average_return=np.mean(undiscounted_returns), ) return samples_data
def _process_samples(self, itr, paths): # pylint: disable=too-many-statements """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (OrderedDict[dict]): A list of collected paths for each task. In RL^2, there are n environments/tasks and paths in each of them will be concatenated at some point and fed to the policy. Returns: dict: Processed sample data, with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * returns: (numpy.ndarray) * valids: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) * paths: (list[dict]) * average_return: (numpy.float64) Raises: ValueError: If 'batch_idx' is not found. """ concatenated_paths = [] paths_by_task = collections.defaultdict(list) for path in paths: path['returns'] = np_tensor_utils.discount_cumsum( path['rewards'], self._discount) path['lengths'] = [len(path['rewards'])] if 'batch_idx' in path: paths_by_task[path['batch_idx']].append(path) elif 'batch_idx' in path['agent_infos']: paths_by_task[path['agent_infos']['batch_idx'][0]].append(path) else: raise ValueError( 'Batch idx is required for RL2 but not found, ' 'Make sure to use garage.tf.algos.rl2.RL2Worker ' 'for sampling') # all path in paths_by_task[i] are sampled from task[i] for _paths in paths_by_task.values(): concatenated_path = self._concatenate_paths(_paths) concatenated_paths.append(concatenated_path) # stack and pad to max path length of the concatenated # path, which will be fed to inner algo # i.e. max_path_length * episode_per_task concatenated_paths_stacked = ( np_tensor_utils.stack_and_pad_tensor_dict_list( concatenated_paths, self._inner_algo.max_path_length)) name_map = None if hasattr(self._task_sampler, '_envs') and hasattr( self._task_sampler._envs[0].env, 'all_task_names'): names = [ env.env.all_task_names[0] for env in self._task_sampler._envs ] name_map = dict(enumerate(names)) undiscounted_returns = log_multitask_performance( itr, TrajectoryBatch.from_trajectory_list(self._env_spec, paths), self._inner_algo.discount, name_map=name_map) concatenated_paths_stacked['paths'] = concatenated_paths concatenated_paths_stacked['average_return'] = np.mean( undiscounted_returns) return concatenated_paths_stacked
def paths_to_tensors(paths, max_episode_length, baseline_predictions, discount, gae_lambda): """Return processed sample data based on the collected paths. Args: paths (list[dict]): A list of collected paths. max_episode_length (int): Maximum length of a single rollout. baseline_predictions(numpy.ndarray): : Predicted value of GAE (Generalized Advantage Estimation) Baseline. discount (float): Environment reward discount. gae_lambda (float): Lambda used for generalized advantage estimation. Returns: dict: Processed sample data, with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * baselines: (numpy.ndarray) * returns: (numpy.ndarray) * valids: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) * paths: (list[dict]) """ baselines = [] returns = [] total_steps = 0 for idx, path in enumerate(paths): total_steps += len(path['rewards']) path_baselines = np.append(baseline_predictions[idx], 0) deltas = (path['rewards'] + discount * path_baselines[1:] - path_baselines[:-1]) path['advantages'] = np_tensor_utils.discount_cumsum( deltas, discount * gae_lambda) path['deltas'] = deltas for idx, path in enumerate(paths): # baselines path['baselines'] = baseline_predictions[idx] baselines.append(path['baselines']) # returns path['returns'] = np_tensor_utils.discount_cumsum( path['rewards'], discount) returns.append(path['returns']) # make all paths the same length obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_episode_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_episode_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_episode_length) returns = [path['returns'] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_episode_length) baselines = tensor_utils.pad_tensor_n(baselines, max_episode_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_episode_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_episode_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_episode_length) lengths = np.asarray([v.sum() for v in valids]) samples_data = dict( observations=obs, actions=actions, rewards=rewards, baselines=baselines, returns=returns, valids=valids, lengths=lengths, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) return samples_data
def process_samples(self, itr, paths): r"""Process sample data based on the collected paths. Notes: P is the maximum path length (self.max_path_length) Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: torch.Tensor: The observations of the environment with shape :math:`(N, P, O*)`. torch.Tensor: The actions fed to the environment with shape :math:`(N, P, A*)`. torch.Tensor: The acquired rewards with shape :math:`(N, P)`. list[int]: Numbers of valid steps in each paths. torch.Tensor: Value function estimation at each step with shape :math:`(N, P)`. """ if self._kwargs is None: for path in paths: if 'returns' not in path: path['returns'] = tu.discount_cumsum(path['rewards'], self.discount) valids = torch.Tensor([len(path['actions']) for path in paths]).int() obs = torch.stack([ pad_to_last(path['observations'], total_length=self.max_path_length, axis=0) for path in paths ]) actions = torch.stack([ pad_to_last(path['actions'], total_length=self.max_path_length, axis=0) for path in paths ]) rewards = torch.stack([ pad_to_last(path['rewards'], total_length=self.max_path_length) for path in paths ]) baselines = torch.stack([ pad_to_last(self._get_baselines(path), total_length=self.max_path_length) for path in paths ]) return obs, actions, rewards, valids, baselines else: if not self.initialized: self.initialize() for path in paths: if 'returns' not in path: path['returns'] = tu.discount_cumsum(path['rewards'], self.discount) for path in enumerate(paths): if self.mode.startswith('ours'): path = path[1] imgs = [img for img in path['env_infos']['imgs'] if img is not None] if not hasattr(self, 'means'): self.means = [] self.imgs = [] validdata = np.load(self._kwargs['modeldata']) for vp in range(self.nvp): context = np.array(imgs[0]) timgs = [] tfeats = [] nvideos = validdata.shape[1] for i in range(nvideos): if i % 10 == 0: print('feats %f'%i) skip = 1 input_img = ((validdata[::skip, i] + 1)*127.5).astype(np.uint8) tfeat, timg = sess.run([model.translated_z, model.out], {image: [input_img, [context]*self.batch_size, np.array(imgs)]}) timgs.append(timg) tfeats.append(tfeat) self.means.append(np.mean(tfeats, axis=0)) meanimgs = np.mean(timgs, axis=0) self.imgs.append(meanimgs) costs = 0 for vp in range(self.nvp): curimgs = np.array(imgs) feats, img_trans = sess.run([model.input_z, image_trans], {image: [curimgs, [curimgs[0]] * self.batch_size, curimgs]}) costs += np.sum((self.means[vp] - feats)**2, axis=1) + \ self._kwargs['scale']*np.sum((self.imgs[vp] - img_trans[0])**2, axis=(1, 2, 3)) for j in range(24): path["rewards"][j*2+1] -= costs[j] #* (j**2) valids = torch.Tensor([len(path['actions']) for path in paths]).int() obs = torch.stack([ pad_to_last(path['observations'], total_length=self.max_path_length, axis=0) for path in paths ]) actions = torch.stack([ pad_to_last(path['actions'], total_length=self.max_path_length, axis=0) for path in paths ]) rewards = torch.stack([ pad_to_last(path['rewards'], total_length=self.max_path_length) for path in paths ]) baselines = torch.stack([ pad_to_last(self._get_baselines(path), total_length=self.max_path_length) for path in paths ]) return obs, actions, rewards, valids, baselines
def process_samples(self, itr, paths): """Return processed sample data based on the collected paths. Parameters ---------- itr : int The iteration number. paths : list[dict] The collected paths from the sampler. Returns ------- samples_data : dict Processed sample data with same trajectory length (padded with 0) """ baselines = [] returns = [] max_path_length = self.max_path_length if self.flatten_input: paths = [ dict( observations=(self.env_spec.observation_space.flatten_n( path['observations'])), actions=( self.env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos']) for path in paths ] else: paths = [ dict( observations=path['observations'], actions=( self.env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos']) for path in paths ] if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = (path['rewards'] + self.discount * path_baselines[1:] - path_baselines[:-1]) path['advantages'] = np_tensor_utils.discount_cumsum( deltas, self.discount * self.gae_lambda) path['deltas'] = deltas for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = np_tensor_utils.discount_cumsum( path['rewards'], self.discount) returns.append(path['returns']) # make all paths the same length obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path['returns'] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) baselines = tensor_utils.pad_tensor_n(baselines, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) # average_discounted_return = (np.mean( # [path['returns'][0] for path in paths])) undiscounted_returns = [sum(path['rewards']) for path in paths] self.episode_reward_mean.extend(undiscounted_returns) # ent = np.sum(self.policy.distribution.entropy(agent_infos) * # valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, rewards=rewards, baselines=baselines, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, average_return=np.mean(undiscounted_returns), ) return samples_data
def process_samples(self, itr, paths): """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: dict: Processed sample data, with key * average_return: (float) """ baselines = [] returns = [] max_path_length = self.max_path_length if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = tensor_utils.discount_cumsum( path['rewards'], self.discount) returns.append(path['returns']) obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self.env_spec, paths), discount=self.discount) self.episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self.episode_reward_mean)) samples_data = dict(average_return=np.mean(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: dict: Processed sample data, with key * average_return: (float) """ baselines = [] returns = [] max_path_length = self.max_path_length if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = tensor_utils.discount_cumsum( path['rewards'], self.discount) returns.append(path['returns']) obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) terminals = [path['dones'] for path in paths] valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) lengths = np.asarray([v.sum() for v in valids]) ent = np.sum(self.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) undiscounted_returns = self.evaluate_performance( itr, dict(env_spec=None, observations=obs, actions=actions, rewards=rewards, terminals=terminals, env_infos=env_infos, agent_infos=agent_infos, lengths=lengths, discount=self.discount)) self.episode_reward_mean.extend(undiscounted_returns) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('Extras/EpisodeRewardMean', np.mean(self.episode_reward_mean)) samples_data = dict(average_return=np.mean(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): """Return processed sample data based on the collected paths. (same as in bath_polopt without entropy and tabular recording) Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: dict: Processed sample data, with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * baselines: (numpy.ndarray) * returns: (numpy.ndarray) * valids: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) * paths: (list[dict]) * average_return: (numpy.float64) """ baselines = [] returns = [] max_path_length = self.max_path_length if self.flatten_input: paths = [ dict( observations=(self.env_spec.observation_space.flatten_n( path['observations'])), actions=( self.env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos']) for path in paths ] else: paths = [ dict( observations=path['observations'], actions=( self.env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos']) for path in paths ] if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = (path['rewards'] + self.discount * path_baselines[1:] - path_baselines[:-1]) path['advantages'] = np_tensor_utils.discount_cumsum( deltas, self.discount * self.gae_lambda) path['deltas'] = deltas for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = np_tensor_utils.discount_cumsum( path['rewards'], self.discount) returns.append(path['returns']) # make all paths the same length obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path['returns'] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) baselines = tensor_utils.pad_tensor_n(baselines, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) # average_discounted_return = (np.mean( # [path['returns'][0] for path in paths])) undiscounted_returns = [sum(path['rewards']) for path in paths] self.episode_reward_mean.extend(undiscounted_returns) # ent = np.sum(self.policy.distribution.entropy(agent_infos) * # valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, rewards=rewards, baselines=baselines, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, average_return=np.mean(undiscounted_returns), ) return samples_data
def process_samples(self, itr, paths): """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: dict: Processed sample data. Note: The returned samples is a dictionary with keys - observations: (numpy.ndarray), shape [B * (T), *obs_dims] - actions: (numpy.ndarray), shape [B * (T), *act_dims] - rewards : (numpy.ndarray), shape [B * (T), ] - baselines: (numpy.ndarray), shape [B * (T), ] - returns: (numpy.ndarray), shape [B * (T), ] - lengths: (numpy.ndarray), shape [P, ], i-th entry represents the length of i-th path. - valids: (numpy.ndarray), shape [P, ], [i, j] entry is 1 if the j-th sample in i-th path is valid, otherwise 0. - agent_infos: (dict), see OnPolicyVectorizedSampler.obtain_samples() - env_infos: (dict), see OnPolicyVectorizedSampler.obtain_samples() - paths: (list[dict]) The original path with observation or action flattened - average_return: (numpy.float64) where B = batch size, (T) = variable-length of each trajectory, P = number of paths. Notice that B * T equals to the total number of environment steps in all trajectories. """ baselines = [] returns = [] if self._flatten_input: paths = [ dict( observations=(self._env_spec.observation_space.flatten_n( path['observations'])), actions=( self._env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos']) for path in paths ] else: paths = [ dict( observations=path['observations'], actions=( self._env_spec.action_space.flatten_n( # noqa: E126 path['actions'])), rewards=path['rewards'], env_infos=path['env_infos'], agent_infos=path['agent_infos']) for path in paths ] if hasattr(self._baseline, 'predict_n'): all_path_baselines = self._baseline.predict_n(paths) else: all_path_baselines = [ self._baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = (path['rewards'] + self._discount * path_baselines[1:] - path_baselines[:-1]) path['advantages'] = np_tensor_utils.discount_cumsum( deltas, self._discount * self._gae_lambda) path['deltas'] = deltas # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = np_tensor_utils.discount_cumsum( path['rewards'], self._discount) returns.append(path['returns']) obs = np.concatenate([path['observations'] for path in paths]) actions = np.concatenate([path['actions'] for path in paths]) rewards = np.concatenate([path['rewards'] for path in paths]) returns = np.concatenate(returns) baselines = np.concatenate(baselines) agent_infos_path = [path['agent_infos'] for path in paths] agent_infos = dict() for key in self._policy.state_info_keys: agent_infos[key] = np.concatenate( [infos[key] for infos in agent_infos_path]) env_infos_path = [path['env_infos'] for path in paths] env_infos = dict() for key in paths[0]['env_infos'].keys(): env_infos[key] = np.concatenate( [infos[key] for infos in env_infos_path]) valids = np.asarray([np.ones_like(path['returns']) for path in paths]) lengths = np.asarray([v.sum() for v in valids]) average_discounted_return = (np.mean( [path['returns'][0] for path in paths])) undiscounted_returns = [sum(path['rewards']) for path in paths] self._episode_reward_mean.extend(undiscounted_returns) samples_data = dict( observations=obs, actions=actions, rewards=rewards, baselines=baselines, returns=returns, lengths=lengths, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, average_return=np.mean(undiscounted_returns), ) tabular.record('Iteration', itr) tabular.record('AverageDiscountedReturn', average_discounted_return) tabular.record('AverageReturn', np.mean(undiscounted_returns)) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) tabular.record('NumTrajs', len(paths)) tabular.record('StdReturn', np.std(undiscounted_returns)) tabular.record('MaxReturn', np.max(undiscounted_returns)) tabular.record('MinReturn', np.min(undiscounted_returns)) return samples_data