def test_stack_tensor_dict_list(self): results = stack_tensor_dict_list(self.data) assert results['obs'].shape == (2, 3) assert results['act'].shape == (2, 3) assert results['info']['lala'].shape == (2, 2) assert results['info']['baba'].shape == (2, 2) results = stack_tensor_dict_list(self.data2) assert results['obs'].shape == (2, 3) assert results['act'].shape == (2, 3) assert results['info']['lala'].shape == (2, 2) assert results['info']['baba'].shape == (2, )
def step(self, action_n): """Step all environments using the provided actions. Inserts an environment infor 'vec_env_executor.complete' containing the episode end signal (time limit reached or done signal from environment). Args: action_n (np.ndarray): Array of actions. Returns: tuple: Tuple containing: * observations (np.ndarray) * rewards (np.ndarray) * dones (np.ndarray): The done signal from the environment. * env_infos (dict[str, np.ndarray]) """ all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)] obs, rewards, dones, env_infos = list( map(list, list(zip(*all_results)))) dones = np.asarray(dones) rewards = np.asarray(rewards) self.ts += 1 completes = np.asarray(dones) if self.max_path_length is not None: completes[self.ts >= self.max_path_length] = True for (i, complete) in enumerate(completes): if complete: obs[i] = self.envs[i].reset() self.ts[i] = 0 env_infos[i]['vec_env_executor.complete'] = completes return (obs, rewards, dones, tensor_utils.stack_tensor_dict_list(env_infos))
def step(self, action_n): """Step all environments using the provided actions.""" all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)] obs, rewards, dones, env_infos = list( map(list, list(zip(*all_results)))) dones = np.asarray(dones) rewards = np.asarray(rewards) self.ts += 1 if self.max_path_length is not None: dones[self.ts >= self.max_path_length] = True for (i, done) in enumerate(dones): if done: obs[i] = self.envs[i].reset() self.ts[i] = 0 return obs, rewards, dones, tensor_utils.stack_tensor_dict_list( env_infos)
def worker_run_step(g, action_n, scope): """Step the environment on a worker.""" assert hasattr(g, 'parallel_vec_envs') assert scope in g.parallel_vec_envs env_template = g.parallel_vec_env_template[scope] ids = [] step_results = [] for (idx, env) in g.parallel_vec_envs[scope]: action = action_n[idx] ids.append(idx) step_results.append(tuple(env.step(action))) if not step_results: return None obs, rewards, dones, env_infos = list(map(list, list(zip(*step_results)))) obs = env_template.observation_space.flatten_n(obs) rewards = np.asarray(rewards) dones = np.asarray(dones) env_infos = tensor_utils.stack_tensor_dict_list(env_infos) return ids, obs, rewards, dones, env_infos
def step(self, action_n): """Step all environments using the provided actions.""" results = singleton_pool.run_each( worker_run_step, [(action_n, self.scope) for _ in self._alloc_env_ids], ) results = [x for x in results if x is not None] ids, obs, rewards, dones, env_infos = list(zip(*results)) ids = np.concatenate(ids) obs = self.observation_space.unflatten_n(np.concatenate(obs)) rewards = np.concatenate(rewards) dones = np.concatenate(dones) env_infos = tensor_utils.split_tensor_dict_list( tensor_utils.concat_tensor_dict_list(env_infos)) if env_infos is None: env_infos = [dict() for _ in range(self.num_envs)] items = list(zip(ids, obs, rewards, dones, env_infos)) items = sorted(items, key=lambda x: x[0]) ids, obs, rewards, dones, env_infos = list(zip(*items)) obs = list(obs) rewards = np.asarray(rewards) dones = np.asarray(dones) self.ts += 1 dones[self.ts >= self.max_path_length] = True reset_obs = self._run_reset(dones) for (i, done) in enumerate(dones): if done: obs[i] = reset_obs[i] self.ts[i] = 0 return obs, rewards, dones, tensor_utils.stack_tensor_dict_list( list(env_infos))
def paths_to_tensors(paths, max_path_length, baseline_predictions, discount): """Return processed sample data based on the collected paths. Args: paths (list[dict]): A list of collected paths. max_path_length (int): Maximum length of a single rollout. baseline_predictions(numpy.ndarray): : Predicted value of GAE (Generalized Advantage Estimation) Baseline. discount (float): Environment reward discount. Returns: dict: Processed sample data, with key * observations (numpy.ndarray): Padded array of the observations of the environment * actions (numpy.ndarray): Padded array of the actions fed to the the environment * rewards (numpy.ndarray): Padded array of the acquired rewards * agent_infos (dict): a dictionary of {stacked tensors or dictionary of stacked tensors} * env_infos (dict): a dictionary of {stacked tensors or dictionary of stacked tensors} * rewards (numpy.ndarray): Padded array of the validity information """ baselines = [] returns = [] for idx, path in enumerate(paths): # baselines path['baselines'] = baseline_predictions[idx] baselines.append(path['baselines']) # returns path['returns'] = tensor_utils.discount_cumsum(path['rewards'], discount) returns.append(path['returns']) obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) samples_data = dict(observations=obs, actions=actions, rewards=rewards, agent_infos=agent_infos, env_infos=env_infos, valids=valids) return samples_data
def rollout_pearl(env, agent, *, max_path_length=np.inf, animated=False, speedup=1, deterministic=False, accum_context=True): """Sample a single rollout of the agent in the environment. Args: agent(Policy): Agent used to select actions. env(gym.Env): Environment to perform actions in. max_path_length(int): If the rollout reaches this many timesteps, it is terminated. animated(bool): If true, render the environment after each step. speedup(float): Factor by which to decrease the wait time between rendered steps. Only relevant, if animated == true. deterministic (bool): If true, use the mean action returned by the stochastic policy instead of sampling from the returned action distribution. Returns: dict[str, np.ndarray or dict]: Dictionary, with keys: * observations(np.array): Non-flattened array of observations. There should be one more of these than actions. Note that observations[i] (for i < len(observations) - 1) was used by the agent to choose actions[i]. Should have shape (T + 1, S^*) (the unflattened state space of the current environment). * actions(np.array): Non-flattened array of actions. Should have shape (T, S^*) (the unflattened action space of the current environment). * rewards(np.array): Array of rewards of shape (T,) (1D array of length timesteps). * agent_infos(Dict[str, np.array]): Dictionary of stacked, non-flattened `agent_info` arrays. * env_infos(Dict[str, np.array]): Dictionary of stacked, non-flattened `env_info` arrays. """ observations = [] actions = [] rewards = [] terminals = [] agent_infos = [] env_infos = [] o = env.reset() next_o = None agent.reset() path_length = 0 if animated: env.render() while path_length < max_path_length: a, agent_info = agent.get_action(o) if deterministic and 'mean' in agent_infos: a = agent_info['mean'] next_o, r, d, env_info = env.step(a) if accum_context: agent.update_context([o, a, r, next_o, d, env_info]) observations.append(o) rewards.append(r) terminals.append(d) actions.append(a) agent_infos.append(agent_info) env_infos.append(env_info) path_length += 1 if d: break o = next_o if animated: env.render() timestep = 0.05 time.sleep(timestep / speedup) actions = np.array(actions) if len(actions.shape) == 1: actions = np.expand_dims(actions, 1) observations = np.array(observations) if len(observations.shape) == 1: observations = np.expand_dims(observations, 1) next_o = np.array([next_o]) next_observations = np.vstack( (observations[1:, :], np.expand_dims(next_o, 0))) # LWM return dict( observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), next_observations=next_observations, terminals=np.array(terminals).reshape(-1, 1), agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), env_infos=tensor_utils.stack_tensor_dict_list(env_infos), )
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Sample the policy for new trajectories. If batch size is not specified, episode per task by default is 1 so batch size will be meta_batch_size * max_path_length. When number of workers are less than meta batch size, sampling will be performed for each of self._vec_envs_indices in series. The i-th value of self._vec_envs_indices represents the indices of the environments/tasks to be sampled for the i-th iteration. Args: itr (int): Iteration number. batch_size (int): Number of samples to be collected. If None, it will be default [algo.max_path_length * n_envs]. whole_paths (bool): Whether return all the paths or not. True by default. It's possible for the paths to have total actual sample size larger than batch_size, and will be truncated if this flag is true. Returns: OrderedDict: Sample paths. Key represents the index of the environment/task and value represents all the paths sampled from that particular environment/task. Note: Each path is a dictionary, with keys and values as following: * observations: numpy.ndarray with shape :math:`[N, S^*]` * actions: numpy.ndarray with shape :math:`[N, S^*]` * rewards: numpy.ndarray with shape :math:`[N, S^*]` * dones: numpy.ndarray with shape :math:`[N, S^*]` * env_infos: A dictionary with each key representing one environment info, value being a numpy.ndarray with shape :math:`[N, S^*]`. One example is "ale.lives" for atari environments. * agent_infos: A dictionary with each key representing one agent info, value being a numpy.ndarray with shape :math:`[N, S^*]`. One example is "prev_action", which is used for recurrent policy as previous action input, merged with the observation input as the state input. """ logger.log('Obtaining samples for iteration %d...' % itr) if batch_size is None: batch_size = self.algo.max_path_length * self._meta_batch_size paths = [] tasks = self.env.sample_tasks(self._meta_batch_size) # Start main loop batch_size_per_loop = batch_size // len(self._vec_envs_indices) for vec_envs_indices in self._vec_envs_indices: self._setup_worker(vec_envs_indices, tasks) n_samples = 0 obses = self._vec_env.reset() dones = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs pbar = ProgBarCounter(batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy # Only reset policies at the beginning of a meta batch policy.reset(dones) while n_samples < batch_size_per_loop: t = time.time() actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self._vec_env.step( actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] if agent_infos is None: agent_infos = [ dict() for _ in range(self._vec_env.num_envs) ] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], dones=[], env_infos=[], agent_infos=[], ) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['dones'].append(done) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) if done: obs = np.asarray(running_paths[idx]['observations']) actions = np.asarray(running_paths[idx]['actions']) paths.append( dict(observations=obs, actions=actions, rewards=np.asarray( running_paths[idx]['rewards']), dones=np.asarray(running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils. stack_tensor_dict_list( running_paths[idx]['agent_infos']), batch_idx=idx)) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) return paths if whole_paths else truncate_paths(paths, batch_size)
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Sample the policy for new trajectories. Args: itr (int): Iteration number. batch_size (int): Number of samples to be collected. If None, it will be default [algo.max_path_length * n_envs]. whole_paths (bool): Whether return all the paths or not. True by default. It's possible for the paths to have total actual sample size larger than batch_size, and will be truncated if this flag is true. Returns: list[dict]: Sample paths. Note: Each path is a dictionary, with keys and values as following: * observations: numpy.ndarray with shape [Batch, *obs_dims] * actions: numpy.ndarray with shape [Batch, *act_dims] * rewards: numpy.ndarray with shape [Batch, ] * env_infos: A dictionary with each key representing one environment info, value being a numpy.ndarray with shape [Batch, ?]. One example is "ale.lives" for atari environments. * agent_infos: A dictionary with each key representing one agent info, value being a numpy.ndarray with shape [Batch, ?]. One example is "prev_action", which is used for recurrent policy as previous action input, merged with the observation input as the state input. * dones: numpy.ndarray with shape [Batch, ] """ logger.log('Obtaining samples for iteration %d...' % itr) if not batch_size: batch_size = self.algo.max_path_length * self._n_envs paths = [] n_samples = 0 obses = self._vec_env.reset() dones = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy with click.progressbar(length=batch_size, label='Sampling') as pbar: while n_samples < batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = \ self._vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] if agent_infos is None: agent_infos = [ dict() for _ in range(self._vec_env.num_envs) ] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict(observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], dones=[]) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) running_paths[idx]['dones'].append(done) if done: obs = np.asarray(running_paths[idx]['observations']) actions = np.asarray(running_paths[idx]['actions']) paths.append( dict(observations=obs, actions=actions, rewards=np.asarray( running_paths[idx]['rewards']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils. stack_tensor_dict_list( running_paths[idx]['agent_infos']), dones=np.asarray( running_paths[idx]['dones']))) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.update(len(obses)) obses = next_obses tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) return paths if whole_paths else truncate_paths(paths, batch_size)
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. whole_paths(bool): Not effective. Only keep here to comply with base class. Raises: ValueError: If the algorithm doesn't have an exploration_policy field. Returns: list: A list of paths. """ assert batch_size is not None paths = [] if not self._no_reset or self._last_obses is None: obses = self._vec_env.reset() else: obses = self._last_obses completes = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs n_samples = 0 policy = self.algo.exploration_policy if policy is None: raise ValueError('OffPolicyVectoriizedSampler should only be used ' 'with an exploration_policy.') while n_samples < batch_size: policy.reset(completes) obs_space = self.algo.env_spec.observation_space input_obses = obs_space.flatten_n(obses) actions, agent_infos = policy.get_actions(input_obses) next_obses, rewards, dones, env_infos = \ self._vec_env.step(actions) completes = env_infos['vec_env_executor.complete'] self._last_obses = next_obses agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) n_samples += len(next_obses) if agent_infos is None: agent_infos = [dict() for _ in range(self._vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] for idx, reward, env_info, done, obs, next_obs, action in zip( itertools.count(), rewards, env_infos, dones, obses, next_obses, actions): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], observations=[], next_observations=[], actions=[], env_infos=[], dones=[], undiscounted_return=self._last_uncounted_discount[idx], # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=self._last_running_length[idx], success_count=self._last_success_count[idx]) running_paths[idx]['rewards'].append(reward) running_paths[idx]['observations'].append(obs) running_paths[idx]['next_observations'].append(next_obs) running_paths[idx]['actions'].append(action) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 self._last_uncounted_discount[idx] += reward self._last_success_count[idx] += env_info.get( 'is_success') or 0 self._last_running_length[idx] += 1 if done or n_samples >= batch_size: paths.append( dict( rewards=np.asarray(running_paths[idx]['rewards']), dones=np.asarray(running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx]['success_count'])) act_space = self._env_spec.action_space path_dict = {} path_dict['observations'] = obs_space.flatten_n( running_paths[idx]['observations']) path_dict['next_observations'] = obs_space.flatten_n( running_paths[idx]['next_observations']) path_dict['rewards'] = np.asarray( running_paths[idx]['rewards']).reshape(-1, 1) path_dict['terminals'] = np.asarray( running_paths[idx]['dones']).reshape(-1, 1) path_dict['actions'] = act_space.flatten_n( running_paths[idx]['actions']) self.algo.replay_buffer.add_path(path_dict) running_paths[idx] = None if done: self._last_running_length[idx] = 0 self._last_success_count[idx] = 0 self._last_uncounted_discount[idx] = 0 obses = next_obses return paths
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. whole_paths(bool): Not effective. Only keep here to comply with base class. Returns: list: A list of paths. """ assert batch_size is not None paths = [] if not self._no_reset or self._last_obses is None: obses = self._vec_env.reset() else: obses = self._last_obses dones = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs n_samples = 0 policy = self.algo.policy if self.algo.es: self.algo.es.reset() while n_samples < batch_size: policy.reset(dones) if self.algo.input_include_goal: obs = [obs['observation'] for obs in obses] d_g = [obs['desired_goal'] for obs in obses] a_g = [obs['achieved_goal'] for obs in obses] input_obses = np.concatenate((obs, d_g), axis=-1) else: input_obses = obses obs_normalized = tensor_utils.normalize_pixel_batch( self._env_spec, input_obses) if self.algo.es: actions, agent_infos = self.algo.es.get_actions( itr, obs_normalized, self.algo.policy) else: actions, agent_infos = self.algo.policy.get_actions( obs_normalized) next_obses, rewards, dones, env_infos = self._vec_env.step(actions) self._last_obses = next_obses agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) n_samples += len(next_obses) if agent_infos is None: agent_infos = [dict() for _ in range(self._vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] if self.algo.input_include_goal: self.algo.replay_buffer.add_transitions( observation=obs, action=actions, goal=d_g, achieved_goal=a_g, terminal=dones, next_observation=[ next_obs['observation'] for next_obs in next_obses ], next_achieved_goal=[ next_obs['achieved_goal'] for next_obs in next_obses ], ) else: self.algo.replay_buffer.add_transitions( observation=obses, action=actions, reward=rewards, terminal=dones, next_observation=next_obses, ) for idx, reward, env_info, done in zip(itertools.count(), rewards, env_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], env_infos=[], dones=[], undiscounted_return=self._last_uncounted_discount[idx], # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=self._last_running_length[idx], success_count=self._last_success_count[idx]) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 self._last_uncounted_discount[idx] += reward self._last_success_count[idx] += env_info.get( 'is_success') or 0 self._last_running_length[idx] += 1 if done or n_samples >= batch_size: paths.append( dict( rewards=np.asarray(running_paths[idx]['rewards']), dones=np.asarray(running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx]['success_count'])) running_paths[idx] = None if done: self._last_running_length[idx] = 0 self._last_success_count[idx] = 0 self._last_uncounted_discount[idx] = 0 if self.algo.es: self.algo.es.reset() obses = next_obses return paths
def process_samples(self, itr, paths): """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: dict: Processed sample data, with key * average_return: (float) """ baselines = [] returns = [] max_path_length = self.max_path_length if hasattr(self.baseline, 'predict_n'): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [ self.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): # baselines path['baselines'] = all_path_baselines[idx] baselines.append(path['baselines']) # returns path['returns'] = tensor_utils.discount_cumsum( path['rewards'], self.discount) returns.append(path['returns']) obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) ent = np.sum(self.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self.env_spec, paths), discount=self.discount) self.episode_reward_mean.extend(undiscounted_returns) tabular.record('Entropy', ent) tabular.record('Perplexity', np.exp(ent)) tabular.record('Extras/EpisodeRewardMean', np.mean(self.episode_reward_mean)) samples_data = dict(average_return=np.mean(undiscounted_returns)) return samples_data