def _evaluate(self, policy_opt_input_values, samples_data): """Evaluate rewards and everything else. Args: policy_opt_input_values (list[np.ndarray]): Flattened policy optimization input values. samples_data (dict): Processed sample data. See process_samples() for details. Returns: dict: Processed sample data. """ # pylint: disable=too-many-statements # Augment reward from baselines rewards_tensor = self._f_rewards(*policy_opt_input_values) returns_tensor = self._f_returns(*policy_opt_input_values) returns_tensor = np.squeeze(returns_tensor, -1) paths = samples_data['paths'] valids = samples_data['valids'] baselines = [path['baselines'] for path in paths] env_rewards = [path['rewards'] for path in paths] env_rewards = concat_tensor_list(env_rewards.copy()) env_returns = [path['returns'] for path in paths] env_returns = concat_tensor_list(env_returns.copy()) env_average_discounted_return = (np.mean( [path['returns'][0] for path in paths])) # Recompute parts of samples_data aug_rewards = [] aug_returns = [] for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids, paths): path['rewards'] = rew[val.astype(np.bool)] path['returns'] = ret[val.astype(np.bool)] aug_rewards.append(path['rewards']) aug_returns.append(path['returns']) aug_rewards = concat_tensor_list(aug_rewards) aug_returns = concat_tensor_list(aug_returns) samples_data['rewards'] = aug_rewards samples_data['returns'] = aug_returns # Calculate effect of the entropy terms d_rewards = np.mean(aug_rewards - env_rewards) tabular.record('{}/EntRewards'.format(self.policy.name), d_rewards) aug_average_discounted_return = (np.mean( [path['returns'][0] for path in paths])) d_returns = np.mean(aug_average_discounted_return - env_average_discounted_return) tabular.record('{}/EntReturns'.format(self.policy.name), d_returns) # Calculate explained variance ev = explained_variance_1d(np.concatenate(baselines), aug_returns) tabular.record('{}/ExplainedVariance'.format(self._baseline.name), ev) inference_rmse = (samples_data['trajectory_infos']['mean'] - samples_data['latents'])**2. inference_rmse = np.sqrt(inference_rmse.mean()) tabular.record('Inference/RMSE', inference_rmse) inference_rrse = rrse(samples_data['latents'], samples_data['trajectory_infos']['mean']) tabular.record('Inference/RRSE', inference_rrse) embed_ent = self._f_encoder_entropy(*policy_opt_input_values) tabular.record('{}/Encoder/Entropy'.format(self.policy.name), embed_ent) infer_ce = self._f_inference_ce(*policy_opt_input_values) tabular.record('Inference/CrossEntropy', infer_ce) pol_ent = self._f_policy_entropy(*policy_opt_input_values) pol_ent = np.sum(pol_ent) / np.sum(samples_data['valids']) tabular.record('{}/Entropy'.format(self.policy.name), pol_ent) task_ents = self._f_task_entropies(*policy_opt_input_values) tasks = samples_data['tasks'][:, 0, :] _, task_indices = np.nonzero(tasks) path_lengths = np.sum(samples_data['valids'], axis=1) for t in range(self.policy.task_space.flat_dim): lengths = path_lengths[task_indices == t] completed = lengths < self.max_episode_length pct_completed = np.mean(completed) tabular.record('Tasks/EpisodeLength/t={}'.format(t), np.mean(lengths)) tabular.record('Tasks/TerminationRate/t={}'.format(t), pct_completed) tabular.record('Tasks/Entropy/t={}'.format(t), task_ents[t]) return samples_data
def _evaluate(self, policy_opt_input_values, episodes, baselines, embed_ep_infos): """Evaluate rewards and everything else. Args: policy_opt_input_values (list[np.ndarray]): Flattened policy optimization input values. episodes (EpisodeBatch): Batch of episodes. baselines (np.ndarray): Baseline predictions. embed_ep_infos (dict): Embedding distribution information. Returns: dict: Paths for fitting the baseline. """ # pylint: disable=too-many-statements fit_paths = [] valids = episodes.valids observations = episodes.padded_observations tasks = pad_batch_array(episodes.env_infos['task_onehot'], episodes.lengths, self.max_episode_length) latents = pad_batch_array(episodes.agent_infos['latent'], episodes.lengths, self.max_episode_length) baselines_list = [] for baseline, valid in zip(baselines, valids): baselines_list.append(baseline[valid.astype(np.bool)]) # Augment reward from baselines rewards_tensor = self._f_rewards(*policy_opt_input_values) returns_tensor = self._f_returns(*policy_opt_input_values) returns_tensor = np.squeeze(returns_tensor, -1) env_rewards = episodes.rewards env_returns = [ discount_cumsum(rwd, self._discount) for rwd in episodes.padded_rewards ] env_average_discounted_return = np.mean( [ret[0] for ret in env_returns]) # Recompute returns and prepare paths for fitting the baseline aug_rewards = [] aug_returns = [] for rew, ret, val, task, latent, obs in zip(rewards_tensor, returns_tensor, valids, tasks, latents, observations): returns = ret[val.astype(np.bool)] task = task[val.astype(np.bool)] latent = latent[val.astype(np.bool)] obs = obs[val.astype(np.bool)] aug_rewards.append(rew[val.astype(np.bool)]) aug_returns.append(returns) fit_paths.append( dict(observations=obs, tasks=task, latents=latent, returns=returns)) aug_rewards = concat_tensor_list(aug_rewards) aug_returns = concat_tensor_list(aug_returns) # Calculate effect of the entropy terms d_rewards = np.mean(aug_rewards - env_rewards) tabular.record('{}/EntRewards'.format(self.policy.name), d_rewards) aug_average_discounted_return = (np.mean( [ret[0] for ret in returns_tensor])) d_returns = np.mean(aug_average_discounted_return - env_average_discounted_return) tabular.record('{}/EntReturns'.format(self.policy.name), d_returns) # Calculate explained variance ev = explained_variance_1d(np.concatenate(baselines_list), aug_returns) tabular.record('{}/ExplainedVariance'.format(self._baseline.name), ev) inference_rmse = (embed_ep_infos['mean'] - latents)**2. inference_rmse = np.sqrt(inference_rmse.mean()) tabular.record('Inference/RMSE', inference_rmse) inference_rrse = rrse(latents, embed_ep_infos['mean']) tabular.record('Inference/RRSE', inference_rrse) embed_ent = self._f_encoder_entropy(*policy_opt_input_values) tabular.record('{}/Encoder/Entropy'.format(self.policy.name), embed_ent) infer_ce = self._f_inference_ce(*policy_opt_input_values) tabular.record('Inference/CrossEntropy', infer_ce) pol_ent = self._f_policy_entropy(*policy_opt_input_values) pol_ent = np.sum(pol_ent) / np.sum(episodes.lengths) tabular.record('{}/Entropy'.format(self.policy.name), pol_ent) task_ents = self._f_task_entropies(*policy_opt_input_values) tasks = tasks[:, 0, :] _, task_indices = np.nonzero(tasks) path_lengths = np.sum(valids, axis=1) for t in range(self.policy.task_space.flat_dim): lengths = path_lengths[task_indices == t] completed = lengths < self.max_episode_length pct_completed = np.mean(completed) tabular.record('Tasks/EpisodeLength/t={}'.format(t), np.mean(lengths)) tabular.record('Tasks/TerminationRate/t={}'.format(t), pct_completed) tabular.record('Tasks/Entropy/t={}'.format(t), task_ents[t]) return fit_paths