def test_explained_variance_1d(self): y = np.array([1, 2, 3, 4, 5, 0, 0, 0, 0, 0]) y_hat = np.array([2, 3, 4, 5, 6, 0, 0, 0, 0, 0]) valids = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) result = explained_variance_1d(y, y_hat, valids) assert result == 1.0 result = explained_variance_1d(y, y_hat) np.testing.assert_almost_equal(result, 0.95)
def _fit_baseline(self, samples_data): """Update baselines from samples. Args: samples_data (dict): Processed sample data. See process_samples() for details. """ policy_opt_input_values = self._policy_opt_input_values(samples_data) # Augment reward from baselines rewards_tensor = self._f_rewards(*policy_opt_input_values) returns_tensor = self._f_returns(*policy_opt_input_values) returns_tensor = np.squeeze(returns_tensor, -1) paths = samples_data['paths'] valids = samples_data['valids'] baselines = [path['baselines'] for path in paths] # Recompute parts of samples_data aug_rewards = [] aug_returns = [] for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids, paths): path['rewards'] = rew[val.astype(np.bool)] path['returns'] = ret[val.astype(np.bool)] aug_rewards.append(path['rewards']) aug_returns.append(path['returns']) aug_rewards = concat_tensor_list(aug_rewards) aug_returns = concat_tensor_list(aug_returns) samples_data['rewards'] = aug_rewards samples_data['returns'] = aug_returns # Calculate explained variance ev = np_tensor_utils.explained_variance_1d(np.concatenate(baselines), aug_returns) tabular.record('{}/ExplainedVariance'.format(self.baseline.name), ev) # Fit baseline logger.log('Fitting baseline...') if hasattr(self.baseline, 'fit_with_samples'): self.baseline.fit_with_samples(paths, samples_data) else: self.baseline.fit(paths)
def optimize_policy(self, itr, samples_data): """Optimize policy. Args: itr (int): Iteration number. samples_data (dict): Processed sample data. See process_samples() for details. """ policy_opt_input_values = self._policy_opt_input_values(samples_data) logger.log('Computing loss before') loss_before = self._optimizer.loss(policy_opt_input_values) logger.log('Computing KL before') policy_kl_before = self._f_policy_kl(*policy_opt_input_values) logger.log('Optimizing') self._optimizer.optimize(policy_opt_input_values) logger.log('Computing KL after') policy_kl = self._f_policy_kl(*policy_opt_input_values) logger.log('Computing loss after') loss_after = self._optimizer.loss(policy_opt_input_values) tabular.record('{}/LossBefore'.format(self.policy.name), loss_before) tabular.record('{}/LossAfter'.format(self.policy.name), loss_after) tabular.record('{}/dLoss'.format(self.policy.name), loss_before - loss_after) tabular.record('{}/KLBefore'.format(self.policy.name), policy_kl_before) tabular.record('{}/KL'.format(self.policy.name), policy_kl) pol_ent = self._f_policy_entropy(*policy_opt_input_values) ent = np.sum(pol_ent) / np.sum(samples_data['valids']) tabular.record('{}/Entropy'.format(self.policy.name), ent) tabular.record('{}/Perplexity'.format(self.policy.name), np.exp(ent)) self._fit_baseline_with_data(samples_data) ev = np_tensor_utils.explained_variance_1d(samples_data['baselines'], samples_data['returns'], samples_data['valids']) tabular.record('{}/ExplainedVariance'.format(self.baseline.name), ev) self._old_policy.model.parameters = self.policy.model.parameters
def optimize_policy(self, samples_data): """Optimize policy. Args: samples_data (dict): Processed sample data. See garage.tf.paths_to_tensors() for details. """ self._fit_baseline_with_data(samples_data) samples_data['baselines'] = self._get_baseline_prediction(samples_data) policy_opt_input_values = self._policy_opt_input_values(samples_data) # Train policy network logger.log('Computing loss before') loss_before = self._optimizer.loss(policy_opt_input_values) logger.log('Computing KL before') policy_kl_before = self._f_policy_kl(*policy_opt_input_values) logger.log('Optimizing') self._optimizer.optimize(policy_opt_input_values) logger.log('Computing KL after') policy_kl = self._f_policy_kl(*policy_opt_input_values) logger.log('Computing loss after') loss_after = self._optimizer.loss(policy_opt_input_values) tabular.record('{}/LossBefore'.format(self.policy.name), loss_before) tabular.record('{}/LossAfter'.format(self.policy.name), loss_after) tabular.record('{}/dLoss'.format(self.policy.name), loss_before - loss_after) tabular.record('{}/KLBefore'.format(self.policy.name), policy_kl_before) tabular.record('{}/KL'.format(self.policy.name), policy_kl) pol_ent = self._f_policy_entropy(*policy_opt_input_values) tabular.record('{}/Entropy'.format(self.policy.name), np.mean(pol_ent)) ev = np_tensor_utils.explained_variance_1d(samples_data['baselines'], samples_data['returns'], samples_data['valids']) tabular.record('{}/ExplainedVariance'.format(self._baseline.name), ev) self._old_policy.parameters = self.policy.parameters
def evaluate(self, policy_opt_input_values, samples_data): """Evaluate rewards and everything else. Args: policy_opt_input_values (list[np.ndarray]): Flattened policy optimization input values. samples_data (dict): Processed sample data. See process_samples() for details. Returns: dict: Processed sample data. """ # pylint: disable=too-many-statements # Augment reward from baselines rewards_tensor = self._f_rewards(*policy_opt_input_values) returns_tensor = self._f_returns(*policy_opt_input_values) returns_tensor = np.squeeze(returns_tensor, -1) paths = samples_data['paths'] valids = samples_data['valids'] baselines = [path['baselines'] for path in paths] env_rewards = [path['rewards'] for path in paths] env_rewards = concat_tensor_list(env_rewards.copy()) env_returns = [path['returns'] for path in paths] env_returns = concat_tensor_list(env_returns.copy()) env_average_discounted_return = (np.mean( [path['returns'][0] for path in paths])) # Recompute parts of samples_data aug_rewards = [] aug_returns = [] for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids, paths): path['rewards'] = rew[val.astype(np.bool)] path['returns'] = ret[val.astype(np.bool)] aug_rewards.append(path['rewards']) aug_returns.append(path['returns']) aug_rewards = concat_tensor_list(aug_rewards) aug_returns = concat_tensor_list(aug_returns) samples_data['rewards'] = aug_rewards samples_data['returns'] = aug_returns # Calculate effect of the entropy terms d_rewards = np.mean(aug_rewards - env_rewards) tabular.record('{}/EntRewards'.format(self.policy.name), d_rewards) aug_average_discounted_return = (np.mean( [path['returns'][0] for path in paths])) d_returns = np.mean(aug_average_discounted_return - env_average_discounted_return) tabular.record('{}/EntReturns'.format(self.policy.name), d_returns) # Calculate explained variance ev = np_tensor_utils.explained_variance_1d(np.concatenate(baselines), aug_returns) tabular.record('{}/ExplainedVariance'.format(self._baseline.name), ev) inference_rmse = (samples_data['trajectory_infos']['mean'] - samples_data['latents'])**2. inference_rmse = np.sqrt(inference_rmse.mean()) tabular.record('Inference/RMSE', inference_rmse) inference_rrse = np_tensor_utils.rrse( samples_data['latents'], samples_data['trajectory_infos']['mean']) tabular.record('Inference/RRSE', inference_rrse) embed_ent = self._f_encoder_entropy(*policy_opt_input_values) tabular.record('{}/Encoder/Entropy'.format(self.policy.name), embed_ent) infer_ce = self._f_inference_ce(*policy_opt_input_values) tabular.record('Inference/CrossEntropy', infer_ce) pol_ent = self._f_policy_entropy(*policy_opt_input_values) pol_ent = np.sum(pol_ent) / np.sum(samples_data['valids']) tabular.record('{}/Entropy'.format(self.policy.name), pol_ent) task_ents = self._f_task_entropies(*policy_opt_input_values) tasks = samples_data['tasks'][:, 0, :] _, task_indices = np.nonzero(tasks) path_lengths = np.sum(samples_data['valids'], axis=1) for t in range(self.policy.task_space.flat_dim): lengths = path_lengths[task_indices == t] completed = lengths < self.max_path_length pct_completed = np.mean(completed) tabular.record('Tasks/EpisodeLength/t={}'.format(t), np.mean(lengths)) tabular.record('Tasks/CompletionRate/t={}'.format(t), pct_completed) tabular.record('Tasks/Entropy/t={}'.format(t), task_ents[t]) return samples_data