def log_diagnostics(self, paths, **kwargs):
        list_of_rewards, terminals, obs, actions, next_obs = split_paths(paths)

        returns = []
        for rewards in list_of_rewards:
            returns.append(np.sum(rewards))
        statistics = OrderedDict()
        statistics.update(
            create_stats_ordered_dict(
                'Undiscounted Returns',
                returns,
            ))
        statistics.update(
            create_stats_ordered_dict(
                'Rewards',
                list_of_rewards,
            ))
        statistics.update(create_stats_ordered_dict(
            'Actions',
            actions,
        ))

        fraction_of_time_on_platform = [o[1] for o in obs]
        statistics['Fraction of time on platform'] = np.mean(
            fraction_of_time_on_platform)

        for key, value in statistics.items():
            logger.record_tabular(key, value)
        return returns
Exemple #2
0
 def _statistics_from_paths(self, paths, stat_prefix):
     eval_replay_buffer = UpdatableSubtrajReplayBuffer(
         len(paths) * (self.max_path_length + 1),
         self.env,
         self.subtraj_length,
         self.memory_dim,
     )
     for path in paths:
         eval_replay_buffer.add_trajectory(path)
     raw_subtraj_batch = eval_replay_buffer.get_all_valid_subtrajectories()
     assert raw_subtraj_batch is not None
     subtraj_batch = create_torch_subtraj_batch(raw_subtraj_batch)
     if self.save_memory_gradients:
         subtraj_batch['memories'].requires_grad = True
     statistics = self._statistics_from_subtraj_batch(
         subtraj_batch, stat_prefix=stat_prefix
     )
     statistics.update(eval_util.get_generic_path_information(
         paths, stat_prefix="Test",
     ))
     env_actions = np.vstack([path["actions"][:self.action_dim] for path in
                              paths])
     writes = np.vstack([path["actions"][self.action_dim:] for path in
                         paths])
     statistics.update(create_stats_ordered_dict(
         'Env Actions', env_actions, stat_prefix=stat_prefix
     ))
     statistics.update(create_stats_ordered_dict(
         'Writes', writes, stat_prefix=stat_prefix
     ))
     return statistics
Exemple #3
0
    def debug_statistics(self):
        """
        Given an image $$x$$, samples a bunch of latents from the prior
        $$z_i$$ and decode them $$\hat x_i$$.
        Compare this to $$\hat x$$, the reconstruction of $$x$$.
        Ideally
         - All the $$\hat x_i$$s do worse than $$\hat x$$ (makes sure VAE
           isn’t ignoring the latent)
         - Some $$\hat x_i$$ do better than other $$\hat x_i$$ (tests for
           coverage)
        """
        debug_batch_size = 64
        data = self.get_batch(train=False)
        reconstructions, _, _ = self.model(data)
        img = data[0]
        recon_mse = ((reconstructions[0] - img)**2).mean().view(-1)
        img_repeated = img.expand((debug_batch_size, img.shape[0]))

        samples = ptu.randn(debug_batch_size, self.representation_size)
        random_imgs, _ = self.model.decode(samples)
        random_mses = (random_imgs - img_repeated)**2
        mse_improvement = ptu.get_numpy(random_mses.mean(dim=1) - recon_mse)
        stats = create_stats_ordered_dict(
            'debug/MSE improvement over random',
            mse_improvement,
        )
        stats.update(
            create_stats_ordered_dict(
                'debug/MSE of random decoding',
                ptu.get_numpy(random_mses),
            ))
        stats['debug/MSE of reconstruction'] = ptu.get_numpy(recon_mse)[0]
        return stats
 def log_diagnostics(self, paths, logger=default_logger):
     statistics = OrderedDict()
     for name_in_env_infos, name_to_log in [
         ('distance_to_target', 'Distance to Target'),
         ('speed', 'Speed'),
         ('distance_reward', 'Distance Reward'),
         ('action_reward', 'Action Reward'),
     ]:
         stats = get_stat_in_paths(paths, 'env_infos', name_in_env_infos)
         statistics.update(create_stats_ordered_dict(
             name_to_log,
             stats,
         ))
         final_stats = [s[-1] for s in stats]
         statistics.update(
             create_stats_ordered_dict(
                 "Final " + name_to_log,
                 final_stats,
                 always_show_all_stats=True,
             ))
     statistics.update(
         create_stats_ordered_dict(
             "Path Lengths",
             get_path_lengths(paths),
         ))
     for key, value in statistics.items():
         logger.record_tabular(key, value)
Exemple #5
0
    def log_diagnostics(self, paths):
        final_values = []
        final_unclipped_rewards = []
        final_rewards = []
        for path in paths:
            final_value = path["actions"][-1][0]
            final_values.append(final_value)
            score = path["observations"][0][0] * final_value
            final_unclipped_rewards.append(score)
            final_rewards.append(clip_magnitude(score, 1))

        last_statistics = OrderedDict()
        last_statistics.update(
            create_stats_ordered_dict(
                'Final Value',
                final_values,
            ))
        last_statistics.update(
            create_stats_ordered_dict(
                'Unclipped Final Rewards',
                final_unclipped_rewards,
            ))
        last_statistics.update(
            create_stats_ordered_dict(
                'Final Rewards',
                final_rewards,
            ))

        for key, value in last_statistics.items():
            logger.record_tabular(key, value)

        return final_unclipped_rewards
Exemple #6
0
    def log_diagnostics(self, paths, **kwargs):
        list_of_rewards, terminals, obs, actions, next_obs = split_paths(paths)

        returns = []
        for rewards in list_of_rewards:
            returns.append(np.sum(rewards))
        last_statistics = OrderedDict()
        last_statistics.update(
            create_stats_ordered_dict(
                'UndiscountedReturns',
                returns,
            ))
        last_statistics.update(
            create_stats_ordered_dict(
                'Rewards',
                list_of_rewards,
            ))
        last_statistics.update(create_stats_ordered_dict(
            'Actions',
            actions,
        ))

        for key, value in last_statistics.items():
            logger.record_tabular(key, value)
        return returns
Exemple #7
0
    def _statistics_from_batch(self, batch, stat_prefix):
        statistics = OrderedDict()

        train_dict = self.get_train_dict(batch)
        for name in [
                'Policy Loss',
        ]:
            tensor = train_dict[name]
            statistics_name = "{} {} Mean".format(stat_prefix, name)
            statistics[statistics_name] = np.mean(ptu.get_numpy(tensor))

        for name in [
                'QF Outputs',
                'Policy Actions',
        ]:
            tensor = train_dict[name]
            statistics.update(
                create_stats_ordered_dict('{} {}'.format(stat_prefix, name),
                                          ptu.get_numpy(tensor)))

        statistics.update(
            create_stats_ordered_dict("{} Env Actions".format(stat_prefix),
                                      ptu.get_numpy(batch['actions'])))

        return statistics
Exemple #8
0
    def _do_training(self):
        tmp_batch = self.get_batch()
        random_state = tmp_batch['observations']

        losses = []
        batch = self.get_batch()
        obs = batch['observations']
        actions = batch['actions']
        next_obs = batch['next_observations']
        ob_deltas_pred = self.model(obs, actions)

        next_obs_pred = obs + ob_deltas_pred
        if self.vectorized:
            distance_to_random_state_pred = (
                    (next_obs_pred - random_state)**2
            )
            distance_to_random_state = (
                    (next_obs - random_state)**2
            )

            squared_errors = (
                 distance_to_random_state_pred - distance_to_random_state
            )**2
            loss = squared_errors.mean()
        else:
            distance_to_random_state_pred = (
                    (next_obs_pred - random_state)**2
            ).sum(1, keepdim=True)
            distance_to_random_state = (
                    (next_obs - random_state)**2
            ).sum(1, keepdim=True)

            squared_errors = (
                distance_to_random_state_pred - distance_to_random_state
            )**2
            loss = squared_errors.mean()

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        losses.append(ptu.get_numpy(loss))

        if self.eval_statistics is None:
            self.eval_statistics = OrderedDict()
            self.eval_statistics.update(create_stats_ordered_dict(
                'Model Loss',
                losses,
                always_show_all_stats=True,
                exclude_max_min=True,
            ))
            self.eval_statistics.update(create_stats_ordered_dict(
                'Distance To Random State',
                ptu.get_numpy(distance_to_random_state),
            ))
            self.eval_statistics.update(create_stats_ordered_dict(
                'Distance To Random State Predicted',
                ptu.get_numpy(distance_to_random_state_pred),
            ))
Exemple #9
0
    def _do_training(self):
        batch = self.get_batch()
        """
        Optimize Critic/Actor.
        """
        rewards = batch['rewards']
        terminals = batch['terminals']
        obs = batch['observations']
        actions = batch['actions']
        next_obs = batch['next_observations']

        _, _, v_pred = self.target_policy(next_obs, None)
        y_target = self.reward_scale * rewards + (
            1. - terminals) * self.discount * v_pred
        y_target = y_target.detach()
        mu, y_pred, v = self.policy(obs, actions)
        policy_loss = self.policy_criterion(y_pred, y_target)

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()
        """
        Update Target Networks
        """
        if self.use_soft_update:
            ptu.soft_update_from_to(self.policy, self.target_policy, self.tau)
        else:
            if self._n_train_steps_total % self.target_hard_update_period == 0:
                ptu.copy_model_params_from_to(self.policy, self.target_policy)

        if self.need_to_update_eval_statistics:
            self.need_to_update_eval_statistics = False
            self.eval_statistics['Policy Loss'] = np.mean(
                ptu.get_numpy(policy_loss))
            self.eval_statistics.update(
                create_stats_ordered_dict(
                    'Policy v',
                    ptu.get_numpy(v),
                ))
            self.eval_statistics.update(
                create_stats_ordered_dict(
                    'Policy mu',
                    ptu.get_numpy(mu),
                ))
            self.eval_statistics.update(
                create_stats_ordered_dict(
                    'Y targets',
                    ptu.get_numpy(y_target),
                ))
            self.eval_statistics.update(
                create_stats_ordered_dict(
                    'Y predictions',
                    ptu.get_numpy(y_pred),
                ))
Exemple #10
0
 def log_diagnostics(self, paths, logger=default_logger):
     lms = get_stat_in_paths(paths, 'agent_infos', 'lagrange_multiplier')
     for key, value in create_stats_ordered_dict(
             "TDM LBFGS Lagrange Multiplier",
             lms,
     ).items():
         logger.record_tabular(key, value)
    def _do_training(self):
        batch = self.get_batch()
        obs = batch['observations']
        actions = batch['actions']
        next_obs = batch['next_observations']
        """
        Policy operations.
        """
        inputs = torch.cat((obs, self.env.convert_obs_to_goals(next_obs)),
                           dim=1)
        policy_actions = self.policy(inputs)
        policy_loss = self.policy_criterion(policy_actions, actions)
        """
        Update Networks
        """
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        if self.need_to_update_eval_statistics:
            self.need_to_update_eval_statistics = False
            """
            This way, these statistics are only computed for one batch.
            """
            self.eval_statistics = OrderedDict()
            self.eval_statistics['Policy Loss'] = np.mean(
                ptu.get_numpy(policy_loss))
            self.eval_statistics.update(
                create_stats_ordered_dict(
                    'Policy Action',
                    ptu.get_numpy(policy_actions),
                ))
    def _statistics_from_batch(self, batch, stat_prefix):
        statistics = OrderedDict()

        train_dict = self.get_train_dict(batch)
        for name in [
            'QF Loss',
            'Policy Loss',
        ]:
            tensor = train_dict[name]
            statistics_name = "{} {} Mean".format(stat_prefix, name)
            statistics[statistics_name] = np.mean(ptu.get_numpy(tensor))

        for name in [
            'Bellman Errors',
            'Target Value',
            'Target Advantage',
            'Predicted Value',
            'Predicted Advantage',
            'Policy Action Value',
            'Policy Action Advantage',
        ]:
            tensor = train_dict[name]
            statistics.update(create_stats_ordered_dict(
                '{} {}'.format(stat_prefix, name),
                ptu.get_numpy(tensor)
            ))

        return statistics
Exemple #13
0
    def _statistics_from_subtraj_batch(self, subtraj_batch, stat_prefix=''):
        statistics = OrderedDict()

        critic_dict = self.get_critic_output_dict(subtraj_batch)
        for name, tensor in critic_dict.items():
            statistics.update(create_stats_ordered_dict(
                '{} QF {}'.format(stat_prefix, name),
                ptu.get_numpy(tensor)
            ))

        policy_dict = self.get_policy_output_dict(subtraj_batch)
        for name, tensor in policy_dict.items():
            statistics.update(create_stats_ordered_dict(
                '{} Policy {}'.format(stat_prefix, name),
                ptu.get_numpy(tensor)
            ))
        return statistics
 def save_gradient_norm(gradient):
     if self.need_to_update_eval_statistics:
         self.extra_eval_statistics.update(
             create_stats_ordered_dict(
                 key,
                 ptu.get_numpy(gradient.data.norm(p=2, dim=1)),
                 always_show_all_stats=True,
             ))
 def log_diagnostics(self, paths, logger=default_logger):
     statistics = OrderedDict()
     for name_in_env_infos, name_to_log in [
         ('distance_to_target', 'Distance to Target'),
         ('reward_ctrl', 'Action Reward'),
     ]:
         stat = get_stat_in_paths(paths, 'env_infos', name_in_env_infos)
         statistics.update(create_stats_ordered_dict(
             name_to_log,
             stat,
         ))
     distances = get_stat_in_paths(paths, 'env_infos', 'distance_to_target')
     statistics.update(create_stats_ordered_dict(
         "Final Distance to Target",
         [ds[-1] for ds in distances],
     ))
     for key, value in statistics.items():
         logger.record_tabular(key, value)
Exemple #16
0
 def log_diagnostics(self, paths, logger=default_logger):
     statistics = OrderedDict()
     for name_in_env_infos, name_to_log in [
         ('posafter', 'Position'),
         ('height', 'Height'),
         ('angle', 'Angle'),
     ]:
         stats = get_stat_in_paths(paths, 'env_infos', name_in_env_infos)
         statistics.update(create_stats_ordered_dict(
             name_to_log,
             stats,
         ))
         statistics.update(
             create_stats_ordered_dict(
                 "Final " + name_to_log,
                 [s[-1] for s in stats],
             ))
     for key, value in statistics.items():
         logger.record_tabular(key, value)
 def get_diagnostics(self, paths):
     statistics = OrderedDict()
     for stat_name_in_paths, stat_name_to_print in [
         ('arm_object_distance', 'Distance hand to object'),
         ('arm_goal_distance', 'Distance hand to goal'),
     ]:
         stats = get_stat_in_paths(paths, 'env_infos', stat_name_in_paths)
         statistics.update(create_stats_ordered_dict(
             stat_name_to_print,
             stats,
             always_show_all_stats=True,
         ))
         final_stats = [s[-1] for s in stats]
         statistics.update(create_stats_ordered_dict(
             "Final " + stat_name_to_print,
             final_stats,
             always_show_all_stats=True,
         ))
     return statistics
 def get_diagnostics(self):
     path_lens = [len(path['actions']) for path in self._epoch_paths]
     stats = OrderedDict([
         ('num steps total', self._num_steps_total),
         ('num paths total', self._num_paths_total),
     ])
     stats.update(
         create_stats_ordered_dict(
             "path length",
             path_lens,
             always_show_all_stats=True,
         ))
     return stats
Exemple #19
0
    def log_diagnostics(self, paths):
        statistics = OrderedDict()

        for stat_name in [
                'arm to object distance',
                'object to goal distance',
                'arm to goal distance',
        ]:
            stat = get_stat_in_paths(paths, 'env_infos', stat_name)
            statistics.update(create_stats_ordered_dict(stat_name, stat))

        for key, value in statistics.items():
            logger.record_tabular(key, value)
    def log_diagnostics(self, paths):
        target_onehots = []
        for path in paths:
            first_observation = path["observations"][0][:self.n + 1]
            target_onehots.append(first_observation)

        final_predictions = []  # each element has shape (dim)
        nonfinal_predictions = []  # each element has shape (seq_length-1, dim)
        for path in paths:
            actions = path["actions"]
            if self._softmax_action:
                actions = softmax(actions, axis=-1)
            final_predictions.append(actions[-1])
            nonfinal_predictions.append(actions[:-1])
        nonfinal_predictions_sequence_dimension_flattened = np.vstack(
            nonfinal_predictions)  # shape = N X dim
        nonfinal_prob_zero = [
            softmax[0]
            for softmax in nonfinal_predictions_sequence_dimension_flattened
        ]
        final_probs_correct = []
        for final_prediction, target_onehot in zip(final_predictions,
                                                   target_onehots):
            correct_pred_idx = np.argmax(target_onehot)
            final_probs_correct.append(final_prediction[correct_pred_idx])
        final_prob_zero = [softmax[0] for softmax in final_predictions]

        last_statistics = OrderedDict()
        last_statistics.update(
            create_stats_ordered_dict('Final P(correct)', final_probs_correct))
        last_statistics.update(
            create_stats_ordered_dict('Non-final P(zero)', nonfinal_prob_zero))
        last_statistics.update(
            create_stats_ordered_dict('Final P(zero)', final_prob_zero))

        for key, value in last_statistics.items():
            logger.record_tabular(key, value)

        return final_probs_correct
 def get_diagnostics(self):
     if self._vae_sample_probs is None or self._vae_sample_priorities is None:
         stats = create_stats_ordered_dict(
             'VAE Sample Weights',
             np.zeros(self._size),
         )
         stats.update(create_stats_ordered_dict(
             'VAE Sample Probs',
             np.zeros(self._size),
         ))
     else:
         vae_sample_priorities = self._vae_sample_priorities[:self._size]
         vae_sample_probs = self._vae_sample_probs[:self._size]
         stats = create_stats_ordered_dict(
             'VAE Sample Weights',
             vae_sample_priorities,
         )
         stats.update(create_stats_ordered_dict(
             'VAE Sample Probs',
             vae_sample_probs,
         ))
     return stats
 def log_diagnostics(self, paths, logger=default_logger):
     statistics = OrderedDict()
     for stat_name_in_paths, stat_name_to_print in [
         ('hand_to_object_distance', 'Distance hand to object'),
         ('object_to_goal_distance', 'Distance object to goal'),
         ('hand_to_hand_goal_distance', 'Distance hand to hand goal'),
         ('success', 'Success (within 0.1)'),
     ]:
         stats = get_stat_in_paths(paths, 'env_infos', stat_name_in_paths)
         statistics.update(
             create_stats_ordered_dict(
                 stat_name_to_print,
                 stats,
                 always_show_all_stats=True,
             ))
         final_stats = [s[-1] for s in stats]
         statistics.update(
             create_stats_ordered_dict(
                 "Final " + stat_name_to_print,
                 final_stats,
                 always_show_all_stats=True,
             ))
     for key, value in statistics.items():
         logger.record_tabular(key, value)
Exemple #23
0
 def _statistics_from_paths(self, paths, stat_prefix):
     rewards, terminals, obs, actions, next_obs = split_paths(paths)
     np_batch = dict(
         rewards=rewards,
         terminals=terminals,
         observations=obs,
         actions=actions,
         next_observations=next_obs,
     )
     batch = np_to_pytorch_batch(np_batch)
     statistics = self._statistics_from_batch(batch, stat_prefix)
     statistics.update(
         create_stats_ordered_dict('Num Paths',
                                   len(paths),
                                   stat_prefix=stat_prefix))
     return statistics
    def _do_training(self):
        if not self.vectorized:
            return DQN._do_training(self)
        batch = self.get_batch()
        rewards = batch['rewards']
        terminals = batch['terminals']
        obs = batch['observations']
        actions = batch['actions']
        next_obs = batch['next_observations']
        goals = batch['goals']
        num_steps_left = batch['num_steps_left']
        """
        Compute loss
        """

        target_q_values = self.target_qf(
            next_obs,
            goals,
            num_steps_left - 1,
        ).detach().max(1, keepdim=False)[0]
        y_target = self.reward_scale * rewards + (
            1. - terminals) * self.discount * target_q_values
        y_target = y_target.detach()
        # actions is a one-hot vector
        y_pred = torch.sum(self.qf(obs, goals, num_steps_left) *
                           actions.unsqueeze(2),
                           dim=1,
                           keepdim=False)
        qf_loss = self.qf_criterion(y_pred, y_target)
        """
        Update networks
        """
        self.qf_optimizer.zero_grad()
        qf_loss.backward()
        self.qf_optimizer.step()
        self._update_target_network()

        if self.need_to_update_eval_statistics:
            self.need_to_update_eval_statistics = False
            self.eval_statistics['QF Loss'] = np.mean(ptu.get_numpy(qf_loss))
            self.eval_statistics.update(
                create_stats_ordered_dict(
                    'Y Predictions',
                    ptu.get_numpy(y_pred),
                ))
Exemple #25
0
    def _statistics_from_paths(self, paths, stat_prefix):
        eval_replay_buffer = SubtrajReplayBuffer(
            len(paths) * (self.max_path_length + 1),
            self.env,
            self.subtraj_length,
        )
        for path in paths:
            eval_replay_buffer.add_trajectory(path)
        raw_subtraj_batch = eval_replay_buffer.get_all_valid_subtrajectories()
        assert raw_subtraj_batch is not None
        subtraj_batch = create_torch_subtraj_batch(raw_subtraj_batch)

        statistics = self._statistics_from_batch(
            subtraj_batch, stat_prefix=stat_prefix
        )
        statistics.update(create_stats_ordered_dict(
            'Num Paths', len(paths), stat_prefix=stat_prefix
        ))
        return statistics
    def train_from_torch(self, batch):
        rewards = batch['rewards']
        terminals = batch['terminals']
        obs = batch['observations']
        actions = batch['actions']
        next_obs = batch['next_observations']
        """
        Compute loss
        """

        best_action_idxs = self.qf(next_obs).max(1, keepdim=True)[1]
        target_q_values = self.target_qf(next_obs).gather(
            1, best_action_idxs).detach()
        y_target = rewards + (1. - terminals) * self.discount * target_q_values
        y_target = y_target.detach()
        # actions is a one-hot vector
        y_pred = torch.sum(self.qf(obs) * actions, dim=1, keepdim=True)
        qf_loss = self.qf_criterion(y_pred, y_target)
        """
        Update networks
        """
        self.qf_optimizer.zero_grad()
        qf_loss.backward()
        self.qf_optimizer.step()
        """
        Soft target network updates
        """
        if self._n_train_steps_total % self.target_update_period == 0:
            ptu.soft_update_from_to(self.qf, self.target_qf,
                                    self.soft_target_tau)
        """
        Save some statistics for eval using just one batch.
        """
        if self._need_to_update_eval_statistics:
            self._need_to_update_eval_statistics = False
            self.eval_statistics['QF Loss'] = np.mean(ptu.get_numpy(qf_loss))
            self.eval_statistics.update(
                create_stats_ordered_dict(
                    'Y Predictions',
                    ptu.get_numpy(y_pred),
                ))
Exemple #27
0
 def _do_training(self):
     batch = self.get_batch()
     rewards = batch['rewards']
     terminals = batch['terminals']
     obs = batch['observations']
     actions = batch['actions']
     next_obs = batch['next_observations']
     """
     Compute loss
     """
     for t in range(self.max_horizon):
         if t == self.max_horizon - 1:
             q_target = self.reward_scale * rewards
         else:
             target_q_values = self.qfs[t + 1](next_obs).detach().max(
                 1, keepdim=True)[0]
             q_target = (self.reward_scale * rewards +
                         (1. - terminals) * self.discount * target_q_values)
         # actions are one-hot vectors
         q_pred = torch.sum(self.qfs[t](obs) * actions, dim=1, keepdim=True)
         qf_loss = self.qf_criterion(q_pred, q_target.detach())
         """
         Update networks
         """
         self.qf_optimizers[t].zero_grad()
         qf_loss.backward()
         self.qf_optimizers[t].step()
         """
         Save some statistics for eval
         """
         if self.need_to_update_eval_statistics:
             self.eval_statistics['QF {} Loss'.format(t)] = np.mean(
                 ptu.get_numpy(qf_loss))
             self.eval_statistics.update(
                 create_stats_ordered_dict(
                     'Q {} Predictions'.format(t),
                     ptu.get_numpy(q_pred),
                 ))
     if self.need_to_update_eval_statistics:
         self.need_to_update_eval_statistics = False
    def _do_training(self):
        batch = self.get_batch()
        obs = batch['observations']
        actions = batch['actions']
        num_steps_left = batch['num_steps_left']
        next_obs = batch['next_observations']
        """
        Policy operations.
        """
        # import ipdb; ipdb.set_trace()
        policy_actions = self.policy(
            obs,
            self.env.convert_obs_to_goals(next_obs),
            num_steps_left,
            return_preactivations=False,
        )
        policy_loss = self.policy_criterion(policy_actions, actions)
        """
        Update Networks
        """
        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        if self.eval_statistics is None:
            """
            This way, these statistics are only computed for one batch.
            """
            self.eval_statistics = OrderedDict()
            self.eval_statistics['Policy Loss'] = np.mean(
                ptu.get_numpy(policy_loss))
            self.eval_statistics.update(
                create_stats_ordered_dict(
                    'Policy Action',
                    ptu.get_numpy(policy_actions),
                ))
Exemple #29
0
    def _do_training(self):
        batch = self.get_batch(training=True)
        terminals = batch['terminals']
        obs = batch['observations']
        actions = batch['actions']
        next_obs = batch['next_observations']
        goal_differences = batch['goal_differences']
        goals = batch['goals']
        """
        Policy operations.
        """
        policy_actions = self.policy(obs)
        # future_goals_predicted = (
        # self.env.convert_obs_to_goals(obs) + self.gcm(obs, policy_actions)
        # )
        # policy_loss = ((future_goals_predicted-goals)**2).sum(dim=1).mean()
        policy_loss = self.gcm(obs, policy_actions).sum(dim=1).mean()
        """
        GCM operations.
        """
        next_actions = self.target_policy(next_obs)
        # speed up computation by not backpropping these gradients
        next_actions.detach()
        target_difference = self.target_gcm(
            next_obs,
            next_actions,
        )
        gcm_target = goal_differences + (1. - terminals) * target_difference
        gcm_target = gcm_target.detach()
        gcm_pred = self.gcm(obs, actions)
        bellman_errors = (gcm_pred - gcm_target)**2
        gcm_loss = self.gcm_criterion(gcm_pred, gcm_target)
        """
        Update Networks
        """

        self.policy_optimizer.zero_grad()
        policy_loss.backward()
        self.policy_optimizer.step()

        self.gcm_optimizer.zero_grad()
        gcm_loss.backward()
        self.gcm_optimizer.step()

        self._update_target_networks()

        if self.eval_statistics is None:
            """
            Eval should set this to None.
            This way, these statistics are only computed for one batch.
            """
            self.eval_statistics = OrderedDict()
            self.eval_statistics['GCM Loss'] = np.mean(ptu.get_numpy(gcm_loss))
            self.eval_statistics['Policy Loss'] = np.mean(
                ptu.get_numpy(policy_loss))
            self.eval_statistics.update(
                create_stats_ordered_dict(
                    'Bellman Errors',
                    ptu.get_numpy(bellman_errors),
                ))
            self.eval_statistics.update(
                create_stats_ordered_dict(
                    'Policy Action',
                    ptu.get_numpy(policy_actions),
                ))
            self.eval_statistics.update(
                create_stats_ordered_dict(
                    'GCM Predictions',
                    ptu.get_numpy(gcm_pred),
                ))
            self.eval_statistics.update(
                create_stats_ordered_dict(
                    'GCM Targets',
                    ptu.get_numpy(gcm_target),
                ))
Exemple #30
0
    def _do_training(self):
        batch = self.get_batch()
        rewards = batch['rewards']
        terminals = batch['terminals']
        obs = batch['observations']
        actions = batch['actions']
        next_obs = batch['next_observations']
        goals = batch['goals']
        num_steps_left = batch['num_steps_left']

        q1_pred = self.qf1(
            observations=obs,
            actions=actions,
            goals=goals,
            num_steps_left=num_steps_left,
        )
        q2_pred = self.qf2(
            observations=obs,
            actions=actions,
            goals=goals,
            num_steps_left=num_steps_left,
        )
        # Make sure policy accounts for squashing functions like tanh correctly!
        policy_outputs = self.policy(obs,
                                     goals,
                                     num_steps_left,
                                     reparameterize=self.train_policy_with_reparameterization,
                                     return_log_prob=True)
        new_actions, policy_mean, policy_log_std, log_pi = policy_outputs[:4]
        if not self.dense_rewards and not self.dense_log_pi:
            log_pi = log_pi * terminals

        """
        QF Loss
        """
        target_v_values = self.target_vf(
            observations=next_obs,
            goals=goals,
            num_steps_left=num_steps_left-1,
        )
        q_target = self.reward_scale * rewards + (1. - terminals) * self.discount * target_v_values
        q_target = q_target.detach()
        bellman_errors_1 = (q1_pred - q_target) ** 2
        bellman_errors_2 = (q2_pred - q_target) ** 2
        qf1_loss = bellman_errors_1.mean()
        qf2_loss = bellman_errors_2.mean()

        if self.use_automatic_entropy_tuning:
            """
            Alpha Loss
            """
            alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean()
            self.alpha_optimizer.zero_grad()
            alpha_loss.backward()
            self.alpha_optimizer.step()
            alpha = self.log_alpha.exp()
        else:
            alpha = 1

        """
        VF Loss
        """
        q1_new_actions = self.qf1(
            observations=obs,
            actions=new_actions,
            goals=goals,
            num_steps_left=num_steps_left,
        )
        q2_new_actions = self.qf2(
            observations=obs,
            actions=new_actions,
            goals=goals,
            num_steps_left=num_steps_left,
        )
        q_new_actions = torch.min(q1_new_actions, q2_new_actions)
        v_target = q_new_actions - alpha * log_pi
        v_pred = self.vf(
            observations=obs,
            goals=goals,
            num_steps_left=num_steps_left,
        )
        v_target = v_target.detach()
        bellman_errors = (v_pred - v_target) ** 2
        vf_loss = bellman_errors.mean()

        """
        Update networks
        """
        self.qf1_optimizer.zero_grad()
        qf1_loss.backward()
        self.qf1_optimizer.step()

        self.qf2_optimizer.zero_grad()
        qf2_loss.backward()
        self.qf2_optimizer.step()

        self.vf_optimizer.zero_grad()
        vf_loss.backward()
        self.vf_optimizer.step()

        """
        Policy Loss
        """
        # paper says to do + but apparently that's a typo. Do Q - V.
        if self.train_policy_with_reparameterization:
            policy_loss = (alpha * log_pi - q_new_actions).mean()
        else:
            log_policy_target = q_new_actions - v_pred
            policy_loss = (
                log_pi * (alpha * log_pi - log_policy_target).detach()
            ).mean()
        mean_reg_loss = self.policy_mean_reg_weight * (policy_mean ** 2).mean()
        std_reg_loss = self.policy_std_reg_weight * (policy_log_std ** 2).mean()
        pre_tanh_value = policy_outputs[-1]
        pre_activation_reg_loss = self.policy_pre_activation_weight * (
            (pre_tanh_value ** 2).sum(dim=1).mean()
        )
        policy_reg_loss = mean_reg_loss + std_reg_loss + pre_activation_reg_loss
        policy_loss = policy_loss + policy_reg_loss

        if self._n_train_steps_total % self.policy_update_period == 0:
            self.policy_optimizer.zero_grad()
            policy_loss.backward()
            self.policy_optimizer.step()

        if self._n_train_steps_total % self.target_update_period == 0:
            ptu.soft_update_from_to(
                self.vf, self.target_vf, self.soft_target_tau
            )

        """
        Save some statistics for eval
        """
        if self.need_to_update_eval_statistics:
            self.need_to_update_eval_statistics = False
            """
            Eval should set this to None.
            This way, these statistics are only computed for one batch.
            """
            self.eval_statistics['QF1 Loss'] = np.mean(ptu.get_numpy(qf1_loss))
            self.eval_statistics['QF2 Loss'] = np.mean(ptu.get_numpy(qf2_loss))
            self.eval_statistics['VF Loss'] = np.mean(ptu.get_numpy(vf_loss))
            self.eval_statistics['Policy Loss'] = np.mean(ptu.get_numpy(
                policy_loss
            ))
            self.eval_statistics.update(create_stats_ordered_dict(
                'Q1 Predictions',
                ptu.get_numpy(q1_pred),
            ))
            self.eval_statistics.update(create_stats_ordered_dict(
                'Q2 Predictions',
                ptu.get_numpy(q2_pred),
            ))
            self.eval_statistics.update(create_stats_ordered_dict(
                'V Predictions',
                ptu.get_numpy(v_pred),
            ))
            self.eval_statistics.update(create_stats_ordered_dict(
                'Log Pis',
                ptu.get_numpy(log_pi),
            ))
            self.eval_statistics.update(create_stats_ordered_dict(
                'Policy mu',
                ptu.get_numpy(policy_mean),
            ))
            self.eval_statistics.update(create_stats_ordered_dict(
                'Policy log std',
                ptu.get_numpy(policy_log_std),
            ))
            if self.use_automatic_entropy_tuning:
                self.eval_statistics['Alpha'] = ptu.get_numpy(alpha)[0]
                self.eval_statistics['Alpha Loss'] = ptu.get_numpy(alpha_loss)[0]