Beispiel #1
0
 def log_diagnostics(self):
     tabular.record("max-path-return_agent", self._max_path_return[0])
     tabular.record("mean-path-return_agent", self._mean_path_return[0])
     tabular.record("last-path-return_agent", self._last_path_return[0])
     tabular.record("episodes", self._n_episodes)
     tabular.record("episode_reward", self._n_episodes)
     tabular.record("terminal_position_x", self._terminal_position[0])
     tabular.record("terminal_position_y", self._terminal_position[1])
     tabular.record("total-samples", self._total_samples)
Beispiel #2
0
 def log_diagnostics(self):
     for i in range(self.agent_num):
         tabular.record("max-path-return_agent_{}".format(i),
                        self._max_path_return[i])
         tabular.record("mean-path-return_agent_{}".format(i),
                        self._mean_path_return[i])
         tabular.record("last-path-return_agent_{}".format(i),
                        self._last_path_return[i])
     tabular.record("episodes", self._n_episodes)
     tabular.record("episode_reward", self._n_episodes)
     tabular.record("total-samples", self._total_samples)
Beispiel #3
0
    def _train(self, batch, weights=None):
        prior_loss = self._prior_train(batch)
        opponent_policy_loss = self._opponent_train(batch)
        critic_loss = self._critic_train(batch, weights)
        actor_loss = self._actor_train(batch, weights)

        self._train_step += 1
        if self._train_step % self._target_update_period == 0:
            self._update_target()

        losses = {
            'pg_loss': actor_loss.numpy(),
            'critic_loss': critic_loss.numpy(),
            'opponent_policy_loss': opponent_policy_loss.numpy(),
            'prior_loss': prior_loss.numpy()
        }

        if self._train_step % 1 == 0:
            tabular.record('q loss', critic_loss.numpy().item())
            tabular.record('opponent_policy_loss loss',
                           opponent_policy_loss.numpy().item())
            tabular.record('actor_loss loss', actor_loss.numpy().item())
            tabular.record('bi', batch['annealing'].numpy())
            tabular.record('bj', 1.)
            if prior_loss is not None:
                tabular.record('prior loss', prior_loss.numpy())
            logger.log(tabular)

        if self._train_step % 100 == 0:
            # print('training statistics')
            # print(self._opponent_policy.get_diagnostics(batch['observations']))
            # print(self._prior.get_diagnostics(batch['observations']))
            opponent_actions = self._opponent_policy.get_actions_np(
                batch['observations'])
            # print(self._policy.get_diagnostics([batch['observations'], opponent_actions]))
            actions = self._policy.get_actions_np(
                [batch['observations'], opponent_actions])
            # print(self._qf.get_diagnostics([batch['observations'], actions, opponent_actions]))

        return losses
Beispiel #4
0
 def log_diagnostics(self):
     tabular.record('max-path-return_agent', self._max_path_return[0])
     tabular.record('mean-path-return_agent', self._mean_path_return[0])
     tabular.record('last-path-return_agent', self._last_path_return[0])
     tabular.record('episodes', self._n_episodes)
     tabular.record('episode_reward', self._n_episodes)
     tabular.record('terminal_position_x', self._terminal_position[0])
     tabular.record('terminal_position_y', self._terminal_position[1])
     tabular.record('total-samples', self._total_samples)
Beispiel #5
0
    def log_diagnostics(self):
        # each print of log correspond to one path of 25 steps
        # mean path return is the mean reward over that path
        # max path return is the max over all means
        # last path return is the reutnr for the last time step in that path

        for i in range(self.agent_num):
            tabular.record('mean-path-return_agent_{}'.format(i), self._mean_path_return[i])
            tabular.record('max-path-return_agent_{}'.format(i), self._max_path_return[i])
            tabular.record('num-hit_agent_{}'.format(i), self.env.agents[i].numHit)   ## this agent is gym agent not the algorithm
            tabular.record('num-was-hit_agent_{}'.format(i), self.env.agents[i].numWasHit)
            # tabular.record
            # tabular.record('last-path-return_agent_{}'.format(i), self._last_path_return[i])
        tabular.record('episodes', self._n_episodes)
        # tabular.record('episode_reward', self._n_episodes)
        tabular.record('total-samples', self._total_samples)
Beispiel #6
0
    def _train(self, batch, weights=None):
        prior_loss = self._prior_train(batch)
        opponent_policy_loss = self._opponent_train(batch)
        critic_loss = self._critic_train(batch, weights)
        actor_loss = self._actor_train(batch, weights)

        self._train_step += 1
        if self._train_step % self._target_update_period == 0:
            self._update_target()

        losses = {
            "pg_loss": actor_loss.numpy(),
            "critic_loss": critic_loss.numpy(),
            "opponent_policy_loss": opponent_policy_loss.numpy(),
            "prior_loss": prior_loss.numpy(),
        }

        if self._train_step % 1 == 0:
            tabular.record("q loss", critic_loss.numpy().item())
            tabular.record(
                "opponent_policy_loss loss", opponent_policy_loss.numpy().item()
            )
            tabular.record("actor_loss loss", actor_loss.numpy().item())
            tabular.record("bi", batch["annealing"].numpy())
            tabular.record("bj", 1.0)
            if prior_loss is not None:
                tabular.record("prior loss", prior_loss.numpy())
            logger.log(tabular)

        if self._train_step % 100 == 0:
            # print('training statistics')
            # print(self._opponent_policy.get_diagnostics(batch['observations']))
            # print(self._prior.get_diagnostics(batch['observations']))
            opponent_actions = self._opponent_policy.get_actions_np(
                batch["observations"]
            )
            # print(self._policy.get_diagnostics([batch['observations'], opponent_actions]))
            actions = self._policy.get_actions_np(
                [batch["observations"], opponent_actions]
            )
            # print(self._qf.get_diagnostics([batch['observations'], actions, opponent_actions]))

        return losses