Ejemplo n.º 1
0
    def sample(self, explore=False):
        self.step += 1
        if self._current_observation is None:
            self._current_observation = self.env.reset()
        self._current_observation = np.squeeze(
            self._current_observation).flatten()

        if explore:
            action = self.env.action_space.sample()
        else:
            action = self.agent.act(
                np.squeeze(self._current_observation).flatten())

        action = np.asarray(action)
        next_observation, reward, done, info = self.env.step(action)
        next_observation = np.squeeze(next_observation).flatten()
        reward = np.squeeze(reward).flatten()
        action = np.squeeze(action).flatten()
        done = np.squeeze(done)
        done = done.astype(np.int8)

        self._path_length += 1
        self._path_return += np.mean(reward)
        self._total_samples += 1
        self.agent.replay_buffer.add_sample(
            observation=self._current_observation,
            action=action,
            reward=reward,
            terminal=done,
            next_observation=next_observation,
        )

        self._current_observation = next_observation

        if np.all(done) or self._path_length >= self._max_path_length:
            self._max_path_return = np.maximum(self._max_path_return,
                                               self._path_return)
            self._mean_path_return = self._path_return / self._path_length
            self._last_path_return = self._path_return
            self._terminal_position = self._current_observation

            self._current_observation = self.env.reset()
            self._path_length = 0
            self._path_return = np.zeros(1)
            self._n_episodes += 1

            # FIXME : delete it afterwards.
            if explore is False:
                self.episode_rewards.append(self._last_path_return.item())
                self.episode_positions.append([
                    self._terminal_position[0].item(),
                    self._terminal_position[1].item(),
                ])

            self.log_diagnostics()
            logger.log(tabular)
            logger.dump_all()

        else:
            self._current_observation = next_observation
Ejemplo n.º 2
0
    def sample(self, explore=False):
        self.step += 1
        if self._current_observation_n is None:
            self._current_observation_n = self.env.reset()
        action_n = []
        if explore:
            action_n = self.env.action_spaces.sample()
        else:
            for agent, current_observation in zip(self.agents,
                                                  self._current_observation_n):
                action = agent.act(current_observation.astype(np.float32))
                action_n.append(np.array(action))

        action_n = np.asarray(action_n)

        next_observation_n, reward_n, done_n, info = self.env.step(action_n)
        infoif = False
        if infoif:
            action_n = info["new_act"]
        if self._global_reward:
            reward_n = np.array([np.sum(reward_n)] * self.agent_num)

        self._path_length += 1
        self._path_return += np.array(reward_n, dtype=np.float32)
        self._total_samples += 1
        for i, agent in enumerate(self.agents):
            opponent_action = action_n[[
                j for j in range(len(action_n)) if j != i
            ]].flatten()
            agent.replay_buffer.add_sample(
                observation=self._current_observation_n[i].astype(np.float32),
                action=action_n[i].astype(np.float32),
                reward=reward_n[i].astype(np.float32),
                terminal=done_n[i],
                next_observation=next_observation_n[i].astype(np.float32),
                opponent_action=opponent_action.astype(np.float32),
            )

        self._current_observation_n = next_observation_n

        if np.all(done_n) or self._path_length >= self._max_path_length:
            self._current_observation_n = self.env.reset()
            self._max_path_return = np.maximum(self._max_path_return,
                                               self._path_return)
            self._mean_path_return = self._path_return / self._path_length
            self._last_path_return = self._path_return
            self.container["path_rw"].append(self._path_return)
            self.container["mean_rw"].append(self._mean_path_return)
            self._path_length = 0
            self._path_return = np.zeros(self.agent_num)
            self._n_episodes += 1
            self.log_diagnostics()
            #for i, agent in enumerate(self.agents):
            #    try:

            logger.log(tabular)
            logger.dump_all()
        else:
            self._current_observation_n = next_observation_n
Ejemplo n.º 3
0
    def _train(self, batch, weights=None):
        prior_loss = self._prior_train(batch)
        opponent_policy_loss = self._opponent_train(batch)
        critic_loss = self._critic_train(batch, weights)
        actor_loss = self._actor_train(batch, weights)

        self._train_step += 1
        if self._train_step % self._target_update_period == 0:
            self._update_target()

        losses = {
            "pg_loss": actor_loss.numpy(),
            "critic_loss": critic_loss.numpy(),
            "opponent_policy_loss": opponent_policy_loss.numpy(),
            "prior_loss": prior_loss.numpy(),
        }

        if self._train_step % 1 == 0:
            tabular.record("q loss", critic_loss.numpy().item())
            tabular.record(
                "opponent_policy_loss loss", opponent_policy_loss.numpy().item()
            )
            tabular.record("actor_loss loss", actor_loss.numpy().item())
            tabular.record("bi", batch["annealing"].numpy())
            tabular.record("bj", 1.0)
            if prior_loss is not None:
                tabular.record("prior loss", prior_loss.numpy())
            logger.log(tabular)

        if self._train_step % 100 == 0:
            # print('training statistics')
            # print(self._opponent_policy.get_diagnostics(batch['observations']))
            # print(self._prior.get_diagnostics(batch['observations']))
            opponent_actions = self._opponent_policy.get_actions_np(
                batch["observations"]
            )
            # print(self._policy.get_diagnostics([batch['observations'], opponent_actions]))
            actions = self._policy.get_actions_np(
                [batch["observations"], opponent_actions]
            )
            # print(self._qf.get_diagnostics([batch['observations'], actions, opponent_actions]))

        return losses
Ejemplo n.º 4
0
    def _train(self, batch, weights=None):
        prior_loss = self._prior_train(batch)
        opponent_policy_loss = self._opponent_train(batch)
        critic_loss = self._critic_train(batch, weights)
        actor_loss = self._actor_train(batch, weights)

        self._train_step += 1
        if self._train_step % self._target_update_period == 0:
            self._update_target()

        losses = {
            'pg_loss': actor_loss.numpy(),
            'critic_loss': critic_loss.numpy(),
            'opponent_policy_loss': opponent_policy_loss.numpy(),
            'prior_loss': prior_loss.numpy()
        }

        if self._train_step % 1 == 0:
            tabular.record('q loss', critic_loss.numpy().item())
            tabular.record('opponent_policy_loss loss',
                           opponent_policy_loss.numpy().item())
            tabular.record('actor_loss loss', actor_loss.numpy().item())
            tabular.record('bi', batch['annealing'].numpy())
            tabular.record('bj', 1.)
            if prior_loss is not None:
                tabular.record('prior loss', prior_loss.numpy())
            logger.log(tabular)

        if self._train_step % 100 == 0:
            # print('training statistics')
            # print(self._opponent_policy.get_diagnostics(batch['observations']))
            # print(self._prior.get_diagnostics(batch['observations']))
            opponent_actions = self._opponent_policy.get_actions_np(
                batch['observations'])
            # print(self._policy.get_diagnostics([batch['observations'], opponent_actions]))
            actions = self._policy.get_actions_np(
                [batch['observations'], opponent_actions])
            # print(self._qf.get_diagnostics([batch['observations'], actions, opponent_actions]))

        return losses
Ejemplo n.º 5
0
    def sample(self, explore=False):
        self.step += 1
        if self._current_observation_n is None:
            self._current_observation_n = self.env.reset()
        action_n = []
        # print(self._current_observation_n)
        # print(self._current_observation_n.shape)
        if explore:
            action_n = self.env.action_spaces.sample()
        else:
            for agent, current_observation in zip(self.agents, self._current_observation_n):
                action = agent.act(current_observation.astype(np.float32))
                action_n.append(np.array(action))

        action_n = np.asarray(action_n)

        next_observation_n, reward_n, done_n, info = self.env.step(action_n)    # stepping over, this env is the fortAttackGlobalenv
        
        if self._global_reward:
            reward_n = np.array([np.sum(reward_n)] * self.agent_num)
            
        self._path_length += 1
        self._last_path_return = np.array(reward_n, dtype=np.float32)
        self._path_return += self._last_path_return

        self._total_samples += 1
        for i, agent in enumerate(self.agents):
            opponent_action = action_n[[j for j in range(len(action_n)) if j != i]].flatten()
            agent.replay_buffer.add_sample(
                observation=self._current_observation_n[i].astype(np.float32),
                action=action_n[i].astype(np.float32),
                reward=np.array(reward_n[i],np.float32),
                terminal=done_n[i],
                next_observation=next_observation_n[i].astype(np.float32),
                opponent_action=opponent_action.astype(np.float32)
            )


        if self.render_after!=None:
            if self._n_episodes % self.render_after == 0:
               # render(self.env,
               #        "/tmp/episode_%08d" % self._path_length,
               #        self._path_length,)
               self.env.render(mode="rgb_array")[0]
               # time.sleep(0.03)
        self._current_observation_n = next_observation_n

        if np.all(done_n) or self._path_length >= self._max_path_length:
            self._max_path_return = np.maximum(self._max_path_return, self._path_return)
            self._mean_path_return = self._path_return / self._path_length
            # self._last_path_return = self._path_return
            # print('last path return', self._path_return)
            #if self._n_episodes % 100 == 0:
            #    render(self.env,
            #           "/tmp/episode_%08d" % self._path_length,
            #           self._path_length,
            #           True)
            self._path_length = 0
            self._path_return = np.zeros(self.agent_num)
            self._n_episodes += 1
            self.log_diagnostics()  # one of these lines is printing to screen
            logger.log(tabular)
            logger.dump_all()
            
            self._current_observation_n = self.env.reset()
        else:
            self._current_observation_n = next_observation_n