Exemple #1
0
    def gen_action(self, agent_list, observation, free_map=None):
        """Action generation method.

        This is a required method that generates list of actions corresponding
        to the list of units.

        Args:
            agent_list (list): list of all friendly units.
            observation (np.array): 2d map of partially observable map.
            free_map (np.array): 2d map of static environment (optional).

        Returns:
            action_out (list): list of integers as actions selected for team.

        Note:
            The graph is not updated in this session.
            It only returns action for given input.
        """
        if not self._reset_done:
            self.reset_network_weight()

        obs = one_hot_encoder(observation,
                              agent_list,
                              self.input_shape,
                              reverse=not self.is_blue)
        action_prob = self.sess.run(self.action,
                                    feed_dict={self.state:
                                               obs})  # Action Probability

        action_out = [
            np.random.choice(5, p=action_prob[x] / sum(action_prob[x]))
            for x in range(len(agent_list))
        ]

        return action_out
Exemple #2
0
    def gen_action(self, agent_list, observation, free_map=None):
        """Action generation method.
        
        This is a required method that generates list of actions corresponding 
        to the list of units. 
        
        Args:
            agent_list (list): list of all friendly units.
            observation (np.array): 2d map of partially observable map.
            free_map (np.array): 2d map of static environment (optional).
            
        Returns:
            action_out (list): list of integers as actions selected for team.

        Note:
            The graph is not updated in this session.
            It only returns action for given input.
        """

        obs = one_hot_encoder(observation, agent_list, self.input_shape, reverse=not self.is_blue)
        action_prob = self.sess.run(self.action, feed_dict={self.state:obs}) # Action Probability

        # If the policy is deterministic policy, return the argmax
        # The parameter can be changed with set_deterministic(bool)
        if self.deterministic:
            action_out = np.argmax(action_prob, axis=1).tolist()
        else: 
            action_out = [np.random.choice(5, p=action_prob[x]/sum(action_prob[x])) for x in range(len(agent_list))]

        return action_out
Exemple #3
0
    def get_action(self, raw_observation, agent_list, process_ids):
        state = one_hot_encoder(raw_observation, agent_list, VISION_RANGE)
        state_wide = one_hot_encoder_v2(raw_observation, agent_list, 19)
        p = self.sub_policy
        choices = [p[0].get_action(state), p[1].get_action(state_wide)]

        # Arbitrary
        action_out = [choices[pid][aid] for aid, pid in enumerate(process_ids)]
        return action_out
Exemple #4
0
    def gen_action(self, agent_list, observation, free_map=None):
        state = one_hot_encoder(observation, agent_list, self.vision_range, reverse=not self.is_blue)
        state_wide = one_hot_encoder_v2(observation, agent_list, 19, reverse=not self.is_blue)
        p = self.policy
        choices = [p[0].get_action(state), p[1].get_action(state_wide)]
        # choices = [p.get_action(state) for p in self.policy]

        # Arbitrary
        action_out = []
        si, ei = 0, 0
        for pid, n in enumerate(self.fix_policy):
            ei += n
            action_out.extend(choices[pid][si:ei])
            si = ei

        return action_out
Exemple #5
0
    def work(self, saver, writer):
        global global_rewards, global_ep_rewards, global_episodes, global_length, global_succeed
        total_step = 1
        local_ep = 0
        buffer = Experience_buffer(experience_shape=6,
                                   buffer_size=replay_capacity)
        epsilon = 1.0
        epsilon_gamma = 0.9999
        epsilon_final = 0.1
        with self.sess.as_default(), self.sess.graph.as_default():
            while global_episodes < total_episodes:
                local_ep += 1
                raw_obs = self.env.reset()
                if partial_visible:
                    s1 = one_hot_encoder(raw_obs, self.env.get_team_blue,
                                         VISION_RANGE)
                else:
                    s1 = one_hot_encoder(self.env._env, self.env.get_team_blue,
                                         VISION_RANGE)

                # parameters
                ep_r = 0
                prev_r = 0
                is_alive = [True] * num_blue

                episode_buffer = []

                for step in range(max_ep + 1):
                    # Set sub-policy
                    if step % 15 == 0:
                        pids = self.Network.run_network(
                            np.expand_dims(s1, axis=0))[0]

                    if random.random() < epsilon:
                        # Random Exploration
                        a = random.choices(range(action_size), k=4)
                        epsilon = max(epsilon_final, epsilon * epsilon_gamma)
                    else:
                        a = self.get_action(raw_obs, self.env.get_team_blue,
                                            pids)

                    s0 = s1
                    raw_obs, rc, d, info = self.env.step(a)
                    if partial_visible:
                        s1 = one_hot_encoder(raw_obs, self.env.get_team_blue,
                                             VISION_RANGE)
                    else:
                        s1 = one_hot_encoder(self.env._env,
                                             self.env.get_team_blue,
                                             VISION_RANGE)
                    is_alive = info['blue_alive'][-1]

                    r = (rc - prev_r - 0.01)
                    if step == max_ep and not d:
                        r = -100
                        rc = -100
                        d = True

                    r /= 100.0
                    ep_r += r

                    # push to buffer
                    for idx in range(num_blue):
                        if step > 0:
                            was_alive = info['blue_alive'][-2]
                        else:
                            was_alive = [True] * num_blue
                        if was_alive[idx]:
                            episode_buffer.append(
                                [s0, a, r, s1, d, is_alive * 1])

                    # Iteration
                    prev_r = rc
                    total_step += 1

                    if d:
                        buffer.add(episode_buffer)
                        if local_ep % update_frequency == 0 and local_ep > 0:
                            batch = buffer.pop(size=batch_size, shuffle=True)
                            aloss = self.train(batch)
                            # buffer.flush()
                        break

                global_ep_rewards.append(ep_r)
                global_rewards.append(rc)
                global_length.append(step)
                global_succeed.append(self.env.blue_win)
                global_episodes += 1
                self.sess.run(global_step_next)
                progbar.update(global_episodes)
                if global_episodes % save_stat_frequency == 0 and global_episodes != 0:
                    summary = tf.Summary()
                    summary.value.add(tag='Records/mean_reward',
                                      simple_value=global_rewards())
                    summary.value.add(tag='Records/mean_length',
                                      simple_value=global_length())
                    summary.value.add(tag='Records/mean_succeed',
                                      simple_value=global_succeed())
                    summary.value.add(tag='Records/mean_episode_reward',
                                      simple_value=global_ep_rewards())
                    summary.value.add(tag='summary/loss', simple_value=aloss)
                    writer.add_summary(summary, global_episodes)
                    writer.flush()
                if global_episodes % save_network_frequency == 0:
                    saver.save(self.sess,
                               MODEL_PATH + '/ctf_policy.ckpt',
                               global_step=global_episodes)