def get_whattodo_view(self):
        """
        Give the view the dict that tell the possible action on this turn and the task
        that the view should do
        :return: dict
        """
        params_view_action = {}

        self.state.print_board()
        if AIElements.is_over(self.state):
            params_view_action['task'] = 'END_GAME'
            print("test")
            return params_view_action
        if self.two_players:
            params_view_action['task'] = 'CHANGE_PLAYER'
            params_view_action['state'] = AIElements.get_state_dict(self.state)
            possible_action = AIElements.get_possible_action(self.state)
            params_view_action['possible_action'] = possible_action
            self.possible_action_keys = possible_action.keys()
        if self.player_vs_ai_white:
            self.possible_action_keys = AIElements.get_possible_action(
                self.state).keys()
            params_view_action['task'] = 'AI_MOVE'
            ai_key_action, ai_action_params = self.ai_agent.choose_action(
                self.state)
            previous_state = deepcopy(self.state)
            self.receive_input_action_play(ai_key_action, ai_action_params)
            if AIElements.is_over(self.state):
                params_view_action['end'] = True
                params_view_action['task'] = 'END_GAME'
                return params_view_action
            print("Reward Function is %.2f" % (AIElements.reward_function(
                self.old_state_reward, self.state, 1)))  #Black
            self.old_state_reward = deepcopy(self.state)
            state_dict = AIElements.get_state_dict(self.state)
            previous_state_dict = AIElements.get_state_dict(previous_state)
            possible_action = AIElements.get_possible_action(self.state)
            previous_mana = AIElements.get_players_mana(previous_state)

            params_view_action['state'] = state_dict
            params_view_action["prev_state"] = previous_state_dict
            params_view_action["ai_action"] = ai_action_params
            params_view_action["prev_mana"] = previous_mana
            params_view_action["possible_action"] = possible_action
            self.possible_action_keys = possible_action.keys()
        return params_view_action
Exemple #2
0
    def expand_node(self, model_deep_net, player_color, label_encoder,
                    epsilon=AlphaZeroConfig.MCTS_EPSILON,
                    alpha_diri=AlphaZeroConfig.MCTS_ALPHA_DIRICHLET,
                    cpuct=AlphaZeroConfig.MCTS_PUCT,
                    greed_attack = False):
        """
        This function contains 2 steps on the MCTS.
        Select and Expand & Evaluate
        :param model_deep_net: The neural network model
        :param player_color: why did I include this???
        :param label_encoder: The encoder used to encode the action key
        :param epsilon: hyperparameter for using the dirichlet random proba
        :param alpha_diri: hyperparameter of dirichlet
        :param cpuct: hyperparameter of the MCTS in alpha zero
        :param greed_attack: HACK! the agent will prioritize attacking and promoting
        :return:
        """
        terminal = AIElements.is_over(self.stacked_state.head)
        self.is_terminal = terminal
        if not terminal:
            possible_action = AIElements.get_possible_action(self.stacked_state.head)
            possible_action_keys = list(possible_action.keys())

            if self.p_state is None:
                """
                    Expand and Evaluate goes here!
                """
                if self.stacked_state.head.get_player_turn() == self.maximizer:
                    state_stack_representation = np.array([self.stacked_state.get_deep_representation_stack()])
                else:
                    state_stack_representation = mirror_stacked_state(self.stacked_state)
                    state_stack_representation = np.array([state_stack_representation.get_deep_representation_stack()])

                self.p_state, self.v = model_deep_net.predict(state_stack_representation)
                self.v_ = self.v[0][0]
                self.v = self.v[0][0]
                self.p_state = self.p_state[0]

                if self.stacked_state.head.get_player_turn() != self.maximizer:
                    self.p_state = label_encoder.array_mirrored(self.p_state)
                possible_action_ohe = label_encoder.transform(possible_action_keys).sum(axis=0)
                self.p_state *= possible_action_ohe
                sum_policy_state = np.sum(self.p_state)
                if sum_policy_state > 0:
                    ## normalize to sum 1
                    self.p_state /= sum_policy_state
                else:
                    print("All valid moves were masked, do workaround.")
                    self.p_state += possible_action_ohe
                    self.p_state /= np.sum(self.p_state)

                # Initialize num and q
                for action in possible_action_keys:
                    self.num_state_action[action] = 0
                    self.q_state_action[action] = 0
                    next_state = AIElements.result_function(self.stacked_state.head, possible_action[action])
                    new_stacked_state = deepcopy(self.stacked_state)
                    new_stacked_state.append(next_state)
                    if action not in self.edge_action:
                        self.edge_action[action] = NodeMCTS(new_stacked_state, parent=self, root=False)

            else:
                """
                    Select goes here
                """
                best_action = ""
                best_upper_confidence = -float('inf')

                dirchlet_prob = np.random.dirichlet([alpha_diri] * len(possible_action_keys))
                counter_loop = 0

                # Randomize possible_action_keys
                random.shuffle(possible_action_keys)
                for action in possible_action_keys:
                    # Get the index of the action
                    index_action = label_encoder.le.transform([action])[0]
                    q_state_action_val = 0
                    num_state_action_val = 0
                    if action in self.q_state_action and action in self.num_state_action:
                        q_state_action_val = self.q_state_action[action]
                        num_state_action_val = self.num_state_action[action]
                    if self.root:
                        upper_confidence = q_state_action_val + \
                                           cpuct * ((1 - epsilon) * self.p_state[index_action] + epsilon * dirchlet_prob[
                            counter_loop]) * \
                                           np.sqrt(self.num_state) / (1 + num_state_action_val)


                    else:
                        upper_confidence = q_state_action_val + \
                                           cpuct * self.p_state[index_action] * \
                                           np.sqrt(self.num_state) / (1 + num_state_action_val)
                    if greed_attack and possible_action[action]['action'] == 'attack':
                        upper_confidence += AlphaZeroConfig.Q_ATTACK_GREEDY # Higher Chance to Attack
                    if greed_attack and possible_action[action]['action'] == 'promote':
                        upper_confidence += AlphaZeroConfig.Q_PROMOTE_GREEDY # Higher Chance to promote
                    counter_loop += 1
                    if best_upper_confidence < upper_confidence:
                        best_upper_confidence = upper_confidence
                        best_action = action

                        # Expand the node and check if this node is terminal

                        next_state = AIElements.result_function(self.stacked_state.head, possible_action[action])
                        new_stacked_state = deepcopy(self.stacked_state)
                        new_stacked_state.append(next_state)
                        if action not in self.edge_action:
                            self.edge_action[action] = NodeMCTS(new_stacked_state, parent=self, root=False)

                self.selected_action = best_action

        else:
            self.v = self.stacked_state.head.sparse_eval(self.stacked_state.head.get_player_turn())