def get_whattodo_view(self): """ Give the view the dict that tell the possible action on this turn and the task that the view should do :return: dict """ params_view_action = {} self.state.print_board() if AIElements.is_over(self.state): params_view_action['task'] = 'END_GAME' print("test") return params_view_action if self.two_players: params_view_action['task'] = 'CHANGE_PLAYER' params_view_action['state'] = AIElements.get_state_dict(self.state) possible_action = AIElements.get_possible_action(self.state) params_view_action['possible_action'] = possible_action self.possible_action_keys = possible_action.keys() if self.player_vs_ai_white: self.possible_action_keys = AIElements.get_possible_action( self.state).keys() params_view_action['task'] = 'AI_MOVE' ai_key_action, ai_action_params = self.ai_agent.choose_action( self.state) previous_state = deepcopy(self.state) self.receive_input_action_play(ai_key_action, ai_action_params) if AIElements.is_over(self.state): params_view_action['end'] = True params_view_action['task'] = 'END_GAME' return params_view_action print("Reward Function is %.2f" % (AIElements.reward_function( self.old_state_reward, self.state, 1))) #Black self.old_state_reward = deepcopy(self.state) state_dict = AIElements.get_state_dict(self.state) previous_state_dict = AIElements.get_state_dict(previous_state) possible_action = AIElements.get_possible_action(self.state) previous_mana = AIElements.get_players_mana(previous_state) params_view_action['state'] = state_dict params_view_action["prev_state"] = previous_state_dict params_view_action["ai_action"] = ai_action_params params_view_action["prev_mana"] = previous_mana params_view_action["possible_action"] = possible_action self.possible_action_keys = possible_action.keys() return params_view_action
def expand_node(self, model_deep_net, player_color, label_encoder, epsilon=AlphaZeroConfig.MCTS_EPSILON, alpha_diri=AlphaZeroConfig.MCTS_ALPHA_DIRICHLET, cpuct=AlphaZeroConfig.MCTS_PUCT, greed_attack = False): """ This function contains 2 steps on the MCTS. Select and Expand & Evaluate :param model_deep_net: The neural network model :param player_color: why did I include this??? :param label_encoder: The encoder used to encode the action key :param epsilon: hyperparameter for using the dirichlet random proba :param alpha_diri: hyperparameter of dirichlet :param cpuct: hyperparameter of the MCTS in alpha zero :param greed_attack: HACK! the agent will prioritize attacking and promoting :return: """ terminal = AIElements.is_over(self.stacked_state.head) self.is_terminal = terminal if not terminal: possible_action = AIElements.get_possible_action(self.stacked_state.head) possible_action_keys = list(possible_action.keys()) if self.p_state is None: """ Expand and Evaluate goes here! """ if self.stacked_state.head.get_player_turn() == self.maximizer: state_stack_representation = np.array([self.stacked_state.get_deep_representation_stack()]) else: state_stack_representation = mirror_stacked_state(self.stacked_state) state_stack_representation = np.array([state_stack_representation.get_deep_representation_stack()]) self.p_state, self.v = model_deep_net.predict(state_stack_representation) self.v_ = self.v[0][0] self.v = self.v[0][0] self.p_state = self.p_state[0] if self.stacked_state.head.get_player_turn() != self.maximizer: self.p_state = label_encoder.array_mirrored(self.p_state) possible_action_ohe = label_encoder.transform(possible_action_keys).sum(axis=0) self.p_state *= possible_action_ohe sum_policy_state = np.sum(self.p_state) if sum_policy_state > 0: ## normalize to sum 1 self.p_state /= sum_policy_state else: print("All valid moves were masked, do workaround.") self.p_state += possible_action_ohe self.p_state /= np.sum(self.p_state) # Initialize num and q for action in possible_action_keys: self.num_state_action[action] = 0 self.q_state_action[action] = 0 next_state = AIElements.result_function(self.stacked_state.head, possible_action[action]) new_stacked_state = deepcopy(self.stacked_state) new_stacked_state.append(next_state) if action not in self.edge_action: self.edge_action[action] = NodeMCTS(new_stacked_state, parent=self, root=False) else: """ Select goes here """ best_action = "" best_upper_confidence = -float('inf') dirchlet_prob = np.random.dirichlet([alpha_diri] * len(possible_action_keys)) counter_loop = 0 # Randomize possible_action_keys random.shuffle(possible_action_keys) for action in possible_action_keys: # Get the index of the action index_action = label_encoder.le.transform([action])[0] q_state_action_val = 0 num_state_action_val = 0 if action in self.q_state_action and action in self.num_state_action: q_state_action_val = self.q_state_action[action] num_state_action_val = self.num_state_action[action] if self.root: upper_confidence = q_state_action_val + \ cpuct * ((1 - epsilon) * self.p_state[index_action] + epsilon * dirchlet_prob[ counter_loop]) * \ np.sqrt(self.num_state) / (1 + num_state_action_val) else: upper_confidence = q_state_action_val + \ cpuct * self.p_state[index_action] * \ np.sqrt(self.num_state) / (1 + num_state_action_val) if greed_attack and possible_action[action]['action'] == 'attack': upper_confidence += AlphaZeroConfig.Q_ATTACK_GREEDY # Higher Chance to Attack if greed_attack and possible_action[action]['action'] == 'promote': upper_confidence += AlphaZeroConfig.Q_PROMOTE_GREEDY # Higher Chance to promote counter_loop += 1 if best_upper_confidence < upper_confidence: best_upper_confidence = upper_confidence best_action = action # Expand the node and check if this node is terminal next_state = AIElements.result_function(self.stacked_state.head, possible_action[action]) new_stacked_state = deepcopy(self.stacked_state) new_stacked_state.append(next_state) if action not in self.edge_action: self.edge_action[action] = NodeMCTS(new_stacked_state, parent=self, root=False) self.selected_action = best_action else: self.v = self.stacked_state.head.sparse_eval(self.stacked_state.head.get_player_turn())