def play_with_ai_white(self, ai_agent=ControllerConfig.AI_AGENT): """ Returns ------- dict Dict of possible action and state """ self.state = State() self.state.initial_state() self.player_vs_ai_white = True state_dict = AIElements.get_state_dict(self.state) possible_action = AIElements.get_possible_action(self.state) self.possible_action_keys = possible_action.keys() if ai_agent == 'random': self.ai_agent = RandomAgent() elif ai_agent == 'minimaxab': self.ai_agent = MinimaxABAgent(player_color=0) elif ai_agent == 'azero': self.ai_agent = AlphaZeroAgent() self.old_state_reward = deepcopy(self.state) return { "state": state_dict, "possible_action": possible_action, "task": "CHANGE_PLAYER" }
def _minimax(self, current_depth, state, is_max_turn): """ Minimax Helper :param current_depth: the current tree depth in the recursion :param state: the current state in the tree node :param is_max_turn: bool check the current's node maximizer or not? :return: """ if current_depth == self.max_depth or state.is_terminal(): return AIElements.evaluation_function(state, self.player_color), "" self.node_expanded += 1 possible_action = AIElements.get_possible_action(state) key_of_actions = list(possible_action.keys()) shuffle(key_of_actions) #randomness best_value = float('-inf') if is_max_turn else float('inf') action_target = "" for action_key in key_of_actions: new_state = AIElements.result_function(state, possible_action[action_key]) eval_child, action_child = self._minimax(current_depth + 1, new_state, not is_max_turn) if is_max_turn and best_value < eval_child: best_value = eval_child action_target = action_key elif (not is_max_turn) and best_value > eval_child: best_value = eval_child action_target = action_key return best_value, action_target
def choose_action(self, state): """ Predict the move using AlphaZero algorithm Parameters ---------- state : State unused. The purpose of this parameter is to match the other class. float, str: The evaluation or utility and the action key name """ from ai_modules.ai_elements import AIElements import numpy as np self.mcts.self_play() action_proba = np.array(self.mcts.get_action_proba(temperature=0)) action = np.random.choice(len(action_proba), p=action_proba) action_key = self.ae.inverse_transform([action])[0] possible_action = AIElements.get_possible_action( self.stacked_state.head) from ai_modules.ai_elements import AIElements new_state = AIElements.result_function(self.stacked_state.head, possible_action[action_key]) self.stacked_state.append(new_state) self.mcts.update_root(action_key) return (action_key, possible_action[action_key])
def fight_agent(best_model: str, current_model: str, ae, round_fight=AlphaZeroConfig.ROUND_ARENA, max_turn=AlphaZeroConfig.MAX_TURN_ARENA, max_simulation=AlphaZeroConfig.MAX_SIMULATION_ARENA): """ The pitted 2 agents. We will check who is the best here. :param best_model: The current best model file path :param current_model: The current model file path :param ae: The Action Encoder :param round_fight: number of round to determine the winner :param max_turn: The maximum turn of the game. If the current turn is higher than max turn. It will be cut and the outcome of the game is draw. :param max_simulation: The maximum of simulation :return: dict, The dictionary of the score """ from ai_modules.reinforcement_algorithm import AlphaZeroAgent loss_win = {0: 0, 1: 0} for round in range(round_fight): print("ROUND {}".format(round + 1)) terminal = False count_turn = 1 state = State() state.initial_state() best_model_agent = AlphaZeroAgent(state, max_simulation, best_model) # 1 current_model_agent = None # 0 while not terminal and count_turn <= max_turn: print("=======TURN {} ========".format(count_turn)) state.print_board() current_player_turn = state.get_player_turn() if current_player_turn == 1: key, dict_key = best_model_agent.choose_action(state) state = AIElements.result_function(state, dict_key) if current_model_agent is not None: current_model_agent.enemy_turn_action(key, state) else: if current_model_agent is None: current_model_agent = AlphaZeroAgent( state, max_simulation, current_model) key, dict_key = current_model_agent.choose_action(state) state = AIElements.result_function(state, dict_key) best_model_agent.enemy_turn_action(key, state) print("Player %d choose action %s" % (current_player_turn, key)) game_ended = state.is_terminal() if game_ended: print("Player {} Win".format(count_turn % 2)) loss_win[(current_player_turn) % 2] += 1 terminal = True count_turn += 1 if count_turn > max_turn: print("ROUND {} DRAW".format(round + 1)) return loss_win
def choose_action(self, state): """ Predict the move using minimax alpha beta pruning algorithm Parameters ---------- state : State Returns ------- float, str: The evaluation or utility and the action key name """ self.node_expanded = 0 start_time = time.time() print("MINIMAX AB : Wait AI is choosing") list_action = AIElements.get_possible_action(state) eval_score, selected_key_action = self._minimax( 0, state, True, float('-inf'), float('inf')) print("MINIMAX : Done, eval = %d, expanded %d" % (eval_score, self.node_expanded)) print("--- %s seconds ---" % (time.time() - start_time)) return (selected_key_action, list_action[selected_key_action])
def get_whattodo_view(self): """ Give the view the dict that tell the possible action on this turn and the task that the view should do :return: dict """ params_view_action = {} self.state.print_board() if AIElements.is_over(self.state): params_view_action['task'] = 'END_GAME' print("test") return params_view_action if self.two_players: params_view_action['task'] = 'CHANGE_PLAYER' params_view_action['state'] = AIElements.get_state_dict(self.state) possible_action = AIElements.get_possible_action(self.state) params_view_action['possible_action'] = possible_action self.possible_action_keys = possible_action.keys() if self.player_vs_ai_white: self.possible_action_keys = AIElements.get_possible_action( self.state).keys() params_view_action['task'] = 'AI_MOVE' ai_key_action, ai_action_params = self.ai_agent.choose_action( self.state) previous_state = deepcopy(self.state) self.receive_input_action_play(ai_key_action, ai_action_params) if AIElements.is_over(self.state): params_view_action['end'] = True params_view_action['task'] = 'END_GAME' return params_view_action print("Reward Function is %.2f" % (AIElements.reward_function( self.old_state_reward, self.state, 1))) #Black self.old_state_reward = deepcopy(self.state) state_dict = AIElements.get_state_dict(self.state) previous_state_dict = AIElements.get_state_dict(previous_state) possible_action = AIElements.get_possible_action(self.state) previous_mana = AIElements.get_players_mana(previous_state) params_view_action['state'] = state_dict params_view_action["prev_state"] = previous_state_dict params_view_action["ai_action"] = ai_action_params params_view_action["prev_mana"] = previous_mana params_view_action["possible_action"] = possible_action self.possible_action_keys = possible_action.keys() return params_view_action
def _minimax(self, current_depth, state, is_max_turn, alpha, beta): """ Helper function of minimax :param current_depth: The current depth on the tree in recursive :param state: State of the current node in recursive :param is_max_turn: Check if the current node is the max turn in recursive :param alpha: parameter of AB Prunning, save the current maximizer best value :param beta: parameter of AB Prunning, save the current minimizer best value :return: int , str The value of the best action and the name of the action """ if current_depth == self.max_depth or state.is_terminal(): return AIElements.evaluation_function(state, self.player_color), "" self.node_expanded += 1 possible_action = AIElements.get_possible_action(state) key_of_actions = list(possible_action.keys()) shuffle(key_of_actions) # add randomness here best_value = float('-inf') if is_max_turn else float('inf') action_target = "" for action_key in key_of_actions: new_state = AIElements.result_function(state, possible_action[action_key]) eval_child, action_child = self._minimax(current_depth + 1, new_state, not is_max_turn, alpha, beta) if is_max_turn and best_value < eval_child: best_value = eval_child action_target = action_key alpha = max(alpha, best_value) if beta <= alpha: break elif (not is_max_turn) and best_value > eval_child: best_value = eval_child action_target = action_key beta = min(beta, best_value) if beta <= alpha: break return best_value, action_target
def play_with_two_players_start(self): """ Return the initial state Returns ------- dict Dict of possible Action and state """ self.state = State() self.state.initial_state() self.two_players = True state_dict = AIElements.get_state_dict(self.state) possible_action = AIElements.get_possible_action(self.state) self.possible_action_keys = possible_action.keys() return { "state": state_dict, "possible_action": possible_action, "task": "CHANGE_PLAYER" }
def receive_input_action_play(self, input_key, input_params): """ Process the input from the user in the view :param input_key: str, the key of the action :param input_params: dict, the parameter of the action :return: bool, tell that the action is present in the possible action """ if input_key in self.possible_action_keys: self.state = AIElements.result_function(self.state, input_params) ## Useful for alpha zero only self.ai_agent.enemy_turn_action(input_key, input_params) index_player = (AIElements.get_player(self.state) + 1) % 2 print("TURN %d" % (self.state.turn)) print("The Evaluation of Player %d is %.2f" % (index_player, AIElements.evaluation_function(self.state, index_player))) return True else: return False
def choose_action(self, state): """ Predict the move with uniform proba random. Parameters ---------- state : State Returns ------- float, str: The evaluation or utility and the action key name """ import random list_action = AIElements.get_possible_action(state) key_list_action = list_action.keys() rand_int = random.randint(0, len(key_list_action) - 1) selected_key_action = list(key_list_action)[rand_int] return (selected_key_action, list_action[selected_key_action])
def get_deep_representation_stack(self): """ The representation of state to be input of the deep learning For more details, see my medium post (Part 3) :return: the state representation """ input_network = np.zeros((9, 9, self.planes_total)) counter_iter = self.max_len - len(self.deque_collection) for state in reversed(self.deque_collection): planes_index = counter_iter * self.max_features counter_iter += 1 all_pawn_list = state.white_pawn_list + state.black_pawn_list + [state.white_king, state.black_king] possible_action = AIElements.get_possible_action(state) all_rune_list = state.rune_list input_network[0, 4, planes_index + 22] = state.turn % 5 input_network[4, 4, planes_index + 23] = state.turn % 5 input_network[8, 4, planes_index + 24] = state.turn % 5 for rune in all_rune_list: input_network[rune.x,rune.y, planes_index + 25] = 1 for i in possible_action: if i != 'skip': possible_action_dict = possible_action[i] # player_index = possible_action_dict['player_index'] x = possible_action_dict['pawn_x'] y = possible_action_dict['pawn_y'] if 'player' in possible_action_dict: index_color = possible_action_dict['player'] input_network[x, y, planes_index + 26 + index_color] += 1 if 'player_index' in possible_action_dict: index_color = possible_action_dict['player_index'] input_network[x, y, planes_index + 26 + index_color] += 1 for pawn in all_pawn_list: x, y = pawn.x, pawn.y player_mana = pawn.player.mana if not pawn.dead: index_color = 0 if pawn.player.color == 0 else 1 pawn_hp = pawn.hp input_network[x, y, planes_index + index_color + 0] = pawn_hp pawn_atk = pawn.atk input_network[x, y, planes_index + index_color + 2] = pawn_atk pawn_step = pawn.step input_network[x, y, planes_index + index_color + 4] = pawn_step input_network[x, y, planes_index + index_color + 6] = 1 # location pawn if isinstance(pawn, KnightPawn): input_network[x, y, planes_index + index_color + 8] = 1 elif isinstance(pawn, RookPawn): input_network[x, y, planes_index + index_color + 10] = 1 elif isinstance(pawn, BishopPawn): input_network[x, y, planes_index + index_color + 12] = 1 elif isinstance(pawn, SoldierPawn): input_network[x, y, planes_index + index_color + 14] = 1 elif isinstance(pawn, QueenPawn): input_network[x, y, planes_index + index_color + 16] = 1 elif isinstance(pawn, King): input_network[x, y, planes_index + index_color + 18] = 1 input_network[x, y, planes_index + index_color + 20] = player_mana return input_network
def do_self_play_episode( stacked_state, mcts, ae: ActionEncoder, greed=False, pov=0, temperature_turn_end=AlphaZeroConfig.TEMPERATURE_END_STEP, greedy_turn=AlphaZeroConfig.GREEDY_TURN_MIN, max_turn_episode=AlphaZeroConfig.MAX_TURN_SIMULATION): """ Self play an episode. This function will simulate MCTS every turns. :param stacked_state: The state that is stacked. :param mcts: a MCTS that will be simulated every turns. :param ae: Action Encoder :param greed: Hack that will be used to make the agent prioritize attacking and promoting. :param pov: The chosen point of view of player :param temperature_turn_end: Make the temperature to 0 on the given turn :return: Training data that will be appended to the global list of training data """ terminal = False counter_step = 0 list_training = [] while not terminal: counter_step += 1 print("Step : %d" % (counter_step)) print("Turn : %d" % (stacked_state.head.turn)) print("Player in state : %d" % (stacked_state.head.get_player_turn())) print("Player in tree : %d" % (mcts.root.stacked_state.head.get_player_turn())) temperature = 1 if counter_step > temperature_turn_end: temperature = 0 # Hack if counter_step > greedy_turn, greedily attack enemies if counter_step > greedy_turn and greed: print("Greedy Mode Activate") mcts.self_play(greed) else: mcts.self_play() action_proba = np.array(mcts.get_action_proba(temperature)) action = np.random.choice(len(action_proba), p=action_proba) action_key = ae.inverse_transform([action])[0] possible_action = AIElements.get_possible_action(stacked_state.head) # Append list training if stacked_state.head.get_player_turn() != pov: training_data_tobe = HelperTrainingExample( mirror_stacked_state(deepcopy(stacked_state)), (stacked_state.head.get_player_turn()), ae.array_mirrored(action_proba)) else: training_data_tobe = HelperTrainingExample( deepcopy(stacked_state), stacked_state.head.get_player_turn(), action_proba) print("Action_proba shape {}".format(action_proba.shape)) list_training.append(training_data_tobe) # TODO : add v_treshold to cut stacked_state.head.print_board() print('Mirrored') mirror_stacked_state(deepcopy(stacked_state)).head.print_board() print("Player %d choose action %s" % (stacked_state.head.get_player_turn(), action_key)) print("Next mean action value : %.4f" % (mcts.root.q_state_action[action_key])) new_state = AIElements.result_function(stacked_state.head, possible_action[action_key]) stacked_state.append(new_state) mcts.update_root(action_key) terminal = stacked_state.head.is_terminal() if terminal: # Update the reward in list_training mcts.self_play() # to fill the v variable reward = mcts.root.v # reward should be -1 loser_player = stacked_state.head.get_player_turn() for i in list_training: if i.current_player == loser_player: i.reward = reward else: i.reward = -reward if counter_step > max_turn_episode: # Terminate episode, set reward to 0 reward = 0 for i in list_training: i.reward = reward terminal = True print("-----") return list_training
def expand_node(self, model_deep_net, player_color, label_encoder, epsilon=AlphaZeroConfig.MCTS_EPSILON, alpha_diri=AlphaZeroConfig.MCTS_ALPHA_DIRICHLET, cpuct=AlphaZeroConfig.MCTS_PUCT, greed_attack = False): """ This function contains 2 steps on the MCTS. Select and Expand & Evaluate :param model_deep_net: The neural network model :param player_color: why did I include this??? :param label_encoder: The encoder used to encode the action key :param epsilon: hyperparameter for using the dirichlet random proba :param alpha_diri: hyperparameter of dirichlet :param cpuct: hyperparameter of the MCTS in alpha zero :param greed_attack: HACK! the agent will prioritize attacking and promoting :return: """ terminal = AIElements.is_over(self.stacked_state.head) self.is_terminal = terminal if not terminal: possible_action = AIElements.get_possible_action(self.stacked_state.head) possible_action_keys = list(possible_action.keys()) if self.p_state is None: """ Expand and Evaluate goes here! """ if self.stacked_state.head.get_player_turn() == self.maximizer: state_stack_representation = np.array([self.stacked_state.get_deep_representation_stack()]) else: state_stack_representation = mirror_stacked_state(self.stacked_state) state_stack_representation = np.array([state_stack_representation.get_deep_representation_stack()]) self.p_state, self.v = model_deep_net.predict(state_stack_representation) self.v_ = self.v[0][0] self.v = self.v[0][0] self.p_state = self.p_state[0] if self.stacked_state.head.get_player_turn() != self.maximizer: self.p_state = label_encoder.array_mirrored(self.p_state) possible_action_ohe = label_encoder.transform(possible_action_keys).sum(axis=0) self.p_state *= possible_action_ohe sum_policy_state = np.sum(self.p_state) if sum_policy_state > 0: ## normalize to sum 1 self.p_state /= sum_policy_state else: print("All valid moves were masked, do workaround.") self.p_state += possible_action_ohe self.p_state /= np.sum(self.p_state) # Initialize num and q for action in possible_action_keys: self.num_state_action[action] = 0 self.q_state_action[action] = 0 next_state = AIElements.result_function(self.stacked_state.head, possible_action[action]) new_stacked_state = deepcopy(self.stacked_state) new_stacked_state.append(next_state) if action not in self.edge_action: self.edge_action[action] = NodeMCTS(new_stacked_state, parent=self, root=False) else: """ Select goes here """ best_action = "" best_upper_confidence = -float('inf') dirchlet_prob = np.random.dirichlet([alpha_diri] * len(possible_action_keys)) counter_loop = 0 # Randomize possible_action_keys random.shuffle(possible_action_keys) for action in possible_action_keys: # Get the index of the action index_action = label_encoder.le.transform([action])[0] q_state_action_val = 0 num_state_action_val = 0 if action in self.q_state_action and action in self.num_state_action: q_state_action_val = self.q_state_action[action] num_state_action_val = self.num_state_action[action] if self.root: upper_confidence = q_state_action_val + \ cpuct * ((1 - epsilon) * self.p_state[index_action] + epsilon * dirchlet_prob[ counter_loop]) * \ np.sqrt(self.num_state) / (1 + num_state_action_val) else: upper_confidence = q_state_action_val + \ cpuct * self.p_state[index_action] * \ np.sqrt(self.num_state) / (1 + num_state_action_val) if greed_attack and possible_action[action]['action'] == 'attack': upper_confidence += AlphaZeroConfig.Q_ATTACK_GREEDY # Higher Chance to Attack if greed_attack and possible_action[action]['action'] == 'promote': upper_confidence += AlphaZeroConfig.Q_PROMOTE_GREEDY # Higher Chance to promote counter_loop += 1 if best_upper_confidence < upper_confidence: best_upper_confidence = upper_confidence best_action = action # Expand the node and check if this node is terminal next_state = AIElements.result_function(self.stacked_state.head, possible_action[action]) new_stacked_state = deepcopy(self.stacked_state) new_stacked_state.append(next_state) if action not in self.edge_action: self.edge_action[action] = NodeMCTS(new_stacked_state, parent=self, root=False) self.selected_action = best_action else: self.v = self.stacked_state.head.sparse_eval(self.stacked_state.head.get_player_turn())