def _MCTS(self, root : tree.Node, env : gym_connect_four.ConnectFourEnv, print_simulation_info : bool = False) -> tree.Node: start_time = time.time() original_move_count = env.get_move_count() while ((time.time() - start_time) < (self.max_play_time_sec)): # Selection selected_node, actions_to_reach_selected_node = _select_best_node(root=root, exploration_constant=self.exploration_constant) # Follow the steps to reach the desired state for move in actions_to_reach_selected_node: env.step(move) # Expansion if (not env.is_final_state()): # assert not selected_node.children selected_node = _expand_node(selected_node, env.get_available_moves()) env.step(selected_node.data.action) # Simulation simulation_reward = self._simulation_function(selected_node, env, self.player_code) # Back propagation _back_propagate(selected_node, simulation_reward) # Go back to original state env.reset(move_to_return_to=original_move_count) # Indicate current estimations for move value if (print_simulation_info): number_of_columns = env.get_board().shape[1] action_values = [None for _ in range(number_of_columns)] for c in root.children: action_values[c.data.action] = c.data.value print('\r', end='') for move in action_values: print('{}.{:<2} '.format('-' if move < 0 else ' ', int(np.trunc(np.abs(move*100) - 1))) if move else ' ', end='') confidence = 50 * (root.data.value + 1) print(' Confidence: {:.1f}%{} '.format(confidence, '!' if confidence > 95 else ' '), end='') return root
def get_movement(self, env : gym_connect_four.ConnectFourEnv, print_simulation_info : bool = True) -> np.int8: assert not env.is_final_state() assert self.player_code == env.get_current_player_code(), "I'm player {} and environment's expect a play from {}".format(self.player_code, env.get_current_player_code()) if (self._use_multiprocessing): return self._get_movement_parallel(env) return self._get_movement_classic(env, print_simulation_info)
def _get_movement_classic(self, env : gym_connect_four.ConnectFourEnv, print_simulation_info : bool = True) -> np.int8: # Find current state in tree root = self.tree_root original_play_history = env.get_play_history() env.reset() for move in original_play_history: if (not root.children): _expand_node(root, env.get_available_moves()) for c in root.children: if c.data.action == move: root = c env.step(move) break original_simulation_count = root.data.simulation_count self._MCTS(root, env, print_simulation_info) # Greed movement choice chosen_child = max(root.children, key=lambda c: c.data.value) # Print confidence in chosen move print('\nExpected reward from chosen move ({}): {:+.2f}'.format(chosen_child.data.action, chosen_child.data.value)) print('Decision made based on', root.data.simulation_count, 'simulations. From these, {} ({:.2f}%) were performed now.'.format((root.data.simulation_count - original_simulation_count), 100 * (root.data.simulation_count - original_simulation_count)/root.data.simulation_count)) return chosen_child.data.action
def _random_play(node : tree.Node, env : gym_connect_four.ConnectFourEnv, original_player : int) -> np.float: if (env.is_draw_state()): node.data.is_final = True return 0 if (env.is_win_state()): node.data.is_final = True return original_player * env.get_reward() done : bool = False reward : np.float while (not done): _, reward, done, _ = env.step(_RANDOM_PLAYER.get_movement(env)) return original_player * reward
def _get_movement_parallel(self, env : gym_connect_four.ConnectFourEnv) -> np.int8: roots = [] env_copies = [] for move in env.get_available_moves(): roots.append(tree.Node(parent=None, data=_data())) roots[-1].children = [tree.Node(parent=roots[-1], data=_data())] roots[-1].children[0].data.action = move # One environment for each process env_copies.append(copy.deepcopy(env)) with multiprocessing.Pool() as processing_pool: roots = processing_pool.starmap(self._MCTS, [args for args in zip(roots, env_copies)]) children = [r.children[0] for r in roots] # Greed movement choice chosen_child = max(children, key=lambda c: c.data.value) # Print confidence in chosen move simulation_count = sum([r.data.simulation_count for r in roots]) expected_reward = np.average([r.data.value for r in roots], weights=[r.data.simulation_count for r in roots]) confidence = 50 * (expected_reward + 1) print('Confidence: {:.1f}%{} '.format(confidence, '!' if confidence > 95 else ' ')) print('Expected reward from chosen move ({}): {:+.2f}'.format(chosen_child.data.action, chosen_child.data.value)) print('Decision made based on', simulation_count, 'simulations. All of them performed now.') return chosen_child.data.action
def _short_sighted(node : tree.Node, env : gym_connect_four.ConnectFourEnv, original_player : int) -> np.float: if (env.is_draw_state()): return 0 if (env.is_win_state()): return original_player * env.get_reward() done : bool = False while (not done): _ONE_MOVE_GREEDY_PLAYER.player_code = env.get_current_player_code() _, _, done, _ = env.step(_ONE_MOVE_GREEDY_PLAYER.get_movement(env)) return original_player * env.get_reward()
def get_movement(self, env : gym_connect_four.ConnectFourEnv): while (True): try: play = input('Your play: ') if (play == 'z'): env.undo_moves(2) env.render() print('Game history:', env.play_history()) continue if (env.is_valid_action(int(play))): return int(play) raise Exception() except: print('Invalid input!')
def train_AIs(env : gym_connect_four.ConnectFourEnv, white_AI_name : str, black_AI_name : str, how_many_seconds_for_each_play : float, how_many_games : int, exploration_constant : float, print_simulation_info : bool, save_tree_after_each_game : bool = True) -> None: assert white_AI_name assert black_AI_name assert how_many_seconds_for_each_play > 0 assert how_many_games > 0 assert exploration_constant >= 0 main_whites = Player( player_code=env.get_player_code_for_whites(), max_time_sec=how_many_seconds_for_each_play, rollout_policy='random_greedy', exploration_constant=exploration_constant, use_multiprocessing=False, load_tree_from_file=white_AI_name + '.tree' ) main_blacks = Player( player_code=env.get_player_code_for_blacks(), max_time_sec=how_many_seconds_for_each_play, rollout_policy='random_greedy', exploration_constant=exploration_constant, use_multiprocessing=False, load_tree_from_file=black_AI_name + '.tree' ) fast_whites = Player( player_code=env.get_player_code_for_whites(), max_time_sec=0.1, rollout_policy='random_greedy', exploration_constant=1, use_multiprocessing=False # load_tree_from_file=black_AI_name + '.tree' ) strong_whites = Player( player_code=env.get_player_code_for_whites(), max_time_sec=1, rollout_policy='random_greedy', exploration_constant=1.414213, use_multiprocessing=True # load_tree_from_file=black_AI_name + '.tree' ) short_sighted_whites = one_move_greedy_player.Player(player_code=env.get_player_code_for_whites()) # player_whites : gym_connect_four.ConnectFourEnv = short_sighted_whites player_whites : gym_connect_four.ConnectFourEnv = fast_whites # player_whites : gym_connect_four.ConnectFourEnv = strong_whites player_blacks : gym_connect_four.ConnectFourEnv = main_blacks wins_whites_count : int = 0 wins_blacks_count : int = 0 draws_count : int = 0 for i in range(how_many_games): env.reset() move : int while (not env.is_final_state()): print() print('Game #: {}/{}'.format((i+1), how_many_games)) print('⬤ wins:', wins_whites_count, '({:.1f}%)'.format((100 * wins_whites_count/i) if (i > 0) else 0.0), '({})'.format(white_AI_name)) print('◯ wins:', wins_blacks_count, '({:.1f}%)'.format((100 * wins_blacks_count/i) if (i > 0) else 0.0), '({})'.format(black_AI_name)) print('Draws:', draws_count, '({:.1f}%)'.format((100 * draws_count / i) if (i > 0) else 0.0)) print('Time for move: {:.2f}s'.format(how_many_seconds_for_each_play)) print('\nGame history:', env.get_play_history()) env.render() if (env.get_current_player_code() == player_whites.player_code): move = player_whites.get_movement(env, print_simulation_info=print_simulation_info) elif (env.get_current_player_code() == player_blacks.player_code): move = player_blacks.get_movement(env, print_simulation_info=print_simulation_info) else: raise Exception('No known player matches player code {}. Is there a player for whites AND for blacks?'.format(env.get_current_player_code())) env.step(move) # Show final stage env.render() # Update win counts if (env.is_draw_state()): draws_count += 1 else: if (int(env.get_reward()) == player_whites.player_code): wins_whites_count += 1 elif (int(env.get_reward()) == player_blacks.player_code): wins_blacks_count += 1 else: print('WARNING: Error while counting wins!') if (save_tree_after_each_game): if (type(player_whites) == Player): player_whites.save_tree(player_whites.loaded_file) if (type(player_whites) == Player): player_blacks.save_tree(player_blacks.loaded_file) if (not save_tree_after_each_game): if (type(player_whites) == Player): player_whites.save_tree(player_whites.loaded_file) if (type(player_whites) == Player): player_blacks.save_tree(player_blacks.loaded_file)
def train_one_episode(env: ConnectFourEnv, params: Dict, players: Dict, total_step: int = 0) -> Tuple[float, int]: """ Perform 1 training episode in ConnectFour Environment. :param env: Gym environment (ConnectFourEnv). :param params: Hyper-parameter dictionary. :param players: A dictionary containing the instance of Player 1, Player 2, and the ID of player to be trained (trainee). :param total_step: Total number of steps performed in env. :return: A tuple of final reward and updated total_step. """ # noinspection PyRedeclaration state = env.reset() # Set player. player_id = 1 player = players[player_id] trainee_id = players.get("trainee_id", 1) # Extract PARAMS. epochs = params.get("EPOCHS_PER_LEARNING", 1) # Do one step ahead of the while loop # Initialize action history and perform first step. epsilon = player.get_epsilon(total_step=total_step) action = player.get_next_action(state, epsilon=epsilon) action_hist = deque([action], maxlen=2) next_state, reward, done, _ = env.step(action) # Initialize the state history and save the state and the next state state_hist = deque([state], maxlen=4) next_state *= -1 # Multiply -1 to change player perspective of game board. state_hist.append(next_state) state = next_state # Change player and enter while loop player_id = env.change_player() player = players[player_id] while not done: # Get current player's action. epsilon = player.get_epsilon(total_step=total_step) action_hist.append(player.get_next_action(state, epsilon=epsilon)) # Take the latest action in the deque. In endgame, winner here. next_state, reward, done, _ = env.step(action_hist[-1]) # Store the resulting state to history. # If the next player is player 2, # Multiply next_state by -1 to change player perspective. if player_id == 1: next_state *= -1 state_hist.append(next_state) # Change player here player_id = env.change_player() player = players[player_id] # Update DQN weights. In endgame, loser here. reward *= -1 player.learn( state_hist[-3], action_hist[-2], # state and action state_hist[-1], reward, done, # next state, reward, done n_step=total_step, epochs=epochs) # Update training result at the end for the next step total_step += 1 state = next_state # Render game board (NOT recommended with large N_EPISODES) # env.render() # Change player at the end of episode. player_id = env.change_player() player = players[player_id] # Both player have learnt all steps at the end. In endgame, winner here. reward *= -1 player.learn( state_hist[-2], action_hist[-1], state_hist[-1] * -1, reward, done, # Multiply -1 to change owner of each move. n_step=total_step, epochs=epochs) # Adjust reward for trainee. # If winner is opponent, we give opposite reward to trainee. if player_id != trainee_id: reward *= -1 # adjust reward. total_step += 1 return reward, total_step