Example #1
0
    def _MCTS(self, root : tree.Node, env : gym_connect_four.ConnectFourEnv, print_simulation_info : bool = False) -> tree.Node:

        start_time = time.time()

        original_move_count = env.get_move_count()


        while ((time.time() - start_time) < (self.max_play_time_sec)):

            # Selection
            selected_node, actions_to_reach_selected_node = _select_best_node(root=root, exploration_constant=self.exploration_constant)

            # Follow the steps to reach the desired state
            for move in actions_to_reach_selected_node:

                env.step(move)
            
            # Expansion
            if (not env.is_final_state()):

                # assert not selected_node.children

                selected_node = _expand_node(selected_node, env.get_available_moves())
                env.step(selected_node.data.action)


            # Simulation
            simulation_reward = self._simulation_function(selected_node, env, self.player_code)
            

            # Back propagation
            _back_propagate(selected_node, simulation_reward)


            # Go back to original state
            env.reset(move_to_return_to=original_move_count)


            # Indicate current estimations for move value
            if (print_simulation_info):

                number_of_columns = env.get_board().shape[1]
                action_values = [None for _ in range(number_of_columns)]

                for c in root.children:
                
                    action_values[c.data.action] = c.data.value
                
                print('\r', end='')
                for move in action_values:
                    
                    print('{}.{:<2} '.format('-' if move < 0 else ' ', int(np.trunc(np.abs(move*100) - 1))) if move else '     ', end='')

                confidence = 50 * (root.data.value + 1)
                print(' Confidence: {:.1f}%{} '.format(confidence, '!' if confidence > 95 else ' '), end='')


        return root
Example #2
0
    def get_movement(self, env : gym_connect_four.ConnectFourEnv, print_simulation_info : bool = True) -> np.int8:

        assert not env.is_final_state()
        assert self.player_code == env.get_current_player_code(), "I'm player {} and environment's expect a play from {}".format(self.player_code, env.get_current_player_code())
        
        
        if (self._use_multiprocessing):

            return self._get_movement_parallel(env)
        
        return self._get_movement_classic(env, print_simulation_info)
Example #3
0
    def _get_movement_classic(self, env : gym_connect_four.ConnectFourEnv, print_simulation_info : bool = True) -> np.int8:

        # Find current state in tree
        root = self.tree_root

        original_play_history = env.get_play_history()

        env.reset()
        for move in original_play_history:

            if (not root.children):

                _expand_node(root, env.get_available_moves())
            
            for c in root.children:

                if c.data.action == move:

                    root = c
                    env.step(move)
                    break


        original_simulation_count = root.data.simulation_count
        self._MCTS(root, env, print_simulation_info)

        # Greed movement choice
        chosen_child = max(root.children, key=lambda c: c.data.value)

        # Print confidence in chosen move
        print('\nExpected reward from chosen move ({}): {:+.2f}'.format(chosen_child.data.action, chosen_child.data.value))
        print('Decision made based on', root.data.simulation_count, 'simulations. From these, {} ({:.2f}%) were performed now.'.format((root.data.simulation_count - original_simulation_count), 100 * (root.data.simulation_count - original_simulation_count)/root.data.simulation_count))
        
        return chosen_child.data.action
Example #4
0
def _random_play(node : tree.Node, env : gym_connect_four.ConnectFourEnv, original_player : int) -> np.float:

    if (env.is_draw_state()):

        node.data.is_final = True
        return 0

    if (env.is_win_state()):

        node.data.is_final = True
        return original_player * env.get_reward()


    done : bool = False
    reward : np.float

    while (not done):

        _, reward, done, _ = env.step(_RANDOM_PLAYER.get_movement(env))
    

    return original_player * reward
Example #5
0
    def _get_movement_parallel(self, env : gym_connect_four.ConnectFourEnv) -> np.int8:
        
        roots = []
        env_copies = []

        for move in env.get_available_moves():

            roots.append(tree.Node(parent=None, data=_data()))
            roots[-1].children = [tree.Node(parent=roots[-1], data=_data())]
            roots[-1].children[0].data.action = move

            # One environment for each process
            env_copies.append(copy.deepcopy(env))



        with multiprocessing.Pool() as processing_pool:
            
            roots = processing_pool.starmap(self._MCTS, [args for args in zip(roots, env_copies)])


        children = [r.children[0] for r in roots]

        # Greed movement choice
        chosen_child = max(children, key=lambda c: c.data.value)

        # Print confidence in chosen move
        simulation_count = sum([r.data.simulation_count for r in roots])
        expected_reward = np.average([r.data.value for r in roots], weights=[r.data.simulation_count for r in roots])

        confidence = 50 * (expected_reward + 1)
        print('Confidence: {:.1f}%{} '.format(confidence, '!' if confidence > 95 else ' '))
        print('Expected reward from chosen move ({}): {:+.2f}'.format(chosen_child.data.action, chosen_child.data.value))
        print('Decision made based on', simulation_count, 'simulations. All of them performed now.')

        return chosen_child.data.action
Example #6
0
def _short_sighted(node : tree.Node, env : gym_connect_four.ConnectFourEnv, original_player : int) -> np.float:

    if (env.is_draw_state()):

        return 0

    if (env.is_win_state()):

        return original_player * env.get_reward()


    done : bool = False

    while (not done):

        _ONE_MOVE_GREEDY_PLAYER.player_code = env.get_current_player_code()
        _, _, done, _ = env.step(_ONE_MOVE_GREEDY_PLAYER.get_movement(env))
    

    return original_player * env.get_reward()
    def get_movement(self, env : gym_connect_four.ConnectFourEnv):

        while (True):

            try:

                play = input('Your play: ')

                if (play == 'z'):

                    env.undo_moves(2)
                    env.render()
                    print('Game history:', env.play_history())
                    continue

                if (env.is_valid_action(int(play))):

                    return int(play)
                
                raise Exception()
            
            except:

                print('Invalid input!')
Example #8
0
def train_AIs(env : gym_connect_four.ConnectFourEnv, white_AI_name : str, black_AI_name : str, how_many_seconds_for_each_play : float, how_many_games : int, exploration_constant : float, print_simulation_info : bool, save_tree_after_each_game : bool = True) -> None:

    assert white_AI_name
    assert black_AI_name
    assert how_many_seconds_for_each_play > 0
    assert how_many_games > 0
    assert exploration_constant >= 0


    main_whites = Player(
        player_code=env.get_player_code_for_whites(),
        max_time_sec=how_many_seconds_for_each_play,
        rollout_policy='random_greedy',
        exploration_constant=exploration_constant,
        use_multiprocessing=False,
        load_tree_from_file=white_AI_name + '.tree'
    )
    main_blacks = Player(
        player_code=env.get_player_code_for_blacks(),
        max_time_sec=how_many_seconds_for_each_play,
        rollout_policy='random_greedy',
        exploration_constant=exploration_constant,
        use_multiprocessing=False,
        load_tree_from_file=black_AI_name + '.tree'
    )

    fast_whites = Player(
        player_code=env.get_player_code_for_whites(),
        max_time_sec=0.1,
        rollout_policy='random_greedy',
        exploration_constant=1,
        use_multiprocessing=False
        # load_tree_from_file=black_AI_name + '.tree'
    )

    strong_whites = Player(
        player_code=env.get_player_code_for_whites(),
        max_time_sec=1,
        rollout_policy='random_greedy',
        exploration_constant=1.414213,
        use_multiprocessing=True
        # load_tree_from_file=black_AI_name + '.tree'
    )

    short_sighted_whites = one_move_greedy_player.Player(player_code=env.get_player_code_for_whites())


    # player_whites : gym_connect_four.ConnectFourEnv = short_sighted_whites
    player_whites : gym_connect_four.ConnectFourEnv = fast_whites
    # player_whites : gym_connect_four.ConnectFourEnv = strong_whites
    
    player_blacks : gym_connect_four.ConnectFourEnv = main_blacks


    wins_whites_count : int = 0
    wins_blacks_count : int = 0
    draws_count : int = 0

    for i in range(how_many_games):


        env.reset()

        move : int

        while (not env.is_final_state()):

            print()
            print('Game #: {}/{}'.format((i+1), how_many_games))
            print('⬤  wins:', wins_whites_count, '({:.1f}%)'.format((100 * wins_whites_count/i) if (i > 0) else 0.0), '({})'.format(white_AI_name))
            print('◯  wins:', wins_blacks_count, '({:.1f}%)'.format((100 * wins_blacks_count/i) if (i > 0) else 0.0), '({})'.format(black_AI_name))
            print('Draws:', draws_count, '({:.1f}%)'.format((100 * draws_count / i) if (i > 0) else 0.0))
            print('Time for move: {:.2f}s'.format(how_many_seconds_for_each_play))
            print('\nGame history:', env.get_play_history())
            env.render()
            

            if (env.get_current_player_code() == player_whites.player_code):

                move = player_whites.get_movement(env, print_simulation_info=print_simulation_info)

            elif (env.get_current_player_code() == player_blacks.player_code):

                move = player_blacks.get_movement(env, print_simulation_info=print_simulation_info)
            
            else:

                raise Exception('No known player matches player code {}. Is there a player for whites AND for blacks?'.format(env.get_current_player_code()))


            env.step(move)
        

        # Show final stage
        env.render()
        
        # Update win counts
        if (env.is_draw_state()):

            draws_count += 1
        
        else:

            if (int(env.get_reward()) == player_whites.player_code):

                wins_whites_count += 1
                
            elif (int(env.get_reward()) == player_blacks.player_code):

                wins_blacks_count += 1
            
            else:

                print('WARNING: Error while counting wins!')
        

        if (save_tree_after_each_game):
            
            if (type(player_whites) == Player):
                
                player_whites.save_tree(player_whites.loaded_file)
            
            if (type(player_whites) == Player):
            
                player_blacks.save_tree(player_blacks.loaded_file)


    if (not save_tree_after_each_game):
        
        if (type(player_whites) == Player):
            
            player_whites.save_tree(player_whites.loaded_file)
        
        if (type(player_whites) == Player):
        
            player_blacks.save_tree(player_blacks.loaded_file)
Example #9
0
def train_one_episode(env: ConnectFourEnv,
                      params: Dict,
                      players: Dict,
                      total_step: int = 0) -> Tuple[float, int]:
    """
    Perform 1 training episode in ConnectFour Environment.

    :param env: Gym environment (ConnectFourEnv).
    :param params: Hyper-parameter dictionary.
    :param players: A dictionary containing the instance of
        Player 1, Player 2, and the ID of player to be trained (trainee).
    :param total_step: Total number of steps performed in env.
    :return: A tuple of final reward and updated total_step.
    """
    # noinspection PyRedeclaration
    state = env.reset()

    # Set player.
    player_id = 1
    player = players[player_id]
    trainee_id = players.get("trainee_id", 1)

    # Extract PARAMS.
    epochs = params.get("EPOCHS_PER_LEARNING", 1)

    # Do one step ahead of the while loop
    # Initialize action history and perform first step.
    epsilon = player.get_epsilon(total_step=total_step)
    action = player.get_next_action(state, epsilon=epsilon)
    action_hist = deque([action], maxlen=2)
    next_state, reward, done, _ = env.step(action)

    # Initialize the state history and save the state and the next state
    state_hist = deque([state], maxlen=4)
    next_state *= -1  # Multiply -1 to change player perspective of game board.
    state_hist.append(next_state)
    state = next_state

    # Change player and enter while loop
    player_id = env.change_player()
    player = players[player_id]

    while not done:
        # Get current player's action.
        epsilon = player.get_epsilon(total_step=total_step)
        action_hist.append(player.get_next_action(state, epsilon=epsilon))

        # Take the latest action in the deque. In endgame, winner here.
        next_state, reward, done, _ = env.step(action_hist[-1])

        # Store the resulting state to history.
        # If the next player is player 2,
        #   Multiply next_state by -1 to change player perspective.
        if player_id == 1:
            next_state *= -1
        state_hist.append(next_state)

        # Change player here
        player_id = env.change_player()
        player = players[player_id]

        # Update DQN weights. In endgame, loser here.
        reward *= -1
        player.learn(
            state_hist[-3],
            action_hist[-2],  # state and action
            state_hist[-1],
            reward,
            done,  # next state, reward, done
            n_step=total_step,
            epochs=epochs)

        # Update training result at the end for the next step
        total_step += 1
        state = next_state

        # Render game board (NOT recommended with large N_EPISODES)
        # env.render()

    # Change player at the end of episode.
    player_id = env.change_player()
    player = players[player_id]

    # Both player have learnt all steps at the end. In endgame, winner here.
    reward *= -1
    player.learn(
        state_hist[-2],
        action_hist[-1],
        state_hist[-1] * -1,
        reward,
        done,
        # Multiply -1 to change owner of each move.
        n_step=total_step,
        epochs=epochs)

    # Adjust reward for trainee.
    # If winner is opponent, we give opposite reward to trainee.
    if player_id != trainee_id:
        reward *= -1  # adjust reward.

    total_step += 1

    return reward, total_step