Ejemplo n.º 1
0
class TestMCTS(unittest.TestCase):
    def setUp(self) -> None:
        self.state_manager = StateManager(TestConstants.K,
                                          TestConstants.STARTING_PLAYER)
        self.a_net = MockActorNet()

        self.mcts = MCTS(self.state_manager, self.a_net)
        init_state = self.state_manager.get_state()
        self.changed_index = 3
        self.visited_state = f"{init_state[:self.changed_index]}{TestConstants.STARTING_PLAYER}" \
                             f"{init_state[self.changed_index+1:-1]}" \
                             f"{StateManager.get_opposite_player(TestConstants.STARTING_PLAYER)}"
        self.root_child_states = self.state_manager.generate_child_states(
            init_state)
        # Building first layer of tree
        for child in self.root_child_states:
            self.mcts.tree.add_state_node(child)
            self.mcts.tree.add_edge(init_state, child)

    def test_get_distribution(self):
        self.mcts.tree.increment_state_number_of_visits(self.visited_state)
        distribution = self.mcts.get_distribution(
            self.state_manager.get_state())
        # If there is only one visited state it should have the whole distribution
        self.assertEqual(distribution[self.changed_index], 1)
        # Adding more runs will change the distribution
        second_changed_index = 8
        for i in range(3):
            self.mcts.tree.increment_state_number_of_visits(
                self.root_child_states[second_changed_index])
        distribution = self.mcts.get_distribution(
            self.state_manager.get_state())
        self.assertEqual(distribution[self.changed_index], 0.25)
        self.assertEqual(distribution[second_changed_index], 0.75)
Ejemplo n.º 2
0
class GameVisualizer:
    def __init__(
        self,
        board_size,
        player1=None,
        player2=None,
        starting_player=1,
        random_play=False,
        frame_rate=1000,
        initial_state=None,
        cartesian_cords=True,
    ):
        # Game logic
        self.board_size = board_size
        self.initial_state = initial_state
        self.state_manager = StateManager(board_size, starting_player)
        if initial_state:
            self.state_manager.set_state_manager(initial_state)
        self.random_play = random_play

        # Setting players
        self.player1 = player1
        self.player2 = player2

        # WINDOW
        self.master = Tk()
        self.master.title("HexGameVisualizer")
        self.master.protocol("WM_DELETE_WINDOW", self.quit_application)

        self.action_input = Entry(self.master)
        self.action_input.bind("<Return>", lambda event: self.button_clicked())
        self.action_input.pack()

        self.perform_action_button = Button(
            self.master, text="perform move", command=self.button_clicked
        )
        self.perform_action_button.pack()

        # TODO: Add label to describe the players currently playing
        self.label = Label(self.master)
        self.label.pack()

        self.start_pos = (60, 30)
        self.canvas = Canvas(
            self.master,
            width=self.start_pos[0] + self.board_size * 55 + self.start_pos[0],
            height=self.start_pos[1] + self.board_size * 33 + self.start_pos[1],
        )
        self.canvas.pack()

        # CONSTANTS
        self.frame_rate = frame_rate
        self.border_size = 10
        self.counter = 0
        self.size = 20
        self.cartesian_cords = cartesian_cords

        # LISTS CONTROLLING GAME AND DRAWING OF BOARD
        self.board = []
        self.board_border = []
        self.actions = []
        self.player_pieces = []

    def quit_application(self):
        import sys

        self.master.quit()
        sys.exit()

    def add_action(self, action: str):
        self.actions.append(action)

    def preprocess_actions(self):
        new_actions = []
        for action in self.actions:
            new_actions.append(GameVisualizer.preprocess_action(action))
        return new_actions

    @staticmethod
    def preprocess_action(action: str):
        positions, player = action.split(":")
        x_pos, y_pos = positions.split(",")
        return int(x_pos), int(y_pos), int(player)

    def run(self):
        self.actions = self.preprocess_actions()
        self.build_and_draw_board()
        if self.initial_state:
            self.state_manager.set_state_manager(self.initial_state)
            self.draw_initial_state()
        if len(self.actions):
            self.master.after(self.frame_rate, self.draw)
        mainloop()

    def model_perform_action(self, model: ANET):
        print(self.state_manager.get_state())
        distribution = model.predict(self.state_manager.get_state())
        print(distribution)
        argmax_distribution_index = int(
            np.argmax(distribution)
        )  # Greedy best from distribution
        action = self.state_manager.get_action_from_flattened_board_index(
            argmax_distribution_index, self.state_manager.get_state()
        )
        self.perform_action(GameVisualizer.preprocess_action(action))

    def button_clicked(self):
        if self.state_manager.is_end_state():
            return
        try:
            current_player = self.player1 if self.state_manager.current_player() == 1 else self.player2
            if current_player:
                self.model_perform_action(current_player)
            else:
                input_action = (
                    f"{self.action_input.get()}:{self.state_manager.current_player()}"
                )
                if self.random_play:
                    input_action = random.choice(
                        self.state_manager.generate_possible_actions(
                            self.state_manager.get_state()
                        )
                    )
                self.perform_action(GameVisualizer.preprocess_action(input_action))
            self.action_input.delete(0, "end")
        except ValueError:
            self.label["text"] = "Something went wrong"
        if self.state_manager.is_end_state():
            self.label["text"] = "Game over"

    def draw_initial_state(self):
        initial_board = self.state_manager.build_board(self.initial_state)
        for row_index, row in enumerate(initial_board):
            for col_index, player in enumerate(row):
                if player:
                    self.player_pieces.append(
                        Cell(
                            self.canvas,
                            self.board[row_index][col_index].top,
                            player=player,
                        )
                    )

    def get_canvas_position(self, position: (int, int)) -> (int, int):
        x, y = self.start_pos
        x += self.size * 2 * position[1] + self.size * position[0]
        y += (self.size + self.size / 1.7) * position[0]
        return x, y

    def build_and_draw_board(self):
        for i in range(self.board_size):
            row = []
            for j in range(self.board_size):
                row.append(
                    Cell(
                        self.canvas,
                        self.get_canvas_position((i, j)),
                        draw_on_init=False,
                    )
                )
            self.board.append(row)
        self.draw_board_border()
        for row_index, row in enumerate(self.board):
            for col_index, cell in enumerate(row):
                self.board[row_index][col_index].draw()

    def get_column(self, target_col_index: int):
        column = []
        for row_index, row in enumerate(self.board):
            for col_index, cell in enumerate(row):
                if target_col_index == col_index:
                    column.append(cell)
        return column

    def draw_board_border(self):
        borders = []
        first_row = self.board[0]
        borders.append(
            self.canvas.create_polygon(
                first_row[0].left1[0],
                first_row[0].left1[1],
                first_row[0].left1[0] - self.border_size,
                first_row[0].left1[1] - self.border_size,
                first_row[0].left1[0],
                first_row[0].left1[1] - 2 * self.border_size,
                first_row[-1].top[0] + 2 * self.border_size,
                first_row[-1].right1[1] - 2 * self.border_size,
                first_row[-1].top[0],
                (first_row[-1].right1[1] + first_row[-1].right2[1]) / 2,
                fill=PLAYER_ONE_COLOR,
            )
        )
        [
            borders.append(
                self.canvas.create_text(
                    cell.top[0] - 1.5 * self.border_size,
                    cell.top[1],
                    text=str(i) if self.cartesian_cords else chr(65 + i),
                    fill=FONT_COLOR,
                    font=(FONT, FONT_SIZE),
                )
            )
            for i, cell in enumerate(first_row)
        ]
        first_column = self.get_column(0)
        borders.append(
            self.canvas.create_polygon(
                first_column[0].left1[0],
                first_column[0].left1[1],
                first_column[0].left1[0] - self.border_size,
                first_column[0].left1[1] - self.border_size,
                first_column[0].left1[0] - 2 * self.border_size,
                first_column[0].left1[1],
                first_column[-1].bottom[0] - 2 * self.border_size,
                first_column[-1].left2[1] + 2 * self.border_size,
                first_column[-1].bottom[0],
                (first_column[-1].left2[1] + first_column[-1].left1[1]) / 2,
                fill=PLAYER_TWO_COLOR,
            )
        )
        [
            borders.append(
                self.canvas.create_text(
                    cell.left1[0] - self.border_size / 1.5,
                    (cell.left1[1] + cell.left2[1]) / 2,
                    text=str(i) if self.cartesian_cords else str(i + 1),
                    fill=FONT_COLOR,
                    font=(FONT, FONT_SIZE),
                )
            )
            for i, cell in enumerate(first_column)
        ]
        last_row = self.board[-1]
        borders.append(
            self.canvas.create_polygon(
                last_row[0].bottom[0],
                (last_row[0].right1[1] + last_row[0].right2[1]) / 2,
                last_row[0].bottom[0] - 2 * self.border_size,
                last_row[0].right2[1] + 2 * self.border_size,
                last_row[-1].right2[0],
                last_row[-1].right2[1] + 2 * self.border_size,
                last_row[-1].right2[0] + self.border_size,
                last_row[-1].right2[1] + self.border_size,
                last_row[-1].right2[0],
                last_row[-1].right2[1],
                fill=PLAYER_ONE_COLOR,
            )
        )
        [
            borders.append(
                self.canvas.create_text(
                    cell.bottom[0] + 1.2 * self.border_size,
                    cell.bottom[1],
                    text=str(i) if self.cartesian_cords else chr(65 + i),
                    fill=FONT_COLOR,
                    font=(FONT, FONT_SIZE),
                )
            )
            for i, cell in enumerate(last_row)
        ]
        last_column = self.get_column(self.board_size - 1)
        borders.append(
            # Bottom up
            self.canvas.create_polygon(
                last_column[-1].right2[0],
                last_column[-1].right2[1],
                last_column[-1].right2[0] + self.border_size,
                last_column[-1].right2[1] + self.border_size,
                last_column[-1].right2[0] + 2 * self.border_size,
                last_column[-1].right2[1],
                last_column[0].top[0] + 2 * self.border_size,
                last_column[0].right1[1] - 2 * self.border_size,
                last_column[0].top[0],
                (last_column[0].right1[1] + last_column[0].right2[1]) / 2,
                fill=PLAYER_TWO_COLOR,
            )
        )
        [
            borders.append(
                self.canvas.create_text(
                    cell.right1[0] + self.border_size / 1.5,
                    (cell.left1[1] + cell.left2[1]) / 2,
                    text=str(i) if self.cartesian_cords else str(i + 1),
                    fill=FONT_COLOR,
                    font=(FONT, FONT_SIZE),
                )
            )
            for i, cell in enumerate(last_column)
        ]

        return borders

    def get_board_pos(self, pos: (int, int)):
        return self.board_size * pos[0] + pos[1]

    def get_cords(self, board_pos: int):
        return math.floor(board_pos / self.board_size), board_pos % self.board_size

    def draw(self):
        self.perform_action(self.actions.pop(0))
        if len(self.actions) > 0:
            self.master.after(self.frame_rate, self.draw)

    def perform_action(self, action: (int, int, int)):
        print(action)
        x_pos, y_pos, player = action
        self.player_pieces.append(
            Cell(self.canvas, self.board[x_pos][y_pos].top, player=player,)
        )
        self.state_manager.perform_action(f"{x_pos},{y_pos}:{player}")
Ejemplo n.º 3
0
class TOPP:
    def __init__(self, path: str, verbose=False):

        self.models = ANET.load_models(path)
        self.state_manager = None
        self.board_size = ANET.infer_board_size_from_model(self.models[0].model)
        self.verbose = verbose

    def play(self, num_games_per_match):
        """
        Plays out the turnament where all models are played agains each other
        :param num_games_per_match: number of games to be played internally for each match
        """
        # Each row represents how many wins model_{row_index} has won against each model_{col_index}.
        # Hence each col represents how many losses model_{col_index} has against each model_{row_index}
        score_matrix = np.zeros((len(self.models), len(self.models)), dtype=int)
        for index1, player1 in enumerate(self.models):
            for index2, player2 in enumerate(self.models[index1 + 1 :]):
                if self.verbose:
                    print(player1.episode_number)
                    print(player2.episode_number)
                wins_p1, wins_p2 = self.play_match(
                    num_games_per_match, player1, player2
                )
                score_matrix[index1, index2 + index1 + 1] += wins_p1
                score_matrix[index2 + index1 + 1, index1] += wins_p2
        self.display_result(score_matrix)

    def play_match(self, num_games_per_match, player1, player2):
        """
        Runs num_games_per_match games between player1 and player2 where the greedy action is chosen.
        Players start every other game.
        :param num_games_per_match: number of games to be played between two models
        :param player1: Keras NN trained on x number of episodes
        :param player2: Keras NN trained on y number of episodes
        :return: the number og wins for each player
        """
        wins_p1 = 0
        wins_p2 = 0
        starting_player = 1
        for i in range(0, num_games_per_match):
            self.state_manager = StateManager(
                board_size=self.board_size, starting_player=starting_player
            )
            while not self.state_manager.is_end_state():
                current_player = self.state_manager.current_player()
                model = player1 if current_player == 1 else player2
                state = self.state_manager.get_state()
                if self.verbose:
                    print(self.state_manager.pretty_state_string())
                distribution = model.predict(state)
                if self.verbose:
                    for k in range(0, self.board_size):
                        print(
                            [
                                distribution[j]
                                for j in range(
                                    self.board_size * k,
                                    self.board_size * k + self.board_size,
                                )
                            ]
                        )
                argmax_distribution_index = int(
                    np.argmax(distribution)
                )  # Greedy best from distribution
                action = self.state_manager.get_action_from_flattened_board_index(
                    argmax_distribution_index, state
                )
                self.state_manager.perform_action(action)
            if current_player == 1:
                wins_p1 += 1
            else:
                wins_p2 += 1
            starting_player = 1 if starting_player == 2 else 2

        return wins_p1, wins_p2

    def display_result(self, score_matrix):
        """
        Displays the score_matrix as a table
        :param score_matrix: np.array
        """
        header = ["wins \ losses"]
        for model in self.models:
            header.append(model.episode_number)
        header.append("sum")
        t = PrettyTable(header)
        x_axis = []
        y_axis = []
        for index, row in enumerate(score_matrix):
            line = [self.models[index].episode_number]
            x_axis.append(self.models[index].episode_number)
            for cell in row:
                line.append(cell)
            line.append(sum(line[1:]))
            y_axis.append(sum(line[1:-1]))
            t.add_row(line)
        print(t)
        plt.clf()
        plt.plot(x_axis, y_axis)
        plt.title('TOPP')
        plt.ylabel('Number of games won')
        plt.xlabel('Episode saved')
        plt.show()
Ejemplo n.º 4
0
class TestBigStateManager(unittest.TestCase):
    def setUp(self) -> None:
        self.state_manager = StateManager(8, 1)
        self.valid_action = "0,0:1"

    def test_build_board(self):
        # Check board shape
        self.assertSequenceEqual(
            self.state_manager.board.shape,
            (self.state_manager.board_size, self.state_manager.board_size),
        )

    def test_check_and_extract_action_string(self):
        self.assertEqual(
            self.state_manager.check_and_extract_action_string(
                self.valid_action),
            (0, 0, 1),
        )

    def test_perform_action(self):
        # Test update of board and state
        correct_board_string = "1" + (
            ":" + str(2)).zfill(self.state_manager.board_size**2 + 1)
        self.state_manager.perform_action(self.valid_action)
        self.assertEqual(self.state_manager.board[0, 0], 1)
        self.assertEqual(self.state_manager.get_state(), correct_board_string)
        # Do some moves
        self.state_manager.perform_action("3,3:2")
        self.state_manager.perform_action("2,3:1")
        self.state_manager.perform_action("3,2:2")
        # Check that the moves are in the graphs
        self.assertSequenceEqual(list(self.state_manager.P2graph.nodes),
                                 ["3,3:2", "3,2:2"])
        self.assertSequenceEqual(list(self.state_manager.P1graph.nodes),
                                 ["0,0:1", "2,3:1"])
        # Check that there is an edge between the two adjacent pieces
        self.assertTrue(self.state_manager.P2graph.has_edge("3,3:2", "3,2:2"))
        # There should not be an edge between the nodes of the other player
        self.assertFalse(self.state_manager.P1graph.has_edge("0,0:1", "2,3:1"))

    def test_generate_possible_actions(self):
        # Creating list of tuple with state and corresponding number of possible states
        state_list = [
            (
                self.state_manager.get_state(),
                self.state_manager.board_size**2,
            ),
            ("1" * 60 + "0" * 4 + ":1", 4),
        ]
        for state_tuple in state_list:
            self.assertEqual(
                len(
                    self.state_manager.generate_possible_actions(
                        state_tuple[0])),
                state_tuple[1],
            )
        # checking if the actions are correct
        self.assertEqual(
            self.state_manager.generate_possible_actions(
                ("10" + "1" * 62 + ":2")),
            ["0,1:2"],
        )
        self.assertSequenceEqual(
            self.state_manager.generate_possible_actions(("1" * 61 + "001:1")),
            ["7,5:1", "7,6:1"],
        )

    def test_generate_child_states(self):
        initial_state = self.state_manager.get_state()
        # checking number of child states
        self.assertEqual(
            len(self.state_manager.generate_child_states(initial_state)),
            self.state_manager.board_size**2,
        )
        self.assertEqual(
            len(self.state_manager.generate_child_states("2" * 63 + "0:1")), 1)
        # Check that the player switches
        self.assertEqual(
            self.state_manager.generate_child_states("2" * 63 + "0:1")[0][-1],
            "2")
        self.assertEqual(
            self.state_manager.generate_child_states("2" * 63 + "0:2")[0][-1],
            "1")
        # Checking that output is correct
        self.assertSequenceEqual(
            self.state_manager.generate_child_states("2" * 63 + "0:1"),
            ["2" * 63 + "1:2"],
        )
        self.assertSequenceEqual(
            self.state_manager.generate_child_states("2" * 30 + "00" +
                                                     "1" * 32 + ":2"),
            [
                "2" * 30 + "20" + "1" * 32 + ":1",
                "2" * 30 + "02" + "1" * 32 + ":1"
            ],
        )

    def test_set_state_manager(self):
        initial_state = (
            "2112102122011010211221002101022212201222022111011220021110221001:1"
        )
        self.state_manager.set_state_manager(initial_state)
        path2 = [
            "0,0:2",
            "1,0:2",
            "2,0:2",
            "3,0:2",
        ]
        path1 = [
            "0,4:1",
            "1,3:1",
            "2,2:1",
            "3,1:1",
            "4,0:1",
        ]

        # Check some of the paths in the state
        def check_path(path, correct_graph, invalid_graph):
            for i in range(len(path) - 1):
                self.assertTrue(correct_graph.has_edge(path[i], path[i + 1]))
                self.assertFalse(invalid_graph.has_edge(path[i], path[i + 1]))
            self.assertTrue(nx.has_path(correct_graph, path[0], path[-1]))

        check_path(path2, self.state_manager.P2graph,
                   self.state_manager.P1graph)
        check_path(path1, self.state_manager.P1graph,
                   self.state_manager.P2graph)

    def test_is_end_state(self):
        end_state_p1 = (
            "2112102122011010211221002101022212201222122111011220021110221001:2"
        )
        self.state_manager.set_state_manager(end_state_p1)
        self.assertTrue(self.state_manager.is_end_state())

    def test_convert_flattened_index_to_cords(self):
        # index 8 should be second row first col
        self.assertSequenceEqual(
            self.state_manager.convert_flattened_index_to_cords(8), (1, 0))
        self.assertSequenceEqual(
            self.state_manager.convert_flattened_index_to_cords(18), (2, 2))
Ejemplo n.º 5
0
class MCTS:
    def __init__(
        self,
        state_manager: StateManager,
        actor_net,
        max_tree_height=5,
        c=1,
        number_of_simulations=10,
        verbose=False,
        random_simulation_rate=0.2,
    ):
        self.state_manager = StateManager(state_manager.board_size,
                                          state_manager.current_player())
        self.tree = StateTree(self.state_manager.get_state())
        self.tree.add_state_node(self.tree.root_state,
                                 self.state_manager.is_end_state())
        self.c = c
        self.max_tree_height = max_tree_height
        self.actor_net = actor_net
        self.number_of_simulations = number_of_simulations
        self.verbose = verbose
        self.random_simulation_rate = random_simulation_rate

    def run(self, root_state: str, progress: float):
        """
        Main method: Runs the monte carlo tree search algorithm, tree traversal -> rollout -> backprop, m times.
        Then finds the greedy best move from root state of the current tree
        :param root_state: state to run the algorithm from -> root node
        :return: the greedy best action from root node of the current tree
        """
        self.tree.cut_tree_with_new_root_node(root_state)
        self.state_manager.set_state_manager(self.tree.root_state)
        for i in range(self.number_of_simulations):
            rollout_state = self.traverse_tree(self.tree.root_state, depth=0)
            simulation_reward = self.simulate(rollout_state)
            self.backpropagate(rollout_state, simulation_reward)
            self.state_manager.set_state_manager(self.tree.root_state)

        distribution = self.get_distribution(self.tree.root_state)
        self.actor_net.add_case(self.tree.root_state, distribution.copy())
        if random.random() > math.tanh(progress):
            chosen_action = self.choose_action_stochastically(
                np.array(distribution), self.tree.root_state)
        else:
            chosen_action = self.epsilon_greedy_action_from_distribution(
                np.array(distribution), self.tree.root_state, epsilon=0.0)
        if self.verbose:
            print("distribution", distribution)
            print("chosen_action", chosen_action)
        return chosen_action

    # MAIN ALGORITHM METHODS
    def traverse_tree(self, state: str, depth: int) -> str:
        """
        Traversing the tree expanding nodes by using the tree policy (tree_policy)
        :param state: current state
        :param depth: current depth of the tree
        :return chosen state to simulate from
        """
        if depth == self.max_tree_height or self.tree.is_end_state(state):
            return state
        # If the current state has not explored it's children yet: Add all to graph and chose one to simulate from
        elif not self.tree.get_outgoing_edges(state):
            children = self.expand(state)
            return self.choose_random_child(state, children)
        else:
            child = self.tree_policy(state)
            self.state_manager.check_difference_and_perform_action(child)
            if self.tree.get_state_number_of_visits(child) == 0:
                self.tree.set_end_state(child,
                                        self.state_manager.is_end_state())
            return self.traverse_tree(child, depth + 1)

    def expand(self, state) -> [str]:
        """
        Expanding all child nodes from the input state and adding them to the graph.
        :param state: state to find all children from
        :return: list of all child states
        """
        children = StateManager.generate_child_states(state)
        for child in children:
            if child not in self.tree.get_nodes():
                self.tree.add_state_node(child)
            self.tree.add_edge(state, child)
        return children

    def simulate(self, state: str):
        """
        Performs one roll-out using the actor net as policy
        :return: return 1 if the simulation ends in player "true" winning, -1 otherwise
        """
        if self.state_manager.get_state() != state:
            raise ValueError(
                "The state manager is not set to the start of the simulation")
        while not self.state_manager.is_end_state():
            if random.random() < self.random_simulation_rate:
                distribution = self.actor_net.predict(
                    self.state_manager.get_state())
                chosen_action = self.epsilon_greedy_action_from_distribution(
                    distribution, self.state_manager.get_state(), epsilon=0.0)
            else:
                chosen_action = random.choice(
                    self.state_manager.generate_possible_actions(
                        self.state_manager.get_state()))
            self.state_manager.perform_action(chosen_action)
        return MCTS.get_end_state_reward(self.state_manager.current_player())

    def backpropagate(self, state: str, simulation_reward: int):
        """
        Starts at rollout start state and jumps up in the tree updating the nodes sap and number of visits
        :param state: rollout start state
        :param simulation_reward: reward from simulation
        """
        if state == self.tree.root_state:
            self.tree.increment_state_number_of_visits(state)
            return
        parent_state = self.tree.get_parent(state)

        self.tree.increment_state_number_of_visits(state)
        self.tree.increment_edge_number_of_visits(parent_state, state)
        edge_times_enc = self.tree.get_edge_number_of_visits(
            parent_state, state)
        edge_sap_value = self.tree.get_sap_value(parent_state, state)
        new_sap_value = (self.tree.get_sap_value(parent_state, state) +
                         (simulation_reward - edge_sap_value) / edge_times_enc)
        self.tree.set_sap_value(parent_state, state, new_sap_value)
        self.tree.set_active_edge(parent_state, state, False)

        self.backpropagate(parent_state, simulation_reward)

    # HELPER METHODS

    def tree_policy(self, state: str) -> str:
        """
        Using the uct score to determine the child state of a input state
        :param state: input state
        :return: child state
        """
        state_number_of_visits = self.tree.get_state_number_of_visits(state)
        if self.state_manager.get_player(state) == 1:
            best_edge = self.tree.get_outgoing_edges(
                state,
                sort_by_function=lambda edge: self.compute_uct(
                    self.tree.get_sap_value(*edge),
                    state_number_of_visits,
                    self.tree.get_edge_number_of_visits(*edge),
                    True,
                ),
            )[0]
        else:
            best_edge = self.tree.get_outgoing_edges(
                state,
                sort_by_function=lambda edge: self.compute_uct(
                    self.tree.get_sap_value(*edge),
                    state_number_of_visits,
                    self.tree.get_edge_number_of_visits(*edge),
                    False,
                ),
            )[-1]
        parent, best_child = best_edge
        self.tree.set_active_edge(parent, best_child, True)
        return best_child

    def compute_uct(
        self,
        sap_value: float,
        number_of_visits_node: int,
        number_of_visits_edge: int,
        maximizing_player: bool,
    ) -> float:
        """
        Computes the uct for the tree policy
        :param sap_value: sap value for the edge
        :param number_of_visits_node: number of visits for the parent state
        :param number_of_visits_edge: number of visits for the edge between the two nodes
        :param maximizing_player: if the current player is the maximizing player
        :return: uct value
        """
        uct = sap_value
        usa_term = self.c * math.sqrt(
            math.log(number_of_visits_node) / (1 + number_of_visits_edge))
        if maximizing_player:
            uct += usa_term
        else:
            uct -= usa_term
        return uct

    def greedy_best_action(self, state: str) -> str:
        sorted_list = self.tree.get_outgoing_edges(
            state,
            sort_by_function=lambda edge: self.tree.get_edge_number_of_visits(
                *edge),
        )
        return self.state_manager.get_action(*sorted_list[0])

    def choose_random_child(self, parent_state: str, child_list: [str]) -> str:
        """
        Helper method choosing a random state from the child list, updating state manager
        and adding edge and node parameters
        :param parent_state: parent state for the child list (to set edge parameters)
        :param child_list: list of children from parent state
        :return: chosen child
        """
        child = random.choice(child_list)
        self.state_manager.check_difference_and_perform_action(child)
        self.tree.set_end_state(child, self.state_manager.is_end_state())
        self.tree.set_active_edge(parent_state, child, True)
        return child

    def epsilon_greedy_action_from_distribution(self,
                                                distribution: np.ndarray,
                                                state: str,
                                                epsilon=0.2):
        """
        Chooses an epsilon greedy index from the distribution converting that index to an action
        :param distribution: distribution from number of simulations per node
        :param state: current state to calculate action
        :param epsilon: the epsilon value to be used
        :return: actionstring
        """
        if random.random() > epsilon:
            chosen_index = int(np.argmax(distribution))
        else:
            # Choose random state from those with positive probability
            # prob == 0 might be occupied cells on the board
            if not [
                    i[0]
                    for i, prob in np.ndenumerate(distribution) if prob > 0
            ]:
                chosen_index = int(np.argmax(distribution))
            else:
                chosen_index = random.choice([
                    i[0] for i, prob in np.ndenumerate(distribution)
                    if prob > 0
                ])
        return self.state_manager.get_action_from_flattened_board_index(
            chosen_index, state)

    @staticmethod
    def get_end_state_reward(current_player: int) -> int:
        """
        We have chosen player 1 to be "us", giving a positive reward if player 1 wins.
        :param current_player: current player for the state manager
        :return: reward for end state
        """
        return -1 if current_player == 1 else 1

    def get_distribution(self, state: str):
        """
        Returns the distribution of total visits for child nodes of input state
        :param state: state to get distribution from
        :return: a normalized list of length equal to the total number of positions on the board
        """

        parent_board, parent_player = StateManager.extract_state(state)
        child_states = self.tree.get_child_states(state)
        change_indices_dict = {}
        total_visits = 0
        for child in child_states:
            child_board, child_player = StateManager.extract_state(child)
            for i in range(len(child_board)):
                if parent_board[i] != child_board[i]:
                    child_number_of_visits = self.tree.get_edge_number_of_visits(
                        state, child)
                    change_indices_dict[i] = child_number_of_visits
                    total_visits += child_number_of_visits
                    break

        return [
            change_indices_dict[index] /
            total_visits if index in change_indices_dict else 0
            for index in range(self.state_manager.board_size**2)
        ]

    def set_random_simulation_rate(self, new_rate: float):
        self.random_simulation_rate = new_rate

    def choose_action_stochastically(self, distribution, state):
        chosen_index = np.random.choice([i for i in range(len(distribution))],
                                        p=distribution)
        return self.state_manager.get_action_from_flattened_board_index(
            chosen_index, state)
Ejemplo n.º 6
0
class GameSimulator:
    def __init__(
        self,
        g,
        p,
        verbose,
        k,
        print_parameters=False,
        save_interval=10,
        actor_net_parameters=None,
        mcts_parameters=None,
    ):
        self.number_of_episodes_to_play = g
        self.starting_player_option = p
        self.k = k
        self.verbose = verbose
        self.state_manager = None
        self.current_state = None
        self.winner_stats = np.zeros((2, 2))
        self.mcts_parameters = mcts_parameters if mcts_parameters else {}
        if actor_net_parameters:
            self.actor_net_parameters = actor_net_parameters
            self.actor_network = ANET(k, **actor_net_parameters)
        else:
            self.actor_network = ANET(k)
        self.save_interval = save_interval
        if print_parameters:
            self.print_all_parameters()

    def print_all_parameters(self):
        print("===================================")
        print("            PARAMETERS             ")
        print("===================================")
        print("number of games in a batch:", self.number_of_episodes_to_play)
        print("starting-player option:", self.starting_player_option)
        print("Verbose:", self.verbose)
        print("k:", self.k)
        print("save interval:", self.save_interval)
        print("===================================")
        self.print_parameters(self.actor_net_parameters,
                              "          ANET-PARAMETERS          ")
        self.print_parameters(self.mcts_parameters,
                              "          MCTS-PARAMETERS          ")

    @staticmethod
    def print_parameters(parameters, header):
        if parameters:
            print(header)
            print("===================================")
            print("".join(
                [f"{key}: {parameters[key]} \n" for key in parameters.keys()]))
            print("===================================")

    def print_start_state(self, i, timer):
        if self.verbose:
            print(f"--- Starting game {i} ---")
            print(f"Start state: {self.state_manager.pretty_state_string()}")
        else:
            print_loader(
                i,
                self.number_of_episodes_to_play,
                10,
                timer,
                self.number_of_episodes_to_play,
            )

    def print_action(self, action: str):
        if self.verbose:
            x_pos, y_pos, player = self.state_manager.check_and_extract_action_string(
                action, check_player_turn=False)
            print(f"Player {player} placed a piece at ({x_pos}, {y_pos})"
                  f" : {self.state_manager.pretty_state_string()}")

    def print_winner_of_batch_game(self):
        if self.verbose:
            print(
                f"Player {2 if self.state_manager.current_player() == 1 else 1} wins the game"
            )

    def print_run_summary(self):
        print("\n------------- SUMMARY -------------")
        header = ["winning player \ starting player", "1", "2"]
        t = PrettyTable(header)
        for index, row in enumerate(self.winner_stats):
            line = [str(index + 1)]
            for cell in row:
                line.append(cell)
            t.add_row(line)
        print(t)

    def save_loss_graph(self, loss, val_loss, id):
        loss = np.array(loss)
        val_loss = np.array(val_loss)
        plt.clf()
        fig, ax1 = plt.subplots()
        ax1.set_xlabel("Games")
        ax1.set_ylabel("Loss")
        ax1.plot(loss, label="Train")
        ax1.plot(val_loss, label="Test")
        ax1.legend(loc="upper right")
        ax1.set_title("Model loss")

        ax2 = ax1.twinx()
        color = "tab:green"
        ax2.set_ylabel("Delta train test", color=color, alpha=0.5)
        ax2.plot(np.sqrt(np.power((loss - val_loss), 2)),
                 color=color,
                 alpha=0.5)
        fig.tight_layout()
        if not os.path.exists("loss_graphs"):
            os.mkdir("loss_graphs")
        plt.savefig(f"loss_graphs/{id}.png")

    def update_winner_stats(self, starting_player: int) -> None:
        second_index = starting_player - 1
        winning_player = 1 if self.state_manager.current_player() == 2 else 2
        first_index = winning_player - 1
        self.winner_stats[first_index][second_index] += 1

    def run(self):
        starting_player = StartingPlayerOptions.get_starting_player(
            self.starting_player_option)
        self.actor_network.save_model(episode_number=0)
        loss = []
        val_loss = []
        timer = Timer()
        for i in range(1, self.number_of_episodes_to_play + 1):
            self.state_manager = StateManager(self.k, starting_player)
            self.print_start_state(i, timer)
            timer.start()
            mcts = MCTS(
                self.state_manager,
                self.actor_network,
                random_simulation_rate=math.tanh(
                    i / self.number_of_episodes_to_play) * 1.2,
                **self.mcts_parameters,
            )
            while not self.state_manager.is_end_state():
                action = mcts.run(self.state_manager.get_state(),
                                  i / self.number_of_episodes_to_play)
                self.state_manager.perform_action(action)
                self.print_action(action)
            self.update_winner_stats(starting_player)
            self.print_winner_of_batch_game()
            history = self.actor_network.train()
            loss.append(np.average(history.history["loss"]))
            val_loss.append(np.average(history.history["val_loss"]))
            if self.starting_player_option == StartingPlayerOptions.ALTERNATING:
                starting_player = StateManager.get_opposite_player(
                    starting_player)
            if i % self.save_interval == 0:
                self.save_loss_graph(loss, val_loss, i)
                self.actor_network.save_model(episode_number=i)
            timer.stop()
            if i % 50 == 0:
                self.actor_network.save_buffer_to_file(
                    i, self.k, self.mcts_parameters["number_of_simulations"])
        self.print_run_summary()