def get_distribution(self, state: str): """ Returns the distribution of total visits for child nodes of input state :param state: state to get distribution from :return: a normalized list of length equal to the total number of positions on the board """ parent_board, parent_player = StateManager.extract_state(state) child_states = self.tree.get_child_states(state) change_indices_dict = {} total_visits = 0 for child in child_states: child_board, child_player = StateManager.extract_state(child) for i in range(len(child_board)): if parent_board[i] != child_board[i]: child_number_of_visits = self.tree.get_edge_number_of_visits( state, child) change_indices_dict[i] = child_number_of_visits total_visits += child_number_of_visits break return [ change_indices_dict[index] / total_visits if index in change_indices_dict else 0 for index in range(self.state_manager.board_size**2) ]
class TestMCTS(unittest.TestCase): def setUp(self) -> None: self.state_manager = StateManager(TestConstants.K, TestConstants.STARTING_PLAYER) self.a_net = MockActorNet() self.mcts = MCTS(self.state_manager, self.a_net) init_state = self.state_manager.get_state() self.changed_index = 3 self.visited_state = f"{init_state[:self.changed_index]}{TestConstants.STARTING_PLAYER}" \ f"{init_state[self.changed_index+1:-1]}" \ f"{StateManager.get_opposite_player(TestConstants.STARTING_PLAYER)}" self.root_child_states = self.state_manager.generate_child_states( init_state) # Building first layer of tree for child in self.root_child_states: self.mcts.tree.add_state_node(child) self.mcts.tree.add_edge(init_state, child) def test_get_distribution(self): self.mcts.tree.increment_state_number_of_visits(self.visited_state) distribution = self.mcts.get_distribution( self.state_manager.get_state()) # If there is only one visited state it should have the whole distribution self.assertEqual(distribution[self.changed_index], 1) # Adding more runs will change the distribution second_changed_index = 8 for i in range(3): self.mcts.tree.increment_state_number_of_visits( self.root_child_states[second_changed_index]) distribution = self.mcts.get_distribution( self.state_manager.get_state()) self.assertEqual(distribution[self.changed_index], 0.25) self.assertEqual(distribution[second_changed_index], 0.75)
class TestSmallStateManager(unittest.TestCase): def setUp(self) -> None: self.state_manager = StateManager(3, 1) self.valid_action = "0,0:1" def test_is_end_state(self): end_state_p1 = ("121000221:2") self.state_manager.set_state_manager(end_state_p1) self.assertFalse(self.state_manager.is_end_state())
def play_match(self, num_games_per_match, player1, player2): """ Runs num_games_per_match games between player1 and player2 where the greedy action is chosen. Players start every other game. :param num_games_per_match: number of games to be played between two models :param player1: Keras NN trained on x number of episodes :param player2: Keras NN trained on y number of episodes :return: the number og wins for each player """ wins_p1 = 0 wins_p2 = 0 starting_player = 1 for i in range(0, num_games_per_match): self.state_manager = StateManager( board_size=self.board_size, starting_player=starting_player ) while not self.state_manager.is_end_state(): current_player = self.state_manager.current_player() model = player1 if current_player == 1 else player2 state = self.state_manager.get_state() if self.verbose: print(self.state_manager.pretty_state_string()) distribution = model.predict(state) if self.verbose: for k in range(0, self.board_size): print( [ distribution[j] for j in range( self.board_size * k, self.board_size * k + self.board_size, ) ] ) argmax_distribution_index = int( np.argmax(distribution) ) # Greedy best from distribution action = self.state_manager.get_action_from_flattened_board_index( argmax_distribution_index, state ) self.state_manager.perform_action(action) if current_player == 1: wins_p1 += 1 else: wins_p2 += 1 starting_player = 1 if starting_player == 2 else 2 return wins_p1, wins_p2
def setUp(self) -> None: self.state_manager = StateManager(TestConstants.K, TestConstants.STARTING_PLAYER) self.a_net = MockActorNet() self.mcts = MCTS(self.state_manager, self.a_net) init_state = self.state_manager.get_state() self.changed_index = 3 self.visited_state = f"{init_state[:self.changed_index]}{TestConstants.STARTING_PLAYER}" \ f"{init_state[self.changed_index+1:-1]}" \ f"{StateManager.get_opposite_player(TestConstants.STARTING_PLAYER)}" self.root_child_states = self.state_manager.generate_child_states( init_state) # Building first layer of tree for child in self.root_child_states: self.mcts.tree.add_state_node(child) self.mcts.tree.add_edge(init_state, child)
def print_graph(self, state_manager): """ Print the DiGraph object representing the current tree """ pos = nx.shell_layout(self.graph) blue_player_nodes = [] red_player_nodes = [] labels = {} state_manager = StateManager(state_manager.board_size, state_manager.current_player()) for state in self.graph.nodes: state_manager.set_state_manager(state) labels[state] = state_manager.pretty_state_string() if StateManager.get_player(state) == 1: blue_player_nodes.append(state) else: red_player_nodes.append(state) nx.draw_networkx_nodes( self.graph, pos, nodelist=blue_player_nodes, node_color=TreeConstants.PLAYER1_COLOR, alpha=0.5, ) nx.draw_networkx_nodes( self.graph, pos, nodelist=red_player_nodes, node_color=TreeConstants.PLAYER2_COLOR, alpha=0.5, ) nx.draw_networkx_edges(self.graph, pos) nx.draw_networkx_labels(self.graph, pos, labels, font_size=10) plt.show()
def handle_get_action(self, state): """ Here you will use the neural net that you trained using MCTS to select a move for your actor on the current board. Remember to use the correct player_number for YOUR actor! The default action is to select a random empty cell on the board. This should be modified. :param state: The current board in the form (1 or 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), where 1 or 2 indicates the number of the current player. If you are player 2 in the current series, for example, then you will see a 2 here throughout the entire series, whereas player 1 will see a 1. :return: Your actor's selected action as a tuple (row, column) """ board = ''.join([str(cell) for cell in state[1:]]) local_state_rep = f"{board}:{state[0]}" state_manager = StateManager(int(math.sqrt(len(board))), state[0]) distribution = self.model.predict(local_state_rep) chosen_index = int(np.argmax(distribution)) action_string = state_manager.get_action_from_flattened_board_index( chosen_index, local_state_rep) return state_manager.check_and_extract_action_string( action_string, check_player_turn=False)[:-1]
def expand(self, state) -> [str]: """ Expanding all child nodes from the input state and adding them to the graph. :param state: state to find all children from :return: list of all child states """ children = StateManager.generate_child_states(state) for child in children: if child not in self.tree.get_nodes(): self.tree.add_state_node(child) self.tree.add_edge(state, child) return children
def __init__( self, state_manager: StateManager, actor_net, max_tree_height=5, c=1, number_of_simulations=10, verbose=False, random_simulation_rate=0.2, ): self.state_manager = StateManager(state_manager.board_size, state_manager.current_player()) self.tree = StateTree(self.state_manager.get_state()) self.tree.add_state_node(self.tree.root_state, self.state_manager.is_end_state()) self.c = c self.max_tree_height = max_tree_height self.actor_net = actor_net self.number_of_simulations = number_of_simulations self.verbose = verbose self.random_simulation_rate = random_simulation_rate
def run(self): starting_player = StartingPlayerOptions.get_starting_player( self.starting_player_option) self.actor_network.save_model(episode_number=0) loss = [] val_loss = [] timer = Timer() for i in range(1, self.number_of_episodes_to_play + 1): self.state_manager = StateManager(self.k, starting_player) self.print_start_state(i, timer) timer.start() mcts = MCTS( self.state_manager, self.actor_network, random_simulation_rate=math.tanh( i / self.number_of_episodes_to_play) * 1.2, **self.mcts_parameters, ) while not self.state_manager.is_end_state(): action = mcts.run(self.state_manager.get_state(), i / self.number_of_episodes_to_play) self.state_manager.perform_action(action) self.print_action(action) self.update_winner_stats(starting_player) self.print_winner_of_batch_game() history = self.actor_network.train() loss.append(np.average(history.history["loss"])) val_loss.append(np.average(history.history["val_loss"])) if self.starting_player_option == StartingPlayerOptions.ALTERNATING: starting_player = StateManager.get_opposite_player( starting_player) if i % self.save_interval == 0: self.save_loss_graph(loss, val_loss, i) self.actor_network.save_model(episode_number=i) timer.stop() if i % 50 == 0: self.actor_network.save_buffer_to_file( i, self.k, self.mcts_parameters["number_of_simulations"]) self.print_run_summary()
class TOPP: def __init__(self, path: str, verbose=False): self.models = ANET.load_models(path) self.state_manager = None self.board_size = ANET.infer_board_size_from_model(self.models[0].model) self.verbose = verbose def play(self, num_games_per_match): """ Plays out the turnament where all models are played agains each other :param num_games_per_match: number of games to be played internally for each match """ # Each row represents how many wins model_{row_index} has won against each model_{col_index}. # Hence each col represents how many losses model_{col_index} has against each model_{row_index} score_matrix = np.zeros((len(self.models), len(self.models)), dtype=int) for index1, player1 in enumerate(self.models): for index2, player2 in enumerate(self.models[index1 + 1 :]): if self.verbose: print(player1.episode_number) print(player2.episode_number) wins_p1, wins_p2 = self.play_match( num_games_per_match, player1, player2 ) score_matrix[index1, index2 + index1 + 1] += wins_p1 score_matrix[index2 + index1 + 1, index1] += wins_p2 self.display_result(score_matrix) def play_match(self, num_games_per_match, player1, player2): """ Runs num_games_per_match games between player1 and player2 where the greedy action is chosen. Players start every other game. :param num_games_per_match: number of games to be played between two models :param player1: Keras NN trained on x number of episodes :param player2: Keras NN trained on y number of episodes :return: the number og wins for each player """ wins_p1 = 0 wins_p2 = 0 starting_player = 1 for i in range(0, num_games_per_match): self.state_manager = StateManager( board_size=self.board_size, starting_player=starting_player ) while not self.state_manager.is_end_state(): current_player = self.state_manager.current_player() model = player1 if current_player == 1 else player2 state = self.state_manager.get_state() if self.verbose: print(self.state_manager.pretty_state_string()) distribution = model.predict(state) if self.verbose: for k in range(0, self.board_size): print( [ distribution[j] for j in range( self.board_size * k, self.board_size * k + self.board_size, ) ] ) argmax_distribution_index = int( np.argmax(distribution) ) # Greedy best from distribution action = self.state_manager.get_action_from_flattened_board_index( argmax_distribution_index, state ) self.state_manager.perform_action(action) if current_player == 1: wins_p1 += 1 else: wins_p2 += 1 starting_player = 1 if starting_player == 2 else 2 return wins_p1, wins_p2 def display_result(self, score_matrix): """ Displays the score_matrix as a table :param score_matrix: np.array """ header = ["wins \ losses"] for model in self.models: header.append(model.episode_number) header.append("sum") t = PrettyTable(header) x_axis = [] y_axis = [] for index, row in enumerate(score_matrix): line = [self.models[index].episode_number] x_axis.append(self.models[index].episode_number) for cell in row: line.append(cell) line.append(sum(line[1:])) y_axis.append(sum(line[1:-1])) t.add_row(line) print(t) plt.clf() plt.plot(x_axis, y_axis) plt.title('TOPP') plt.ylabel('Number of games won') plt.xlabel('Episode saved') plt.show()
def setUp(self) -> None: self.state_manager = StateManager(8, 1) self.valid_action = "0,0:1"
class TestBigStateManager(unittest.TestCase): def setUp(self) -> None: self.state_manager = StateManager(8, 1) self.valid_action = "0,0:1" def test_build_board(self): # Check board shape self.assertSequenceEqual( self.state_manager.board.shape, (self.state_manager.board_size, self.state_manager.board_size), ) def test_check_and_extract_action_string(self): self.assertEqual( self.state_manager.check_and_extract_action_string( self.valid_action), (0, 0, 1), ) def test_perform_action(self): # Test update of board and state correct_board_string = "1" + ( ":" + str(2)).zfill(self.state_manager.board_size**2 + 1) self.state_manager.perform_action(self.valid_action) self.assertEqual(self.state_manager.board[0, 0], 1) self.assertEqual(self.state_manager.get_state(), correct_board_string) # Do some moves self.state_manager.perform_action("3,3:2") self.state_manager.perform_action("2,3:1") self.state_manager.perform_action("3,2:2") # Check that the moves are in the graphs self.assertSequenceEqual(list(self.state_manager.P2graph.nodes), ["3,3:2", "3,2:2"]) self.assertSequenceEqual(list(self.state_manager.P1graph.nodes), ["0,0:1", "2,3:1"]) # Check that there is an edge between the two adjacent pieces self.assertTrue(self.state_manager.P2graph.has_edge("3,3:2", "3,2:2")) # There should not be an edge between the nodes of the other player self.assertFalse(self.state_manager.P1graph.has_edge("0,0:1", "2,3:1")) def test_generate_possible_actions(self): # Creating list of tuple with state and corresponding number of possible states state_list = [ ( self.state_manager.get_state(), self.state_manager.board_size**2, ), ("1" * 60 + "0" * 4 + ":1", 4), ] for state_tuple in state_list: self.assertEqual( len( self.state_manager.generate_possible_actions( state_tuple[0])), state_tuple[1], ) # checking if the actions are correct self.assertEqual( self.state_manager.generate_possible_actions( ("10" + "1" * 62 + ":2")), ["0,1:2"], ) self.assertSequenceEqual( self.state_manager.generate_possible_actions(("1" * 61 + "001:1")), ["7,5:1", "7,6:1"], ) def test_generate_child_states(self): initial_state = self.state_manager.get_state() # checking number of child states self.assertEqual( len(self.state_manager.generate_child_states(initial_state)), self.state_manager.board_size**2, ) self.assertEqual( len(self.state_manager.generate_child_states("2" * 63 + "0:1")), 1) # Check that the player switches self.assertEqual( self.state_manager.generate_child_states("2" * 63 + "0:1")[0][-1], "2") self.assertEqual( self.state_manager.generate_child_states("2" * 63 + "0:2")[0][-1], "1") # Checking that output is correct self.assertSequenceEqual( self.state_manager.generate_child_states("2" * 63 + "0:1"), ["2" * 63 + "1:2"], ) self.assertSequenceEqual( self.state_manager.generate_child_states("2" * 30 + "00" + "1" * 32 + ":2"), [ "2" * 30 + "20" + "1" * 32 + ":1", "2" * 30 + "02" + "1" * 32 + ":1" ], ) def test_set_state_manager(self): initial_state = ( "2112102122011010211221002101022212201222022111011220021110221001:1" ) self.state_manager.set_state_manager(initial_state) path2 = [ "0,0:2", "1,0:2", "2,0:2", "3,0:2", ] path1 = [ "0,4:1", "1,3:1", "2,2:1", "3,1:1", "4,0:1", ] # Check some of the paths in the state def check_path(path, correct_graph, invalid_graph): for i in range(len(path) - 1): self.assertTrue(correct_graph.has_edge(path[i], path[i + 1])) self.assertFalse(invalid_graph.has_edge(path[i], path[i + 1])) self.assertTrue(nx.has_path(correct_graph, path[0], path[-1])) check_path(path2, self.state_manager.P2graph, self.state_manager.P1graph) check_path(path1, self.state_manager.P1graph, self.state_manager.P2graph) def test_is_end_state(self): end_state_p1 = ( "2112102122011010211221002101022212201222122111011220021110221001:2" ) self.state_manager.set_state_manager(end_state_p1) self.assertTrue(self.state_manager.is_end_state()) def test_convert_flattened_index_to_cords(self): # index 8 should be second row first col self.assertSequenceEqual( self.state_manager.convert_flattened_index_to_cords(8), (1, 0)) self.assertSequenceEqual( self.state_manager.convert_flattened_index_to_cords(18), (2, 2))
class GameSimulator: def __init__( self, g, p, verbose, k, print_parameters=False, save_interval=10, actor_net_parameters=None, mcts_parameters=None, ): self.number_of_episodes_to_play = g self.starting_player_option = p self.k = k self.verbose = verbose self.state_manager = None self.current_state = None self.winner_stats = np.zeros((2, 2)) self.mcts_parameters = mcts_parameters if mcts_parameters else {} if actor_net_parameters: self.actor_net_parameters = actor_net_parameters self.actor_network = ANET(k, **actor_net_parameters) else: self.actor_network = ANET(k) self.save_interval = save_interval if print_parameters: self.print_all_parameters() def print_all_parameters(self): print("===================================") print(" PARAMETERS ") print("===================================") print("number of games in a batch:", self.number_of_episodes_to_play) print("starting-player option:", self.starting_player_option) print("Verbose:", self.verbose) print("k:", self.k) print("save interval:", self.save_interval) print("===================================") self.print_parameters(self.actor_net_parameters, " ANET-PARAMETERS ") self.print_parameters(self.mcts_parameters, " MCTS-PARAMETERS ") @staticmethod def print_parameters(parameters, header): if parameters: print(header) print("===================================") print("".join( [f"{key}: {parameters[key]} \n" for key in parameters.keys()])) print("===================================") def print_start_state(self, i, timer): if self.verbose: print(f"--- Starting game {i} ---") print(f"Start state: {self.state_manager.pretty_state_string()}") else: print_loader( i, self.number_of_episodes_to_play, 10, timer, self.number_of_episodes_to_play, ) def print_action(self, action: str): if self.verbose: x_pos, y_pos, player = self.state_manager.check_and_extract_action_string( action, check_player_turn=False) print(f"Player {player} placed a piece at ({x_pos}, {y_pos})" f" : {self.state_manager.pretty_state_string()}") def print_winner_of_batch_game(self): if self.verbose: print( f"Player {2 if self.state_manager.current_player() == 1 else 1} wins the game" ) def print_run_summary(self): print("\n------------- SUMMARY -------------") header = ["winning player \ starting player", "1", "2"] t = PrettyTable(header) for index, row in enumerate(self.winner_stats): line = [str(index + 1)] for cell in row: line.append(cell) t.add_row(line) print(t) def save_loss_graph(self, loss, val_loss, id): loss = np.array(loss) val_loss = np.array(val_loss) plt.clf() fig, ax1 = plt.subplots() ax1.set_xlabel("Games") ax1.set_ylabel("Loss") ax1.plot(loss, label="Train") ax1.plot(val_loss, label="Test") ax1.legend(loc="upper right") ax1.set_title("Model loss") ax2 = ax1.twinx() color = "tab:green" ax2.set_ylabel("Delta train test", color=color, alpha=0.5) ax2.plot(np.sqrt(np.power((loss - val_loss), 2)), color=color, alpha=0.5) fig.tight_layout() if not os.path.exists("loss_graphs"): os.mkdir("loss_graphs") plt.savefig(f"loss_graphs/{id}.png") def update_winner_stats(self, starting_player: int) -> None: second_index = starting_player - 1 winning_player = 1 if self.state_manager.current_player() == 2 else 2 first_index = winning_player - 1 self.winner_stats[first_index][second_index] += 1 def run(self): starting_player = StartingPlayerOptions.get_starting_player( self.starting_player_option) self.actor_network.save_model(episode_number=0) loss = [] val_loss = [] timer = Timer() for i in range(1, self.number_of_episodes_to_play + 1): self.state_manager = StateManager(self.k, starting_player) self.print_start_state(i, timer) timer.start() mcts = MCTS( self.state_manager, self.actor_network, random_simulation_rate=math.tanh( i / self.number_of_episodes_to_play) * 1.2, **self.mcts_parameters, ) while not self.state_manager.is_end_state(): action = mcts.run(self.state_manager.get_state(), i / self.number_of_episodes_to_play) self.state_manager.perform_action(action) self.print_action(action) self.update_winner_stats(starting_player) self.print_winner_of_batch_game() history = self.actor_network.train() loss.append(np.average(history.history["loss"])) val_loss.append(np.average(history.history["val_loss"])) if self.starting_player_option == StartingPlayerOptions.ALTERNATING: starting_player = StateManager.get_opposite_player( starting_player) if i % self.save_interval == 0: self.save_loss_graph(loss, val_loss, i) self.actor_network.save_model(episode_number=i) timer.stop() if i % 50 == 0: self.actor_network.save_buffer_to_file( i, self.k, self.mcts_parameters["number_of_simulations"]) self.print_run_summary()
class GameVisualizer: def __init__( self, board_size, player1=None, player2=None, starting_player=1, random_play=False, frame_rate=1000, initial_state=None, cartesian_cords=True, ): # Game logic self.board_size = board_size self.initial_state = initial_state self.state_manager = StateManager(board_size, starting_player) if initial_state: self.state_manager.set_state_manager(initial_state) self.random_play = random_play # Setting players self.player1 = player1 self.player2 = player2 # WINDOW self.master = Tk() self.master.title("HexGameVisualizer") self.master.protocol("WM_DELETE_WINDOW", self.quit_application) self.action_input = Entry(self.master) self.action_input.bind("<Return>", lambda event: self.button_clicked()) self.action_input.pack() self.perform_action_button = Button( self.master, text="perform move", command=self.button_clicked ) self.perform_action_button.pack() # TODO: Add label to describe the players currently playing self.label = Label(self.master) self.label.pack() self.start_pos = (60, 30) self.canvas = Canvas( self.master, width=self.start_pos[0] + self.board_size * 55 + self.start_pos[0], height=self.start_pos[1] + self.board_size * 33 + self.start_pos[1], ) self.canvas.pack() # CONSTANTS self.frame_rate = frame_rate self.border_size = 10 self.counter = 0 self.size = 20 self.cartesian_cords = cartesian_cords # LISTS CONTROLLING GAME AND DRAWING OF BOARD self.board = [] self.board_border = [] self.actions = [] self.player_pieces = [] def quit_application(self): import sys self.master.quit() sys.exit() def add_action(self, action: str): self.actions.append(action) def preprocess_actions(self): new_actions = [] for action in self.actions: new_actions.append(GameVisualizer.preprocess_action(action)) return new_actions @staticmethod def preprocess_action(action: str): positions, player = action.split(":") x_pos, y_pos = positions.split(",") return int(x_pos), int(y_pos), int(player) def run(self): self.actions = self.preprocess_actions() self.build_and_draw_board() if self.initial_state: self.state_manager.set_state_manager(self.initial_state) self.draw_initial_state() if len(self.actions): self.master.after(self.frame_rate, self.draw) mainloop() def model_perform_action(self, model: ANET): print(self.state_manager.get_state()) distribution = model.predict(self.state_manager.get_state()) print(distribution) argmax_distribution_index = int( np.argmax(distribution) ) # Greedy best from distribution action = self.state_manager.get_action_from_flattened_board_index( argmax_distribution_index, self.state_manager.get_state() ) self.perform_action(GameVisualizer.preprocess_action(action)) def button_clicked(self): if self.state_manager.is_end_state(): return try: current_player = self.player1 if self.state_manager.current_player() == 1 else self.player2 if current_player: self.model_perform_action(current_player) else: input_action = ( f"{self.action_input.get()}:{self.state_manager.current_player()}" ) if self.random_play: input_action = random.choice( self.state_manager.generate_possible_actions( self.state_manager.get_state() ) ) self.perform_action(GameVisualizer.preprocess_action(input_action)) self.action_input.delete(0, "end") except ValueError: self.label["text"] = "Something went wrong" if self.state_manager.is_end_state(): self.label["text"] = "Game over" def draw_initial_state(self): initial_board = self.state_manager.build_board(self.initial_state) for row_index, row in enumerate(initial_board): for col_index, player in enumerate(row): if player: self.player_pieces.append( Cell( self.canvas, self.board[row_index][col_index].top, player=player, ) ) def get_canvas_position(self, position: (int, int)) -> (int, int): x, y = self.start_pos x += self.size * 2 * position[1] + self.size * position[0] y += (self.size + self.size / 1.7) * position[0] return x, y def build_and_draw_board(self): for i in range(self.board_size): row = [] for j in range(self.board_size): row.append( Cell( self.canvas, self.get_canvas_position((i, j)), draw_on_init=False, ) ) self.board.append(row) self.draw_board_border() for row_index, row in enumerate(self.board): for col_index, cell in enumerate(row): self.board[row_index][col_index].draw() def get_column(self, target_col_index: int): column = [] for row_index, row in enumerate(self.board): for col_index, cell in enumerate(row): if target_col_index == col_index: column.append(cell) return column def draw_board_border(self): borders = [] first_row = self.board[0] borders.append( self.canvas.create_polygon( first_row[0].left1[0], first_row[0].left1[1], first_row[0].left1[0] - self.border_size, first_row[0].left1[1] - self.border_size, first_row[0].left1[0], first_row[0].left1[1] - 2 * self.border_size, first_row[-1].top[0] + 2 * self.border_size, first_row[-1].right1[1] - 2 * self.border_size, first_row[-1].top[0], (first_row[-1].right1[1] + first_row[-1].right2[1]) / 2, fill=PLAYER_ONE_COLOR, ) ) [ borders.append( self.canvas.create_text( cell.top[0] - 1.5 * self.border_size, cell.top[1], text=str(i) if self.cartesian_cords else chr(65 + i), fill=FONT_COLOR, font=(FONT, FONT_SIZE), ) ) for i, cell in enumerate(first_row) ] first_column = self.get_column(0) borders.append( self.canvas.create_polygon( first_column[0].left1[0], first_column[0].left1[1], first_column[0].left1[0] - self.border_size, first_column[0].left1[1] - self.border_size, first_column[0].left1[0] - 2 * self.border_size, first_column[0].left1[1], first_column[-1].bottom[0] - 2 * self.border_size, first_column[-1].left2[1] + 2 * self.border_size, first_column[-1].bottom[0], (first_column[-1].left2[1] + first_column[-1].left1[1]) / 2, fill=PLAYER_TWO_COLOR, ) ) [ borders.append( self.canvas.create_text( cell.left1[0] - self.border_size / 1.5, (cell.left1[1] + cell.left2[1]) / 2, text=str(i) if self.cartesian_cords else str(i + 1), fill=FONT_COLOR, font=(FONT, FONT_SIZE), ) ) for i, cell in enumerate(first_column) ] last_row = self.board[-1] borders.append( self.canvas.create_polygon( last_row[0].bottom[0], (last_row[0].right1[1] + last_row[0].right2[1]) / 2, last_row[0].bottom[0] - 2 * self.border_size, last_row[0].right2[1] + 2 * self.border_size, last_row[-1].right2[0], last_row[-1].right2[1] + 2 * self.border_size, last_row[-1].right2[0] + self.border_size, last_row[-1].right2[1] + self.border_size, last_row[-1].right2[0], last_row[-1].right2[1], fill=PLAYER_ONE_COLOR, ) ) [ borders.append( self.canvas.create_text( cell.bottom[0] + 1.2 * self.border_size, cell.bottom[1], text=str(i) if self.cartesian_cords else chr(65 + i), fill=FONT_COLOR, font=(FONT, FONT_SIZE), ) ) for i, cell in enumerate(last_row) ] last_column = self.get_column(self.board_size - 1) borders.append( # Bottom up self.canvas.create_polygon( last_column[-1].right2[0], last_column[-1].right2[1], last_column[-1].right2[0] + self.border_size, last_column[-1].right2[1] + self.border_size, last_column[-1].right2[0] + 2 * self.border_size, last_column[-1].right2[1], last_column[0].top[0] + 2 * self.border_size, last_column[0].right1[1] - 2 * self.border_size, last_column[0].top[0], (last_column[0].right1[1] + last_column[0].right2[1]) / 2, fill=PLAYER_TWO_COLOR, ) ) [ borders.append( self.canvas.create_text( cell.right1[0] + self.border_size / 1.5, (cell.left1[1] + cell.left2[1]) / 2, text=str(i) if self.cartesian_cords else str(i + 1), fill=FONT_COLOR, font=(FONT, FONT_SIZE), ) ) for i, cell in enumerate(last_column) ] return borders def get_board_pos(self, pos: (int, int)): return self.board_size * pos[0] + pos[1] def get_cords(self, board_pos: int): return math.floor(board_pos / self.board_size), board_pos % self.board_size def draw(self): self.perform_action(self.actions.pop(0)) if len(self.actions) > 0: self.master.after(self.frame_rate, self.draw) def perform_action(self, action: (int, int, int)): print(action) x_pos, y_pos, player = action self.player_pieces.append( Cell(self.canvas, self.board[x_pos][y_pos].top, player=player,) ) self.state_manager.perform_action(f"{x_pos},{y_pos}:{player}")
def __init__( self, board_size, player1=None, player2=None, starting_player=1, random_play=False, frame_rate=1000, initial_state=None, cartesian_cords=True, ): # Game logic self.board_size = board_size self.initial_state = initial_state self.state_manager = StateManager(board_size, starting_player) if initial_state: self.state_manager.set_state_manager(initial_state) self.random_play = random_play # Setting players self.player1 = player1 self.player2 = player2 # WINDOW self.master = Tk() self.master.title("HexGameVisualizer") self.master.protocol("WM_DELETE_WINDOW", self.quit_application) self.action_input = Entry(self.master) self.action_input.bind("<Return>", lambda event: self.button_clicked()) self.action_input.pack() self.perform_action_button = Button( self.master, text="perform move", command=self.button_clicked ) self.perform_action_button.pack() # TODO: Add label to describe the players currently playing self.label = Label(self.master) self.label.pack() self.start_pos = (60, 30) self.canvas = Canvas( self.master, width=self.start_pos[0] + self.board_size * 55 + self.start_pos[0], height=self.start_pos[1] + self.board_size * 33 + self.start_pos[1], ) self.canvas.pack() # CONSTANTS self.frame_rate = frame_rate self.border_size = 10 self.counter = 0 self.size = 20 self.cartesian_cords = cartesian_cords # LISTS CONTROLLING GAME AND DRAWING OF BOARD self.board = [] self.board_border = [] self.actions = [] self.player_pieces = []
class MCTS: def __init__( self, state_manager: StateManager, actor_net, max_tree_height=5, c=1, number_of_simulations=10, verbose=False, random_simulation_rate=0.2, ): self.state_manager = StateManager(state_manager.board_size, state_manager.current_player()) self.tree = StateTree(self.state_manager.get_state()) self.tree.add_state_node(self.tree.root_state, self.state_manager.is_end_state()) self.c = c self.max_tree_height = max_tree_height self.actor_net = actor_net self.number_of_simulations = number_of_simulations self.verbose = verbose self.random_simulation_rate = random_simulation_rate def run(self, root_state: str, progress: float): """ Main method: Runs the monte carlo tree search algorithm, tree traversal -> rollout -> backprop, m times. Then finds the greedy best move from root state of the current tree :param root_state: state to run the algorithm from -> root node :return: the greedy best action from root node of the current tree """ self.tree.cut_tree_with_new_root_node(root_state) self.state_manager.set_state_manager(self.tree.root_state) for i in range(self.number_of_simulations): rollout_state = self.traverse_tree(self.tree.root_state, depth=0) simulation_reward = self.simulate(rollout_state) self.backpropagate(rollout_state, simulation_reward) self.state_manager.set_state_manager(self.tree.root_state) distribution = self.get_distribution(self.tree.root_state) self.actor_net.add_case(self.tree.root_state, distribution.copy()) if random.random() > math.tanh(progress): chosen_action = self.choose_action_stochastically( np.array(distribution), self.tree.root_state) else: chosen_action = self.epsilon_greedy_action_from_distribution( np.array(distribution), self.tree.root_state, epsilon=0.0) if self.verbose: print("distribution", distribution) print("chosen_action", chosen_action) return chosen_action # MAIN ALGORITHM METHODS def traverse_tree(self, state: str, depth: int) -> str: """ Traversing the tree expanding nodes by using the tree policy (tree_policy) :param state: current state :param depth: current depth of the tree :return chosen state to simulate from """ if depth == self.max_tree_height or self.tree.is_end_state(state): return state # If the current state has not explored it's children yet: Add all to graph and chose one to simulate from elif not self.tree.get_outgoing_edges(state): children = self.expand(state) return self.choose_random_child(state, children) else: child = self.tree_policy(state) self.state_manager.check_difference_and_perform_action(child) if self.tree.get_state_number_of_visits(child) == 0: self.tree.set_end_state(child, self.state_manager.is_end_state()) return self.traverse_tree(child, depth + 1) def expand(self, state) -> [str]: """ Expanding all child nodes from the input state and adding them to the graph. :param state: state to find all children from :return: list of all child states """ children = StateManager.generate_child_states(state) for child in children: if child not in self.tree.get_nodes(): self.tree.add_state_node(child) self.tree.add_edge(state, child) return children def simulate(self, state: str): """ Performs one roll-out using the actor net as policy :return: return 1 if the simulation ends in player "true" winning, -1 otherwise """ if self.state_manager.get_state() != state: raise ValueError( "The state manager is not set to the start of the simulation") while not self.state_manager.is_end_state(): if random.random() < self.random_simulation_rate: distribution = self.actor_net.predict( self.state_manager.get_state()) chosen_action = self.epsilon_greedy_action_from_distribution( distribution, self.state_manager.get_state(), epsilon=0.0) else: chosen_action = random.choice( self.state_manager.generate_possible_actions( self.state_manager.get_state())) self.state_manager.perform_action(chosen_action) return MCTS.get_end_state_reward(self.state_manager.current_player()) def backpropagate(self, state: str, simulation_reward: int): """ Starts at rollout start state and jumps up in the tree updating the nodes sap and number of visits :param state: rollout start state :param simulation_reward: reward from simulation """ if state == self.tree.root_state: self.tree.increment_state_number_of_visits(state) return parent_state = self.tree.get_parent(state) self.tree.increment_state_number_of_visits(state) self.tree.increment_edge_number_of_visits(parent_state, state) edge_times_enc = self.tree.get_edge_number_of_visits( parent_state, state) edge_sap_value = self.tree.get_sap_value(parent_state, state) new_sap_value = (self.tree.get_sap_value(parent_state, state) + (simulation_reward - edge_sap_value) / edge_times_enc) self.tree.set_sap_value(parent_state, state, new_sap_value) self.tree.set_active_edge(parent_state, state, False) self.backpropagate(parent_state, simulation_reward) # HELPER METHODS def tree_policy(self, state: str) -> str: """ Using the uct score to determine the child state of a input state :param state: input state :return: child state """ state_number_of_visits = self.tree.get_state_number_of_visits(state) if self.state_manager.get_player(state) == 1: best_edge = self.tree.get_outgoing_edges( state, sort_by_function=lambda edge: self.compute_uct( self.tree.get_sap_value(*edge), state_number_of_visits, self.tree.get_edge_number_of_visits(*edge), True, ), )[0] else: best_edge = self.tree.get_outgoing_edges( state, sort_by_function=lambda edge: self.compute_uct( self.tree.get_sap_value(*edge), state_number_of_visits, self.tree.get_edge_number_of_visits(*edge), False, ), )[-1] parent, best_child = best_edge self.tree.set_active_edge(parent, best_child, True) return best_child def compute_uct( self, sap_value: float, number_of_visits_node: int, number_of_visits_edge: int, maximizing_player: bool, ) -> float: """ Computes the uct for the tree policy :param sap_value: sap value for the edge :param number_of_visits_node: number of visits for the parent state :param number_of_visits_edge: number of visits for the edge between the two nodes :param maximizing_player: if the current player is the maximizing player :return: uct value """ uct = sap_value usa_term = self.c * math.sqrt( math.log(number_of_visits_node) / (1 + number_of_visits_edge)) if maximizing_player: uct += usa_term else: uct -= usa_term return uct def greedy_best_action(self, state: str) -> str: sorted_list = self.tree.get_outgoing_edges( state, sort_by_function=lambda edge: self.tree.get_edge_number_of_visits( *edge), ) return self.state_manager.get_action(*sorted_list[0]) def choose_random_child(self, parent_state: str, child_list: [str]) -> str: """ Helper method choosing a random state from the child list, updating state manager and adding edge and node parameters :param parent_state: parent state for the child list (to set edge parameters) :param child_list: list of children from parent state :return: chosen child """ child = random.choice(child_list) self.state_manager.check_difference_and_perform_action(child) self.tree.set_end_state(child, self.state_manager.is_end_state()) self.tree.set_active_edge(parent_state, child, True) return child def epsilon_greedy_action_from_distribution(self, distribution: np.ndarray, state: str, epsilon=0.2): """ Chooses an epsilon greedy index from the distribution converting that index to an action :param distribution: distribution from number of simulations per node :param state: current state to calculate action :param epsilon: the epsilon value to be used :return: actionstring """ if random.random() > epsilon: chosen_index = int(np.argmax(distribution)) else: # Choose random state from those with positive probability # prob == 0 might be occupied cells on the board if not [ i[0] for i, prob in np.ndenumerate(distribution) if prob > 0 ]: chosen_index = int(np.argmax(distribution)) else: chosen_index = random.choice([ i[0] for i, prob in np.ndenumerate(distribution) if prob > 0 ]) return self.state_manager.get_action_from_flattened_board_index( chosen_index, state) @staticmethod def get_end_state_reward(current_player: int) -> int: """ We have chosen player 1 to be "us", giving a positive reward if player 1 wins. :param current_player: current player for the state manager :return: reward for end state """ return -1 if current_player == 1 else 1 def get_distribution(self, state: str): """ Returns the distribution of total visits for child nodes of input state :param state: state to get distribution from :return: a normalized list of length equal to the total number of positions on the board """ parent_board, parent_player = StateManager.extract_state(state) child_states = self.tree.get_child_states(state) change_indices_dict = {} total_visits = 0 for child in child_states: child_board, child_player = StateManager.extract_state(child) for i in range(len(child_board)): if parent_board[i] != child_board[i]: child_number_of_visits = self.tree.get_edge_number_of_visits( state, child) change_indices_dict[i] = child_number_of_visits total_visits += child_number_of_visits break return [ change_indices_dict[index] / total_visits if index in change_indices_dict else 0 for index in range(self.state_manager.board_size**2) ] def set_random_simulation_rate(self, new_rate: float): self.random_simulation_rate = new_rate def choose_action_stochastically(self, distribution, state): chosen_index = np.random.choice([i for i in range(len(distribution))], p=distribution) return self.state_manager.get_action_from_flattened_board_index( chosen_index, state)