def play(self, M, N, G, board_size): for anet1 in self.anets: for anet2 in [a for a in self.anets if a != anet1]: for i in range(G): play = grid.Grid(board_size) # Playing until the board is in terminal state. Anet1 is always the first player # This ensures that all networks will have a chance to play as first player while not play.is_terminal()[0]: # Let the player whose turn it currently is to make the move an = anet1 if play.get_player() == 1 else anet2 # Get the probability distribution from ANET pd = an.policy( str(play.get_player()) + play.get_state() ) # Make the best move play.make_move(play.get_coor( pd.index(h.argmax( pd ))) ) # Save the results of the game key = "an" + str(self.anets.index(anet1)) + " vs an" + str(self.anets.index(anet2)) # Add the results into the dictionary if play.is_terminal()[1] == 1: if key in self.results: self.results[key][0] += 1 else: self.results[key] = [1,0] else: if key in self.results: self.results[key][1] += 1 else: self.results[key] = [0, 1]
def handle_get_action(self, state): """ Here you will use the neural net that you trained using MCTS to select a move for your actor on the current board. Remember to use the correct player_number for YOUR actor! The default action is to select a random empty cell on the board. This should be modified. :param state: The current board in the form (1 or 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), where 1 or 2 indicates the number of the current player. If you are player 2 in the current series, for example, then you will see a 2 here throughout the entire series, whereas player 1 will see a 1. :return: Your actor's selected action as a tuple (row, column) """ # Get the probability distribution from ANET pd = self.anet.policy("".join(str(i) for i in state)) # Pick the best move index = pd.index(h.argmax(pd)) next_move = (int(index / 6), index % 6) ############################# # # # YOUR CODE HERE # # next_move = ??? ############################## return next_move
def rollout(anode, policy): # Make the board from the anode's parent's state board = grid.create_board(anode.parent.state) # Make a move based on the anode selected board.make_move(anode.action) # Make moves until the game is in terminal state while not board.is_terminal()[0]: move = None # Random policy, e.g. moves are selected randomly if policy == "r": move = random.choice(board.get_available_actions()) # ANET should predict the next move elif policy == "n": if random.random() > self.grate: # Pick a move randomly move = random.choice(board.get_available_actions()) else: # Ask anet to select the move for the next state pd = self.anet.policy( str(board.get_player()) + board.get_state()) # Get the index of the highest PD value, then its coordinate move = board.get_coor(pd.index(h.argmax(pd))) # Raise the exception if wrong rollout policy has been chosen else: raise Exception("MCTS does not support policy named " + policy) # Make the move selected by the rollout policy board.make_move(move) # Return the reward of the terminal state return board.get_reward()
def tree_policy(self, snode, grate): # Upper Confidence Bound to encourage exploration def UCT(state_visits, action_visits): return (log10(state_visits) / (1 + action_visits))**0.5 if state_visits != 0 else 0 # If snode received is None, something went wrong if h.is_empty(snode): raise Exception("Tree policy received a None instead of a snode") # If the board is already in the terminal state, e.g. no actions are available, return None elif h.is_empty(snode.actions): return None # With grate probability explore if random.random() > self.grate: # Pick a move randomly anode = random.choice(snode.actions) # Else, exploit, based on the values of the action, and the player's turn it currently is else: # Get the state info board = grid.Grid(self.board_size) board.set_from_state(snode.state) # Make a move depending whose turn it is if board.get_player() == 1: # Select the action which gives the highest values anode = h.argmax(snode.actions, (lambda a: a.value + self.c * UCT( a.parent.visits, a.visits))) else: # Select the action which gives the lowest values anode = h.argmin(snode.actions, (lambda a: a.value - self.c * UCT( a.parent.visits, a.visits))) # Return the selected action return anode
def __init__(self, board_size, ai_lvl, path, player): # Load the ANET anet = an.ANET() anet.load(ai_lvl, path) # Create the board play = grid.Grid(board_size) play.print_grid() # Playing until the board is in terminal state while not play.is_terminal()[0]: # Change 2 to 1 if you want to go first if play.get_player() == player: # Play the game, pausing every time it is player's turn to play player_input = input("Player turn: ") move = (int(player_input[1]), int(player_input[0]) ) else: # Get the probability distribution from ANET pd = anet.policy( str(play.get_player()) + play.get_state() ) # Pick the best move move = play.get_coor( pd.index( h.argmax( pd ))) # Make the move and print the grid play.make_move(move) play.print_grid() # Printing the winner print("Player " + str(1 if play.get_player() == 2 else 2 ) + " won!" )