Ejemplo n.º 1
0
Archivo: TOPP.py Proyecto: pirib/RLPR2
    def play(self, M, N, G, board_size):
        for anet1 in self.anets:
            for anet2 in [a for a in self.anets if a != anet1]:
                for i in range(G):
                    play = grid.Grid(board_size)
    
                    # Playing until the board is in terminal state. Anet1 is always the first player 
                    # This ensures that all networks will have a chance to play as first player
                    while not play.is_terminal()[0]:
                        
                        # Let the player whose turn it currently is to make the move
                        an = anet1 if play.get_player() == 1 else anet2
                            
                        # Get the probability distribution from ANET
                        pd = an.policy( str(play.get_player()) + play.get_state() )
                        
                        # Make the best move
                        play.make_move(play.get_coor( pd.index(h.argmax( pd ))) )
                        

                    # Save the results of the game
                    key = "an" + str(self.anets.index(anet1)) + " vs an" + str(self.anets.index(anet2)) 
                    
                    # Add the results into the dictionary 
                    if play.is_terminal()[1] == 1:
                        if key in self.results: 
                            self.results[key][0] += 1                
                        else: 
                            self.results[key] = [1,0]
                    else:
                        if key in self.results: 
                            self.results[key][1] += 1                
                        else: 
                            self.results[key] = [0, 1]
Ejemplo n.º 2
0
    def handle_get_action(self, state):
        """
        Here you will use the neural net that you trained using MCTS to select a move for your actor on the current board.
        Remember to use the correct player_number for YOUR actor! The default action is to select a random empty cell
        on the board. This should be modified.
        :param state: The current board in the form (1 or 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), where
        1 or 2 indicates the number of the current player.  If you are player 2 in the current series, for example,
        then you will see a 2 here throughout the entire series, whereas player 1 will see a 1.
        :return: Your actor's selected action as a tuple (row, column)
        """

        # Get the probability distribution from ANET
        pd = self.anet.policy("".join(str(i) for i in state))

        # Pick the best move
        index = pd.index(h.argmax(pd))
        next_move = (int(index / 6), index % 6)

        #############################
        #
        #
        # YOUR CODE HERE
        #
        # next_move = ???
        ##############################
        return next_move
Ejemplo n.º 3
0
Archivo: MCTS.py Proyecto: pirib/RLPR2
        def rollout(anode, policy):

            # Make the board from the anode's parent's state
            board = grid.create_board(anode.parent.state)

            # Make a move based on the anode selected
            board.make_move(anode.action)

            # Make moves until the game is in terminal state
            while not board.is_terminal()[0]:

                move = None

                # Random policy, e.g. moves are selected randomly
                if policy == "r":
                    move = random.choice(board.get_available_actions())

                # ANET should predict the next move
                elif policy == "n":

                    if random.random() > self.grate:
                        # Pick a move randomly
                        move = random.choice(board.get_available_actions())

                    else:
                        # Ask anet to select the move for the next state
                        pd = self.anet.policy(
                            str(board.get_player()) + board.get_state())
                        # Get the index of the highest PD value, then its coordinate
                        move = board.get_coor(pd.index(h.argmax(pd)))

                # Raise the exception if wrong rollout policy has been chosen
                else:
                    raise Exception("MCTS does not support policy named " +
                                    policy)

                # Make the move selected by the rollout policy
                board.make_move(move)

            # Return the reward of the terminal state
            return board.get_reward()
Ejemplo n.º 4
0
Archivo: MCTS.py Proyecto: pirib/RLPR2
    def tree_policy(self, snode, grate):

        # Upper Confidence Bound to encourage exploration
        def UCT(state_visits, action_visits):
            return (log10(state_visits) /
                    (1 + action_visits))**0.5 if state_visits != 0 else 0

        # If snode received is None, something went wrong
        if h.is_empty(snode):
            raise Exception("Tree policy received a None instead of a snode")

        # If the board is already in the terminal state, e.g. no actions are available, return None
        elif h.is_empty(snode.actions):
            return None

        # With grate probability explore
        if random.random() > self.grate:
            # Pick a move randomly
            anode = random.choice(snode.actions)

        # Else, exploit, based on the values of the action, and the player's turn it currently is
        else:
            # Get the state info
            board = grid.Grid(self.board_size)
            board.set_from_state(snode.state)

            # Make a move depending whose turn it is
            if board.get_player() == 1:
                # Select the action which gives the highest values
                anode = h.argmax(snode.actions,
                                 (lambda a: a.value + self.c * UCT(
                                     a.parent.visits, a.visits)))
            else:
                # Select the action which gives the lowest values
                anode = h.argmin(snode.actions,
                                 (lambda a: a.value - self.c * UCT(
                                     a.parent.visits, a.visits)))

        # Return the selected action
        return anode
Ejemplo n.º 5
0
    def __init__(self, board_size, ai_lvl, path, player):
        
        # Load the ANET
        anet = an.ANET()
        
        anet.load(ai_lvl, path)
        
        # Create the board
        play = grid.Grid(board_size)
        play.print_grid()
        
        # Playing until the board is in terminal state
        while not play.is_terminal()[0]:
        
            # Change 2 to 1 if you want to go first
            if play.get_player() == player:
        
                # Play the game, pausing every time it is player's turn to play
                player_input = input("Player turn: ")
                move = (int(player_input[1]), int(player_input[0]) )
        
            else:
        
                # Get the probability distribution from ANET
                pd = anet.policy( str(play.get_player()) + play.get_state() )
                
                # Pick the best move
                move = play.get_coor( pd.index( h.argmax( pd ))) 

            
            # Make the move and print the grid
            play.make_move(move)
            play.print_grid()
            
        # Printing the winner
        print("Player " + str(1 if play.get_player() == 2 else 2 ) + " won!" )