예제 #1
0
class SelfPlay(object):
    '''Abstract class for construction remote self-play actors.
    '''
    def __init__(self):
        '''Instantiates a self-play actor.
        
        Returns
        -------
        None.

        '''
        self.agent = Agent(path='./model_data/alpha_0.pt')

    def run(self,
            replay_buffer,
            update_signal,
            self_play_id,
            search_iters=SELF_PLAY_SEARCH_ITERS,
            markov_exp=SELF_PLAY_MARKOV_EXP,
            temp=TEMP,
            temp_thrshld=TEMP_THRSHLD):
        '''Starts indefinite self-play loop. The games for self-play are 
        generated via an ongoing Markov chain as described in randomDag.py.
        The self-play processes are synchronized with one another, train 
        and evaluation processes via the 'replay_buffer' and 'update_signal', 
        respectively. 'replay_buffer' stores the self-play data and triggers 
        the start of training while 'update_signal' triggers model parameter 
        updates.
        
        Parameters
        ----------
        replay_buffer : ReplayBuffer
            remote actor for managing self-play data between self-play processes 
            and the Train process. Also carries the signal to start training.
        update_signal : UpdateSignal
            remote actor for synchronization between self-play processes and 
            evaluation processes. Triggers model parameter updates.
        self_play_id : int (nonnegative)
            unique identifier for the self-play process.
        search_iters : int (positve), optional
             the number of search iterations to perform during MCTS. 
             The default is SELF_PLAY_SEARCH_ITERS.
        markov_exp : float, optional
            The exponent determining the number of steps taken in 
            the markov chain in generating games for self-play.
        temp : float (nonnegative)
            partially controls exploration. If 0, the policy is deterministic 
            and the position with highest visit  from MCTS is chosen.
        temp_thrshld : int (nonnegative), optional
            The number of moves after which the policy becomes determnistic.
            I.e., temp is set to 0. (See temp, above.) The default is 
            TEMP_THRSHLD.

        Returns
        -------
        None.

        '''
        # put agent in evaluation mode
        self.agent.model.eval()
        # the action space...
        actions = np.arange(MAX_NODES)
        # game state generator via an ongoing Markov chain
        state_generator = GameState.state_generator(markov_exp)
        # start indefinite self-play loop
        while True:
            # check for updates
            if ray.get(update_signal.get_update.remote(self_play_id)):
                # get current update_id
                update_id = ray.get(update_signal.get_update_id.remote())
                # load current alpha paramenters
                self.agent.load_parameters(
                    path=f'./model_data/alpha_{update_id}.pt')
                # reset the update signal
                update_signal.clear_update.remote(self_play_id)
            # get a game and play
            initial_state = next(state_generator)
            root = PUCTNode(initial_state)
            states = []
            policies = []
            move_count = 0
            while not root.state.is_terminal_state():
                t = temp if move_count < temp_thrshld else 0
                policy = self.agent.MCTS(root, search_iters, t)
                move = np.random.choice(actions, p=policy)
                states.append(root.state.encoded_state)
                policies.append(policy)
                root = root.edges[move]
                root.to_root()
                move_count += 1
            # update state values as seen from current players perspective
            if move_count % 2 == 0:
                values = [(-1)**(i + 1) for i in range(move_count)]
            else:
                values = [(-1)**i for i in range(move_count)]
            # construct training data from self-play
            train_data = [
                (state, policy, value)
                for state, policy, value in zip(states, policies, values)
            ]
            # add training data to replay buffer
            replay_buffer.add.remote(train_data)
예제 #2
0
class Evaluation(object):
    '''Abstract class for construction of remote evaluation actors.
    '''
    def __init__(self, update_id):
        '''Instantiate an Evaluation actor

        Parameters
        ----------
        update_id : int (nonegative)
            the current update_id. When an Evaluation actor is instatiated the 
            actors alpha agent pulls from the most current alpha parameters.
    
        Returns
        -------
        None.

        '''
        self.alpha_agent = Agent(path=f'./model_data/alpha_{update_id}.pt')
        self.apprentice_agent = Agent(path='./model_data/apprentice.pt')

    def update_alpha_parameters(self, update_id):
        '''Updates the alpha parameters. Used if an update is triggered 
        after an evaluation.

        Parameters
        ----------
        update_id : int (positive)
            the index of the update to the alpha parameters. 
            (Each time an update is triggered the update_id is incremented 
             by 1 and then new alpha parameters are saved indexed by the 
            current update_id.)

        Returns
        -------
        None.

        '''
        self.apprentice_agent.save_parameters(
            path=f'./model_data/alpha_{update_id}.pt')

    def run(self,
            num_plays=PLAYS_PER_EVAL,
            search_iters=EVAL_PLAY_SEARCH_ITERS,
            markov_exp=EVAL_PLAY_MARKOV_EXP):
        '''Starts an evaluation. The evaluation process is synchronized 
        with the self-play processes and evaluation processes via instances of  
        UpdateSignal and AsyncSignal, respectively, in the main script: 
        asyn_training.py. The UpdateSignal triggers an update in each of the 
        self-play processes if the total number of apprentice wins to total 
        evaluation games surpasses the declared win ratio while the AsyncSignal
        triggers the evaluation processes.

        Parameters
        ----------
        num_plays : int (positive), optional
            The numnber of evaluation games to play. The default is 
            PLAYS_PER_EVAL.
        search_iters : int (positive), optional
            the number of search iterations to perform during MCTS. 
            The default is EVAL_PLAY_SEARCH_ITERS.
        markov_exp : float, optional
            The exponent determining the number of steps taken in 
            the markov chain in generating games for evaluation.

        Returns
        -------
        apprentice_wins : int (nonegative)
            the number of apprentice wins.

        '''
        # put models in eval mode...
        self.alpha_agent.model.eval()
        self.apprentice_agent.model.eval()
        # setup gameplay
        alpha = 0
        apprentice = 1
        actions = np.arange(MAX_NODES)
        state_generator = GameState.state_generator(markov_exp)
        apprentice_wins = 0
        # start evaluation game play
        for i in range(num_plays):
            # uniformly randomly choose which agent plays first
            next_move = np.random.choice([alpha, apprentice])
            # play a randomly generated game of upset-downset
            game_state = next(state_generator)
            while not game_state.is_terminal_state():
                root = PUCTNode(game_state)
                policy = self.alpha_agent.MCTS(root, search_iters, 0) \
                    if next_move == alpha \
                        else self.apprentice_agent.MCTS(root, search_iters, 0)
                move = np.random.choice(actions, p=policy)
                game_state = root.edges[move].state
                next_move = 1 - next_move
            # decide winner
            winner = 1 - next_move
            if winner == apprentice:
                apprentice_wins += 1

        return apprentice_wins