class SelfPlay(object): '''Abstract class for construction remote self-play actors. ''' def __init__(self): '''Instantiates a self-play actor. Returns ------- None. ''' self.agent = Agent(path='./model_data/alpha_0.pt') def run(self, replay_buffer, update_signal, self_play_id, search_iters=SELF_PLAY_SEARCH_ITERS, markov_exp=SELF_PLAY_MARKOV_EXP, temp=TEMP, temp_thrshld=TEMP_THRSHLD): '''Starts indefinite self-play loop. The games for self-play are generated via an ongoing Markov chain as described in randomDag.py. The self-play processes are synchronized with one another, train and evaluation processes via the 'replay_buffer' and 'update_signal', respectively. 'replay_buffer' stores the self-play data and triggers the start of training while 'update_signal' triggers model parameter updates. Parameters ---------- replay_buffer : ReplayBuffer remote actor for managing self-play data between self-play processes and the Train process. Also carries the signal to start training. update_signal : UpdateSignal remote actor for synchronization between self-play processes and evaluation processes. Triggers model parameter updates. self_play_id : int (nonnegative) unique identifier for the self-play process. search_iters : int (positve), optional the number of search iterations to perform during MCTS. The default is SELF_PLAY_SEARCH_ITERS. markov_exp : float, optional The exponent determining the number of steps taken in the markov chain in generating games for self-play. temp : float (nonnegative) partially controls exploration. If 0, the policy is deterministic and the position with highest visit from MCTS is chosen. temp_thrshld : int (nonnegative), optional The number of moves after which the policy becomes determnistic. I.e., temp is set to 0. (See temp, above.) The default is TEMP_THRSHLD. Returns ------- None. ''' # put agent in evaluation mode self.agent.model.eval() # the action space... actions = np.arange(MAX_NODES) # game state generator via an ongoing Markov chain state_generator = GameState.state_generator(markov_exp) # start indefinite self-play loop while True: # check for updates if ray.get(update_signal.get_update.remote(self_play_id)): # get current update_id update_id = ray.get(update_signal.get_update_id.remote()) # load current alpha paramenters self.agent.load_parameters( path=f'./model_data/alpha_{update_id}.pt') # reset the update signal update_signal.clear_update.remote(self_play_id) # get a game and play initial_state = next(state_generator) root = PUCTNode(initial_state) states = [] policies = [] move_count = 0 while not root.state.is_terminal_state(): t = temp if move_count < temp_thrshld else 0 policy = self.agent.MCTS(root, search_iters, t) move = np.random.choice(actions, p=policy) states.append(root.state.encoded_state) policies.append(policy) root = root.edges[move] root.to_root() move_count += 1 # update state values as seen from current players perspective if move_count % 2 == 0: values = [(-1)**(i + 1) for i in range(move_count)] else: values = [(-1)**i for i in range(move_count)] # construct training data from self-play train_data = [ (state, policy, value) for state, policy, value in zip(states, policies, values) ] # add training data to replay buffer replay_buffer.add.remote(train_data)
class Evaluation(object): '''Abstract class for construction of remote evaluation actors. ''' def __init__(self, update_id): '''Instantiate an Evaluation actor Parameters ---------- update_id : int (nonegative) the current update_id. When an Evaluation actor is instatiated the actors alpha agent pulls from the most current alpha parameters. Returns ------- None. ''' self.alpha_agent = Agent(path=f'./model_data/alpha_{update_id}.pt') self.apprentice_agent = Agent(path='./model_data/apprentice.pt') def update_alpha_parameters(self, update_id): '''Updates the alpha parameters. Used if an update is triggered after an evaluation. Parameters ---------- update_id : int (positive) the index of the update to the alpha parameters. (Each time an update is triggered the update_id is incremented by 1 and then new alpha parameters are saved indexed by the current update_id.) Returns ------- None. ''' self.apprentice_agent.save_parameters( path=f'./model_data/alpha_{update_id}.pt') def run(self, num_plays=PLAYS_PER_EVAL, search_iters=EVAL_PLAY_SEARCH_ITERS, markov_exp=EVAL_PLAY_MARKOV_EXP): '''Starts an evaluation. The evaluation process is synchronized with the self-play processes and evaluation processes via instances of UpdateSignal and AsyncSignal, respectively, in the main script: asyn_training.py. The UpdateSignal triggers an update in each of the self-play processes if the total number of apprentice wins to total evaluation games surpasses the declared win ratio while the AsyncSignal triggers the evaluation processes. Parameters ---------- num_plays : int (positive), optional The numnber of evaluation games to play. The default is PLAYS_PER_EVAL. search_iters : int (positive), optional the number of search iterations to perform during MCTS. The default is EVAL_PLAY_SEARCH_ITERS. markov_exp : float, optional The exponent determining the number of steps taken in the markov chain in generating games for evaluation. Returns ------- apprentice_wins : int (nonegative) the number of apprentice wins. ''' # put models in eval mode... self.alpha_agent.model.eval() self.apprentice_agent.model.eval() # setup gameplay alpha = 0 apprentice = 1 actions = np.arange(MAX_NODES) state_generator = GameState.state_generator(markov_exp) apprentice_wins = 0 # start evaluation game play for i in range(num_plays): # uniformly randomly choose which agent plays first next_move = np.random.choice([alpha, apprentice]) # play a randomly generated game of upset-downset game_state = next(state_generator) while not game_state.is_terminal_state(): root = PUCTNode(game_state) policy = self.alpha_agent.MCTS(root, search_iters, 0) \ if next_move == alpha \ else self.apprentice_agent.MCTS(root, search_iters, 0) move = np.random.choice(actions, p=policy) game_state = root.edges[move].state next_move = 1 - next_move # decide winner winner = 1 - next_move if winner == apprentice: apprentice_wins += 1 return apprentice_wins