def self_play(self, first_color): """ This function executes one episode of self-play, starting with player 1. As the game is played, each turn is added as a training example to train_examples. The game is played till the game ends. After the game ends, the outcome of the game is used to assign values to each example in train_examples. """ train_examples = [] gomoku = Gomoku(self.n, self.n_in_row, first_color) mcts = MCTS("./models/checkpoint.pt", self.thread_pool_size, self.c_puct, self.num_mcts_sims, self.c_virtual_loss, self.action_size, self.mcts_use_gpu) episode_step = 0 while True: episode_step += 1 # prob temp = self.temp if episode_step <= self.explore_num else 0 prob = np.array(list(mcts.get_action_probs(gomoku, temp))) # generate sample board = tuple_2d_to_numpy_2d(gomoku.get_board()) last_action = gomoku.get_last_move() cur_player = gomoku.get_current_color() sym = self.get_symmetries(board, prob) for b, p in sym: train_examples.append([b, last_action, cur_player, p]) # dirichlet noise legal_moves = list(gomoku.get_legal_moves()) noise = 0.25 * np.random.dirichlet( self.dirichlet_alpha * np.ones(np.count_nonzero(legal_moves))) prob_noise = 0.75 * prob j = 0 for i in range(len(prob_noise)): if legal_moves[i] == 1: prob_noise[i] += noise[j] j += 1 prob_noise /= np.sum(prob_noise) action = np.random.choice(len(prob_noise), p=prob_noise) # execute move gomoku.execute_move(action) mcts.update_with_move(action) # is ended ended, winner = gomoku.get_game_status() if ended == 1: # b, last_action, cur_player, p, v return [(x[0], x[1], x[2], x[3], x[2] * winner) for x in train_examples]
def play_with_human(self, human_first=True, checkpoint_name="best_checkpoint"): t = threading.Thread(target=self.gomoku_gui.loop) t.start() # load best model libtorch_best = NeuralNetwork('./models/best_checkpoint.pt', self.libtorch_use_gpu, self.num_mcts_threads * 2) mcts_best = MCTS(libtorch_best, self.num_mcts_threads * 2, self.c_puct, self.num_mcts_sims * 4, self.c_virtual_loss, self.action_size) # create gomoku game human_color = self.gomoku_gui.get_human_color() gomoku = Gomoku(self.n, self.n_in_row, human_color if human_first else -human_color) players = ["alpha", None, "human" ] if human_color == 1 else ["human", None, "alpha"] player_index = human_color if human_first else -human_color while True: player = players[player_index + 1] # select move if player == "alpha": prob = mcts_best.get_action_probs(gomoku) best_move = int(np.argmax(np.array(list(prob)))) self.gomoku_gui.execute_move(player_index, best_move) else: self.gomoku_gui.set_is_human(True) # wait human action while self.gomoku_gui.get_is_human(): time.sleep(0.1) best_move = self.gomoku_gui.get_human_move() # execute move gomoku.execute_move(best_move) # check game status ended, winner = gomoku.get_game_status() if ended == 1: break # update tree search mcts_best.update_with_move(best_move) # next player player_index = -player_index print("HUMAN WIN" if winner == human_color else "ALPHA ZERO WIN") t.join()
import time if __name__ == "__main__": gomoku = Gomoku(15, 5, 1) gomoku.execute_move(0 + 40) gomoku.execute_move(99) gomoku.execute_move(1 + 40) gomoku.execute_move(98) gomoku.execute_move(2 + 40) gomoku.execute_move(97) gomoku.execute_move(3 + 40) gomoku.execute_move(96) gomoku.display() mcts = MCTS("./models/checkpoint.pt", 4, 2.5, 1600, 2.5, 225, True) print("RUNNING") while True: time_start=time.time() res = mcts.get_action_probs(gomoku, 1) time_end=time.time() print('get_action_probs', time_end - time_start) print(list(res)) best_action = int(np.argmax(np.array(list(res)))) print(best_action, res[best_action]) mcts.update_with_move(-1)