def self_play(self, first_color): """ This function executes one episode of self-play, starting with player 1. As the game is played, each turn is added as a training example to train_examples. The game is played till the game ends. After the game ends, the outcome of the game is used to assign values to each example in train_examples. """ train_examples = [] gomoku = Gomoku(self.n, self.n_in_row, first_color) mcts = MCTS("./models/checkpoint.pt", self.thread_pool_size, self.c_puct, self.num_mcts_sims, self.c_virtual_loss, self.action_size, self.mcts_use_gpu) episode_step = 0 while True: episode_step += 1 # prob temp = self.temp if episode_step <= self.explore_num else 0 prob = np.array(list(mcts.get_action_probs(gomoku, temp))) # generate sample board = tuple_2d_to_numpy_2d(gomoku.get_board()) last_action = gomoku.get_last_move() cur_player = gomoku.get_current_color() sym = self.get_symmetries(board, prob) for b, p in sym: train_examples.append([b, last_action, cur_player, p]) # dirichlet noise legal_moves = list(gomoku.get_legal_moves()) noise = 0.25 * np.random.dirichlet( self.dirichlet_alpha * np.ones(np.count_nonzero(legal_moves))) prob_noise = 0.75 * prob j = 0 for i in range(len(prob_noise)): if legal_moves[i] == 1: prob_noise[i] += noise[j] j += 1 prob_noise /= np.sum(prob_noise) action = np.random.choice(len(prob_noise), p=prob_noise) # execute move gomoku.execute_move(action) mcts.update_with_move(action) # is ended ended, winner = gomoku.get_game_status() if ended == 1: # b, last_action, cur_player, p, v return [(x[0], x[1], x[2], x[3], x[2] * winner) for x in train_examples]
def play_with_human(self, human_first=True, checkpoint_name="best_checkpoint"): t = threading.Thread(target=self.gomoku_gui.loop) t.start() # load best model libtorch_best = NeuralNetwork('./models/best_checkpoint.pt', self.libtorch_use_gpu, self.num_mcts_threads * 2) mcts_best = MCTS(libtorch_best, self.num_mcts_threads * 2, self.c_puct, self.num_mcts_sims * 4, self.c_virtual_loss, self.action_size) # create gomoku game human_color = self.gomoku_gui.get_human_color() gomoku = Gomoku(self.n, self.n_in_row, human_color if human_first else -human_color) players = ["alpha", None, "human" ] if human_color == 1 else ["human", None, "alpha"] player_index = human_color if human_first else -human_color while True: player = players[player_index + 1] # select move if player == "alpha": prob = mcts_best.get_action_probs(gomoku) best_move = int(np.argmax(np.array(list(prob)))) self.gomoku_gui.execute_move(player_index, best_move) else: self.gomoku_gui.set_is_human(True) # wait human action while self.gomoku_gui.get_is_human(): time.sleep(0.1) best_move = self.gomoku_gui.get_human_move() # execute move gomoku.execute_move(best_move) # check game status ended, winner = gomoku.get_game_status() if ended == 1: break # update tree search mcts_best.update_with_move(best_move) # next player player_index = -player_index print("HUMAN WIN" if winner == human_color else "ALPHA ZERO WIN") t.join()
def _contest(self, network1, network2, first_player, show): # create MCTS player1 = MCTS(network1, self.num_mcts_threads, self.c_puct, self.num_mcts_sims, self.c_virtual_loss, self.action_size) player2 = MCTS(network2, self.num_mcts_threads, self.c_puct, self.num_mcts_sims, self.c_virtual_loss, self.action_size) # prepare players = [player2, None, player1] player_index = first_player gomoku = Gomoku(self.n, self.n_in_row, first_player) if show: self.gomoku_gui.reset_status() # play while True: player = players[player_index + 1] # select best move prob = player.get_action_probs(gomoku) best_move = int(np.argmax(np.array(list(prob)))) # execute move gomoku.execute_move(best_move) if show: self.gomoku_gui.execute_move(player_index, best_move) # check game status ended, winner = gomoku.get_game_status() if ended == 1: return winner # update search tree player1.update_with_move(best_move) player2.update_with_move(best_move) # next player player_index = -player_index
def self_play(self, first_color, libtorch, show): """ This function executes one episode of self-play, starting with player 1. As the game is played, each turn is added as a training example to train_examples. The game is played till the game ends. After the game ends, the outcome of the game is used to assign values to each example in train_examples. """ train_examples = [] player1 = MCTS(libtorch, self.num_mcts_threads, self.c_puct, self.num_mcts_sims, self.c_virtual_loss, self.action_size) player2 = MCTS(libtorch, self.num_mcts_threads, self.c_puct, self.num_mcts_sims, self.c_virtual_loss, self.action_size) players = [player2, None, player1] player_index = 1 gomoku = Gomoku(self.n, self.n_in_row, first_color) if show: self.gomoku_gui.reset_status() episode_step = 0 while True: episode_step += 1 player = players[player_index + 1] # get action prob if episode_step <= self.num_explore: prob = np.array( list(player.get_action_probs(gomoku, self.temp))) else: prob = np.array(list(player.get_action_probs(gomoku, 0))) # generate sample board = tuple_2d_to_numpy_2d(gomoku.get_board()) last_action = gomoku.get_last_move() cur_player = gomoku.get_current_color() sym = self.get_symmetries(board, prob, last_action) for b, p, a in sym: train_examples.append([b, a, cur_player, p]) # dirichlet noise legal_moves = list(gomoku.get_legal_moves()) noise = 0.1 * np.random.dirichlet( self.dirichlet_alpha * np.ones(np.count_nonzero(legal_moves))) prob = 0.9 * prob j = 0 for i in range(len(prob)): if legal_moves[i] == 1: prob[i] += noise[j] j += 1 prob /= np.sum(prob) # execute move action = np.random.choice(len(prob), p=prob) if show: self.gomoku_gui.execute_move(cur_player, action) gomoku.execute_move(action) player1.update_with_move(action) player2.update_with_move(action) # next player player_index = -player_index # is ended ended, winner = gomoku.get_game_status() if ended == 1: # b, last_action, cur_player, p, v return [(x[0], x[1], x[2], x[3], x[2] * winner) for x in train_examples]
import time if __name__ == "__main__": gomoku = Gomoku(15, 5, 1) gomoku.execute_move(0 + 40) gomoku.execute_move(99) gomoku.execute_move(1 + 40) gomoku.execute_move(98) gomoku.execute_move(2 + 40) gomoku.execute_move(97) gomoku.execute_move(3 + 40) gomoku.execute_move(96) gomoku.display() mcts = MCTS("./models/checkpoint.pt", 4, 2.5, 1600, 2.5, 225, True) print("RUNNING") while True: time_start=time.time() res = mcts.get_action_probs(gomoku, 1) time_end=time.time() print('get_action_probs', time_end - time_start) print(list(res)) best_action = int(np.argmax(np.array(list(res)))) print(best_action, res[best_action]) mcts.update_with_move(-1)
def learn(self): # train the model by self play t = threading.Thread(target=self.gomoku_gui.loop) t.start() if os.path.exists('./models/checkpoint'): print("loading checkpoint...") self.nnet.load_model('models', "checkpoint") self.load_samples("models", "checkpoint") # generate .pt for libtorch self.nnet.save_model('models', "checkpoint") self.nnet.save_model('models', "best_checkpoint") for i in range(1, self.num_iters + 1): print("ITER ::: " + str(i)) # self play first_color = 1 for eps in range(1, self.num_eps + 1): examples = self.self_play(first_color) self.examples_buffer.extend(examples)\ first_color = -first_color print("EPS :: " + str(eps) + ", EXAMPLES :: " + str(len(examples))) # sample train data if len(self.examples_buffer) < self.batch_size: continue print("sampling...") train_data = sample(self.examples_buffer, self.batch_size) # train neural network self.nnet.train(train_data) self.nnet.save_model('models', "checkpoint") if i % self.check_freq == 0: self.save_samples("models", "checkpoint") # compare performance mcts = MCTS("./models/checkpoint.pt", self.thread_pool_size, self.c_puct, self.num_mcts_sims, self.c_virtual_loss, self.action_size, self.mcts_use_gpu) mcts_best = MCTS("./models/best_checkpoint.pt", self.thread_pool_size, self.c_puct, self.num_mcts_sims, self.c_virtual_loss, self.action_size, self.mcts_use_gpu) one_won, two_won, draws = self.contest(mcts, mcts_best, self.contest_num) print("NEW/PREV WINS : %d / %d ; DRAWS : %d" % (one_won, two_won, draws)) if one_won + two_won > 0 and float(one_won) / ( one_won + two_won) > self.update_threshold: print('ACCEPTING NEW MODEL') self.nnet.save_model('models', "best_checkpoint") else: print('REJECTING NEW MODEL') del mcts del mcts_best t.join()
res = self.contest(player1, player2, show) if res == 1: win1_cnt += 1 elif res == -1: win2_cnt += 1 else: draw_cnt += 1 else: res = self.contest(player2, player1, show) if res == 1: win2_cnt += 1 elif res == -1: win1_cnt += 1 else: draw_cnt += 1 print("\nTournament finished.") print("Player1 vs. player2: {:d}/{:d}/{:d} (win1/win2/draw)".format( win1_cnt, win2_cnt, draw_cnt)) network1 = NeuralNetwork("../models/model_8x8_5_2000iters_cpu.pt", False, 4) network2 = NeuralNetwork("../models/model_8x8_5_2000iters_cpu.pt", False, 4) game = Game() human = Human() mcts1 = MCTS(4, 100000, 5, 3) mcts2 = MCTS(4, 100000, 5, 3) alphazero1 = AlphaZero(network1, 4, 800, 5, 3) alphazero2 = AlphaZero(network2, 4, 800, 5, 3) game.contest(mcts1, mcts2) # game.tournament(mcts1, mcts2, round = 10, show = True)