def __init__(self, init_model=None, transfer_model=None): self.resnet_block = 19 # num of block structures in resnet # params of the board and the game self.board_width = 11 self.board_height = 11 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 1e-3 self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 500000 # memory size self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 # play n games for each network training self.check_freq = 50 self.game_batch_num = 50000000 # total game to train self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 200 if (init_model is not None) and os.path.exists(init_model + ".index"): # start training from an initial policy-value net self.policy_value_net = PolicyValueNet( self.board_width, self.board_height, block=self.resnet_block, init_model=init_model, cuda=True, ) elif (transfer_model is not None) and os.path.exists(transfer_model + ".index"): # start training from a pre-trained policy-value net self.policy_value_net = PolicyValueNet( self.board_width, self.board_height, block=self.resnet_block, transfer_model=transfer_model, cuda=True, ) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, block=self.resnet_block, cuda=True) self.mcts_player = MCTSPlayer( policy_value_function=self.policy_value_net.policy_value_fn_random, action_fc=self.policy_value_net.action_fc_test, evaluation_fc=self.policy_value_net.evaluation_fc2_test, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=True, )
def run(start_player=0, is_shown=1): # run a gomoku game with AI # you can set # human vs AI or AI vs AI n = 5 width, height = 15, 15 # model_file = 'model_15_15_5/best_policy.model' # width, height = 6, 6 # model_file = 'model/best_policy.model' # width, height = 11, 11 # model_file = 'model/best_policy.model' model_file = 'training/model_best/policy.model' # model_file = 'training/best_policy.model' p = os.getcwd() model_file = path.join(p, model_file) board = Board(width=width, height=height, n_in_row=n) game = Game(board) mcts_player = MCTS_pure(5, 8000) best_policy = PolicyValueNet(board_width=width, board_height=height, block=19, init_model=model_file, cuda=True) # alpha_zero vs alpha_zero # best_policy.save_numpy(best_policy.network_all_params) # best_policy.load_numpy(best_policy.network_oppo_all_params) alpha_zero_player = MCTSPlayer( best_policy, model_file, policy_value_function=best_policy.policy_value_fn_random, action_fc=best_policy.action_fc_test, evaluation_fc=best_policy.evaluation_fc2_test, c_puct=5, n_playout=400, is_selfplay=False) # alpha_zero_player_oppo = MCTSPlayer(policy_value_function=best_policy.policy_value_fn_random, # action_fc=best_policy.action_fc_test_oppo, # evaluation_fc=best_policy.evaluation_fc2_test_oppo, # c_puct=5, # n_playout=400, # is_selfplay=False) # human player, input your move in the format: 2,3 # set start_player=0 for human first # play in termianl without GUI # human = Human() # win = game.start_play(human, alpha_zero_player, start_player=start_player, is_shown=is_shown,print_prob=True) # return win # play in GUI game.start_play_with_UI(alpha_zero_player) # Play with alpha zero
def __init__(self): # def run(start_player=0,is_shown=1): self.start_player = 0 self.is_shown = 1 menu = Gui_menu() # run a gomoku game with AI # you can set # human vs AI or AI vs AI self.n = 5 # Rule 5 목 if menu.rule == 11: width, height = 11, 11 # 바둑판의 폭과 높이 model_file = 'model_11_11_5/best_policy.model' elif menu.rule == 15: width, height = 15, 15 # 바둑판의 폭과 높이 model_file = 'model_15_15_5/best_policy.model' p = os.getcwd() # 현재 작업 경로를 얻음 model_file = path.join(p, model_file) # 파일 경로 + model file name board = Board(width=width, height=height, n_in_row=self.n) # 게임 판 game = Game(board) mcts_player = MCTS_pure(5, 400) best_policy = PolicyValueNet(board_width=width, board_height=height, block=19, init_model=model_file, cuda=True) # alpha_zero vs alpha_zero # best_policy.save_numpy(best_policy.network_all_params) # best_policy.load_numpy(best_policy.network_oppo_all_params) alpha_zero_player = MCTSPlayer( policy_value_function=best_policy.policy_value_fn_random, action_fc=best_policy.action_fc_test, evaluation_fc=best_policy.evaluation_fc2_test, c_puct=5, n_playout=400, is_selfplay=False) # alpha_zero_player_oppo = MCTSPlayer(policy_value_function=best_policy.policy_value_fn_random, # action_fc=best_policy.action_fc_test_oppo, # evaluation_fc=best_policy.evaluation_fc2_test_oppo, # c_puct=5, # n_playout=400, # is_selfplay=False) # human player, input your move in the format: 2,3 # set start_player=0 for human first # play in termianl without GUI # human = Human() # win = game.start_play(human, alpha_zero_player, start_player=start_player, is_shown=is_shown,print_prob=True) # return win # play in GUI game.start_play_with_UI(alpha_zero_player)
def __init__(self, init_model=None, transfer_model=None): self.game_count = 0 # count total game have played self.resnet_block = 19 # num of block structures in resnet # params of the board and the game self.board_width = 11 self.board_height = 11 self.n_in_row = 5 self.board = Board( width=self.board_width, height=self.board_height, n_in_row=self.n_in_row ) self.game = Game(self.board) # training params self.learn_rate = 1e-3 self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 500000 # memory size, should be larger with bigger board # in paper it can stores 500,000 games, here with 11x11 board can store only around 2000 games self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.game_batch_num = 10000000 # total game to train # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy # only for monitoring the progress of training self.pure_mcts_playout_num = 200 # record the win rate against pure mcts # once the win ratio risen to 1, # pure mcts playout num will plus 100 and win ratio reset to 0 self.best_win_ratio = 0.0 # GPU setting # be careful to set your GPU using depends on GPUs' and CPUs' memory if rank in {0, 1, 2}: cuda = True elif rank in range(10, 30): cuda = True os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "1" else: cuda = False # cuda = True if (init_model is not None) and os.path.exists(init_model + ".index"): # start training from an initial policy-value net self.policy_value_net = PolicyValueNet( self.board_width, self.board_height, block=self.resnet_block, init_model=init_model, cuda=cuda, ) elif (transfer_model is not None) and os.path.exists(transfer_model + ".index"): # start training from a pre-trained policy-value net self.policy_value_net = PolicyValueNet( self.board_width, self.board_height, block=self.resnet_block, transfer_model=transfer_model, cuda=cuda, ) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet( self.board_width, self.board_height, block=self.resnet_block, cuda=cuda ) self.mcts_player = MCTSPlayer( policy_value_function=self.policy_value_net.policy_value_fn_random, action_fc=self.policy_value_net.action_fc_test, evaluation_fc=self.policy_value_net.evaluation_fc2_test, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=True, )