def policy_evaluate(self, n_games=10): """ 通过和纯MCTS进行对弈来评估训练好的策略 """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0) win_cnt[winner] += 1 win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio
def play(self): model_file = "current.model" best_policy = PolicyValueNet(self.width, self.height, model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=300) pure_player = MCTS_Pure(c_puct=5, n_playout=300) human1 = Human() human2 = Human() # self.show() win_cnt = defaultdict(int) for i in range(10): winner = self.start_play(mcts_player, pure_player, start_player=(i % 2), is_shown=1) win_cnt[winner] += 1 print "win", win_cnt[1], "lose", win_cnt[2], "tie", win_cnt[0]
def run(): n_row = 5 width, height = 11, 11 try: board = Board(width=width, height=height, n_in_row=n_row) game = Game(board) ################ human VS AI ################### best_policy = PolicyValueNet(width, height, n_row) mcts_player = MCTSPlayer( best_policy.policy_value_fn, c_puct=5, n_playout=400) # set larger n_playout for better performance human = Human() # set start_player=0 for human first game.start_play(human, mcts_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def policy_evaluate(self, n_games=10): """ Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio
def MCTSput(board, who, n_playout=400): # print("n_playout=", n_playout) # input("按任意键继续") player = MCTSPlayer(c_puct=5, n_playout=n_playout) # 设置当前下棋者,使用do_move的才要 # board.set_current_player(who) # 如果是先手,随机下一个地方 last = board.getLast() if last == [-1, -1]: row = random.randint(2, 5) col = random.randint(2, 5) if board[row][col] == 0: move = board.location_to_move((row, col)) if board.do_move(move): return True return False # 不是先手 move = player.get_action(board) #print(board.current_player, who) # input("按任意键继续") return board.do_move(move)
def run(): n = 5 width, height = 8, 8 model_file = './best_policy_8_8_5.model' try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow # best_policy = PolicyValueNet(width, height, model_file = model_file) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy try: policy_param = pickle.load(open(model_file, 'rb')) except: policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes') # To support python3 best_policy = PolicyValueNetNumpy(width, height, policy_param) mcts_player = MCTSPlayer( best_policy.policy_value_fn, c_puct=5, n_playout=400) # set larger n_playout for better performance # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) mcts_player2 = MCTS_Pure(c_puct=5, n_playout=1) # human player, input your move in the format: 2,3 human = Human() # set start_player=0 for human first # 解除注释就是人机对抗 game.start_play(human, mcts_player, start_player=1, is_shown=1) ## 解除注释就是就是AI自己博弈 # game.start_self_play(mcts_player,is_shown=1) # 解除注释就是AI打AI # game.start_play(mcts_player2,mcts_player,is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def __init__(self, init_model=None): self.board_width = 6 self.board_height = 6 self.config = GameConfig() self.board = Board(self.config) self.game = Game(self.board) # training params #学习率0.002 self.learn_rate = 2e-3 #自动调整学习率 kl比较两个概率分布的接近程度。在某个变化范围内,KL散度取到最小值的时候,对应的参数是我们想要的最优参数 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 1500 # num of simulations for each move self.c_puct = 5 #UCTK self.buffer_size = 10000 self.batch_size = 200 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 50 # self.check_freq = 25 # self.game_batch_num = 1500 self.game_batch_num = 5000 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 5000 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def __init__(self, init_model=None): # 设置棋盘和游戏的参数 self.board_width = 10 self.board_height = 10 self.n_in_row = 4 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # 设置训练参数 self.learn_rate = 2e-3 # 基准学习率 self.lr_multiplier = 1.0 # 基于KL自动调整学习倍速 self.temp = 1.0 # 温度参数 self.n_playout = 400 # 每下一步棋,模拟的步骤数 self.c_puct = 5 # exploitation和exploration之间的折中系数 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) #使用 deque 创建一个双端队列 self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 # 早停检查 self.check_freq = 50 # 每50次检查一次,策略价值网络是否更新 self.game_batch_num = 500 # 训练多少个epoch self.best_win_ratio = 0.0 # 当前最佳胜率,用他来判断是否有更好的模型 # 弱AI(纯MCTS)模拟步数,用于给训练的策略AI提供对手 self.pure_mcts_playout_num = 1000 if init_model: # 通过init_model设置策略网络 self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # 训练一个新的策略网络 self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) # AI Player,设置is_selfplay=1 自我对弈,因为是在进行训练 self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def __init__(self, init_model=None): # params of the board and the game self.board_width = 15 self.board_height = 15 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 800 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) # 存储mcts的数据,增广以后的数据 self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update # 此处应该是400或者800 self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 1500 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 # 此处是1000 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def run(): n = 5 #width, height = 8, 8 width, height = 16,16 #model_file = 'best_policy_8_8_5.model' model_file = './tfData/best_policy.model' try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow # best_policy = PolicyValueNet(width, height, model_file = model_file) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy #得到策略 best_policy = PolicyValueNet(width, height, model_file) #得到策略函数 mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # set larger n_playout for better performance # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000) # human player, input your move in the format: 2,3 human = Human() # set start_player=0 for human first winner, play_data = game.start_play(human, mcts_player, start_player=1, is_shown=1) play_data = list(play_data)[:] play_data = get_equi_data(play_data,height,width ) backupSave(play_data,"human" ) except KeyboardInterrupt: print('\n\rquit')
def local_thread_func(thread_id, shared_queue, net_lock, data_lock): from policy_value_net_tensorflow import PolicyValueNet # 读取模型文件,加锁 logging.info("selfplay process {} ask net lock".format(thread_id)) with net_lock: logging.info( 'selfpaly process {} get net lock'.format(thread_id)) current_policy = PolicyValueNet(self.board_width, self.board_height, model_dir, model_file=current_model_name) logging.info( 'selfplay process {} release net lock'.format(thread_id)) local_board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) local_game = Game(local_board) local_mcts_player = MCTSPlayer(current_policy.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) logging.info("selfplay process {} start {}th selfplay".format( thread_id, index)) winner, play_data = local_game.start_self_play(local_mcts_player, temp=self.temp) logging.info("selfplay process {} finish {}th selfplay".format( thread_id, index)) play_data = list(play_data) play_data = self.get_equi_data(play_data) # 添加对弈数据,加锁 logging.info('selfplay process {} ask date lock'.format(thread_id)) with data_lock: logging.info( 'selfplay process {} get date lock'.format(thread_id)) shared_queue.extend(play_data) while len(shared_queue) > self.buffer_num: shared_queue.pop(0) logging.info( 'selfplay process {} release data lock'.format(thread_id))
def __init__(self, init_model=None): # 棋盘数据 self.board_width = 8 self.board_height = 8 # self.n_in_row = 5 self.board = chessboard(row=self.board_width, col=self.board_height) # 训练参数 self.learn_rate = 2e-3 self.lr_multiplier = 1.0 self.temp = 1.0 self.n_playout = 400 # 每次模拟次数 self.c_puct = 5 self.buffer_size = 10000000 self.batch_size = 512 # 每批样本量 self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # 每次更新前迭代次数 self.kl_targ = 0.02 self.check_freq = 2 # 自我对弈次数 self.game_batch_num = 1000 self.best_win_ratio = 0.0 # 纯蒙特卡罗树搜索,用来作为基准 self.pure_mcts_playout_num = 400 # 有预训练模型的情况 if init_model: self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model, use_gpu=True) else: # 从头开始训练 self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, use_gpu=True) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def single_game_play(num,initmode): print('Starting worker {} '.format(num)) board = Board(width=board_width, height=board_height, n_in_row=n_in_row) game = Game(board) if initmode: policy_value_net = PolicyValueNet(board_width,board_height,model_file=initmode) else: policy_value_net = PolicyValueNet(board_width,board_height) mcts_player = MCTSPlayer(policy_value_net.policy_value_fn, c_puct=c_puct, n_playout=n_playout, is_selfplay=1) winner, play_data = game.start_self_play(mcts_player,temp=temp) #should not do following line because zip function return a iterator instead of a static data strutcure like list #playlen = len(list(play_data)) #print('Exiting worker{} and len is {}'.format(num,playlen)) #logging.info('Exiting worker{} and len is {}'.format(num,playlen)) return winner, play_data
def __init__(self, init_model='./current_policy.hdf5'): # 棋盘参数 self.board_width = 8 self.board_height = 8 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # t训练的参数 self.learn_rate = 2e-3 self.lr_multiplier = 1.0 self.temp = 1.0 # 温度参数 self.n_playout = 400 # 每一次落子模拟次数 self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 #每次更新的训练步数 self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 1500 self.best_win_ratio = 0.0 # 对策略评估使用的MCTS self.pure_mcts_playout_num = 2000 if init_model: # 从现有的网络开始训练 self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # 从新的网络开始训练 self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def policy_evaluate(self, n_games=10): """ Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training """ print('4') current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = 0 for i in range(n_games): winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0) win_cnt += 1 win_ratio = win_cnt / n_games print("num_playouts:{}, win: {}".format(self.pure_mcts_playout_num, win_cnt)) return win_ratio
def policy_evaluate(self, n_games=10): """ Evaluate the trained policy by playing games against the pure MCTS player Note: this is only for monitoring the progress of training """ current_mcts_player = MCTSPlayer( self.policy_value_net.policy_value_func, c_puct=self.c_puct, n_play_out=self.n_play_out) pure_mcts_player = MCTS_Pure(c_puct=5, n_play_out=self.pure_mcts_play_out_number) win_cnt = defaultdict(int) results = self.pool.map(self.game.start_play, [(current_mcts_player, pure_mcts_player, i) for i in range(n_games)]) for winner in results: win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games print_log("number_play_outs:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_play_out_number, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio
def run(): width, height = 9, 9 model_file = 'best_policy.model' try: board = Board(width=width, height=height) game = Game(board) # ############### human VS AI ################### best_policy = PolicyValueNet(width, height, model_file = model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=800) # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000) # human player, input your move in the format: 2,3 human = Human() # set start_player=0 for human first game.start_play(human, mcts_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def __init__(self, init_model=None): # params of the board and the game #width of chessboard self.board_width = 8 #6 #10 #height of chessboard self.board_height = 8 #6 #10 #conditions for victory self.n_in_row = 5 #4 #5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 5e-3 #learning rate self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 #The number of maximum elements in the queue self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) # queue size self.play_batch_size = 1 # collect a set of data if it self-play once self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.025 #KL target #check frequency: evaluate the game and current AI model every 50 times of self-play #The evaluation method is to use the latest AI model and MCTs-pure AI (based on random roll out) to fight 10 rounds self.check_freq = 50 #50 self.game_batch_num = 200 #the number of training batches self.best_win_ratio = 0.0 #historical best winning rate # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if init_model: # start training from an initial policy-value net #pickle.load(file)反序列化对象。将文件中的数据解析为一个pytorch对象 policy_param = pickle.load(open(init_model, 'rb')) #使用‘rb’按照二进制位读取 self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params = policy_param) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def run(): n = 5 width, height = 15, 15 model_file = 'best_policy_3000.pt' try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow # best_policy = PolicyValueNet(width, height, model_file = model_file) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy # try: # policy_param = pickle.load(open(model_file, 'rb')) # except: # policy_param = pickle.load(open(model_file, 'rb'), # encoding='bytes') # To support python3 best_policy = PolicyValueNet(width, height, model_file=model_file) mcts_player1 = MCTSPlayer( best_policy.policy_value_fn, c_puct=5, n_playout=400) # set larger n_playout for better performance mcts_player = MCTS_Pure(c_puct=5, n_playout=1000) # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000) # human player, input your move in the format: 2,3 human = Human() print("!!!!!") # set start_player=0 for human first game.start_play(mcts_player1, human, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def main(config): try: game = Game.from_config(config) # ############### human VS AI ################### # load the trained policy_value_net in PyTorch policy_value_net = PolicyValueNet(config.size, model_file=config.model_file) mcts_player = MCTSPlayer( policy_value_net, c_puct=config.c_puct, n_playout=config.n_playout, temperature=HUMAN_PLAY_TEMPERATURE, ) # human player, input your move in the format: 2,3 human = Human() # set start_player=0 for human first game.start_play(human, mcts_player, display=1) except KeyboardInterrupt: print("\n\rquit")
def run(): n = 6 width, height = 9, 9 model_file = 'best_policy.model' #載入模型 try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) try: policy_param = pickle.load(open(model_file, 'rb')) except: policy_param = pickle.load(open(model_file, 'rb'), encoding = 'bytes') # To support python3 best_policy = PolicyValueNet(width, height, policy_param) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # set larger n_playout for better performance # human player, input your move in the format: 2,3 human = Human() # set start_player=0 for human first game.start_play(human, mcts_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def run(states, sensible_moves, currentPlayer, lastMove): #胜利所需要连续的子 n = 5 #棋盘宽度,高度 width, height = 8, 8 board = Board(width=width, height=height, n_in_row=n) board.init_board() board.states = states board.availables = sensible_moves board.current_player = currentPlayer board.last_move = lastMove #策略价值网络 best_policy = PolicyValueNetNumpy(width, height, policy_param) #纯蒙特卡洛搜索 mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) #从蒙特卡洛搜索中返回下一步要走的地方 nextmove = mcts_player.get_action(board) return nextmove
def run(): n = 5 width, height = 8, 8 try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow best_policy = PolicyValueNet(width, height, model_file=model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=500) # human player, input your move in the format: 2,3 human1 = Human() # set start_player=0 for human first game.start_play(human1, mcts_player, start_player=1, is_shown=1) # game.start_play(human1, human2, start_player=0, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def get_mcts_player(player_index=1): """ Get an mcts player, an index of 1 corresponds to first player (typically human) and an index of 2 corresponds to the second player (typically AI opponent). """ board = Board() board.init_board() size = 8 model_file = '../AlphaZero_Gomoku/best_policy_8_8_5.model' try: policy_param = pickle.load(open(model_file, 'rb')) except Exception: policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes') best_policy = PolicyValueNetNumpy(size, size, policy_param) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=200) mcts_player.set_player_ind(player_index) return mcts_player
def __init__(self, init_model=None, is_shown=0): self.board_width = 15 self.board_height = 15 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.is_shown = is_shown self.game = Game_UI(self.board, is_shown) self.learn_rate = 2e-3 self.lr_multiplier = 1.0 self.temp = 1.0 self.n_playout = 400 self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 1500 self.best_win_ratio = 0.0 self.pure_mcts_playout_num = 1000 if init_model: self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def __init__(self, n: int, init_model=None): # params of the board and the game self.n = n self.board = Board(self.n) self.game = Game(self.board) # training params self.learn_rate = 5e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_play_out = 400 # number of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.epochs = 5 # number of train_steps for each update self.kl_target = 0.025 self.check_freq = 50 self.game_batch_number = 10000 self.best_win_ratio = 0.0 self.episode_length = 0 self.pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) # number of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy self.last_batch_number = 0 self.pure_mcts_play_out_number = 1000 if init_model: # start training from an initial policy-value net policy_param = pickle.load(open(init_model, 'rb')) self.policy_value_net = PolicyValueNet(self.n, net_params=policy_param) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.n) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_func, c_puct=self.c_puct, n_play_out=self.n_play_out, is_self_play=1)
def run(): n = 5 width, height = 9, 9 model_file = 'output/best_policy.model' try: board = Board(width=width, height=height, n_in_row=n, forbidden_hands=True) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow # best_policy = PolicyValueNet(width, height, model_file = model_file) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy best_policy = PolicyValueNetRes30(width, height, 'l+', model_file=model_file) mcts_player = MCTSPlayer( best_policy.policy_value_fn, c_puct=5, n_playout=400) # set larger n_playout for better performance # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000) # human player, input your move in the format: 2,3 human = Human() # set start_player=0 for human first game.start_play(human, mcts_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def __init__(self, game_batch_num, model_file=None): # params of the board and the game self.size = BOARD_SIZE use_gpu = False board = Board(size=self.size, n_in_row=N_IN_ROW) self.game = Game(board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=10000) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = game_batch_num self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if model_file: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(size=self.size, model_file=model_file, use_gpu=use_gpu) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(size=self.size, use_gpu=use_gpu) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def __init__(self): # params of the board and the game self.board_width = 6 self.board_height = 6 self.n_in_row = 4 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 5e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.025 self.check_freq = 50 self.game_batch_num = 1500 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 # start training from a given policy-value net # policy_param = pickle.load(open('current_policy.model', 'rb')) # self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params = policy_param) # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def run(): n = 5 width, height = 15, 15 # model_file = 'best_policy_8_8_5.model' model_file = 'best_policy.model' try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) ################ human VS AI ################### # MCTS player with the policy_value_net trained by AlphaZero algorithm # policy_param = pickle.load(open(model_file, 'rb')) # best_policy = PolicyValueNet(width, height, net_params = policy_param) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # MCTS player with the trained policy_value_net written in pure numpy try: policy_param = pickle.load(open(model_file, 'rb')) except: policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes') # To support python3 best_policy = PolicyValueNetNumpy(width, height, policy_param) mcts_player = MCTSPlayer( best_policy.policy_value_fn, c_puct=5, n_playout=400) # set larger n_playout for better performance # uncomment the following line to play with pure MCTS (its much weaker even with a larger n_playout) # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000) # human player, input your move in the format: 2,3 human = Human() # set start_player=0 for human first game.start_play(human, mcts_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')