def __init__(self, model_name, loss_function, forbidden_hands, init_model=None): # params of the board and the game self.board_width = 9 self.board_height = 9 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row, forbidden_hands=forbidden_hands) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 1000 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 3000 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 self.model_name = model_name if init_model: # start training from an initial policy-value net if self.model_name == 'baseline': self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, loss_function, model_file=init_model) else: self.policy_value_net = PolicyValueNetRes30( self.board_width, self.board_height, loss_function, model_file=init_model) else: # start training from a new policy-value net if self.model_name == 'baseline': self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, loss_function) else: self.policy_value_net = PolicyValueNetRes30( self.board_width, self.board_height, loss_function) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def run(): n = 5 width, height = 15, 15 model_file = 'models/resnet/output318/current_policy.model' model_file1 = 'models/resnet/output196/current_policy.model' try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow best_policy = PolicyValueNet(width, height, model_file=model_file, output='output/') mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=1, n_playout=1500) tf.reset_default_graph() best_policy1 = PolicyValueNet(width, height, model_file=model_file1, output='output/') mcts_player1 = MCTSPlayer(best_policy1.policy_value_fn, c_puct=1, n_playout=1500) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy '''try: policy_param = pickle.load(open(model_file, 'rb')) except: policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes') # To support python3 best_policy = PolicyValueNetNumpy(width, height, policy_param) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # set larger n_playout for better performance ''' # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) # human = MCTS_Pure(c_puct=5, n_playout=5000) # human player, input your move in the format: 2,3 human = Human() # MCTS_Pure(c_puct=5, n_playout=1000) # # set start_player=0 for human first game.start_play(mcts_player1, mcts_player, start_player=0, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
class SelfPlayer(Process): def __init__(self, config, sample_queue, model_queue): super(SelfPlayer, self).__init__() self.config = config self.temp = config['temperature'] self.sample_queue = sample_queue self.model_queue = model_queue self.board = Board(width=config['board_width'], height=config['board_height'], n_in_row=config['n_in_row']) self.game = Game(self.board) def collect_selfplay_data(self, n_games=1): """collect self-play data for training""" samples = [] for i in range(n_games): _, play_data = self.game.start_self_play(self.mcts_player, temp=self.temp) samples.extend(list(play_data)[:]) return samples def run(self): self.policy_value_net = PolicyValueNet( self.config['board_width'], self.config['board_height'], model_file=self.config['init_model']) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.config['c_puct'], n_playout=self.config['n_playout'], is_selfplay=1) print("running") while True: # always use the latest weight weights = None while not self.model_queue.empty(): weights = self.model_queue.get() if weights: self.policy_value_net.set_weight(weights) # sample samples = self.collect_selfplay_data() # put the new sample to sample queue self.sample_queue.put(samples)
def __init__(self, init_model=None, is_remote=False): # params of the board and the game self.board_width = 8 self.board_height = 8 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.n_playout_self_play = 1000 self.c_puct = 5 self.buffer_size = 2000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 100 self.game_batch_num = 1500 self.best_win_ratio = 0.0 self.td_step = 2 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if is_remote: self.path = '/content/drive/My Drive/' else: self.path = './' if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=self.path + init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout_self_play, is_selfplay=1)
def collect_selfplay_data_thread(self, thread_id, shared_queue, net_lock, data_lock): logging.info('selfpaly process start: {}'.format(thread_id)) os.environ["CUDA_VISIBLE_DEVICES"] = str(thread_id % 6 + 2) from policy_value_net_tensorflow import PolicyValueNet # 读取模型文件,加锁 with net_lock: current_policy = PolicyValueNet(self.board_width, self.board_height, model_file=current_model_name) local_board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) local_game = Game(local_board) local_mcts_player = MCTSPlayer(current_policy.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) winner, play_data = local_game.start_self_play(local_mcts_player, temp=self.temp) play_data = list(play_data) play_data = self.get_equi_data(play_data) # 添加对弈数据,加锁 with data_lock: shared_queue.extend(play_data) while len(shared_queue) > self.buffer_num: shared_queue.pop(0) logging.info('selfpaly process finished: {}'.format(thread_id))
def run(): n = 5 width, height = 9, 9 model_file = 'best_policy_9_9_5.model' try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) ################ human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow # best_policy = PolicyValueNet(width, height, model_file = model_file) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy try: policy_param = pickle.load(open(model_file, 'rb')) except: policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes') # To support python3 best_policy = PolicyValueNet(width, height, policy_param) mcts_player = MCTSPlayer( best_policy.policy_value_fn, c_puct=5, n_playout=1000) # set larger n_playout for better performance # uncomment the following line to play with pure MCTS (its much weaker even with a larger n_playout) # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000) # human player, input your move in the format: 2,3 human = Human() # set start_player=0 for human first game.start_play(human, mcts_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def __init__(self, startplayer=0): model_file = 'models/resnet/output318/current_policy.model' policy_param = None self.height = 15 self.width = 15 '''if model_file is not None: print('loading...', model_file) try: policy_param = pickle.load(open(model_file, 'rb')) except: policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes')''' policy_value_net = PolicyValueNet(self.height, self.width, model_file=model_file, output='output/') self.mcts_player = MCTSPlayer(policy_value_net.policy_value_fn, c_puct=1, n_playout=1000) self.board = Board(width=self.width, height=self.height, n_in_row=5) self.board.init_board(startplayer) self.game = Game(self.board) p1, p2 = self.board.players print('players:', p1, p2) self.mcts_player.set_player_ind(p1) pass
def __init__(self, init_model): self.init_model = init_model # params of the board and the game self.board_width = 6 self.board_height = 6 self.n_in_row = 4 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 1000 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if os.path.isdir(init_model): self.is_init = True # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: self.is_init = False os.system('mkdir ' + init_model) # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) if not os.path.isdir(init_model + 'best'): os.system('mkdir ' + init_model + 'best')
def __init__(self, init_model=None): # params of the board and the game self.board_width = 7 self.board_height = 7 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 1500 # num of simulations for each move self.c_puct = 5 self.buffer_size = 150000 self.batch_size = 2048 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) if os.path.exists("data_buffer.pkl"): with open("data_buffer.pkl", "rb") as f: self.data_buffer = pickle.load(f) print("Load data, size = %d" % len(self.data_buffer)) self.play_batch_size = 1 self.epochs = 10 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 1500 self.save_freq = 500 self.game_batch_num = 10000 self.best_win_ratio = 0.0 self.episode_len = 0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def get_net_player_next_action( player, i, shared_board_states, shared_board_availables, shared_board_last_move, shared_board_current_player, game_continue, winner, play_lock, net_lock, ): from policy_value_net_tensorflow import PolicyValueNet from mcts_alphaZero import MCTSPlayer local_board = Board(width=self.board.width, height=self.board.height, n_in_row=self.board.n_in_row) local_board.init_board(start_player) with net_lock: policy = PolicyValueNet(local_board.width, local_board.height, model_file=player) mcts_player = MCTSPlayer(policy.policy_value_fn, c_puct=5, n_playout=400, is_selfplay=0) while game_continue.value == 1: if shared_board_current_player.value == i: with play_lock: # 必须进行同步,好麻烦 for k, v in shared_board_states.items(): local_board.states[k] = v local_board.availables = [] for availables in shared_board_availables: local_board.availables.append(availables) local_board.last_move = shared_board_last_move.value local_board.current_player = shared_board_current_player.value # 同步结束 move = mcts_player.get_action(local_board) local_board.do_move(move) #print('player {} do move {}'.format(i, move)) if is_shown: self.graphic(local_board) end, win = local_board.game_end() if end: if win != -1: print("Game end. Winner is", win) else: print("Game end. Tie") game_continue.value = 0 winner.value = win # 继续同步 shared_board_states[ move] = shared_board_current_player.value shared_board_availables.remove(move) shared_board_last_move.value = move shared_board_current_player.value = 1 - shared_board_current_player.value time.sleep(0.2)
def init_tensorflow_net(self, model_file=None): os.environ["CUDA_VISIBLE_DEVICES"] = "1" logging.info('init tf net') from policy_value_net_tensorflow import PolicyValueNet policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_dir, model_file=model_file) logging.info('init tf net finished')
def __init__(self, init_model=None): # params of the board and the game self.board_width = 5 self.board_height = 6 self.board = Board() self.game = Game(self.board) # training params #学习率0.002 self.learn_rate = 2e-3 #自动调整学习率 kl比较两个概率分布的接近程度。在某个变化范围内,KL散度取到最小值的时候,对应的参数是我们想要的最优参数 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 # self.check_freq = 50 self.check_freq = 50 # self.game_batch_num = 1500 self.game_batch_num = 50 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def run(): model_file = './current_policy.model' width = 5 height = 6 board = Board() game = Game(board) best_policy = PolicyValueNet(width, height, model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) human = Human() game.start_play(human, mcts_player, start_player=1, is_shown=1)
def update_net(self, shared_queue, net_lock, data_lock, stop_update_process): os.environ["CUDA_VISIBLE_DEVICES"] = "1" from policy_value_net_tensorflow import PolicyValueNet logging.info('update process start') # 读取和写入模型文 current_policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_dir) current_policy_value_net.save_model(current_model_name) i = 0 best_win_ratio = 0 pure_mcts_playout_num = 1000 get_enough_train_data = False while stop_update_process.value == 0: time.sleep(1) if get_enough_train_data: i += 1 logging.info('update process start {} th self train'.format(i)) self.policy_update(current_policy_value_net, shared_queue, net_lock, data_lock, i) logging.info('update process end {} th self train'.format(i)) # 这里更新最新模型文件 if (i + 1) % self.update_freq == 0: logging.info('update process ask net lock') with net_lock: logging.info('update process get net lock') current_policy_value_net.save_model(current_model_name) logging.info('update process release net lock') # 这里和纯MCTS比赛,判断胜率,更新最优模型文件 if (i + 1) % self.check_freq == 0: logging.info("Game {}: AlphagZero VS PURE MCTS".format(i + 1)) win_ratio = self.policy_evaluate(pure_mcts_playout_num, current_policy_value_net) if win_ratio >= best_win_ratio: logging.info("update process New best policy!!!!!!!!") best_win_ratio = win_ratio # update the best_policy current_policy_value_net.save_model(best_model_name) if (best_win_ratio == 1.0 and pure_mcts_playout_num < 5000): pure_mcts_playout_num += 1000 best_win_ratio = 0.0 else: with data_lock: get_enough_train_data = len( shared_queue) >= self.batch_size logging.info('update process finished')
def run(self): self.policy_value_net = PolicyValueNet( self.config['board_width'], self.config['board_height'], model_file=self.config['init_model']) while True: weight = self.queue.get() self.policy_value_net.set_weight(weight) win_ratio = self.policy_evaluate() self.policy_value_net.save_model( self.config['current_policy_name']) if win_ratio > self.best_win_ratio: print("New best policy!!!!!!!!") self.best_win_ratio = win_ratio # update the best_policy self.policy_value_net.save_model( self.config['best_policy_name']) if (self.best_win_ratio == 1.0 and self.pure_mcts_playout_num < 10000): self.pure_mcts_playout_num += 1000 self.best_win_ratio = 0.0
def __init__(self, config): self.config = config self.lr_multiplier = self.config['lr_multiplier'] self.data_buffer = deque(maxlen=config['buffer_size']) # sample queue: Self player will put the samples to this queue # model queue: Train process will put the update model to this queue self.sample_queue = Queue() self.model_queues = [] self.self_players = [] self.evaluator_queue = Queue() self.evaluator = Evaluator(self.config, self.evaluator_queue) for _ in range(self.config['selfplayer_num']): model_queue = Queue() self.model_queues.append(model_queue) self.self_players.append( SelfPlayer(config, self.sample_queue, model_queue)) self.policy_value_net = PolicyValueNet(config['board_width'], config['board_height'], model_file=config['init_model'])
def __init__(self, init_model=None, is_shown=0): self.board_width = 15 self.board_height = 15 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.is_shown = is_shown self.game = Game_UI(self.board, is_shown) self.learn_rate = 2e-3 self.lr_multiplier = 1.0 self.temp = 1.0 self.n_playout = 400 self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 1500 self.best_win_ratio = 0.0 self.pure_mcts_playout_num = 1000 if init_model: self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def run(): model_file = './current_policy.model' best_policy = PolicyValueNet(6, 6, model_file) config = GameConfig() board = Board(config) game = Game(board) mcts_player1 = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=1000) mcts_player2 = MCTS_Pure(c_puct=5, n_playout=1000) mcts_player3 = MCTS_Pure(c_puct=5, n_playout=1000) human = Human(config) human2 = Human(config) human3 = Human(config) game.start_play(mcts_player3, human, mcts_player2)
def run(self): self.policy_value_net = PolicyValueNet( self.config['board_width'], self.config['board_height'], model_file=self.config['init_model']) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.config['c_puct'], n_playout=self.config['n_playout'], is_selfplay=1) print("running") while True: # always use the latest weight weights = None while not self.model_queue.empty(): weights = self.model_queue.get() if weights: self.policy_value_net.set_weight(weights) # sample samples = self.collect_selfplay_data() # put the new sample to sample queue self.sample_queue.put(samples)
def run(): # n = 5 width, height = 5, 5 # for pure_mcts board = Board(width=width, height=height) game = Game(board) width, height = real_dim(width, height) # model_file = 'best_policy_8_8_5.model' try: ## board = Board(width=width, height=height, n_in_row=n) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow model_file = './tmp/best_policy.model' best_policy = PolicyValueNet(width, height, model_file=model_file) mcts_player1 = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=500) mcts_player2 = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=500) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy # try: # policy_param = pickle.load(open(model_file, 'rb')) # except: # policy_param = pickle.load(open(model_file, 'rb'), # encoding='bytes') # To support python3 # best_policy = PolicyValueNetNumpy(width, height, policy_param) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, # c_puct=5, # n_playout=400) # set larger n_playout for better performance # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) # mcts_player1 = MCTS_Pure(c_puct=5, n_playout=500) # mcts_player2 = MCTS_Pure(c_puct=5, n_playout=500) # human player, input your move in the format: 2,3 # human = Human() # set start_player=0 for human first # game.start_play(mcts_player1, mcts_player2, start_player=1, is_shown=0) # game.start_play(human, mcts_player2, start_player=1, is_shown=0) game.start_play(mcts_player1, mcts_player2, start_player=1, is_shown=0) except KeyboardInterrupt: print('\n\rquit')
def update_net_thread(self, shared_queue, net_lock, data_lock, stop_update_process, update_best_model): os.environ["CUDA_VISIBLE_DEVICES"] = "1" from policy_value_net_tensorflow import PolicyValueNet logging.info('update process start') # 读取和写入模型文 current_policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_dir) current_policy_value_net.save_model(current_model_name) current_policy_value_net.save_model(best_model_name) best_win_ratio = 0 get_enough_train_data = False global_update_step = 0 lr_multiplier = 1.0 while stop_update_process.value == 0: time.sleep(1) if get_enough_train_data: global_update_step += 1 logging.info('update process start {} th self train'.format( global_update_step)) lr_multiplier = self.policy_update(current_policy_value_net, shared_queue, net_lock, data_lock, global_update_step, lr_multiplier) logging.info('update process end {} th self train'.format( global_update_step)) # 这里更新最新模型文件 logging.info('update process ask net lock') with net_lock: logging.info('update process get net lock') current_policy_value_net.save_model(current_model_name) logging.info('update process release net lock') if (global_update_step + 1) % self.update_freq == 0: update_best_model.value = 1 else: with data_lock: get_enough_train_data = len( shared_queue) >= self.batch_size logging.info('update process finished')
def run(): n = 5 width, height = 15, 15 model_file = 'dist/best_policy.model' try: board = Board(width=width, height=height, n_in_row=n) game = Game_UI(board, is_shown=1) # ############### Human-machine ################### best_policy = PolicyValueNet(width, height, model_file=model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) human = Human() game.start_play_mouse(human, mcts_player, start_player=0, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def update_net(self, shared_queue, net_lock, update_best_model, global_update_step, lr_multiplier, stop_update_process, update_or_selfplay): os.environ["CUDA_VISIBLE_DEVICES"] = "1" from policy_value_net_tensorflow import PolicyValueNet current_policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_dir) current_policy_value_net.save_model(current_model_name) current_policy_value_net.save_model(best_model_name) while global_update_step.value <= self.game_batch_num: if update_or_selfplay.value == 0: if len(shared_queue) >= self.batch_size: for _ in range(self.epochs): global_update_step.value += 1 logging.info('update current model process start self train: {}'.format(global_update_step.value)) self.policy_update(current_policy_value_net, shared_queue, net_lock, global_update_step, lr_multiplier) if (global_update_step.value) % self.check_freq == 0: update_best_model.value = 1 # 这里更新最新模型文件 with net_lock: logging.info('update process update current model') current_policy_value_net.save_model(current_model_name) update_or_selfplay.value = 1 else: time.sleep(1) stop_update_process.value = 1
def policy_evaluate(self, n_games=10): """ Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training """ print("evaluating...") current_mcts_player = MCTSPlayer(self.policy_value_net_train.policy_value_fn, c_puct=self.c_puct, n_playout=self.pure_mcts_playout_num) best_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): winner = self.game.start_play(current_mcts_player, best_mcts_player, start_player=i % 2, is_shown=0) win_cnt[winner] += 1 win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) # save the current_model self.policy_value_net_train.save_model('/data/output/current_policy.model') if win_ratio > self.best_win_ratio: print("New best policy!!!!!!!!") # update the best_policy self.policy_value_net_train.save_model('/data/output/best_policy.model') self.g1 = tf.Graph() with self.g1.as_default(): self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file='/data/output/best_policy.model', graph=self.g1, output='/data/data/') return win_ratio
def __init__(self, current_model, baseline_model): # params of the board and the game self.board_width = 9 self.board_height = 9 self.n_in_row = 5 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row, forbidden_hands=True) self.game = Game(self.board) self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.baseline_policy_value_net = PolicyValueNet(self.board_width, self.board_height, 'l+', model_file=baseline_model) self.current_policy_value_net = PolicyValueNetRes30(self.board_width, self.board_height, 'l+', model_file=current_model)
def local_thread_func(thread_id, shared_queue, net_lock, data_lock): from policy_value_net_tensorflow import PolicyValueNet # 读取模型文件,加锁 logging.info("selfplay process {} ask net lock".format(thread_id)) with net_lock: logging.info( 'selfpaly process {} get net lock'.format(thread_id)) current_policy = PolicyValueNet(self.board_width, self.board_height, model_dir, model_file=current_model_name) logging.info( 'selfplay process {} release net lock'.format(thread_id)) local_board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) local_game = Game(local_board) local_mcts_player = MCTSPlayer(current_policy.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) logging.info("selfplay process {} start {}th selfplay".format( thread_id, index)) winner, play_data = local_game.start_self_play(local_mcts_player, temp=self.temp) logging.info("selfplay process {} finish {}th selfplay".format( thread_id, index)) play_data = list(play_data) play_data = self.get_equi_data(play_data) # 添加对弈数据,加锁 logging.info('selfplay process {} ask date lock'.format(thread_id)) with data_lock: logging.info( 'selfplay process {} get date lock'.format(thread_id)) shared_queue.extend(play_data) while len(shared_queue) > self.buffer_num: shared_queue.pop(0) logging.info( 'selfplay process {} release data lock'.format(thread_id))
def run(): n = 5 #width, height = 8, 8 width, height = 16,16 #model_file = 'best_policy_8_8_5.model' model_file = './tfData/best_policy.model' try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow # best_policy = PolicyValueNet(width, height, model_file = model_file) # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy #得到策略 best_policy = PolicyValueNet(width, height, model_file) #得到策略函数 mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400) # set larger n_playout for better performance # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout) # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000) # human player, input your move in the format: 2,3 human = Human() # set start_player=0 for human first winner, play_data = game.start_play(human, mcts_player, start_player=1, is_shown=1) play_data = list(play_data)[:] play_data = get_equi_data(play_data,height,width ) backupSave(play_data,"human" ) except KeyboardInterrupt: print('\n\rquit')
def __init__(self) : self.board = chessBoard() self.game_print = StringVar() self.game_print.set("") #16*16的二维列表,保证不会out of index self.db = [([2] * 9) for i in range(9)] #悔棋用的顺序列表 self.order = [] #棋子颜色 self.color_count = 0 self.color = 'black' #清空与赢的初始化,已赢为1,已清空为1 self.flag_win = 1 self.flag_empty = 1 self.start_player = 0 width, height, n_in_row = 9, 9, 5 model_file = 'output/best_policy.model' baseline_file = 'output/baseline_policy.model' board = Board(width=width, height=height, n_in_row=n_in_row, forbidden_hands=False) self.game = Game(board) self.game.board.init_board(self.start_player) self.best_policy = PolicyValueNetRes30(width, height, 'l+', model_file=model_file) self.baseline_policy = PolicyValueNet(width, height, 'l+', model_file=baseline_file) self.mcts_player = MCTSPlayer(self.best_policy.policy_value_fn, c_puct=5, n_playout=500) # set larger n_playout for better performance self.mcts_baseline_player = MCTSPlayer(self.baseline_policy.policy_value_fn, c_puct=5, n_playout=500) # set larger n_playout for better performance self.human_player = Human() self.human_player.set_player_ind(1) #self.mcts_baseline_player.set_player_ind(1) self.mcts_player.set_player_ind(2) self.players = {1:self.human_player, 2:self.mcts_player} #self.players = {1:self.mcts_baseline_player, 2:self.mcts_player} self.options()
def run(): n = 5 width, height = 8, 8 try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) # ############### human VS AI ################### # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow best_policy = PolicyValueNet(width, height, model_file=model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=500) # human player, input your move in the format: 2,3 human1 = Human() # set start_player=0 for human first game.start_play(human1, mcts_player, start_player=1, is_shown=1) # game.start_play(human1, human2, start_player=0, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def run(): n = 5 width, height = 10, 10 model_file = 'best_policy.model' try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) graphic = Graphic() # graphic.run() print(1111) # thread1 = threading.Thread(target=graphic.run, args=()) best_policy = PolicyValueNet(width, height, model_file='./model/' + model_file) mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=1000) print("hhh") human = Human(graphic) # set start_player=0 for human first thread2 = threading.Thread(target=game.start_play, args=(human, mcts_player, graphic, 1, 1)) # game.start_play(human, mcts_player, graphic, start_player=0, is_shown=1) # thread1.setDaemon(True) # thread1.start() thread2.setDaemon(True) thread2.start() graphic.run() # thread1.join() # thread2.join() # game.start_play(human, mcts_player, graphic, start_player=0, is_shown=1) # thread.start_new_thread(game.start_play, (human, mcts_player, graphic, 0, 1)) # thread.start_new_thread(graphic.run, ()) except KeyboardInterrupt: print('\n\rquit')