def __init__(self): # params of the board and the game self.board_width = 6 self.board_height = 6 self.n_in_row = 4 self.board = ShogiBoard() # training params self.learn_rate = 5e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.025 self.check_freq = 50 self.game_batch_num = 3000 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 # start training from a given policy-value net # policy_param = pickle.load(open('current_policy.model', 'rb')) # self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params = policy_param) # start training from a new policy-value net self.policy_value_net = PolicyValueNet() self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def __init__(self, config=None): # params of the board and the game self.config = config if config else Config() # Network wrapper self.policy_value_net = PolicyValueNet(self.config.board_width, self.config.board_height, net_params=self.config.policy_param, Network=self.config.network) # 传入policy_value_net的predict方法,神经网络辅助MCTS搜索过程 self.mcts_player = AlphaZeroPlayer(self.policy_value_net.predict, c_puct=self.config.c_puct, nplays=self.config.n_playout, is_selfplay=True)
def run(config=None): if config == None: config = load_config(file_name=root_data_file + 'resnet_6_6_4.model', only_load_param=True) try: board = Board(width=config.board_width, height=config.board_height, n_in_row=config.n_in_row) game = Game(board) # --------------- human VS AI ---------------- best_policy = PolicyValueNet( config.board_width, config.board_height, Network=config.network, net_params=config.policy_param ) # setup which Network to use based on the net_params mcts_player = AlphaZeroPlayer( best_policy.predict, c_puct=config.c_puct, nplays=100, add_noise=True) # set larger nplays for better performance # uncomment the following line to play with pure MCTS # mcts_player2 = RolloutPlayer(nplays=1000, c_puct=config.c_puct) # human player, input your move in the format: 2,3 human = HumanPlayer() # set who_first=0 for human first game.start_game(human, mcts_player, who_first=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def __init__(self, config=None): # params of the board and the game self.config = config if config else Config() if not hasattr(self.config, "use_gpu"): setattr(config, "use_gpu", False) # compatible with old version config # Network wrapper self.policy_value_net = PolicyValueNet( self.config.board_width, self.config.board_height, net_params=self.config.policy_param, Network=self.config.network, use_gpu=self.config.use_gpu) # forward the reference of policy_value_net'predict function,for MCTS simulation self.mcts_player = AlphaZeroPlayer(self.policy_value_net.predict, c_puct=self.config.c_puct, nplays=self.config.n_playout, is_selfplay=True)
def __init__(self, modelPath=None): # 棋盘和游戏 self.boardWidth = 4 self.boardHeight = 4 self.game = Game() # 训练参数 self.learningRate = 5e-3 self.learningRateMultiplier = 1.0 # 自适应 try: self.learningRateMultiplier = float(Util.getNewestLearningRateMultiplier(type='from_db' if modelPath is None else 'from_self_play')) except Exception as e: print(str(e)) self.temperature = 1.0 # 温度, 含义见参考资料.txt第2条 self.playoutTimes = 500 # 模拟次数 self.polynomialUpperConfidenceTreesConstant = 5 # 论文中的c_puct, 含义见参考资料.txt第2条 self.dataDequeSize = 10000 self.trainBatchSize = 512 # 训练批次尺寸,原本为512,先使用50用于调试 self.dataDeque = deque(maxlen=self.dataDequeSize) # 超出maxlen会自动删除另一边的元素 self.playBatchSize = 1 self.epochs = 5 # 单次训练拟合多少次 self.KLDParam = 0.025 self.checkFrequency = 1000 # 之前为100,改为1000,因为评测太浪费时间 self.gameBatchSize = 200000 # 5000时对mcts1500的胜率为0.9,所以再升到10000 self.maxWinRatio = 0.0 self.pureMctsPlayoutTimes = 1500 # 初始为500,现在已到1500s self.pureMctsPlayoutTimesAddend = 500 self.maxPureMctsPlayoutTimes = 3000 self.modelPath = modelPath self.trainedGameCountInDB = Util.readGameCount(type='train') self.lossDataCount = 12184 # 被黑客删掉的棋谱数量,这些棋谱数据已无法恢复 if modelPath is not None: self.policyValueNet = PolicyValueNet(self.boardWidth, self.boardHeight, logPath=Util.getTrainLogPath(isFromDB=False), modelPath=modelPath) self.trainedGameCount = self.trainedGameCountInDB + self.lossDataCount else: self.policyValueNet = PolicyValueNet(self.boardWidth, self.boardHeight, logPath=Util.getTrainLogPath(isFromDB=True)) self.trainedGameCount = 0 + self.lossDataCount self.zeroPlayer = ZeroPlayer(self.policyValueNet.policyValueFunction, polynomialUpperConfidenceTreesConstant=self.polynomialUpperConfidenceTreesConstant, playoutTimes=self.playoutTimes, isSelfPlay=1)
def run(config=None): if config == None: config = load_config(file_name=root_data_file + 'resnet_6_6_4.model', only_load_param=True) try: board = Board(width=config.board_width, height=config.board_height, n_in_row=config.n_in_row) #--------------------1.set player:alphazero VS human---------------------# best_policy = PolicyValueNet( config.board_width, config.board_height, Network=config.network, net_params=config.policy_param ) # setup which Network to use based on the net_params player1 = AlphaZeroPlayer( best_policy.predict, c_puct=config.c_puct, nplays=1000) #set larger nplays for better performance # uncomment the following line to play with pure MCTS #player2 = RolloutPlayer(nplays=1000, c_puct=config.c_puct) player2 = HumanPlayer() # --------------------2.set order---------------------# who_first = 0 # 0 means player1 first, otherwise player2 first # --------------------3.start game--------------------# game = Game(board, is_visualize=True) t = threading.Thread(target=game.start_game, args=(player1, player2, who_first)) t.start() game.show() except: print('\n\rquit')
class TrainPipeline: def __init__(self, modelPath=None): # 棋盘和游戏 self.boardWidth = 4 self.boardHeight = 4 self.game = Game() # 训练参数 self.learningRate = 5e-3 self.learningRateMultiplier = 1.0 # 自适应 try: self.learningRateMultiplier = float(Util.getNewestLearningRateMultiplier(type='from_db' if modelPath is None else 'from_self_play')) except Exception as e: print(str(e)) self.temperature = 1.0 # 温度, 含义见参考资料.txt第2条 self.playoutTimes = 500 # 模拟次数 self.polynomialUpperConfidenceTreesConstant = 5 # 论文中的c_puct, 含义见参考资料.txt第2条 self.dataDequeSize = 10000 self.trainBatchSize = 512 # 训练批次尺寸,原本为512,先使用50用于调试 self.dataDeque = deque(maxlen=self.dataDequeSize) # 超出maxlen会自动删除另一边的元素 self.playBatchSize = 1 self.epochs = 5 # 单次训练拟合多少次 self.KLDParam = 0.025 self.checkFrequency = 1000 # 之前为100,改为1000,因为评测太浪费时间 self.gameBatchSize = 200000 # 5000时对mcts1500的胜率为0.9,所以再升到10000 self.maxWinRatio = 0.0 self.pureMctsPlayoutTimes = 1500 # 初始为500,现在已到1500s self.pureMctsPlayoutTimesAddend = 500 self.maxPureMctsPlayoutTimes = 3000 self.modelPath = modelPath self.trainedGameCountInDB = Util.readGameCount(type='train') self.lossDataCount = 12184 # 被黑客删掉的棋谱数量,这些棋谱数据已无法恢复 if modelPath is not None: self.policyValueNet = PolicyValueNet(self.boardWidth, self.boardHeight, logPath=Util.getTrainLogPath(isFromDB=False), modelPath=modelPath) self.trainedGameCount = self.trainedGameCountInDB + self.lossDataCount else: self.policyValueNet = PolicyValueNet(self.boardWidth, self.boardHeight, logPath=Util.getTrainLogPath(isFromDB=True)) self.trainedGameCount = 0 + self.lossDataCount self.zeroPlayer = ZeroPlayer(self.policyValueNet.policyValueFunction, polynomialUpperConfidenceTreesConstant=self.polynomialUpperConfidenceTreesConstant, playoutTimes=self.playoutTimes, isSelfPlay=1) def generateEquivalentData(self, stateProbScore): """ 生成等价数据,这是为了加快训练速度. 旋转,左右翻转,可得到8组等价数据 分值不用旋转,它是针对整个盘面的一个标量 :param stateProbScore: 元组列表[(states, mctsProbabilities, scores), ..., ...]""" extendedData = [] for states, probabilities, scores in stateProbScore: for i in [1, 2, 3, 4]: # 逆时针旋转 equivalentState = np.array([np.rot90(state, i) for state in states]) # 这里的4的含义是每个点有4个方向可以走动,这里一共有16个点,下面会在boardHeight的方向上翻转 equivalentProbabilities = np.rot90(np.flipud(probabilities.reshape(self.boardHeight, self.boardWidth, 4)), i) # 概率先上下翻转,因为之前的棋盘状态是上下翻转了的,这里需要保持一致 extendedData.append((equivalentState, np.flipud(equivalentProbabilities).flatten(), scores)) # 水平翻转 equivalentState = np.array([np.fliplr(state) for state in equivalentState]) equivalentProbabilities = np.fliplr(equivalentProbabilities) extendedData.append((equivalentState, np.flipud(equivalentProbabilities).flatten(), scores)) return extendedData def collectOneSelfPlayData(self, times=1): """收集训练数据""" for i in range(times): _, stateProbScore = self.game.doOneSelfPlay(self.zeroPlayer, printMove=False, temperature=self.temperature) stateProbScore = list(stateProbScore)[:] self.episodeSize = len(stateProbScore) # 用等价数据增加训练数据量 stateProbScore = self.generateEquivalentData(stateProbScore) self.dataDeque.extend(stateProbScore) def updatePolicy(self, type): """更新策略网络""" batchSample = random.sample(self.dataDeque, self.trainBatchSize) batchState = [stateProbScore[0] for stateProbScore in batchSample] batchProbability = [data[1] for data in batchSample] batchScore = [data[2] for data in batchSample] oldLearningRateMultiplier = self.learningRateMultiplier oldLearningRate = oldLearningRateMultiplier * self.learningRate oldProbability, oldScore = self.policyValueNet.doPolicyValueFunction(batchState) for i in range(self.epochs): loss, entropy = self.policyValueNet.doOneTrain(batchState, batchProbability, batchScore, oldLearningRate) newProbability, newScore = self.policyValueNet.doPolicyValueFunction(batchState) Kullback_Leibler_Divergence = np.mean(np.sum(oldProbability * (np.log(oldProbability + 1e-10) - np.log(newProbability + 1e-10)), axis=1)) if Kullback_Leibler_Divergence > self.KLDParam * 4: # 如果D_KL发生严重分歧,提早停止 break # 自适应地调整学习率 if Kullback_Leibler_Divergence > self.KLDParam * 2 and self.learningRateMultiplier > 0.1: self.learningRateMultiplier /= 1.5 elif Kullback_Leibler_Divergence < self.KLDParam / 2 and self.learningRateMultiplier < 10: self.learningRateMultiplier *= 1.5 # 方差 oldVariance = 1 - np.var(np.array(batchScore) - oldScore.flatten()) / np.var(np.array(batchScore)) newVariance = 1 - np.var(np.array(batchScore) - newScore.flatten()) / np.var(np.array(batchScore)) # print("Kullback_Leibler_Divergence:{:.5f},learningRateMultiplier:{:.3f},loss:{},entropy:{},oldVariance:{:.3f},newVariance:{:.3f}".format(Kullback_Leibler_Divergence, self.learningRateMultiplier, loss, entropy, oldVariance, newVariance)) Util.savePolicyUpdate(uuid=uuid.uuid1(), KullbackLeiblerDivergence=Kullback_Leibler_Divergence, oldLearningRateMultiplier=oldLearningRateMultiplier, newLearningRateMultiplier=self.learningRateMultiplier, oldLearningRate=oldLearningRate, newLearningRate=self.learningRateMultiplier * self.learningRate, loss=loss, entropy=entropy, oldVariance=oldVariance, newVariance=newVariance, insertTime=Util.getTimeNowStr(), type=type) return loss, entropy def doPolicyEvaluate(self, times=10): """ 通过与纯MCTS玩家对弈来评估策略网络,这仅用于监控训练的进度 :param times 对弈次数 """ zeroPlayer = ZeroPlayer(self.policyValueNet.policyValueFunction, polynomialUpperConfidenceTreesConstant=self.polynomialUpperConfidenceTreesConstant, playoutTimes=self.playoutTimes) zeroPlayer.setName('AlphaZero_' + str(Util.readGameCount(type='train'))) zeroPlayer.setNetworkVersion(1) purePlayer = PurePlayer(polynomialUpperConfidenceTreesConstant=5, playoutTimes=self.pureMctsPlayoutTimes) winTimes = defaultdict(int) for i in range(times): # 这里把startPlayer=i%2改为=0,即永远黑棋先行,因为训练时一直都是黑棋先行,没有执白且白棋先行这种情况,而先行方又是输入参数之一 if 0 == i % 2: winner = self.game.startPlay(zeroPlayer, purePlayer, startPlayer=0, printMove=1, type='evaluation') else: winner = self.game.startPlay(purePlayer, zeroPlayer, startPlayer=0, printMove=1, type='evaluation') if winner == -1: # 平局 winTimes['tie'] += 1 elif winner == 0: # 黑棋胜 if 0 == i % 2: winTimes['zero'] += 1 else: winTimes['pure'] += 1 else: # 白棋胜 if 0 == i % 2: winTimes['pure'] += 1 else: winTimes['zero'] += 1 winRatio = 1.0 * (winTimes['zero'] + 0.5 * winTimes['tie']) / times print("PlayoutTimes:{}, win: {}, lose: {}, tie:{}".format(self.pureMctsPlayoutTimes, winTimes['zero'], winTimes['pure'], winTimes['tie'])) return winRatio @staticmethod def toListOfNumpyArray(lst: list): for i in range(len(lst)): lst[i] = np.array(lst[i]) return lst def trainByDataFromDB(self): # 分批次读取数据,每次读取100局数据,因为数据量很大,一次读完会卡在这里 pageSize = 10 # Python里直接用/会四舍五入 readTimes = self.trainedGameCountInDB // pageSize # 避免漏掉余下的数据 if self.trainedGameCountInDB % pageSize != 0: readTimes += 1 for pageIndex in range(readTimes): gameDatas = Util.readGameFromDB(offset=pageIndex * pageSize, size=pageSize, readAll=False, type='train') for i in range(len(gameDatas)): gameData = gameDatas[i] # print(gameData) states = self.toListOfNumpyArray(json.loads(gameData[1])) probabilities = self.toListOfNumpyArray(json.loads(gameData[2])) scores = np.array(json.loads(gameData[3])) stateProbScore = zip(states, probabilities, scores) stateProbScore = list(stateProbScore)[:] self.episodeSize = len(stateProbScore) # 用等价数据增加训练数据量 stateProbScore = self.generateEquivalentData(stateProbScore) self.dataDeque.extend(stateProbScore) print("Train from DB Batch i:{}, episodeSize:{}".format(i + 1, self.episodeSize)) if len(self.dataDeque) > self.trainBatchSize: self.updatePolicy(type='from_db') self.policyEvaluate(index=i, currentModelSavedPath=Util.getPathToSaveModel(False, True, True), willDoPolicyEvaluate=False) self.trainedGameCount += 1 def run(self): """运行训练流水线""" try: if self.modelPath is None: # 如果没有指定模型文件,则先把数据库里的数据拿来训练 self.trainByDataFromDB() for i in range(self.trainedGameCount, self.gameBatchSize): self.collectOneSelfPlayData(self.playBatchSize) print("Batch i:{}, episodeSize:{}".format(i + 1, self.episodeSize)) if len(self.dataDeque) > self.trainBatchSize: self.updatePolicy(type='from_self_play') self.policyEvaluate(i) except KeyboardInterrupt: print('\n\rquit') def policyEvaluate(self, index, currentModelSavedPath=Util.getPathToSaveModel(False, True, False), bestModelSavedPath=Util.getPathToSaveModel(False, False, False) + '_' + str(Util.readGameCount(type='train')), willDoPolicyEvaluate=True): """保存模型参数到文件,检查当前模型的性能""" self.policyValueNet.saveModel(currentModelSavedPath) if willDoPolicyEvaluate and (index + 1) % self.checkFrequency == 0: print("Self play batch: {}".format(index + 1)) # 这里有个bug,评估的时候start_player是0,1互换的,这就导致白棋先行,而这是训练时没有产生的情况,其实规定先行方只能是黑棋,是完全合理的 winRatio = self.doPolicyEvaluate() if winRatio >= self.maxWinRatio: # >改为>= print("New best policy with win ratio: {}".format(winRatio)) self.maxWinRatio = winRatio # 更新最好的模型 self.policyValueNet.saveModel(bestModelSavedPath) if self.maxWinRatio == 1.0 and self.pureMctsPlayoutTimes < self.maxPureMctsPlayoutTimes: self.pureMctsPlayoutTimes += self.pureMctsPlayoutTimesAddend self.maxWinRatio = 0.0
class TrainPipeline(): def __init__(self, config=None): # params of the board and the game self.config = config if config else Config() # Network wrapper self.policy_value_net = PolicyValueNet(self.config.board_width, self.config.board_height, net_params=self.config.policy_param, Network=self.config.network) # 传入policy_value_net的predict方法,神经网络辅助MCTS搜索过程 self.mcts_player = AlphaZeroPlayer(self.policy_value_net.predict, c_puct=self.config.c_puct, nplays=self.config.n_playout, is_selfplay=True) def self_play(self, n_games=1): """ collect self-play data for training n_game: 自我对弈n_game局后,再更新网络 """ self.episode_len = 0 self.augmented_len = 0 for i in range(n_games): winner, play_data, episode_len = self.config.game.start_self_play_game(self.mcts_player, temp=self.config.temp) self.episode_len += episode_len # episode_len每局下的回合数 # augment the data play_data = self.augment_data(play_data) self.augmented_len += len(play_data) self.config.data_buffer.extend(play_data) def optimize(self, iteration): """update the policy-value net""" mini_batch = random.sample(self.config.data_buffer, self.config.batch_size) state_batch, mcts_probs_batch, winner_batch = list(zip(*mini_batch)) if self.config.is_adjust_lr and iteration % self.config.adjust_lr_freq == 0: old_probs, old_v = self.policy_value_net.predict_many(state_batch) # used for adjusting lr for i in range(self.config.per_game_opt_times): # number of opt times loss_info = self.policy_value_net.fit(state_batch, mcts_probs_batch, winner_batch, self.config.learn_rate * self.config.lr_multiplier) if self.config.is_adjust_lr and iteration % self.config.adjust_lr_freq == 0: # adaptively adjust the learning rate self.adjust_learning_rate(old_probs, old_v, state_batch, winner_batch) #self.adjust_learning_rate_2(iteration) print("combined loss:{0:.5f}, value loss:{1:.5f}, policy loss:{2:.5f}, entropy:{3:.5f}". format(loss_info['combined_loss'], loss_info['value_loss'], loss_info['policy_loss'], loss_info['entropy'])) return loss_info def adjust_learning_rate(self, old_probs, old_v, state_batch, winner_batch): ''' reference paper: PPO:Proximal Policy Optimization adjust learning rate based on KL ''' new_probs, new_v = self.policy_value_net.predict_many(state_batch) kl = np.mean(np.sum(old_probs * (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), axis=1)) # KL if kl > self.config.kl_targ * 2 and self.config.lr_multiplier > 0.1: # kl increase, denote that the new move prob distribution deviate a lot from original distribution, that's what we don't expect. maybe dute to too large lr self.config.lr_multiplier /= 1.5 elif kl < self.config.kl_targ / 2 and self.config.lr_multiplier < 10: # kl decrease, denote that learning procedure is vary stable and slow self.config.lr_multiplier *= 1.5 explained_var_old = 1 - np.var(np.array(winner_batch) - old_v.flatten()) / np.var(np.array(winner_batch)) explained_var_new = 1 - np.var(np.array(winner_batch) - new_v.flatten()) / np.var(np.array(winner_batch)) print("kl:{:.5f},lr:{:.7f},explained_var_old:{:.3f},explained_var_new:{:.3f}".format( kl, self.config.learn_rate * self.config.lr_multiplier, explained_var_old, explained_var_new)) def adjust_learning_rate_2(self, iteration): '''衰减法''' if (iteration+1) % self.config.lr_decay_per_iterations == 0: self.config.lr_multiplier /= self.config.lr_decay_speed print ("lr:{}".format(self.config.learn_rate * self.config.lr_multiplier)) def evaluate(self, n_games=10): """ Evaluate the trained policy by playing games against the pure MCTS player Note: this is only for monitoring the progress of training """ current_mcts_player = AlphaZeroPlayer(self.policy_value_net.predict, c_puct=self.config.c_puct, nplays=self.config.n_playout) if self.config.evaluate_opponent == 'Pure': # opponent is rolloutplayer print("Begin evaluation, Opponent is RolloutMCTSPlayer") opponent_mcts_player = RolloutPlayer(c_puct=5, nplays=self.config.pure_mcts_playout_num) else: # oppenent is AlphaZeroPlayer print("Begin evaluation, Opponent is AlphaZeroMCTSPlayer") opponent_mcts_player = load_current_best_player(self.config.cur_best_alphazero_store_filename) win_cnt = defaultdict(int) for i in range(n_games): print ("evaluate game %d" %i) winner = self.config.game.start_game(current_mcts_player, opponent_mcts_player, who_first=i % 2, is_shown=0) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(self.config.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1])) return win_ratio def augment_data(self, play_data): """ augment the data set by rotation and flipping play_data: [(state, mcts_prob, winner_z), ..., ...]""" extend_data = [] for state, mcts_porb, winner in play_data: ''' state: 3*3 board's moves like: 6 7 8 3 4 5 0 1 2 mcts_porb: flatten 0,1,2,3,4,5,6,7,8 winner 1 or -1 ''' for i in [1, 2, 3, 4]: # rotate counterclockwise equi_state = np.array([np.rot90(s, i) for s in state]) # i=4就是原来的数据 equi_mcts_prob = np.rot90(np.flipud(mcts_porb.reshape(self.config.board_height, self.config.board_width)), i) # 上下翻转成棋盘形状,各个cell的值对应该位置下棋概率 extend_data.append((equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) # flip horizontally equi_state = np.array([np.fliplr(s) for s in equi_state]) # 水平翻转 equi_mcts_prob = np.fliplr(equi_mcts_prob) # equi_mcts_prob和equi_state对应 extend_data.append((equi_state, np.flipud(equi_mcts_prob).flatten(), winner)) return extend_data def save_model(self, win_ratio, epochs): # save model if necessary # if opponent is Rollout Player, then win_ratio > best_win_pure_so_far # if opponent is the Strongest Rollout Player, then win_ratio must be 1.0 # else win_ratio >= win_ratio_alphazero if (self.config.evaluate_opponent == 'Pure' and win_ratio > self.config.best_win_pure_so_far) or \ (self.config.evaluate_opponent == 'Pure' and self.config.pure_mcts_playout_num == 5000 and win_ratio == 1.0) or \ (self.config.evaluate_opponent == 'AlphaZero' and win_ratio >= self.config.win_ratio_alphazero): print("New best policy!!!!!!!!") # load network parameters self.config.policy_param = self.policy_value_net.get_policy_param() # get model params self.config.cur_best_alphazero_store_filename = "tmp/epochs-{0}-opponent-{1}-win-{2:.2f}.pkl".format(epochs, self.config.evaluate_opponent, win_ratio) pickle.dump(self.config, open(self.config.cur_best_alphazero_store_filename, 'wb')) pickle.dump(self.config, open(self.config.local_model_path + self.config.cur_best_alphazero_store_filename, 'wb')) #---------------Adjust Opponent---------------------# # Firstly, Make Rollout stronger(increase pure_mcts_playout_num) # Secondly, when RolloutPlayer is the strongest version(mcts_num=5000) but still lose self.config change_opponent_continuous_times Times, # Then Change the opponent to AlphaZero Player # if opponent is RolloutPlayer, Then make it Stronger!! if self.config.evaluate_opponent =='Pure' and win_ratio > self.config.best_win_pure_so_far: if win_ratio == 1.0 and self.config.pure_mcts_playout_num < 5000: self.config.pure_mcts_playout_num += 1000 # stronger self.config.best_win_pure_so_far = 0.0 # reset win_ratio # current model continuously win(or tie) against the strongest pure mcts player(mcts_play_out>=5000) if self.config.evaluate_opponent == 'Pure' and self.config.pure_mcts_playout_num >= 5000 \ and win_ratio == 1.0: # note: add equal self.config.continuous_win_pure_times += 1 # change the opponent if self.config.evaluate_opponent == 'Pure' and \ self.config.continuous_win_pure_times >= self.config.change_opponent_continuous_times: print ('Change Opponent:AlphaZero') self.config.evaluate_opponent = 'AlphaZero' def check_loss_change(self): ''' check loss change every self.config.check_freq steps record the current minimum [mean loss of every self.config.check_freq steps] if the mean loss of every self.config.check_freq steps don't decrease for twice times comparing to the current minimum then decrease the learn_rate by half ''' combined_loss_list = [loss['combined_loss'] for loss in self.config.loss_records] last_check_freq_mean_loss = np.mean(combined_loss_list[-self.config.check_freq:]) if self.config.min_mean_loss_every_check_freq is None or \ last_check_freq_mean_loss < self.config.min_mean_loss_every_check_freq: if self.config.min_mean_loss_every_check_freq is not None: print('decrease loss by {0:.4f}'.format(self.config.min_mean_loss_every_check_freq-last_check_freq_mean_loss)) self.config.min_mean_loss_every_check_freq = last_check_freq_mean_loss # update self.config.increase_mean_loss_times = 0 # reset to zero else: print('increase loss by {0:.4f}'.format(last_check_freq_mean_loss-self.config.min_mean_loss_every_check_freq)) self.config.increase_mean_loss_times += 1 if self.config.increase_mean_loss_times >= self.config.adjust_lr_increase_loss_times: self.config.learn_rate /= 10 # decrease init lr by half self.config.kl_targ /= 10 # decrease kl_targ, so that the lr tends to be smaller #self.config.increase_mean_loss_times = 0 # reset again print('decrease lr by half, now init lr is {0:.5f}'.format(self.config.learn_rate)) def run(self): """run the training pipeline""" print ("start training from game:{}".format(self.config.start_game_num)) try: for i in range(self.config.start_game_num, self.config.game_batch_num): self.self_play(self.config.play_batch_size) # big step 1 print("iteration i:{}, episode_len:{}, augmented_len:{}, current_buffer_len:{}".format(i + 1, self.episode_len, self.augmented_len, len(self.config.data_buffer))) # new Added parameters,So check for old config file if not hasattr(self.config, "episode_records"): setattr(config, "episode_records", []) self.config.episode_records.append(self.episode_len) if len(self.config.data_buffer) > self.config.batch_size: loss_info = self.optimize(iteration=i+1) # big step 2 self.config.loss_records.append(loss_info) self.config.start_game_num = i + 1 # update for restart # check the performance of the current model,and save the model params if (i + 1) % self.config.check_freq == 0: print("current iteration: {}".format(i + 1)) win_ratio = self.evaluate() #big step 3 self.check_loss_change() # check loss, and adjust init lr if necessary self.save_model(win_ratio, i + 1) except KeyboardInterrupt: print('\n\rquit')
class TrainPipeline(): def __init__(self): # params of the board and the game self.board_width = 6 self.board_height = 6 self.n_in_row = 4 self.board = ShogiBoard() # training params self.learn_rate = 5e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.025 self.check_freq = 50 self.game_batch_num = 3000 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 # start training from a given policy-value net # policy_param = pickle.load(open('current_policy.model', 'rb')) # self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params = policy_param) # start training from a new policy-value net self.policy_value_net = PolicyValueNet() self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) def collect_selfplay_data(self, n_games=1): """collect self-play data for training""" for i in range(n_games): winner, stepCounts, play_data = start_self_play(self.board, self.mcts_player, temp=self.temp) print("train-collect_selfplay_data: winner = %d" % winner) self.episode_len = stepCounts self.data_buffer.extend(play_data) def policy_update(self): """update the policy-value net""" mini_batch = random.sample(self.data_buffer, self.batch_size) state_batch = [data[0] for data in mini_batch] mcts_probs_batch = [data[1] for data in mini_batch] winner_batch = [data[2] for data in mini_batch] old_probs, old_v = self.policy_value_net.policy_value(state_batch) for i in range(self.epochs): loss, entropy = self.policy_value_net.train_step( state_batch, mcts_probs_batch, winner_batch, self.learn_rate * self.lr_multiplier) new_probs, new_v = self.policy_value_net.policy_value(state_batch) kl = np.mean( np.sum(old_probs * (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), axis=1)) if kl > self.kl_targ * 4: # early stopping if D_KL diverges badly break # adaptively adjust the learning rate if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1: self.lr_multiplier /= 1.5 elif kl < self.kl_targ / 2 and self.lr_multiplier < 10: self.lr_multiplier *= 1.5 explained_var_old = 1 - np.var( np.array(winner_batch) - old_v.flatten()) / np.var( np.array(winner_batch)) explained_var_new = 1 - np.var( np.array(winner_batch) - new_v.flatten()) / np.var( np.array(winner_batch)) print( "kl:{:.5f},lr_multiplier:{:.3f},loss:{},entropy:{},explained_var_old:{:.3f},explained_var_new:{:.3f}" .format(kl, self.lr_multiplier, loss, entropy, explained_var_old, explained_var_new)) return loss, entropy def policy_evaluate(self, n_games=10): """ Evaluate the trained policy by playing games against the pure MCTS player Note: this is only for monitoring the progress of training """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): print("train-policy_evaluate: game = %d" % (i)) winner = start_play(self.board, current_mcts_player, pure_mcts_player, startPlayer=i % 2) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[0]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[0])) return win_ratio def run(self): """run the training pipeline""" try: curTime = datetime.datetime.now() writeTrainingLog("train-run: {}".format(curTime)) for i in range(self.game_batch_num): print("train-run: train round %d" % i) writeTrainingLog("train-run: train round %d" % i) self.collect_selfplay_data(self.play_batch_size) print("batch i:{}, episode_len:{}".format( i + 1, self.episode_len)) writeTrainingLog("batch i:{}, episode_len:{}".format( i + 1, self.episode_len)) print("train-run: len of data_buffer = {}".format( len(self.data_buffer))) if len(self.data_buffer) > self.batch_size: loss, entropy = self.policy_update() # check the performance of the current model,and save the model params if (i + 1) % self.check_freq == 0: print("current self-play batch: {}".format(i + 1)) writeTrainingLog("current self-play batch: {}".format(i + 1)) win_ratio = self.policy_evaluate() net_params = self.policy_value_net.get_policy_param( ) # get model params pickle.dump(net_params, open('current_policy.model', 'wb')) # save model param to file if win_ratio > self.best_win_ratio: print("New best policy!!!!!!!!") writeTrainingLog("New best policy!!!!!!!!") self.best_win_ratio = win_ratio pickle.dump(net_params, open('best_policy.model', 'wb')) # update the best_policy if self.best_win_ratio == 1.0 and self.pure_mcts_playout_num < 5000: self.pure_mcts_playout_num += 1000 self.best_win_ratio = 0.0 except KeyboardInterrupt: print('\n\rquit') writeTrainingLog('\n\rquit')