예제 #1
0
 def __init__(self):
     # params of the board and the game
     self.board_width = 6
     self.board_height = 6
     self.n_in_row = 4
     self.board = ShogiBoard()
     # training params
     self.learn_rate = 5e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 400  # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.025
     self.check_freq = 50
     self.game_batch_num = 3000
     self.best_win_ratio = 0.0
     # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 1000
     # start training from a given policy-value net
     #        policy_param = pickle.load(open('current_policy.model', 'rb'))
     #        self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params = policy_param)
     # start training from a new policy-value net
     self.policy_value_net = PolicyValueNet()
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
예제 #2
0
    def __init__(self, config=None):
        # params of the board and the game
        self.config = config if config else Config()

        # Network wrapper
        self.policy_value_net = PolicyValueNet(self.config.board_width, self.config.board_height,
                                               net_params=self.config.policy_param,
                                               Network=self.config.network)

        # 传入policy_value_net的predict方法,神经网络辅助MCTS搜索过程
        self.mcts_player = AlphaZeroPlayer(self.policy_value_net.predict, c_puct=self.config.c_puct,
                                           nplays=self.config.n_playout, is_selfplay=True)
예제 #3
0
def run(config=None):
    if config == None:
        config = load_config(file_name=root_data_file + 'resnet_6_6_4.model',
                             only_load_param=True)
    try:
        board = Board(width=config.board_width,
                      height=config.board_height,
                      n_in_row=config.n_in_row)
        game = Game(board)

        # --------------- human VS AI ----------------
        best_policy = PolicyValueNet(
            config.board_width,
            config.board_height,
            Network=config.network,
            net_params=config.policy_param
        )  # setup which Network to use based on the net_params

        mcts_player = AlphaZeroPlayer(
            best_policy.predict,
            c_puct=config.c_puct,
            nplays=100,
            add_noise=True)  # set larger nplays for better performance

        # uncomment the following line to play with pure MCTS
        # mcts_player2 = RolloutPlayer(nplays=1000, c_puct=config.c_puct)

        # human player, input your move in the format: 2,3
        human = HumanPlayer()

        # set who_first=0 for human first
        game.start_game(human, mcts_player, who_first=1, is_shown=1)

    except KeyboardInterrupt:
        print('\n\rquit')
예제 #4
0
    def __init__(self, config=None):
        # params of the board and the game
        self.config = config if config else Config()
        if not hasattr(self.config, "use_gpu"):
            setattr(config, "use_gpu",
                    False)  # compatible with old version config
        # Network wrapper
        self.policy_value_net = PolicyValueNet(
            self.config.board_width,
            self.config.board_height,
            net_params=self.config.policy_param,
            Network=self.config.network,
            use_gpu=self.config.use_gpu)

        # forward the reference of policy_value_net'predict function,for MCTS simulation
        self.mcts_player = AlphaZeroPlayer(self.policy_value_net.predict,
                                           c_puct=self.config.c_puct,
                                           nplays=self.config.n_playout,
                                           is_selfplay=True)
예제 #5
0
 def __init__(self, modelPath=None):
     # 棋盘和游戏
     self.boardWidth = 4
     self.boardHeight = 4
     self.game = Game()
     # 训练参数
     self.learningRate = 5e-3
     self.learningRateMultiplier = 1.0  # 自适应
     try:
         self.learningRateMultiplier = float(Util.getNewestLearningRateMultiplier(type='from_db' if modelPath is None else 'from_self_play'))
     except Exception as e:
         print(str(e))
     self.temperature = 1.0  # 温度, 含义见参考资料.txt第2条
     self.playoutTimes = 500  # 模拟次数
     self.polynomialUpperConfidenceTreesConstant = 5  # 论文中的c_puct, 含义见参考资料.txt第2条
     self.dataDequeSize = 10000
     self.trainBatchSize = 512  # 训练批次尺寸,原本为512,先使用50用于调试
     self.dataDeque = deque(maxlen=self.dataDequeSize)  # 超出maxlen会自动删除另一边的元素
     self.playBatchSize = 1
     self.epochs = 5  # 单次训练拟合多少次
     self.KLDParam = 0.025
     self.checkFrequency = 1000  # 之前为100,改为1000,因为评测太浪费时间
     self.gameBatchSize = 200000  # 5000时对mcts1500的胜率为0.9,所以再升到10000
     self.maxWinRatio = 0.0
     self.pureMctsPlayoutTimes = 1500  # 初始为500,现在已到1500s
     self.pureMctsPlayoutTimesAddend = 500
     self.maxPureMctsPlayoutTimes = 3000
     self.modelPath = modelPath
     self.trainedGameCountInDB = Util.readGameCount(type='train')
     self.lossDataCount = 12184  # 被黑客删掉的棋谱数量,这些棋谱数据已无法恢复
     if modelPath is not None:
         self.policyValueNet = PolicyValueNet(self.boardWidth, self.boardHeight, logPath=Util.getTrainLogPath(isFromDB=False), modelPath=modelPath)
         self.trainedGameCount = self.trainedGameCountInDB + self.lossDataCount
     else:
         self.policyValueNet = PolicyValueNet(self.boardWidth, self.boardHeight, logPath=Util.getTrainLogPath(isFromDB=True))
         self.trainedGameCount = 0 + self.lossDataCount
     self.zeroPlayer = ZeroPlayer(self.policyValueNet.policyValueFunction,
                                  polynomialUpperConfidenceTreesConstant=self.polynomialUpperConfidenceTreesConstant,
                                  playoutTimes=self.playoutTimes, isSelfPlay=1)
예제 #6
0
def run(config=None):
    if config == None:
        config = load_config(file_name=root_data_file + 'resnet_6_6_4.model',
                             only_load_param=True)
    try:
        board = Board(width=config.board_width,
                      height=config.board_height,
                      n_in_row=config.n_in_row)

        #--------------------1.set player:alphazero VS human---------------------#
        best_policy = PolicyValueNet(
            config.board_width,
            config.board_height,
            Network=config.network,
            net_params=config.policy_param
        )  # setup which Network to use based on the net_params

        player1 = AlphaZeroPlayer(
            best_policy.predict, c_puct=config.c_puct,
            nplays=1000)  #set larger nplays for better performance

        # uncomment the following line to play with pure MCTS
        #player2 = RolloutPlayer(nplays=1000, c_puct=config.c_puct)
        player2 = HumanPlayer()
        # --------------------2.set order---------------------#
        who_first = 0  # 0 means player1 first, otherwise player2 first

        # --------------------3.start game--------------------#
        game = Game(board, is_visualize=True)
        t = threading.Thread(target=game.start_game,
                             args=(player1, player2, who_first))
        t.start()
        game.show()

    except:
        print('\n\rquit')
예제 #7
0
class TrainPipeline:
    def __init__(self, modelPath=None):
        # 棋盘和游戏
        self.boardWidth = 4
        self.boardHeight = 4
        self.game = Game()
        # 训练参数
        self.learningRate = 5e-3
        self.learningRateMultiplier = 1.0  # 自适应
        try:
            self.learningRateMultiplier = float(Util.getNewestLearningRateMultiplier(type='from_db' if modelPath is None else 'from_self_play'))
        except Exception as e:
            print(str(e))
        self.temperature = 1.0  # 温度, 含义见参考资料.txt第2条
        self.playoutTimes = 500  # 模拟次数
        self.polynomialUpperConfidenceTreesConstant = 5  # 论文中的c_puct, 含义见参考资料.txt第2条
        self.dataDequeSize = 10000
        self.trainBatchSize = 512  # 训练批次尺寸,原本为512,先使用50用于调试
        self.dataDeque = deque(maxlen=self.dataDequeSize)  # 超出maxlen会自动删除另一边的元素
        self.playBatchSize = 1
        self.epochs = 5  # 单次训练拟合多少次
        self.KLDParam = 0.025
        self.checkFrequency = 1000  # 之前为100,改为1000,因为评测太浪费时间
        self.gameBatchSize = 200000  # 5000时对mcts1500的胜率为0.9,所以再升到10000
        self.maxWinRatio = 0.0
        self.pureMctsPlayoutTimes = 1500  # 初始为500,现在已到1500s
        self.pureMctsPlayoutTimesAddend = 500
        self.maxPureMctsPlayoutTimes = 3000
        self.modelPath = modelPath
        self.trainedGameCountInDB = Util.readGameCount(type='train')
        self.lossDataCount = 12184  # 被黑客删掉的棋谱数量,这些棋谱数据已无法恢复
        if modelPath is not None:
            self.policyValueNet = PolicyValueNet(self.boardWidth, self.boardHeight, logPath=Util.getTrainLogPath(isFromDB=False), modelPath=modelPath)
            self.trainedGameCount = self.trainedGameCountInDB + self.lossDataCount
        else:
            self.policyValueNet = PolicyValueNet(self.boardWidth, self.boardHeight, logPath=Util.getTrainLogPath(isFromDB=True))
            self.trainedGameCount = 0 + self.lossDataCount
        self.zeroPlayer = ZeroPlayer(self.policyValueNet.policyValueFunction,
                                     polynomialUpperConfidenceTreesConstant=self.polynomialUpperConfidenceTreesConstant,
                                     playoutTimes=self.playoutTimes, isSelfPlay=1)

    def generateEquivalentData(self, stateProbScore):
        """
        生成等价数据,这是为了加快训练速度. 旋转,左右翻转,可得到8组等价数据
        分值不用旋转,它是针对整个盘面的一个标量

        :param stateProbScore: 元组列表[(states, mctsProbabilities, scores), ..., ...]"""
        extendedData = []
        for states, probabilities, scores in stateProbScore:
            for i in [1, 2, 3, 4]:
                # 逆时针旋转
                equivalentState = np.array([np.rot90(state, i) for state in states])
                # 这里的4的含义是每个点有4个方向可以走动,这里一共有16个点,下面会在boardHeight的方向上翻转
                equivalentProbabilities = np.rot90(np.flipud(probabilities.reshape(self.boardHeight, self.boardWidth, 4)), i)
                # 概率先上下翻转,因为之前的棋盘状态是上下翻转了的,这里需要保持一致
                extendedData.append((equivalentState, np.flipud(equivalentProbabilities).flatten(), scores))
                # 水平翻转
                equivalentState = np.array([np.fliplr(state) for state in equivalentState])
                equivalentProbabilities = np.fliplr(equivalentProbabilities)
                extendedData.append((equivalentState, np.flipud(equivalentProbabilities).flatten(), scores))
        return extendedData

    def collectOneSelfPlayData(self, times=1):
        """收集训练数据"""
        for i in range(times):
            _, stateProbScore = self.game.doOneSelfPlay(self.zeroPlayer, printMove=False, temperature=self.temperature)
            stateProbScore = list(stateProbScore)[:]
            self.episodeSize = len(stateProbScore)
            # 用等价数据增加训练数据量
            stateProbScore = self.generateEquivalentData(stateProbScore)
            self.dataDeque.extend(stateProbScore)

    def updatePolicy(self, type):
        """更新策略网络"""
        batchSample = random.sample(self.dataDeque, self.trainBatchSize)
        batchState = [stateProbScore[0] for stateProbScore in batchSample]
        batchProbability = [data[1] for data in batchSample]
        batchScore = [data[2] for data in batchSample]
        oldLearningRateMultiplier = self.learningRateMultiplier
        oldLearningRate = oldLearningRateMultiplier * self.learningRate
        oldProbability, oldScore = self.policyValueNet.doPolicyValueFunction(batchState)
        for i in range(self.epochs):
            loss, entropy = self.policyValueNet.doOneTrain(batchState, batchProbability, batchScore, oldLearningRate)
            newProbability, newScore = self.policyValueNet.doPolicyValueFunction(batchState)
            Kullback_Leibler_Divergence = np.mean(np.sum(oldProbability * (np.log(oldProbability + 1e-10) - np.log(newProbability + 1e-10)), axis=1))
            if Kullback_Leibler_Divergence > self.KLDParam * 4:  # 如果D_KL发生严重分歧,提早停止
                break
        # 自适应地调整学习率
        if Kullback_Leibler_Divergence > self.KLDParam * 2 and self.learningRateMultiplier > 0.1:
            self.learningRateMultiplier /= 1.5
        elif Kullback_Leibler_Divergence < self.KLDParam / 2 and self.learningRateMultiplier < 10:
            self.learningRateMultiplier *= 1.5
        # 方差
        oldVariance = 1 - np.var(np.array(batchScore) - oldScore.flatten()) / np.var(np.array(batchScore))
        newVariance = 1 - np.var(np.array(batchScore) - newScore.flatten()) / np.var(np.array(batchScore))
        # print("Kullback_Leibler_Divergence:{:.5f},learningRateMultiplier:{:.3f},loss:{},entropy:{},oldVariance:{:.3f},newVariance:{:.3f}".format(Kullback_Leibler_Divergence, self.learningRateMultiplier, loss, entropy, oldVariance, newVariance))
        Util.savePolicyUpdate(uuid=uuid.uuid1(), KullbackLeiblerDivergence=Kullback_Leibler_Divergence, oldLearningRateMultiplier=oldLearningRateMultiplier, newLearningRateMultiplier=self.learningRateMultiplier, oldLearningRate=oldLearningRate, newLearningRate=self.learningRateMultiplier * self.learningRate, loss=loss, entropy=entropy, oldVariance=oldVariance, newVariance=newVariance, insertTime=Util.getTimeNowStr(), type=type)
        return loss, entropy

    def doPolicyEvaluate(self, times=10):
        """
        通过与纯MCTS玩家对弈来评估策略网络,这仅用于监控训练的进度
        :param times 对弈次数
        """
        zeroPlayer = ZeroPlayer(self.policyValueNet.policyValueFunction,
                                polynomialUpperConfidenceTreesConstant=self.polynomialUpperConfidenceTreesConstant,
                                playoutTimes=self.playoutTimes)
        zeroPlayer.setName('AlphaZero_' + str(Util.readGameCount(type='train')))
        zeroPlayer.setNetworkVersion(1)
        purePlayer = PurePlayer(polynomialUpperConfidenceTreesConstant=5, playoutTimes=self.pureMctsPlayoutTimes)
        winTimes = defaultdict(int)
        for i in range(times):
            # 这里把startPlayer=i%2改为=0,即永远黑棋先行,因为训练时一直都是黑棋先行,没有执白且白棋先行这种情况,而先行方又是输入参数之一
            if 0 == i % 2:
                winner = self.game.startPlay(zeroPlayer, purePlayer, startPlayer=0, printMove=1, type='evaluation')
            else:
                winner = self.game.startPlay(purePlayer, zeroPlayer, startPlayer=0, printMove=1, type='evaluation')
            if winner == -1:  # 平局
                winTimes['tie'] += 1
            elif winner == 0:  # 黑棋胜
                if 0 == i % 2:
                    winTimes['zero'] += 1
                else:
                    winTimes['pure'] += 1
            else:  # 白棋胜
                if 0 == i % 2:
                    winTimes['pure'] += 1
                else:
                    winTimes['zero'] += 1
        winRatio = 1.0 * (winTimes['zero'] + 0.5 * winTimes['tie']) / times
        print("PlayoutTimes:{}, win: {}, lose: {}, tie:{}".format(self.pureMctsPlayoutTimes, winTimes['zero'], winTimes['pure'], winTimes['tie']))
        return winRatio

    @staticmethod
    def toListOfNumpyArray(lst: list):
        for i in range(len(lst)):
            lst[i] = np.array(lst[i])
        return lst

    def trainByDataFromDB(self):
        # 分批次读取数据,每次读取100局数据,因为数据量很大,一次读完会卡在这里
        pageSize = 10
        # Python里直接用/会四舍五入
        readTimes = self.trainedGameCountInDB // pageSize
        # 避免漏掉余下的数据
        if self.trainedGameCountInDB % pageSize != 0:
            readTimes += 1
        for pageIndex in range(readTimes):
            gameDatas = Util.readGameFromDB(offset=pageIndex * pageSize, size=pageSize, readAll=False, type='train')
            for i in range(len(gameDatas)):
                gameData = gameDatas[i]
                # print(gameData)
                states = self.toListOfNumpyArray(json.loads(gameData[1]))
                probabilities = self.toListOfNumpyArray(json.loads(gameData[2]))
                scores = np.array(json.loads(gameData[3]))
                stateProbScore = zip(states, probabilities, scores)
                stateProbScore = list(stateProbScore)[:]
                self.episodeSize = len(stateProbScore)
                # 用等价数据增加训练数据量
                stateProbScore = self.generateEquivalentData(stateProbScore)
                self.dataDeque.extend(stateProbScore)
                print("Train from DB Batch i:{}, episodeSize:{}".format(i + 1, self.episodeSize))
                if len(self.dataDeque) > self.trainBatchSize:
                    self.updatePolicy(type='from_db')
                self.policyEvaluate(index=i, currentModelSavedPath=Util.getPathToSaveModel(False, True, True), willDoPolicyEvaluate=False)
                self.trainedGameCount += 1

    def run(self):
        """运行训练流水线"""
        try:
            if self.modelPath is None:  # 如果没有指定模型文件,则先把数据库里的数据拿来训练
                self.trainByDataFromDB()
            for i in range(self.trainedGameCount, self.gameBatchSize):
                self.collectOneSelfPlayData(self.playBatchSize)
                print("Batch i:{}, episodeSize:{}".format(i + 1, self.episodeSize))
                if len(self.dataDeque) > self.trainBatchSize:
                    self.updatePolicy(type='from_self_play')
                self.policyEvaluate(i)
        except KeyboardInterrupt:
            print('\n\rquit')

    def policyEvaluate(self, index, currentModelSavedPath=Util.getPathToSaveModel(False, True, False),
                       bestModelSavedPath=Util.getPathToSaveModel(False, False, False) + '_' + str(Util.readGameCount(type='train')),
                       willDoPolicyEvaluate=True):
        """保存模型参数到文件,检查当前模型的性能"""
        self.policyValueNet.saveModel(currentModelSavedPath)
        if willDoPolicyEvaluate and (index + 1) % self.checkFrequency == 0:
            print("Self play batch: {}".format(index + 1))
            # 这里有个bug,评估的时候start_player是0,1互换的,这就导致白棋先行,而这是训练时没有产生的情况,其实规定先行方只能是黑棋,是完全合理的
            winRatio = self.doPolicyEvaluate()
            if winRatio >= self.maxWinRatio:  # >改为>=
                print("New best policy with win ratio: {}".format(winRatio))
                self.maxWinRatio = winRatio
                # 更新最好的模型
                self.policyValueNet.saveModel(bestModelSavedPath)
                if self.maxWinRatio == 1.0 and self.pureMctsPlayoutTimes < self.maxPureMctsPlayoutTimes:
                    self.pureMctsPlayoutTimes += self.pureMctsPlayoutTimesAddend
                    self.maxWinRatio = 0.0
예제 #8
0
class TrainPipeline():
    def __init__(self, config=None):
        # params of the board and the game
        self.config = config if config else Config()

        # Network wrapper
        self.policy_value_net = PolicyValueNet(self.config.board_width, self.config.board_height,
                                               net_params=self.config.policy_param,
                                               Network=self.config.network)

        # 传入policy_value_net的predict方法,神经网络辅助MCTS搜索过程
        self.mcts_player = AlphaZeroPlayer(self.policy_value_net.predict, c_puct=self.config.c_puct,
                                           nplays=self.config.n_playout, is_selfplay=True)


    def self_play(self, n_games=1):
        """
        collect self-play data for training
        n_game: 自我对弈n_game局后,再更新网络
        """
        self.episode_len = 0
        self.augmented_len = 0
        for i in range(n_games):
            winner, play_data, episode_len = self.config.game.start_self_play_game(self.mcts_player, temp=self.config.temp)
            self.episode_len += episode_len # episode_len每局下的回合数
            # augment the data
            play_data = self.augment_data(play_data)
            self.augmented_len += len(play_data)
            self.config.data_buffer.extend(play_data)

    def optimize(self, iteration):
        """update the policy-value net"""
        mini_batch = random.sample(self.config.data_buffer, self.config.batch_size)
        state_batch, mcts_probs_batch, winner_batch = list(zip(*mini_batch))

        if self.config.is_adjust_lr and iteration % self.config.adjust_lr_freq == 0:
            old_probs, old_v = self.policy_value_net.predict_many(state_batch) # used for adjusting lr

        for i in range(self.config.per_game_opt_times): # number of opt times
            loss_info = self.policy_value_net.fit(state_batch, mcts_probs_batch, winner_batch,
                                                      self.config.learn_rate * self.config.lr_multiplier)
        if self.config.is_adjust_lr and iteration % self.config.adjust_lr_freq == 0:
            # adaptively adjust the learning rate
            self.adjust_learning_rate(old_probs, old_v, state_batch, winner_batch)
            #self.adjust_learning_rate_2(iteration)

        print("combined loss:{0:.5f}, value loss:{1:.5f}, policy loss:{2:.5f}, entropy:{3:.5f}".
              format(loss_info['combined_loss'], loss_info['value_loss'], loss_info['policy_loss'], loss_info['entropy']))

        return loss_info


    def adjust_learning_rate(self, old_probs, old_v, state_batch, winner_batch):
        '''
        reference paper: PPO:Proximal Policy Optimization
        adjust learning rate based on KL
        '''
        new_probs, new_v = self.policy_value_net.predict_many(state_batch)
        kl = np.mean(np.sum(old_probs * (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), axis=1))  # KL
        if kl > self.config.kl_targ * 2 and self.config.lr_multiplier > 0.1:  # kl increase, denote that the new move prob distribution deviate a lot from original distribution, that's what we don't expect. maybe dute to too large lr
            self.config.lr_multiplier /= 1.5
        elif kl < self.config.kl_targ / 2 and self.config.lr_multiplier < 10:  # kl decrease, denote that learning procedure is vary stable and slow
            self.config.lr_multiplier *= 1.5

        explained_var_old = 1 - np.var(np.array(winner_batch) - old_v.flatten()) / np.var(np.array(winner_batch))
        explained_var_new = 1 - np.var(np.array(winner_batch) - new_v.flatten()) / np.var(np.array(winner_batch))

        print("kl:{:.5f},lr:{:.7f},explained_var_old:{:.3f},explained_var_new:{:.3f}".format(
                kl, self.config.learn_rate * self.config.lr_multiplier, explained_var_old, explained_var_new))


    def adjust_learning_rate_2(self, iteration):
        '''衰减法'''
        if (iteration+1) % self.config.lr_decay_per_iterations == 0:
            self.config.lr_multiplier /= self.config.lr_decay_speed
        print ("lr:{}".format(self.config.learn_rate * self.config.lr_multiplier))


    def evaluate(self, n_games=10):
        """
        Evaluate the trained policy by playing games against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """
        current_mcts_player = AlphaZeroPlayer(self.policy_value_net.predict, c_puct=self.config.c_puct,
                                              nplays=self.config.n_playout)

        if self.config.evaluate_opponent == 'Pure':
            # opponent is rolloutplayer
            print("Begin evaluation, Opponent is RolloutMCTSPlayer")
            opponent_mcts_player = RolloutPlayer(c_puct=5, nplays=self.config.pure_mcts_playout_num)
        else:
            # oppenent is AlphaZeroPlayer
            print("Begin evaluation, Opponent is AlphaZeroMCTSPlayer")
            opponent_mcts_player = load_current_best_player(self.config.cur_best_alphazero_store_filename)

        win_cnt = defaultdict(int)
        for i in range(n_games):
            print ("evaluate game %d" %i)
            winner = self.config.game.start_game(current_mcts_player, opponent_mcts_player, who_first=i % 2, is_shown=0)
            win_cnt[winner] += 1
        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(self.config.pure_mcts_playout_num, win_cnt[1], win_cnt[2],
                                                                  win_cnt[-1]))
        return win_ratio




    def augment_data(self, play_data):
        """
        augment the data set by rotation and flipping
        play_data: [(state, mcts_prob, winner_z), ..., ...]"""
        extend_data = []
        for state, mcts_porb, winner in play_data:
            '''
            state:
            3*3 board's moves like:
                6 7 8
                3 4 5
                0 1 2
            mcts_porb: flatten
            0,1,2,3,4,5,6,7,8
            winner
            1 or -1
            '''
            for i in [1, 2, 3, 4]:
                # rotate counterclockwise
                equi_state = np.array([np.rot90(s, i) for s in state])  # i=4就是原来的数据
                equi_mcts_prob = np.rot90(np.flipud(mcts_porb.reshape(self.config.board_height, self.config.board_width)),
                                          i)  # 上下翻转成棋盘形状,各个cell的值对应该位置下棋概率
                extend_data.append((equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
                # flip horizontally
                equi_state = np.array([np.fliplr(s) for s in equi_state])  # 水平翻转
                equi_mcts_prob = np.fliplr(equi_mcts_prob)  # equi_mcts_prob和equi_state对应
                extend_data.append((equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
        return extend_data


    def save_model(self, win_ratio, epochs):
        # save model if necessary
        # if opponent is Rollout Player, then win_ratio > best_win_pure_so_far
        # if opponent is the Strongest Rollout Player, then win_ratio must be 1.0
        # else win_ratio >= win_ratio_alphazero
        if (self.config.evaluate_opponent == 'Pure' and win_ratio > self.config.best_win_pure_so_far) or \
                (self.config.evaluate_opponent == 'Pure' and self.config.pure_mcts_playout_num == 5000 and win_ratio == 1.0) or \
                (self.config.evaluate_opponent == 'AlphaZero' and win_ratio >= self.config.win_ratio_alphazero):

            print("New best policy!!!!!!!!")
            # load network parameters
            self.config.policy_param = self.policy_value_net.get_policy_param()  # get model params

            self.config.cur_best_alphazero_store_filename = "tmp/epochs-{0}-opponent-{1}-win-{2:.2f}.pkl".format(epochs,
                                                                                                                 self.config.evaluate_opponent,
                                                                                                                 win_ratio)
            pickle.dump(self.config, open(self.config.cur_best_alphazero_store_filename, 'wb'))
            pickle.dump(self.config, open(self.config.local_model_path + self.config.cur_best_alphazero_store_filename, 'wb'))


        #---------------Adjust Opponent---------------------#
        # Firstly, Make Rollout stronger(increase pure_mcts_playout_num)
        # Secondly, when RolloutPlayer is the strongest version(mcts_num=5000) but still lose self.config change_opponent_continuous_times Times,
        # Then Change the opponent to AlphaZero Player

        # if opponent is RolloutPlayer, Then make it Stronger!!
        if self.config.evaluate_opponent =='Pure' and win_ratio > self.config.best_win_pure_so_far:
            if win_ratio == 1.0 and self.config.pure_mcts_playout_num < 5000:
                self.config.pure_mcts_playout_num += 1000  # stronger
                self.config.best_win_pure_so_far = 0.0 # reset win_ratio

        # current model continuously win(or tie) against the strongest pure mcts player(mcts_play_out>=5000)
        if self.config.evaluate_opponent == 'Pure' and self.config.pure_mcts_playout_num >= 5000 \
                and win_ratio == 1.0: # note: add equal
            self.config.continuous_win_pure_times += 1


        # change the opponent
        if self.config.evaluate_opponent == 'Pure' and \
                self.config.continuous_win_pure_times  >= self.config.change_opponent_continuous_times:
            print ('Change Opponent:AlphaZero')
            self.config.evaluate_opponent = 'AlphaZero'


    def check_loss_change(self):
        '''
        check loss change every self.config.check_freq steps
        record the current minimum [mean loss of every self.config.check_freq steps]
        if the mean loss of every self.config.check_freq steps don't decrease for twice times comparing to the current minimum
        then decrease the learn_rate by half
        '''
        combined_loss_list = [loss['combined_loss'] for loss in self.config.loss_records]
        last_check_freq_mean_loss = np.mean(combined_loss_list[-self.config.check_freq:])
        if self.config.min_mean_loss_every_check_freq is None or \
                last_check_freq_mean_loss < self.config.min_mean_loss_every_check_freq:
            if self.config.min_mean_loss_every_check_freq is not None:
                print('decrease loss by {0:.4f}'.format(self.config.min_mean_loss_every_check_freq-last_check_freq_mean_loss))
            self.config.min_mean_loss_every_check_freq = last_check_freq_mean_loss # update
            self.config.increase_mean_loss_times = 0 # reset to zero
        else:
            print('increase loss by {0:.4f}'.format(last_check_freq_mean_loss-self.config.min_mean_loss_every_check_freq))
            self.config.increase_mean_loss_times += 1

        if self.config.increase_mean_loss_times >= self.config.adjust_lr_increase_loss_times:
            self.config.learn_rate /= 10 # decrease init lr by half
            self.config.kl_targ /= 10 # decrease kl_targ, so that the lr tends to be smaller
            #self.config.increase_mean_loss_times = 0 # reset again
            print('decrease lr by half, now init lr is {0:.5f}'.format(self.config.learn_rate))



    def run(self):
        """run the training pipeline"""
        print ("start training from game:{}".format(self.config.start_game_num))
        try:
            for i in range(self.config.start_game_num, self.config.game_batch_num):

                self.self_play(self.config.play_batch_size) # big step 1
                print("iteration i:{}, episode_len:{}, augmented_len:{}, current_buffer_len:{}".format(i + 1,
                                                            self.episode_len, self.augmented_len, len(self.config.data_buffer)))
                # new Added parameters,So check for old config file
                if not hasattr(self.config, "episode_records"): setattr(config, "episode_records", [])
                self.config.episode_records.append(self.episode_len)

                if len(self.config.data_buffer) > self.config.batch_size:
                    loss_info = self.optimize(iteration=i+1) # big step 2
                    self.config.loss_records.append(loss_info)

                self.config.start_game_num = i + 1  # update for restart

                # check the performance of the current model,and save the model params
                if (i + 1) % self.config.check_freq == 0:
                    print("current iteration: {}".format(i + 1))
                    win_ratio = self.evaluate() #big step 3
                    self.check_loss_change() # check loss, and adjust init lr if necessary
                    self.save_model(win_ratio, i + 1)



        except KeyboardInterrupt:
            print('\n\rquit')
예제 #9
0
class TrainPipeline():
    def __init__(self):
        # params of the board and the game
        self.board_width = 6
        self.board_height = 6
        self.n_in_row = 4
        self.board = ShogiBoard()
        # training params
        self.learn_rate = 5e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.025
        self.check_freq = 50
        self.game_batch_num = 3000
        self.best_win_ratio = 0.0
        # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        # start training from a given policy-value net
        #        policy_param = pickle.load(open('current_policy.model', 'rb'))
        #        self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params = policy_param)
        # start training from a new policy-value net
        self.policy_value_net = PolicyValueNet()
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

    def collect_selfplay_data(self, n_games=1):
        """collect self-play data for training"""
        for i in range(n_games):
            winner, stepCounts, play_data = start_self_play(self.board,
                                                            self.mcts_player,
                                                            temp=self.temp)
            print("train-collect_selfplay_data: winner = %d" % winner)
            self.episode_len = stepCounts
            self.data_buffer.extend(play_data)

    def policy_update(self):
        """update the policy-value net"""
        mini_batch = random.sample(self.data_buffer, self.batch_size)
        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]
        old_probs, old_v = self.policy_value_net.policy_value(state_batch)
        for i in range(self.epochs):
            loss, entropy = self.policy_value_net.train_step(
                state_batch, mcts_probs_batch, winner_batch,
                self.learn_rate * self.lr_multiplier)
            new_probs, new_v = self.policy_value_net.policy_value(state_batch)
            kl = np.mean(
                np.sum(old_probs *
                       (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)),
                       axis=1))
            if kl > self.kl_targ * 4:  # early stopping if D_KL diverges badly
                break
        # adaptively adjust the learning rate
        if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
            self.lr_multiplier /= 1.5
        elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
            self.lr_multiplier *= 1.5

        explained_var_old = 1 - np.var(
            np.array(winner_batch) - old_v.flatten()) / np.var(
                np.array(winner_batch))
        explained_var_new = 1 - np.var(
            np.array(winner_batch) - new_v.flatten()) / np.var(
                np.array(winner_batch))
        print(
            "kl:{:.5f},lr_multiplier:{:.3f},loss:{},entropy:{},explained_var_old:{:.3f},explained_var_new:{:.3f}"
            .format(kl, self.lr_multiplier, loss, entropy, explained_var_old,
                    explained_var_new))
        return loss, entropy

    def policy_evaluate(self, n_games=10):
        """
        Evaluate the trained policy by playing games against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)
        pure_mcts_player = MCTS_Pure(c_puct=5,
                                     n_playout=self.pure_mcts_playout_num)
        win_cnt = defaultdict(int)
        for i in range(n_games):
            print("train-policy_evaluate: game = %d" % (i))
            winner = start_play(self.board,
                                current_mcts_player,
                                pure_mcts_player,
                                startPlayer=i % 2)
            win_cnt[winner] += 1
        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[0]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
            self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[0]))
        return win_ratio

    def run(self):
        """run the training pipeline"""
        try:
            curTime = datetime.datetime.now()
            writeTrainingLog("train-run: {}".format(curTime))
            for i in range(self.game_batch_num):
                print("train-run: train round %d" % i)
                writeTrainingLog("train-run: train round %d" % i)
                self.collect_selfplay_data(self.play_batch_size)
                print("batch i:{}, episode_len:{}".format(
                    i + 1, self.episode_len))
                writeTrainingLog("batch i:{}, episode_len:{}".format(
                    i + 1, self.episode_len))
                print("train-run: len of data_buffer = {}".format(
                    len(self.data_buffer)))
                if len(self.data_buffer) > self.batch_size:
                    loss, entropy = self.policy_update()
                # check the performance of the current model,and save the model params
                if (i + 1) % self.check_freq == 0:
                    print("current self-play batch: {}".format(i + 1))
                    writeTrainingLog("current self-play batch: {}".format(i +
                                                                          1))
                    win_ratio = self.policy_evaluate()
                    net_params = self.policy_value_net.get_policy_param(
                    )  # get model params
                    pickle.dump(net_params,
                                open('current_policy.model',
                                     'wb'))  # save model param to file
                    if win_ratio > self.best_win_ratio:
                        print("New best policy!!!!!!!!")
                        writeTrainingLog("New best policy!!!!!!!!")
                        self.best_win_ratio = win_ratio
                        pickle.dump(net_params,
                                    open('best_policy.model',
                                         'wb'))  # update the best_policy
                        if self.best_win_ratio == 1.0 and self.pure_mcts_playout_num < 5000:
                            self.pure_mcts_playout_num += 1000
                            self.best_win_ratio = 0.0
        except KeyboardInterrupt:
            print('\n\rquit')
            writeTrainingLog('\n\rquit')