Beispiel #1
0
class TrainPipeline():
    def __init__(self, init_model=None):
        # 设置棋盘和游戏的参数
        self.board_width = 10
        self.board_height = 10
        self.n_in_row = 5
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # 设置训练参数
        self.learn_rate = 2e-3  # 基准学习率
        self.lr_multiplier = 1.2  # 基于KL自动调整学习倍速
        self.temp = 1.0  # 温度参数
        self.n_playout = 400  # 每下一步棋,模拟的步骤数
        self.c_puct = 5  # exploitation和exploration之间的折中系数
        self.buffer_size = 10000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)  #使用 deque 创建一个双端队列
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02  # 早停检查
        self.check_freq = 50  # 每50次检查一次,策略价值网络是否更新
        self.game_batch_num = 2000  # 训练多少个epoch
        self.best_win_ratio = 0.0  # 当前最佳胜率,用他来判断是否有更好的模型
        # 弱AI(纯MCTS)模拟步数,用于给训练的策略AI提供对手
        self.pure_mcts_playout_num = 1000
        if init_model:
            # 通过init_model设置策略网络
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model,
                                                   use_gpu=True)
        else:
            # 训练一个新的策略网络
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   use_gpu=True)
        # AI Player,设置is_selfplay=1 自我对弈,因为是在进行训练
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

    # 通过旋转和翻转增加数据集, play_data: [(state, mcts_prob, winner_z), ..., ...]
    def get_equi_data(self, play_data):
        extend_data = []
        for state, mcts_porb, winner in play_data:
            # 在4个方向上进行expand,每个方向都进行旋转,水平翻转
            for i in [1, 2, 3, 4]:
                # 逆时针旋转
                equi_state = np.array([np.rot90(s, i) for s in state])
                equi_mcts_prob = np.rot90(
                    np.flipud(
                        mcts_porb.reshape(self.board_height,
                                          self.board_width)), i)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
                # 水平翻转
                equi_state = np.array([np.fliplr(s) for s in equi_state])
                equi_mcts_prob = np.fliplr(equi_mcts_prob)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
        return extend_data

    # 收集自我对弈数据,用于训练
    def collect_selfplay_data(self, n_games=1):
        for i in range(n_games):
            # 与MCTS Player进行对弈
            winner, play_data = self.game.start_self_play(self.mcts_player,
                                                          temp=self.temp)
            play_data = list(play_data)[:]
            # 保存下了多少步
            self.episode_len = len(play_data)
            # 增加数据 play_data
            play_data = self.get_equi_data(play_data)
            self.data_buffer.extend(play_data)

    # 更新策略网络
    def policy_update(self):
        mini_batch = random.sample(self.data_buffer, self.batch_size)
        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]
        # 保存更新前的old_probs, old_v
        old_probs, old_v = self.policy_value_net.policy_value(state_batch)
        for i in range(self.epochs):
            # 每次训练,调整参数,返回loss和entropy
            loss, entropy = self.policy_value_net.train_step(
                state_batch, mcts_probs_batch, winner_batch,
                self.learn_rate * self.lr_multiplier)
            # 输入状态,得到行动的可能性和状态值,按照batch进行输入
            new_probs, new_v = self.policy_value_net.policy_value(state_batch)
            # 计算更新前后两次的loss差
            kl = np.mean(
                np.sum(old_probs *
                       (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)),
                       axis=1))
            if kl > self.kl_targ * 4:  # early stopping if D_KL diverges badly
                break
        # 动态调整学习倍率 lr_multiplier
        if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
            self.lr_multiplier /= 1.5
        elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
            self.lr_multiplier *= 1.5

        explained_var_old = (1 -
                             np.var(np.array(winner_batch) - old_v.flatten()) /
                             np.var(np.array(winner_batch)))
        explained_var_new = (1 -
                             np.var(np.array(winner_batch) - new_v.flatten()) /
                             np.var(np.array(winner_batch)))
        print(("kl:{:.5f},"
               "lr_multiplier:{:.3f},"
               "loss:{},"
               "entropy:{},"
               "explained_var_old:{:.3f},"
               "explained_var_new:{:.3f}").format(kl, self.lr_multiplier, loss,
                                                  entropy, explained_var_old,
                                                  explained_var_new))
        return loss, entropy

    # 用于评估训练网络的质量,评估一共10场play,返回比赛胜率(赢1分、输0分、平0.5分)
    def policy_evaluate(self, n_games=10):
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)
        pure_mcts_player = MCTS_Pure(c_puct=5,
                                     n_playout=self.pure_mcts_playout_num)
        win_cnt = defaultdict(int)
        for i in range(n_games):
            # AI和弱AI(纯MCTS)对弈,不需要可视化 is_shown=0,双方轮流职黑 start_player=i % 2
            winner = self.game.start_play(current_mcts_player,
                                          pure_mcts_player,
                                          start_player=i % 2,
                                          is_shown=0)
            win_cnt[winner] += 1
        # 计算胜率,平手计为0.5分
        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
            self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
        return win_ratio

    def run(self):
        # 开始训练
        try:
            # 训练game_batch_num次,每个batch比赛play_batch_size场
            for i in range(self.game_batch_num):
                # 收集自我对弈数据
                self.collect_selfplay_data(self.play_batch_size)
                print("batch i:{}, episode_len:{}".format(
                    i + 1, self.episode_len))
                if len(self.data_buffer) > self.batch_size:
                    loss, entropy = self.policy_update()
                # 判断当前模型的表现,保存最优模型
                if (i + 1) % self.check_freq == 0:
                    print("current self-play batch: {}".format(i + 1))
                    win_ratio = self.policy_evaluate()
                    # 保存当前策略
                    self.policy_value_net.save_model('./current_policy.model')
                    if win_ratio > self.best_win_ratio:
                        print("发现新的最优策略,进行策略更新")
                        self.best_win_ratio = win_ratio
                        # 更新最优策略
                        self.policy_value_net.save_model('./best_policy.model')
                        if (self.best_win_ratio == 1.0
                                and self.pure_mcts_playout_num < 5000):
                            self.pure_mcts_playout_num += 1000
                            self.best_win_ratio = 0.0
        except KeyboardInterrupt:
            print('\n\rquit')
Beispiel #2
0
class TrainPipeline():
    def __init__(self, init_model=None):
        # params of the board and the game
        self.board_width = 15
        self.board_height = 15
        self.n_in_row = 5
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02
        self.check_freq = 200
        self.game_batch_num = 5000
        self.best_win_ratio = 0.0
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        #init_model = "best_policy.pt"
        if init_model:
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model,
                                                   use_gpu=True)

        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   use_gpu=True)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

    def get_equi_data(self, play_data):
        """augment the data set by rotation and flipping
        play_data: [(state, mcts_prob, winner_z), ..., ...]
        """
        extend_data = []
        for state, mcts_porb, winner in play_data:
            for i in [1, 2, 3, 4]:
                # rotate counterclockwise
                equi_state = np.array([np.rot90(s, i) for s in state])
                equi_mcts_prob = np.rot90(
                    np.flipud(
                        mcts_porb.reshape(self.board_height,
                                          self.board_width)), i)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
                # flip horizontally
                equi_state = np.array([np.fliplr(s) for s in equi_state])
                equi_mcts_prob = np.fliplr(equi_mcts_prob)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
        return extend_data

    def collect_selfplay_data(self, n_games=10):
        """collect self-play data for training"""
        for i in range(n_games):
            winner, play_data = self.game.start_self_play(self.mcts_player,
                                                          temp=self.temp)
            play_data = list(play_data)[:]
            self.episode_len = len(play_data)
            # augment the data
            play_data = self.get_equi_data(play_data)
            self.data_buffer.extend(play_data)

    def policy_update(self):
        """update the policy-value net"""
        mini_batch = random.sample(self.data_buffer, self.batch_size)
        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]
        old_probs, old_v = self.policy_value_net.policy_value(state_batch)
        for i in range(self.epochs):
            loss, entropy = self.policy_value_net.train_step(
                state_batch, mcts_probs_batch, winner_batch,
                self.learn_rate * self.lr_multiplier)
            new_probs, new_v = self.policy_value_net.policy_value(state_batch)
            kl = np.mean(
                np.sum(old_probs *
                       (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)),
                       axis=1))
            if kl > self.kl_targ * 4:  # early stopping if D_KL diverges badly
                break
        # adaptively adjust the learning rate
        if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
            self.lr_multiplier /= 1.5
        elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
            self.lr_multiplier *= 1.5

        explained_var_old = (1 -
                             np.var(np.array(winner_batch) - old_v.flatten()) /
                             np.var(np.array(winner_batch)))
        explained_var_new = (1 -
                             np.var(np.array(winner_batch) - new_v.flatten()) /
                             np.var(np.array(winner_batch)))
        print(("kl:{:.5f},"
               "lr_multiplier:{:.3f},"
               "loss:{},"
               "entropy:{},"
               "explained_var_old:{:.3f},"
               "explained_var_new:{:.3f}").format(kl, self.lr_multiplier, loss,
                                                  entropy, explained_var_old,
                                                  explained_var_new))
        return loss, entropy

    def policy_evaluate(self, n_games=15):
        """
        Evaluate the trained policy by playing against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)
        pure_mcts_player = MCTS_Pure(c_puct=5,
                                     n_playout=self.pure_mcts_playout_num)
        win_cnt = defaultdict(int)
        for i in range(n_games):
            print("running for game", i)
            winner = self.game.start_play(current_mcts_player,
                                          pure_mcts_player,
                                          start_player=i % 2,
                                          is_shown=0)
            win_cnt[winner] += 1
        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
            self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
        return win_ratio

    def run(self):
        """run the training pipeline"""
        try:
            for i in range(self.game_batch_num):
                self.collect_selfplay_data(self.play_batch_size)
                print("batch i:{}, episode_len:{}".format(
                    i + 1, self.episode_len))
                if len(self.data_buffer) > self.batch_size:
                    loss, entropy = self.policy_update()
                # check the performance of the current model,
                # and save the model params
                if (i + 1) % self.check_freq == 0:
                    print("current self-play batch: {}".format(i + 1))
                    win_ratio = self.policy_evaluate()
                    self.policy_value_net.save_model('./current_policy.model')
                    if win_ratio >= self.best_win_ratio:
                        print("New best policy!!!!!!!!")
                        self.best_win_ratio = win_ratio
                        # update the best_policy
                        self.policy_value_net.save_model('./best_policy.pt')
                        if (self.best_win_ratio == 1.0
                                and self.pure_mcts_playout_num < 5000):
                            self.pure_mcts_playout_num += 1000
                            self.best_win_ratio = 0.0
        except KeyboardInterrupt:
            print('\n\rquit')
Beispiel #3
0
class TrainPipeline():
    def __init__(self, init_model=None, board_width=6, board_height=6,
                 n_in_row=4, n_playout=400, use_gpu=False, is_shown=False,
                 output_file_name="", game_batch_number=1500):
        # 游戏和棋盘参数
        self.board_width = board_width
        self.board_height = board_height
        self.n_in_row = n_in_row
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # 训练参数
        self.learn_rate = 2e-3  #学习率α :0.002
        self.lr_multiplier = 1.0  # 根据 KL散度 适应性的调整学习率 
        self.temp = 1.0  # 温度参数t
        self.n_playout = n_playout  # 每次move的 模拟playout次数
        self.c_puct = 5 #c_put常量
        self.buffer_size = 10000 #缓冲区大小
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size) 
        self.play_batch_size = 1 
        self.epochs = 5  #  每次 update 的 train_steps
        self.kl_targ = 0.02
        self.check_freq = 100
        self.game_batch_num = game_batch_number #训练局数
        self.best_win_ratio = 0.0
        # 纯蒙特卡索搜索训练参数
        # 目的可以是作为真正训练的模型的对手
        self.pure_mcts_playout_num = 7000 #纯蒙特卡洛搜索模拟次数
        self.use_gpu = use_gpu 
        self.is_shown = is_shown
        self.output_file_name = output_file_name #输出的txt文件名
        #初始化神经网络
        self.policy_value_net = PolicyValueNet(self.board_width,
                                               self.board_height,
                                               model_file=init_model,
                                               use_gpu=self.use_gpu
                                               )
        #
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)


    def get_equi_data(self, play_data):
        """旋转和平移,增大本次play_data(因为同一个局面对应着其他三个等价局面)
        play_data: [(state, mcts_prob, winner_z), ..., ...]
        """
        extend_data = []
        for state, mcts_porb, winner in play_data:
            for i in [1, 2, 3, 4]:
                # rotate counterclockwise
                equi_state = np.array([np.rot90(s, i) for s in state]) #盘面旋转90度
                equi_mcts_prob = np.rot90(np.flipud(
                    mcts_porb.reshape(self.board_height, self.board_width)), i)
                extend_data.append((equi_state,
                                    np.flipud(equi_mcts_prob).flatten(),
                                    winner))
                # flip horizontally
                equi_state = np.array([np.fliplr(s) for s in equi_state])
                equi_mcts_prob = np.fliplr(equi_mcts_prob)
                extend_data.append((equi_state,
                                    np.flipud(equi_mcts_prob).flatten(),
                                    winner))
        return extend_data

    def collect_selfplay_data(self, n_games=1):
        """为训练收集self-play的数据"""
        for i in range(n_games):
            winner, play_data = self.game.start_self_play(self.mcts_player,
                                                          temp=self.temp)
            play_data = list(play_data)[:]#刚刚的zip元组类型转换成list
            self.episode_len = len(play_data) #事件数
            # 通过旋转平移得到四个等价的局面,对本次数据进行增大/增元 (augment)
            play_data = self.get_equi_data(play_data)
            self.data_buffer.extend(play_data)#将增大后的数据存入缓冲队列

    def policy_update(self):
        """使用训练结果取更新策略网络"""
        mini_batch = random.sample(self.data_buffer, self.batch_size) #从data_buffer中,随机获取batch_size个元素
        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]
        old_probs, old_v = self.policy_value_net.policy_value(state_batch)
        for i in range(self.epochs):
            loss, entropy = self.policy_value_net.train_step(
                    state_batch,
                    mcts_probs_batch,
                    winner_batch,
                    self.learn_rate*self.lr_multiplier)
            new_probs, new_v = self.policy_value_net.policy_value(state_batch)
            kl = np.mean(np.sum(old_probs * (
                    np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)),
                    axis=1)
            )
            if kl > self.kl_targ * 4:  # early stopping if D_KL diverges badly
                break
        # adaptively adjust the learning rate
        if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
            self.lr_multiplier /= 1.5
        elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
            self.lr_multiplier *= 1.5

        explained_var_old = (1 -
                             np.var(np.array(winner_batch) - old_v.flatten()) /
                             np.var(np.array(winner_batch)))
        explained_var_new = (1 -
                             np.var(np.array(winner_batch) - new_v.flatten()) /
                             np.var(np.array(winner_batch)))
        print(("kl:{:.5f},"
               "lr_multiplier:{:.3f},"
               "loss:{},"
               "entropy:{},"
               "explained_var_old:{:.3f},"
               "explained_var_new:{:.3f}"
               ).format(kl,
                        self.lr_multiplier,
                        loss,
                        entropy,
                        explained_var_old,
                        explained_var_new))
        return loss, entropy

    def policy_evaluate(self, n_games=10):
        """
        通过和纯MCTS玩家对战评估当前策略
        Note: 这仅仅是为了监控训练的进程
        """
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)
        pure_mcts_player = MCTS_Pure(c_puct=5,
                                     n_playout=self.pure_mcts_playout_num)
        win_cnt = defaultdict(int)
        for i in range(n_games):
            winner = self.game.start_play(current_mcts_player,
                                          pure_mcts_player,
                                          start_player=i % 2,
                                          is_shown=self.is_shown)
            win_cnt[winner] += 1
        win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
                self.pure_mcts_playout_num,
                win_cnt[1], win_cnt[2], win_cnt[-1]))
        return win_ratio

    def run(self):
        """开始训练"""
        #打开两个txt文件
        with open("info/"+str(self.board)+"_loss_"+self.output_file_name+".txt",'w') as loss_file:
            loss_file.write("self-play次数,loss,entropy\n")
        with open("info/"+str(self.board)+"_win_ration"+self.output_file_name+".txt", 'w') as win_ratio_file:
            win_ratio_file.write("self-play次数, pure_MCTS战力, 胜率\n")
        
        try:
            for i in range(self.game_batch_num):
                self.collect_selfplay_data(self.play_batch_size)#执行一次重0到分出胜负的模拟,并收集数据
                print("对局 i:{}, 事件数(走了多少步):{}".format(
                        i+1, self.episode_len))
                if len(self.data_buffer) > self.batch_size:
                    loss, entropy = self.policy_update()
                    with open("info/" + str(self.board) + "_loss_" + self.output_file_name + ".txt", 'a') as loss_file:
                        loss_file.write(str(i+1)+','+str(loss)+','+str(entropy)+'\n')
                # check the performance of the current model,
                # and save the model params
                if (i+1) % self.check_freq == 0:
                    print("当前的 self-play 对局: {}".format(i+1))
                    win_ratio = self.policy_evaluate()
                    with open("info/" + str(self.board) + "_win_ration" + self.output_file_name + ".txt",
                              'a') as win_ratio_file:
                        win_ratio_file.write(str(i+1)+','+str(self.pure_mcts_playout_num)+','+str(win_ratio)+'\n')
                    self.policy_value_net.save_model('./model/'+str(self.board_height)
                                                     +'_'+str(self.board_width)
                                                     +'_'+str(self.n_in_row)+
                                                     '_current_policy_'+output_file_name+'.model')
                    if win_ratio >= self.best_win_ratio:
                        print("更好的策略产生!!!!!!!!")
                        self.best_win_ratio = win_ratio
                        # update the best_policy
                        self.policy_value_net.save_model('./model/'+str(self.board_height)
                                                     +'_'+str(self.board_width)
                                                     +'_'+str(self.n_in_row)+
                                                     '_best_policy_'+output_file_name+'.model')
                        if (self.best_win_ratio == 1.0 and
                                self.pure_mcts_playout_num < 50000):
                            self.pure_mcts_playout_num += 1000
                            self.best_win_ratio = 0.0
        except KeyboardInterrupt:
            print('\n\rquit')
        loss_file.close()
        win_ratio_file.close()
Beispiel #4
0
class TrainPipeline():
    def __init__(self, init_model=None):
        # params of the board and the game
        self.board_width = 8  #6
        self.board_height = 8  #6
        self.n_in_row = 5  #4
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 5e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        # c_puct是MCTS里用来控制exploration-exploit tradeoff的参数
        # 这个参数越大的话MCTS搜索的过程中就偏向于均匀的探索,越小的话就偏向于直接选择访问次数多的分支
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.025
        # 检查当前策咯胜率的频率,当前设置为每50次训练后通过自我对弈评价当前策略
        # 如果找到更优策略,则保存当前策咯模型
        self.check_freq = 50
        #训练迭代次数
        self.game_batch_num = 1500
        self.best_win_ratio = 0.0
        # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy
        #每次训练蒙特卡洛树搜索的次数,初始化为1000(后续训练过程中会不断增加)
        self.pure_mcts_playout_num = 1000
        if init_model:
            # start training from an initial policy-value net
            policy_param = pickle.load(open(init_model, 'rb'))
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   net_params=policy_param)
        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

    def get_equi_data(self, play_data):
        """
        augment the data set by rotation and flipping
        play_data: [(state, mcts_prob, winner_z), ..., ...]"""
        extend_data = []
        for state, mcts_porb, winner in play_data:
            for i in [1, 2, 3, 4]:
                # rotate counterclockwise
                equi_state = np.array([np.rot90(s, i) for s in state])
                equi_mcts_prob = np.rot90(
                    np.flipud(
                        mcts_porb.reshape(self.board_height,
                                          self.board_width)), i)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
                # flip horizontally
                equi_state = np.array([np.fliplr(s) for s in equi_state])
                equi_mcts_prob = np.fliplr(equi_mcts_prob)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
        return extend_data

    def collect_selfplay_data(self, n_games=1):
        """collect self-play data for training"""
        for i in range(n_games):
            winner, play_data = self.game.start_self_play(self.mcts_player,
                                                          temp=self.temp)
            play_data_zip2list = list(play_data)  # add by haward
            self.episode_len = len(play_data_zip2list)
            # augment the data
            play_data = self.get_equi_data(play_data_zip2list)
            self.data_buffer.extend(play_data)

    def policy_update(self):
        """update the policy-value net"""
        mini_batch = random.sample(self.data_buffer, self.batch_size)
        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]
        old_probs, old_v = self.policy_value_net.policy_value(state_batch)
        for i in range(self.epochs):
            loss, entropy = self.policy_value_net.train_step(
                state_batch, mcts_probs_batch, winner_batch,
                self.learn_rate * self.lr_multiplier)
            new_probs, new_v = self.policy_value_net.policy_value(state_batch)
            # kl距离,也叫做相对熵,衡量的是相同事件空间里的两个概率分布的差异情况
            kl = np.mean(
                np.sum(old_probs *
                       (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)),
                       axis=1))
            if kl > self.kl_targ * 4:  # early stopping if D_KL diverges badly
                break
        # adaptively adjust the learning rate
        # 通过比较新旧两个神经网络输出的KL散度(信息增益)来控制学习率,使得学习率快死增加然后逐渐减少
        if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
            self.lr_multiplier /= 1.5
        elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
            self.lr_multiplier *= 1.5

        explained_var_old = 1 - np.var(
            np.array(winner_batch) - old_v.flatten()) / np.var(
                np.array(winner_batch))
        explained_var_new = 1 - np.var(
            np.array(winner_batch) - new_v.flatten()) / np.var(
                np.array(winner_batch))
        print(
            "kl:{:.5f},lr_multiplier:{:.3f},loss:{},entropy:{},explained_var_old:{:.3f},explained_var_new:{:.3f}"
            .format(kl, self.lr_multiplier, loss, entropy, explained_var_old,
                    explained_var_new))
        return loss, entropy

    def policy_evaluate(self, n_games=10):
        """
        Evaluate the trained policy by playing games against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)
        pure_mcts_player = MCTS_Pure(c_puct=5,
                                     n_playout=self.pure_mcts_playout_num)
        win_cnt = defaultdict(int)
        for i in range(n_games):
            winner = self.game.start_play(current_mcts_player,
                                          pure_mcts_player,
                                          start_player=i % 2,
                                          is_shown=0)
            win_cnt[winner] += 1
        #计算赢率,获胜积一分,平局积0.5分,失败不计分,再以总积分除以总比赛次数
        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
            self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
        return win_ratio

    def run(self):
        """run the training pipeline"""
        try:
            for i in range(self.game_batch_num):
                self.collect_selfplay_data(self.play_batch_size)
                print("batch i:{}, episode_len:{}".format(
                    i + 1, self.episode_len))
                if len(self.data_buffer) > self.batch_size:
                    loss, entropy = self.policy_update()
                # check the performance of the current model and save the model params
                if (i + 1) % self.check_freq == 0:
                    print("current self-play batch: {}".format(i + 1))
                    # 与纯蒙特卡洛树进行十局对弈,计算对弈胜率
                    win_ratio = self.policy_evaluate()
                    net_params = self.policy_value_net.get_policy_param(
                    )  # get model params
                    pickle.dump(
                        net_params, open('current_policy_8_8_5_new.model',
                                         'wb'),
                        pickle.HIGHEST_PROTOCOL)  # save model param to file
                    if win_ratio > self.best_win_ratio:
                        print("New best policy!!!!!!!!")
                        self.best_win_ratio = win_ratio
                        pickle.dump(
                            net_params,
                            open('best_policy_8_8_5_new.model', 'wb'),
                            pickle.HIGHEST_PROTOCOL)  # update the best_policy
                        #如果当前策咯价值网络胜率为1,则提高纯蒙特卡洛树的搜索次数,继续训练
                        if self.best_win_ratio == 1.0 and self.pure_mcts_playout_num < 5000:
                            self.pure_mcts_playout_num += 1000
                            self.best_win_ratio = 0.0
        except KeyboardInterrupt:
            print('\n\rquit')
Beispiel #5
0
class TrainPipeline:
    def __init__(self, config: Config):
        # params of the game

        self.config = config
        self.game = Game.from_config(config)

        # training params

        self.buffer_size = 10000
        self.min_data_to_collect = 128
        self.batch_size = 128  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)

        self.save_freq = 20
        self.eval_freq = self.save_freq * 100
        self.num_total_iter = self.eval_freq * 4
        assert self.num_total_iter % self.save_freq == 0
        assert self.num_total_iter % self.eval_freq == 0

        self.best_win_ratio = 0.0
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.policy_value_net = PolicyValueNet(self.config.size,
                                               model_file=config.model_file)
        self.mcts_player = MCTSPlayer(
            self.policy_value_net,
            c_puct=config.c_puct,
            n_playout=config.n_playout,
            is_selfplay=True,
        )

    def collect_selfplay_data(self):
        """collect self-play data for training"""
        n_game = 0

        while not len(self.data_buffer) > self.min_data_to_collect:
            _, play_data = self.game.start_self_play(self.mcts_player,
                                                     display=True)
            play_data = [(data[0], data[1], data[2]) for data in play_data]
            self.data_buffer.extend(play_data)
            n_game += 1

        return len(self.data_buffer), n_game

    def transform_data(self, board_input_batch, mcts_probs_batch):
        """rotate or flip the original data"""
        original_shape = (board_input_batch.shape, mcts_probs_batch.shape)

        for i in range(board_input_batch.shape[0]):
            n_rotate = np.random.randint(4)
            flip = np.random.randint(2)

            if not n_rotate or not flip:
                continue

            board_input = board_input_batch[i]
            board_size = board_input.shape[0]
            mcts_probs = mcts_probs_batch[i]
            mcts_place_probs = np.reshape(
                mcts_probs[0:board_size * board_size],
                (board_size, board_size))
            mcts_pass_move_prob = mcts_probs[-1]

            if n_rotate:
                board_input = np.rot90(board_input, n_rotate, axes=(1, 2))
                mcts_place_probs = np.rot90(mcts_place_probs, n_rotate)
            if flip:
                board_input = board_input.T
                mcts_place_probs = mcts_place_probs.T

            np.put(board_input_batch, i, board_input)
            np.put(mcts_probs_batch, i,
                   np.append(mcts_place_probs, mcts_pass_move_prob))

        transformed_shape = (board_input_batch.shape, mcts_probs_batch.shape)
        assert original_shape == transformed_shape
        return board_input_batch, mcts_probs

    def policy_update(self):
        """update the policy-value net"""
        random.shuffle(self.data_buffer)
        n_batchs = len(self.data_buffer) // self.batch_size

        for i in range(n_batchs):
            mini_batch = list(
                itertools.islice(self.data_buffer, i * self.batch_size,
                                 (i + 1) * self.batch_size))

            board_input_batch = np.array(
                [get_current_input(data[0]) for data in mini_batch])
            mcts_probs_batch = np.array([data[1] for data in mini_batch])
            winner_batch = np.array([data[2] for data in mini_batch])
            board_input_batch, mcts_probs_batch = self.transform_data(
                board_input_batch, mcts_probs_batch)

            old_probs, old_v = self.policy_value_net.policy_value(
                board_input_batch)

            self.policy_value_net.set_train_mode()

            loss, entropy = self.policy_value_net.train_step(
                board_input_batch, mcts_probs_batch, winner_batch)
            new_probs, new_v = self.policy_value_net.policy_value(
                board_input_batch)
            kl = np.mean(
                np.sum(
                    old_probs *
                    (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)),
                    axis=1,
                ))

            explained_var_old = 1 - np.var(
                np.array(winner_batch) - old_v.flatten()) / np.var(
                    np.array(winner_batch))
            explained_var_new = 1 - np.var(
                np.array(winner_batch) - new_v.flatten()) / np.var(
                    np.array(winner_batch))
            print(("batch:{}, "
                   "kl:{:.5f}, "
                   "loss:{:.5f}, "
                   "entropy:{:.5f}, "
                   "explained_var_old:{:.3f}, "
                   "explained_var_new:{:.3f}").format(
                       i,
                       kl,
                       loss,
                       entropy,
                       explained_var_old,
                       explained_var_new,
                   ))

    def run(self):
        """run the training pipeline"""
        np.random.seed(0)
        try:
            for i in range(self.num_total_iter):
                n_data, n_game = self.collect_selfplay_data()

                print("iteration {}: total {} data collected from {} game(s)".
                      format(i, n_data, n_game))

                self.policy_value_net.set_train_mode()
                self.policy_update()
                self.data_buffer.clear()
                # check the performance of the current model,
                # and save the model params
                if (i + 1) % self.save_freq == 0:
                    print("saving current model at {}: file={}".format(
                        i + 1, self.config.get_current_model_name()))
                    self.policy_value_net.save_model(
                        self.config.get_current_model_name())
                if (i + 1) % self.eval_freq == 0:
                    print("evalutating current model: {}".format(i + 1))
                    current_mcts_player = MCTSPlayer(
                        self.policy_value_net,
                        c_puct=self.config.c_puct,
                        n_playout=self.config.n_playout,
                    )
                    win_ratio = evaluate.evaluate_policy(
                        self.game, current_mcts_player)
                    if win_ratio > self.best_win_ratio:
                        print(
                            "saving the new best policy at {}! win_ratio={}, file={}"
                            .format(
                                i,
                                win_ratio,
                                self.config.get_best_model_name(),
                            ))
                        self.best_win_ratio = win_ratio
                        # update the best_policy
                        self.policy_value_net.save_model(
                            self.config.get_best_model_name())
        except KeyboardInterrupt:
            print("\n\rquit")
Beispiel #6
0
class AlphaZeroAgent:
    def __init__(self, lr_factor=1, kl_datum=0.02):
        self.agent = PVNet(BOARD_SIZE)
        self.reply_buffer = deque(maxlen=REPLAY_BUFF_CAPACITY)
        self.lr_factor = lr_factor
        self.kl_datum = kl_datum

    def load(self, model_path):
        self.agent.restore_model(model_path)
        return self

    def self_play(self, n):
        res_lst, step_lst = [], []
        s_lst, c_lst, p_lst, z_lst = [], [], [], []
        for _ in range(n):
            step, res = 0, 0  # 下棋的轮数和结果
            mcts = Mcts(self.agent)
            while res == 0:  # 非终局
                mcts.simulate(SELF_PLAY_SIMULATION_NUM - 10)
                s_lst.append(mcts.board.current_state())
                c_lst.append(mcts.board.color)
                tau = TAU if step < 20 else TAU_INFINITE
                pi, act, idx = mcts.play(tau, with_noise=True)
                p_lst.append(pi)
                mcts.shift_root(act, idx)
                res = mcts.board.judge()
                step += 1

            res_lst.append(res)
            step_lst.append(step)
            # 棋盘局势  -1黑胜,0对局未完,1白胜,2平局 跟环境保持一致
            if res == 2:  # 平局
                z = 0
            elif res == -1:  # 黑棋胜
                z = -1
            else:  # 白棋胜
                z = 1
            z_lst.extend([z * color for color in c_lst])
        # logging
        logger.info('self_play(%d局) 黑vs白:[%d/%d] 局长:%s' %
                    (n, res_lst.count(-1), res_lst.count(1), str(step_lst)))
        return zip(s_lst, p_lst, z_lst)

    @staticmethod
    def free_play(bord, agent):
        mcts = Mcts(agent, bord)
        mcts.simulate(FREE_PLAY_SIMULATION_NUM)
        _, act, _ = mcts.play(TAU_INFINITE)
        return act

    @staticmethod
    def eval(conn):
        """
        :param conn:
        :return:
        """

        players = [
            PVNet(BOARD_SIZE, device_str='cuda:1'),
            PVNet(BOARD_SIZE, device_str='cuda:1')
        ]
        while True:
            scores = [0, 0]
            model_paths = conn.recv()
            if model_paths is None:
                conn.close()
                break
            players[0].restore_model(model_paths[0])
            players[1].restore_model(model_paths[1])
            for i in range(EVAL_GAME_NUM):
                idx, _ = AlphaZeroAgent.duiyi(Board(), players, i % 2)
                if idx is not None:
                    scores[idx] += 1
            conn.send(scores)

    @staticmethod
    def duiyi(chess_board, players, pidx=0):
        """ 对弈
        :param chess_board: 初始棋盘
        :param players: 双方棋手
        :param pidx: 先手的索引: 0 or 1, 默认第一个棋手先走
        :return: 平局:None 否则返回胜者的索引
        """
        assert len(players) == 2

        act_lst = []
        board = chess_board.clone()
        res = board.judge()
        while res == 0:
            act = AlphaZeroAgent.free_play(board, players[pidx % 2])
            act_lst.append(act)
            pidx += 1
            row, col = board.action_1_to_2(act)
            board.move(row, col)
            res = board.judge()

        if res == 2:  # 平局
            winner = None
        else:
            winner = (pidx + 1) % 2

        return winner, act_lst

    def self_play_training_pipeline(self):
        # self-play
        play_data_lst = _augment_play_data(self.self_play(SELF_PLAY_GAME_NUM))
        random.shuffle(play_data_lst)
        self.reply_buffer.extend(play_data_lst)
        if len(self.reply_buffer) < BATCH_SIZE:
            return

        # optimize
        for _ in range(TRAIN_BATCH_NUM):
            batch = random.sample(self.reply_buffer, BATCH_SIZE)
            states, probs, zs = zip(*batch)
            loss, kl, entropy = self.agent.train_step(states, probs, zs,
                                                      self.lr_factor * LR)

            # adaptively adjust the learning rate
            if kl > self.kl_datum * 1.5 and self.lr_factor > 0.01:
                self.lr_factor /= 1.5
            elif kl < self.kl_datum / 2 and self.lr_factor < 100:
                self.lr_factor *= 1.5

            logger.info('loss: %.3f kl: %.3f entropy: %.3f lr_factor: %.3f' %
                        (loss, kl, entropy, self.lr_factor))

        # slow window trick
        if len(self.reply_buffer) < self.reply_buffer.maxlen:
            [self.reply_buffer.popleft() for _ in range(2 * 8)]
Beispiel #7
0
class TrainPipeline():
    def __init__(self, init_model=None):

        self.writer = SummaryWriter(WRITER_DIR)

        # params of the board and the game
        self.board_width = 6
        self.board_height = 6
        self.n_in_row = 4

        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)

        self.game = Game(self.board)

        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param

        self.buffer_size = 10000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02

        self.check_freq = 50
        self.game_batch_num = 5000

        self.improvement_counter = 1000
        self.best_win_ratio = 0.0

        self.input_plains_num = INPUT_PLANES_NUM

        self.c_puct = 5
        self.n_playout = 50  # num of simulations for each move
        self.shutter_threshold_availables = 1
        self.full_boards_selfplay = False

        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 200
        self.pure_mcts_playout_num_step = 200

        if init_model:
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(
                self.board_width,
                self.board_height,
                self.input_plains_num,
                model_file=init_model,
                shutter_threshold_availables=self.shutter_threshold_availables)
        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(
                self.board_width,
                self.board_height,
                self.input_plains_num,
                shutter_threshold_availables=self.shutter_threshold_availables)

        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

    def get_equi_data(self, play_data):
        """augment the data set by rotation and flipping
        play_data: [(state, mcts_prob, winner_z), ..., ...]
        """
        extend_data = []
        for state, mcts_porb, winner in play_data:
            for i in [1, 2, 3, 4]:
                # rotate counterclockwise
                equi_state = np.array([np.rot90(s, i) for s in state])
                equi_mcts_prob = np.rot90(
                    np.flipud(
                        mcts_porb.reshape(self.board_height,
                                          self.board_width)), i)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
                # flip horizontally
                equi_state = np.array([np.fliplr(s) for s in equi_state])
                equi_mcts_prob = np.fliplr(equi_mcts_prob)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
        return extend_data

    def collect_selfplay_data(self, n_games=1):

        self.episode_len = 0
        self.episode_len_full_1 = 0
        self.episode_len_full_2 = 0
        self.episode_len_full_3 = 0
        self.episode_len_full_4 = 0
        self.episode_len_full_5 = 0

        if self.full_boards_selfplay:
            """collect self-play data for training"""
            for i in range(n_games):

                #EMPTY BOARD:
                winner, play_data = self.game.start_self_play(
                    self.mcts_player,
                    temp=self.temp,
                    is_last_move=(self.input_plains_num == 4),
                    start_player=i % 2 + 1)
                play_data = list(play_data)[:]
                self.episode_len += len(play_data) / n_games

                # augment the data
                play_data = self.get_equi_data(play_data)
                self.data_buffer.extend(play_data)

                if self.board_width == 6:
                    #BOARD 1 FULL
                    board = copy.deepcopy(BOARD_1_FULL[0])
                    board = np.flipud(board)
                    i_board_1 = np.zeros(
                        (2, self.board_width, self.board_height))
                    i_board_1[0] = board == 1
                    i_board_1[1] = board == 2

                    winner_full_1, play_data_full_1 = self.game.start_self_play(
                        self.mcts_player,
                        temp=self.temp,
                        is_last_move=(self.input_plains_num == 4),
                        initial_state=i_board_1)

                    play_data_full_1 = list(play_data_full_1)[:]
                    self.episode_len_full_1 += len(play_data_full_1) / n_games

                    # augment the data
                    play_data_full_1 = self.get_equi_data(play_data_full_1)
                    self.data_buffer.extend(play_data_full_1)

                    # BOARD 2 FULL
                    board = copy.deepcopy(BOARD_2_FULL[0])
                    board = np.flipud(board)
                    i_board_2 = np.zeros(
                        (2, self.board_width, self.board_height))
                    i_board_2[0] = board == 1
                    i_board_2[1] = board == 2

                    winner_full_2, play_data_full_2 = self.game.start_self_play(
                        self.mcts_player,
                        temp=self.temp,
                        is_last_move=(self.input_plains_num == 4),
                        initial_state=i_board_2)

                    play_data_full_2 = list(play_data_full_2)[:]
                    self.episode_len_full_2 += len(play_data_full_2) / n_games

                    # augment the data
                    play_data_full_2 = self.get_equi_data(play_data_full_2)
                    self.data_buffer.extend(play_data_full_2)

                else:
                    # BOARD 3 FULL
                    board = copy.deepcopy(BOARD_3_FULL[0])
                    board = np.flipud(board)
                    i_board_3 = np.zeros(
                        (2, self.board_width, self.board_height))
                    i_board_3[0] = board == 1
                    i_board_3[1] = board == 2

                    winner_full_3, play_data_full_3 = self.game.start_self_play(
                        self.mcts_player,
                        temp=self.temp,
                        is_last_move=(self.input_plains_num == 4),
                        initial_state=i_board_3)

                    play_data_full_3 = list(play_data_full_3)[:]
                    self.episode_len_full_3 += len(play_data_full_3) / n_games

                    # augment the data
                    play_data_full_3 = self.get_equi_data(play_data_full_3)
                    self.data_buffer.extend(play_data_full_3)

                    # BOARD 4 FULL
                    board = copy.deepcopy(BOARD_4_FULL[0])
                    board = np.flipud(board)
                    i_board_4 = np.zeros(
                        (2, self.board_width, self.board_height))
                    i_board_4[0] = board == 1
                    i_board_4[1] = board == 2

                    winner_full_4, play_data_full_4 = self.game.start_self_play(
                        self.mcts_player,
                        temp=self.temp,
                        is_last_move=(self.input_plains_num == 4),
                        initial_state=i_board_4)

                    play_data_full_4 = list(play_data_full_4)[:]
                    self.episode_len_full_4 += len(play_data_full_4) / n_games

                    # augment the data
                    play_data_full_4 = self.get_equi_data(play_data_full_4)
                    self.data_buffer.extend(play_data_full_4)

                    # BOARD 5 FULL
                    board = copy.deepcopy(BOARD_5_FULL[0])
                    board = np.flipud(board)
                    i_board_5 = np.zeros(
                        (2, self.board_width, self.board_height))
                    i_board_5[0] = board == 1
                    i_board_5[1] = board == 2

                    winner_full_5, play_data_full_5 = self.game.start_self_play(
                        self.mcts_player,
                        temp=self.temp,
                        is_last_move=(self.input_plains_num == 4),
                        initial_state=i_board_5)

                    play_data_full_5 = list(play_data_full_5)[:]
                    self.episode_len_full_5 += len(play_data_full_5) / n_games

                    # augment the data
                    play_data_full_5 = self.get_equi_data(play_data_full_5)
                    self.data_buffer.extend(play_data_full_5)

        else:
            for i in range(n_games):
                # EMPTY BOARD:
                winner, play_data = self.game.start_self_play(
                    self.mcts_player,
                    temp=self.temp,
                    is_last_move=(self.input_plains_num == 4),
                    start_player=i % 2 + 1)
                play_data = list(play_data)[:]
                self.episode_len += len(play_data) / n_games

                # augment the data
                play_data = self.get_equi_data(play_data)
                self.data_buffer.extend(play_data)

    def policy_update(self, iteration):
        """update the policy-value net"""
        mini_batch = random.sample(self.data_buffer, self.batch_size)
        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]
        old_probs, old_v = self.policy_value_net.policy_value(state_batch)

        for i in range(self.epochs):
            loss, entropy = self.policy_value_net.train_step(
                state_batch, mcts_probs_batch, winner_batch,
                self.learn_rate * self.lr_multiplier)

            new_probs, new_v = self.policy_value_net.policy_value(state_batch)

            kl = np.mean(
                np.sum(old_probs *
                       (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)),
                       axis=1))

            if kl > self.kl_targ * 4:  # early stopping if D_KL diverges badly
                break
            # adaptively adjust the learning rate

        if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
            self.lr_multiplier /= 1.5
        elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
            self.lr_multiplier *= 1.5

        explained_var_old = (1 -
                             np.var(np.array(winner_batch) - old_v.flatten()) /
                             np.var(np.array(winner_batch)))
        explained_var_new = (1 -
                             np.var(np.array(winner_batch) - new_v.flatten()) /
                             np.var(np.array(winner_batch)))

        train_str = ("kl:{:.5f}, "
                     "lr_multiplier:{:.3f}, "
                     "loss:{}, "
                     "entropy:{}, "
                     "explained_var_old:{:.3f}, "
                     "explained_var_new:{:.3f}").format(
                         kl, self.lr_multiplier, loss, entropy,
                         explained_var_old, explained_var_new)

        print(train_str)

        self.writer.add_scalar('lr multiplier', self.lr_multiplier,
                               iteration + 1)
        self.writer.add_scalar('kl_', kl, iteration + 1)
        self.writer.add_scalar('explained var old', explained_var_old,
                               iteration + 1)
        self.writer.add_scalar('explained var new', explained_var_new,
                               iteration + 1)
        self.writer.add_scalar('training loss', loss, iteration + 1)
        self.writer.add_scalar('training entropy', entropy, iteration + 1)

        # self.writer.add_scalars("training tracking", {'lr multiplier': self.lr_multiplier,
        #                                               'kl': kl,
        #                                               'explained var old': explained_var_old,
        #                                               'explained var new': explained_var_new,
        #                                               'training loss':loss,
        #                                               'training entropy':entropy},
        #                                                i+1)

        return loss, entropy

    def policy_evaluate(self, iteration, n_games=10):
        """
        Evaluate the trained policy by playing against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)

        pure_mcts_player = MCTS_Pure(c_puct=5,
                                     n_playout=self.pure_mcts_playout_num)

        win_cnt = defaultdict(int)
        for i in range(n_games):
            winner = self.game.start_play(current_mcts_player,
                                          pure_mcts_player,
                                          start_player=i % 2 + 1,
                                          is_shown=0,
                                          savefig=False)
            win_cnt[winner] += 1
        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
        print("num_playouts: {}, win: {}, lose: {}, tie:{}".format(
            self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))

        self.writer.add_text(
            tag='evaluation results',
            text_string=
            f"num_playouts: {self.pure_mcts_playout_num}, win: {win_cnt[1]}, lose: {win_cnt[2]}, tie:{win_cnt[-1]}",
            global_step=iteration + 1)

        return win_ratio

    def run(self):
        """run the training pipeline"""

        if not os.path.exists(MODEL_DIR):
            os.makedirs(MODEL_DIR)

        try:
            improvement_counter_local = 0

            for i in range(self.game_batch_num):

                self.writer.add_scalar('MCTS playouts num',
                                       self.pure_mcts_playout_num, i + 1)

                self.collect_selfplay_data(self.play_batch_size)

                if self.full_boards_selfplay:

                    if self.board_width == 6:
                        print(
                            "batch i:{}, episode_len:{}, episode len full 1: {}, episode len full 2: {}"
                            .format(i + 1, self.episode_len,
                                    self.episode_len_full_1,
                                    self.episode_len_full_2))
                        self.writer.add_scalar('episode len full 1',
                                               self.episode_len_full_1, i + 1)
                        self.writer.add_scalar('episode len full 2',
                                               self.episode_len_full_2, i + 1)
                        self.writer.add_scalar('episode len', self.episode_len,
                                               i + 1)

                    else:
                        print(
                            "batch i:{}, episode_len:{}, episode len full 3: {}, episode len full 4: {}, episode len full 5: {}"
                            .format(i + 1, self.episode_len,
                                    self.episode_len_full_3,
                                    self.episode_len_full_4,
                                    self.episode_len_full_5))

                        self.writer.add_scalar('episode len full 3',
                                               self.episode_len_full_3, i + 1)
                        self.writer.add_scalar('episode len full 4',
                                               self.episode_len_full_4, i + 1)
                        self.writer.add_scalar('episode len full 5',
                                               self.episode_len_full_5, i + 1)

                        self.writer.add_scalar('episode len', self.episode_len,
                                               i + 1)

                else:
                    print("batch i:{}, episode_len:{}".format(
                        i + 1, self.episode_len))
                    self.writer.add_scalar('episode len', self.episode_len,
                                           i + 1)

                if len(self.data_buffer) > self.batch_size:
                    loss, entropy = self.policy_update(iteration=i)

                # check the performance of the current model,
                # and save the model params
                if (i + 1) % self.check_freq == 0:
                    print("current self-play batch: {}".format(i + 1))
                    win_ratio = self.policy_evaluate(iteration=i)

                    self.policy_value_net.save_model(
                        f'{MODEL_DIR}/current_policy_{i + 1}.model')

                    if win_ratio > self.best_win_ratio:

                        self.writer.add_text('best model savings',
                                             'better model found', i + 1)

                        print("New best policy!!!!!!!!")

                        improvement_counter_local = 0
                        self.best_win_ratio = win_ratio

                        # update the best_policy
                        # self.policy_value_net.save_model(f'{MODEL_DIR}/best_policy.model')

                        # if (self.best_win_ratio == 1.0 and
                        #         self.pure_mcts_playout_num < 5000):

                        if self.best_win_ratio == 1.0:
                            self.pure_mcts_playout_num += self.pure_mcts_playout_num_step
                            self.best_win_ratio = 0.0

                    else:
                        improvement_counter_local += 1
                        if improvement_counter_local == self.improvement_counter:
                            print(
                                f"No better policy was found in the last {self.improvement_counter} "
                                f"checks. Ending training. ")

                            self.writer.add_text(
                                'best model savings',
                                f"No better policy was found "
                                f"in the last {self.improvement_counter} "
                                f"checks. Ending training. ", i + 1)
                            break

        except KeyboardInterrupt:
            print('\n\rquit')
Beispiel #8
0
class TrainPipeline():
    def __init__(self, init_model=None):
        # params of the board and the game
        self.board_width = 6
        self.board_height = 6
        self.n_in_row = 4
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 2e-3
        self.data_buffer = deque(maxlen=1000)
        self.batch_size = 10
        self.temp = 1.0  # the temperature param
        self.n_playout = 40  # num of simulations for each move
        self.c_puct = 5
        self.epochs = 50

        self.pure_mcts_playout_num = 2
        self.best_win_ratio = 0.0

        self.policy_value_net = PolicyValueNet(self.board_width,
                                               self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

    def collect_selfplay_data(self, n_games=1):
        """collect self-play data for training"""
        print("Phase 1: Collecting Data")
        for i in range(n_games):
            winner, play_data = self.game.start_self_play(self.mcts_player,
                                                          temp=self.temp)
            play_data = list(play_data)[:]
            self.data_buffer.extend(play_data)

    def policy_update(self):
        """update the policy-value net"""
        print("Phase 2: Updating the Network")
        mini_batch = random.sample(self.data_buffer, self.batch_size)

        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]

        for i in range(self.epochs):
            loss, entropy = self.policy_value_net.train_step(
                state_batch, mcts_probs_batch, winner_batch, self.learn_rate)
            #print("Loss is {}, Entropy is {}".format(loss,entropy))
        return loss, entropy

    def policy_evaluate(self, n_games=10):
        """
        Evaluate the trained policy by playing against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """

        print("Phase 3: Evaluatiing the Network")
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)
        pure_mcts_player = MCTS_Pure(c_puct=5,
                                     n_playout=self.pure_mcts_playout_num)
        win_cnt = defaultdict(int)
        for i in range(n_games):
            winner = self.game.start_play(current_mcts_player,
                                          pure_mcts_player,
                                          start_player=i % 2,
                                          is_shown=0)
            win_cnt[winner] += 1
        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
            self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
        return win_ratio

    def run(self):
        """run the training pipeline"""
        try:
            for i in range(5):
                print("Then {} / {} Training".format(i, 5))
                self.collect_selfplay_data(5)
                if len(self.data_buffer) > self.batch_size:
                    loss, entropy = self.policy_update()
                    print("Final Loss : {} , Final Entropy: {}".format(
                        loss, entropy))

                win_ratio = self.policy_evaluate(7)
                print("Win-Ratio: ", win_ratio)
                #self.policy_value_net.save_model('./current_policy.model')
                if win_ratio > self.best_win_ratio:
                    print("New best policy!!!!!!!!")
                    self.best_win_ratio = win_ratio
            self.policy_value_net.save_model('./best_policy.model')
        except KeyboardInterrupt:
            print('\n\rquit')
Beispiel #9
0
class TrainPipeline():
    def __init__(self, init_model=None):
        # params of the board and the game
        #width of chessboard
        self.board_width = 8  #6 #10
        #height of chessboard
        self.board_height = 8 #6 #10
        #conditions for victory
        self.n_in_row = 5     #4 #5
        self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params 
        self.learn_rate = 5e-3   #learning rate
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0 # the temperature param
        self.n_playout = 400 # num of simulations for each move
        self.c_puct = 5
        self.buffer_size = 10000 #The number of maximum elements in the queue
        self.batch_size = 512 # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size) # queue size      
        self.play_batch_size = 1 # collect a set of data if it self-play once
        self.epochs = 5 # num of train_steps for each update
        self.kl_targ = 0.025 #KL target
        #check frequency: evaluate the game and current AI model every 50 times of self-play
        #The evaluation method is to use the latest AI model and MCTs-pure AI (based on random roll out) to fight 10 rounds
        self.check_freq = 50  #50
        self.game_batch_num = 200 #the number of training batches
        self.best_win_ratio = 0.0 #historical best winning rate
        # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000  
        if init_model:
            # start training from an initial policy-value net
            #pickle.load(file)反序列化对象。将文件中的数据解析为一个pytorch对象
            policy_param = pickle.load(open(init_model, 'rb')) #使用‘rb’按照二进制位读取
            self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params = policy_param)
        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) 
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)

    def get_equi_data(self, play_data):
        """
        augment the data set by rotation and flipping
        play_data: [(state, mcts_prob, winner_z), ..., ...]"""
        extend_data = []
        for state, mcts_porb, winner in play_data:
            for i in [1,2,3,4]:
                # rotate counterclockwise 
                equi_state = np.array([np.rot90(s,i) for s in state])
                equi_mcts_prob = np.rot90(np.flipud(mcts_porb.reshape(self.board_height, self.board_width)), i)
                extend_data.append((equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
                # flip horizontally
                equi_state = np.array([np.fliplr(s) for s in equi_state])
                equi_mcts_prob = np.fliplr(equi_mcts_prob)
                extend_data.append((equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
        return extend_data
                
    def collect_selfplay_data(self, n_games=1):
        """collect self-play data for training"""
        for i in range(n_games):
            winner, play_data = self.game.start_self_play(self.mcts_player, temp=self.temp)
            play_data_zip2list = list(play_data)  # add by haward
            self.episode_len = len(play_data_zip2list)
            # augment the data
            play_data = self.get_equi_data(play_data_zip2list)
            self.data_buffer.extend(play_data)
                        
    def policy_update(self):
        """update the policy-value net"""
        mini_batch = random.sample(self.data_buffer, self.batch_size)
        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]            
        old_probs, old_v = self.policy_value_net.policy_value(state_batch) 
        for i in range(self.epochs): 
            loss, entropy = self.policy_value_net.train_step(state_batch, mcts_probs_batch, winner_batch, self.learn_rate*self.lr_multiplier)
            new_probs, new_v = self.policy_value_net.policy_value(state_batch)
            kl = np.mean(np.sum(old_probs * (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)), axis=1))  
            if kl > self.kl_targ * 4:   # early stopping if D_KL diverges badly
                break
        # adaptively adjust the learning rate
        if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
            self.lr_multiplier /= 1.5
        elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
            self.lr_multiplier *= 1.5
            
        explained_var_old =  1 - np.var(np.array(winner_batch) - old_v.flatten())/np.var(np.array(winner_batch))
        explained_var_new = 1 - np.var(np.array(winner_batch) - new_v.flatten())/np.var(np.array(winner_batch))        
        print("kl:{:.5f},lr_multiplier:{:.3f},loss:{},entropy:{},explained_var_old:{:.3f},explained_var_new:{:.3f}".format(
                kl, self.lr_multiplier, loss, entropy, explained_var_old, explained_var_new))
        return loss, entropy
        
    def policy_evaluate(self, n_games=10,batch=0):
        """
        Evaluate the trained policy by playing games against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout)
        pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num)
        win_cnt = defaultdict(int)
        for i in range(n_games):
            winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i%2, is_shown=0)
            win_cnt[winner] += 1
        win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1])/n_games
        print("batch_i:{}, num_playouts:{}, win: {}, lose: {}, tie:{}".format(batch, self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
        logging.debug("batch_i {} num_playouts {} win {} lose {} tie {}".format(batch, self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
        return win_ratio
    
    def run(self):
        """run the training pipeline"""
        try:
            for i in range(self.game_batch_num):                
                self.collect_selfplay_data(self.play_batch_size)# collect a set of data if it self-play once
                print("batch_i:{}, episode_len:{}".format(i+1, self.episode_len))
                #logging.debug("batch_i:{}, episode_len:{}".format(i+1, self.episode_len))
                if len(self.data_buffer) > self.batch_size:
                    #当队列中的数据量大于mini-batch梯度下降所需的最小批量数目时,我们便可以从队列中随机选择最小批量的数据,用于训练更新网络
                    loss, entropy = self.policy_update()
                    logging.debug("batch_i {} loss {}".format(i+1, loss))
                #check the performance of the current model and save the model params
                if (i+1) % self.check_freq == 0: #每隔50次self-play对局就对当前AI模型进行一次评估
                    print("current self-play batch: {}".format(i+1))
                    win_ratio = self.policy_evaluate(batch= i+1)
                    net_params = self.policy_value_net.get_policy_param() # get model params
                    #保存模型,序列化对象,并将结果数据流写入到文件对象中
                    pickle.dump(net_params, open('current_policy_8_8_5_new.model', 'wb'), pickle.HIGHEST_PROTOCOL) # save model param to file
                    if win_ratio > self.best_win_ratio: #胜率提高,更新模型参数
                        print("New best policy!!!!!!!!")
                        self.best_win_ratio = win_ratio
                        pickle.dump(net_params, open('best_policy_8_8_5_new.model', 'wb'), pickle.HIGHEST_PROTOCOL) # update the best_policy
                        if self.best_win_ratio == 1.0 and self.pure_mcts_playout_num < 5000:
                            #当胜率达到100的时候纯蒙特卡洛模拟次数增加1000次,胜率清0
                            self.pure_mcts_playout_num += 1000
                            self.best_win_ratio = 0.0
        except KeyboardInterrupt:
            print('\n\rquit')