Exemple #1
0
 def __init__(self, init_model=None):
     self.board = CSB_Game()
     self.game = Game(self.board)
     # training params
     self.learn_rate = .001
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 50  # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 50  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 20  # num of train_steps for each update
     self.kl_targ = 0.02
     self.check_freq = 100000000000000000000000
     self.game_batch_num = 200000000
     self.best_win_ratio = 0.0
     # num of simulations used for the pure mcts, which is used as
     # the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 1000
     if init_model:
         # start training from an initial policy-value net
         self.policy_value_net = PolicyValueNet(model_file=init_model)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet()
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
Exemple #2
0
 def __init__(self, flag_is_shown=False, flag_is_train=True):
     # training params
     self.flag_is_shown = flag_is_shown
     self.flag_is_train = flag_is_train
     self.game = Game(self.flag_is_shown, self.flag_is_train)
     self.NN = PolicyValueNet(
         (4, self.game.board_width, self.game.board_height))
     if not self.flag_is_train:
         self.NN.load_model("./paras/policy.model")
     self.mcts_player = MCTSPlayer(self.NN.propagation)
Exemple #3
0
    def __init__(self, init_model=None):
        #cpu count
        self.n_workers = multiprocessing.cpu_count() -1
        self.worker_pool = None

        #
        self.episode_len = -1

        # params of the board and the game
        self.board_width = 15
        self.board_height = 15
        self.n_in_row = 5
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        self.c_puct = 5
        self.buffer_size = 36000
        self.batch_size = 1024  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.pre_data_size = 5
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02
        # self.check_freq = 50
        self.check_freq = 30 # when use predfined data
        self.game_batch_num = 300
        self.best_win_ratio = 0.0
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        self.batch_i = 0

        if init_model:
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model)
            self.best_win_ratio = 0.6
        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)
Exemple #4
0
def run():
    n = 5
    width, height = 9, 9
    model_file = './current_model_9_9_5.h5'
    try:
        global winner, game, BlockingThread
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        # USE ML
        best_policy = PolicyValueNet(width, height, model_file=model_file)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=400)
        ###

        while True:
            BlockingThread = True  # blocking
            print("new game starts")
            # set start_player=0 for human first
            winner = game.start_play(Client(),
                                     mcts_player,
                                     start_player=0,
                                     is_shown=1,
                                     send_step=send_step)
            has_winner(winner)
            eventlet.sleep(1)
            print("game end")
            while BlockingThread:  # blocking
                eventlet.sleep(2)

    except KeyboardInterrupt:
        print('\n\rquit')
Exemple #5
0
    def __init__(self, init_model = None, last_iteration = None):
        # params of the board and the game
        self.board_width = 9
        self.board_height = 9
        self.n_in_row = 5
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02
        self.check_freq = 200
        self.game_batch_num = 1500
        self.best_win_ratio = 0.95
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 3500
        if init_model:
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model)
            self.last_iteration = last_iteration

        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
                                                   
            self.last_iteration = 0

        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)
Exemple #6
0
class TrainPipeline():
    save_ParaFreq = 200
    MAX_EPISODES = 2000

    def __init__(self, flag_is_shown=False, flag_is_train=True):
        # training params
        self.flag_is_shown = flag_is_shown
        self.flag_is_train = flag_is_train
        self.game = Game(self.flag_is_shown, self.flag_is_train)
        self.NN = PolicyValueNet(
            (4, self.game.board_width, self.game.board_height))
        if not self.flag_is_train:
            self.NN.load_model("./paras/policy.model")
        self.mcts_player = MCTSPlayer(self.NN.propagation)

    def train(self):
        """run the training pipeline"""
        for episode in range(self.MAX_EPISODES):
            if self.flag_is_train:
                winner, play_data = self.game.start_self_play(self.mcts_player)
                self.NN.memory(play_data)
                if len(self.NN.data_buffer) > self.NN.batch_size:
                    loss = self.NN.policy_update()
                else:
                    print(
                        "Collecting data: %d%%, " %
                        (len(self.NN.data_buffer) / self.NN.batch_size * 100),
                        end="")
                # and save the model params
                if (episode + 1) % self.save_ParaFreq == 0:
                    self.NN.save_model('./paras/policy.model')
                print("episode = %d" % episode)
            else:
                self.game.start_play(self.mcts_player)
Exemple #7
0
def single_game_play(num,initmode):
    print('Starting worker {} '.format(num))
    board = Board(width=board_width,
                  height=board_height,
                  n_in_row=n_in_row)
    game = Game(board)
    if initmode:
        policy_value_net = PolicyValueNet(board_width,board_height,model_file=initmode)
    else:
        policy_value_net = PolicyValueNet(board_width,board_height)

    mcts_player = MCTSPlayer(policy_value_net.policy_value_fn,
                                 c_puct=c_puct,
                                 n_playout=n_playout,
                                 is_selfplay=1)

    winner, play_data = game.start_self_play(mcts_player,temp=temp)
    #should not do following line because zip function return a iterator instead of a static data strutcure like list
    #playlen = len(list(play_data))
    #print('Exiting worker{} and len is {}'.format(num,playlen))
    #logging.info('Exiting worker{} and len is {}'.format(num,playlen))
    return winner, play_data
Exemple #8
0
 def __init__(self, init_model='./current_policy.hdf5'):
     # 棋盘参数
     self.board_width = 8
     self.board_height = 8
     self.n_in_row = 5
     self.board = Board(width=self.board_width,
                        height=self.board_height,
                        n_in_row=self.n_in_row)
     self.game = Game(self.board)
     # t训练的参数
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0
     self.temp = 1.0  # 温度参数
     self.n_playout = 400  # 每一次落子模拟次数
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 512
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  #每次更新的训练步数
     self.kl_targ = 0.02
     self.check_freq = 50
     self.game_batch_num = 1500
     self.best_win_ratio = 0.0
     # 对策略评估使用的MCTS
     self.pure_mcts_playout_num = 2000
     if init_model:
         # 从现有的网络开始训练
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                model_file=init_model)
     else:
         # 从新的网络开始训练
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
    def get_action(self, board):
        print("AI's turn")
        try:
            model_file = './best_model_9_9_5.h5'
            best_policy = PolicyValueNet(9, 9, model_file=model_file)
            mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                     c_puct=5,
                                     n_playout=400)

        except Exception as e:
            print(e)
            move = -1
        if move == -1 or move not in board.availables:
            print(f"invalid move: {move}")
            move = self.get_action(board)
        return move
Exemple #10
0
    def run(self):
        n = 5
        width, height = 9, 9
        model_file = './best_model_9_9_5.h5'
        print('Game start.')
        try:
            board = Board(width=width, height=height, n_in_row=n)
            game = Game(board)
            # ############### human VS AI ###################
            # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow
            
            best_policy = PolicyValueNet(width, height, model_file = model_file)
            mcts_player = BraccioPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)
            #mcts_player = MCTSPlayer(best_policy.policy_value_fn,c_puct=5,n_playout=400)
            from braccio_player import init
            init(self.testMode)
            
            # load the provided model (trained in Theano/Lasagne)
            #  into a MCTS player written in pure numpy
            """
            try:
                policy_param = pickle.load(open(model_file, 'rb'))
            except:
                policy_param = pickle.load(open(model_file, 'rb'),
                                        encoding='bytes')  # To support python3
            best_policy = PolicyValueNetNumpy(width, height, policy_param)
            mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                    c_puct=5,
                                    n_playout=400)  # set larger n_playout for better performance
            """
            # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
            #mcts_player = MCTS_Pure(c_puct=5, n_playout=3000)

            # human player, input your move in the format: 2,3
            human = self.client

            # set start_player=0 for human first
            winner = game.start_play(human, mcts_player, start_player=self.who_first, is_shown=1)
            
            print(f'[Play with Robot] winner: {winner}')
            if self.parent != None:
                self.parent.end_game(winner)
            
        except KeyboardInterrupt:
            print('\n\rquit')
        cv2.destroyAllWindows()
Exemple #11
0
def run():
    # n = 5
    # width, height = 8, 8
    # model_file = 'best_policy_8_8_5.model'
    n = 5
    width, height = 9, 9
    iteration = 1000

    model_file = './model/current_policy_{}_{}_{}_iteration{}.model'.format(height,width,n,iteration)
    #model_file = './model/best_policy_{}_{}_{}.model'.format(height,width,n)
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

        best_policy = PolicyValueNet(width, height, model_file = model_file)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        # try:
        #     policy_param = pickle.load(open(model_file, 'rb'))
        # except:
        #     policy_param = pickle.load(open(model_file, 'rb'),
        #                                encoding='bytes')  # To support python3
        # best_policy = PolicyValueNetNumpy(width, height, policy_param)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn,
        #                          c_puct=5,
        #                          n_playout=400)  # set larger n_playout for better performance

        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)

        # human player, input your move in the format: 2,3
        human = Human()

        # set start_player=0 for human first
        #game.start_play(human, mcts_player, start_player=1, is_shown=1)
        
        mcts_player2 = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)
        game.start_play(mcts_player2, mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Exemple #12
0
def run():
    if (len(sys.argv)) != 2:
        print(sys.argv)
        print("Need to provide one argument, the model which to play with")
        sys.exit(0)

    n = 5
    width, height = 15, 15
    model_file = sys.argv[1]
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

        best_policy = PolicyValueNet(width, height, model_file=model_file)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        try:
            policy_param = pickle.load(open(model_file, 'rb'))
        except:
            policy_param = pickle.load(open(model_file, 'rb'),
                                       encoding='bytes')  # To support python3
        # best_policy = PolicyValueNetNumpy(width, height, policy_param)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn,
        #                         c_puct=5,
        #                         n_playout=400)  # set larger n_playout for better performance

        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)

        # human player, input your move in the format: 2,3
        human = Human()

        # set start_player=0 for human first
        game.start_play(human, mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Exemple #13
0
def run():
    n = 5
    width, height = 9, 9
    model_file = './n400-o/current_model_9_9_5_o_50.h5'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

        best_policy = PolicyValueNet(width, height, model_file=model_file)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        """
        try:
            policy_param = pickle.load(open(model_file, 'rb'))
        except:
            policy_param = pickle.load(open(model_file, 'rb'),
                                       encoding='bytes')  # To support python3
        best_policy = PolicyValueNetNumpy(width, height, policy_param)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=400)  # set larger n_playout for better performance
        """
        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        # mcts_player = MCTS_Pure(c_puct=5, n_playout=3000)

        # human player, input your move in the format: 2,3
        human = Human()

        # set start_player=0 for human first
        game.start_play(human, mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Exemple #14
0
def run():
    n = 5
    width, height = 9, 9
    iteration = 1000

    model_file = './model/current_policy_{}_{}_{}_iteration{}.model'.format(
        height, width, n, iteration)
    #model_file = './model/best_policy_{}_{}_{}.model'.format(height,width,n)
    try:
        board = Board(width=width, height=height, n_in_row=n)

        best_policy = PolicyValueNet(width, height, model_file=model_file)
        AI_player1 = MCTSPlayer(best_policy.policy_value_fn,
                                c_puct=5,
                                n_playout=400)
        AI_player2 = MCTSPlayer(best_policy.policy_value_fn,
                                c_puct=5,
                                n_playout=400)
        human = Human()

        game = Game("AlphaZero Gomoku", board, AI_player1, AI_player2)
        while True:
            game.play()
            pygame.display.update()

            for event in pygame.event.get():
                if event.type == pygame.QUIT:
                    pygame.quit()
                    exit()
                elif event.type == pygame.MOUSEBUTTONDOWN:
                    mouse_x, mouse_y = pygame.mouse.get_pos()
                    game.mouseClick(mouse_x, mouse_y)
                    game.check_buttons(mouse_x, mouse_y)

    except KeyboardInterrupt:
        print('\n\rquit')
Exemple #15
0
 def __init__(self):
     """
     關於訓練的初始設置
     
     *補充說明
     kl 用於計算 lr (learning rate)
     """
     # run() -----------------------------------------------------------------------------------
     self.game_batch_num = -1  # 跑一次訓練的重複次數,負值代表不限制
     self.play_batch_size = 1    # 自我訓練的執行次數
     self.batch_size = 1024     # 每次要訓練的資料量,當 data_buffer 的資料累積到超過本數值就會更新 policy
     self.check_freq = 50        # 每訓練 ( check_freq ) 次就會與MCTS比賽
     self.save_freq = 50 # 每訓練 ( save_freq ) 次就會存檔
     
     # collect_selfplay_data() -----------------------------------------------------------------
     self.buffer_size = 10000
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.kl_targ = 0.02
     
     # policy_update() -------------------------------------------------------------------------
     self.epochs = 5            # 每次更新的 epochs 數
     
     # board -----------------------------------------------------------------------------------
     self.board_width = 9        # 寬度
     self.board_height = 9       # 高度
     self.n_in_row = 5           # 多少顆連成一線獲得勝利
     self.board = Board(width=self.board_width,
                         height=self.board_height,
                         n_in_row=self.n_in_row)
     self.game = Game(self.board)
     
     # keras -----------------------------------------------------------------------------------
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0    # 基於KL自適應調整學習率
     self.temp = 1.0             # 溫度參數,太小會導致訓練不夠全面
     
     file_folder = './n400-o'
     model_tag = '9_9_5_o'
     self.current_model= f'{file_folder}/current_model_{model_tag}.h5'
     self.best_model= f'{file_folder}/best_model_{model_tag}.h5'
     init_model = self.current_model
     
     self.policy_value_net = PolicyValueNet(self.board_width,
                                     self.board_height,
                                     model_file = init_model if os.path.exists(init_model) else None)
     
     self.progress = file_folder + '/progress.csv'
     self.evaluate_path = file_folder + '/evaluate.csv'
     
     self.history_path = file_folder + '/history.csv'
     self.history = []
     
     # MCTS ------------------------------------------------------------------------------------
     self.c_puct = 5    # MCTS的搜索偏好
     self.loss_goal = 0 #! 存檔時 loss 小於此值會增加訓練時的 n_playout 次數
     self.pure_mcts_playout_num = 1000   # MCTS每一步的模擬次數,隨著模型強度提升
     self.pure_mcts_playout_num_upgrade = 1000   # MCTS隨著模型強度提升的模擬次數
     self.best_win_ratio = 0.0
     
     self.n_playout = 400 # 神經網路每一步的模擬次數,越大代表結果越依賴MCTS的技巧,否則依靠神經網路的判斷
     self.n_playout_training = 400 
     self.n_playout_growth = 0
     self.n_playout_limit = 2000
     self.MCTS_levelup()
Exemple #16
0
                    logging.info("New best policy!!!!!!!!")
                    best_win_ratio = win_ratio
                    # update the best_policy
                    policy_value_net.save_model('./best_policy.model')
                    if (best_win_ratio == 1.0 and
                            pure_mcts_playout_num < 5000):
                        pure_mcts_playout_num += 1000
                        best_win_ratio = 0.0
    except KeyboardInterrupt:
        print('\n\rquit')


if __name__ == '__main__':
    if os.path.exists('./current_policy.model'):
        initmode = './current_policy.model'
        policy_value_net = PolicyValueNet(board_width, board_height, model_file=initmode)
        logging.info('use existing model file')
        win_ratio = 0.6
    else:
        initmode = None
        policy_value_net = PolicyValueNet(board_width,board_height)
        win_ratio = 0.1

    do_run()


# todo

# load trained model to continue
# save record of auto-play(at least when vs pure mcts ) ->sgf format
# simple gui to load record and show -> parse sgf
    def __init__(self):
        """
        關於訓練的初始設置
        
        *補充說明
        kl 用於計算 lr (learning rate)
        """
        # run() -----------------------------------------------------------------------------------
        self.game_batch_num = -1  # 跑一次訓練的重複次數,負值代表不限制
        self.play_batch_size = 1  # 自我訓練的執行次數
        self.batch_size = 4096  # 每次要訓練的資料量,當 data_buffer 的資料累積到超過本數值就會更新 policy
        self.check_freq = 500  # 每訓練 ( check_freq ) 次就會與MCTS比賽
        self.save_freq = 50  # 每訓練 ( save_freq ) 次就會存檔

        # collect_selfplay_data() -----------------------------------------------------------------
        self.buffer_size = 10000
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.kl_targ = 0.02

        # policy_update() -------------------------------------------------------------------------
        self.epochs = 20  # 每次更新 lr 前應嘗試的訓練次數

        # board -----------------------------------------------------------------------------------
        self.board_width = 13  # 寬度
        self.board_height = 13  # 高度
        self.n_in_row = 5  # 多少顆連成一線獲得勝利
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)

        # keras -----------------------------------------------------------------------------------
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # 基於KL自適應調整學習率
        self.temp = 2.0  # 溫度參數,太小會導致訓練不夠全面

        file_folder = './n400'
        model_tag = '13_13_5'
        self.current_model = f'{file_folder}/current_model_{model_tag}.h5'
        self.best_model = f'{file_folder}/best_model_{model_tag}.h5'
        init_model = self.current_model

        self.policy_value_net = PolicyValueNet(
            self.board_width,
            self.board_height,
            model_file=init_model if os.path.exists(init_model) else None)

        self.progress = file_folder + '/progress.csv'
        self.evaluate_path = file_folder + '/evaluate.csv'

        self.history_path = file_folder + '/history.csv'
        self.history = []

        # MCTS ------------------------------------------------------------------------------------
        self.c_puct = 5  # MCTS的搜索偏好
        self.n_playout = 400  # 神經網路每一步的模擬次數,越大代表結果越依賴MCTS的技巧,否則依靠神經網路的判斷

        self.loss_goal = 4.0  # 直到 loss 小於此值才會與MCTS比較,以節省訓練時間
        self.pure_mcts_playout_num = 1000  # MCTS每一步的模擬次數,隨著模型強度提升
        self.pure_mcts_playout_num_upgrade = 500  # MCTS隨著模型強度提升的模擬次數
        self.best_win_ratio = 0.0
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

        self.flush_gate = [5.5, 5.0, 4.4, 4.0, 3.6, 3.2, 2.8, 2.6, 2.4,
                           2.2]  # 當 loss 降低到一定程度後,清空之前舊模型生成的爛數據,以新數據重新訓練
        self.flushTimes = 0
class TrainPipeline:
    def __init__(self):
        """
        關於訓練的初始設置
        
        *補充說明
        kl 用於計算 lr (learning rate)
        """
        # run() -----------------------------------------------------------------------------------
        self.game_batch_num = -1  # 跑一次訓練的重複次數,負值代表不限制
        self.play_batch_size = 1  # 自我訓練的執行次數
        self.batch_size = 4096  # 每次要訓練的資料量,當 data_buffer 的資料累積到超過本數值就會更新 policy
        self.check_freq = 500  # 每訓練 ( check_freq ) 次就會與MCTS比賽
        self.save_freq = 50  # 每訓練 ( save_freq ) 次就會存檔

        # collect_selfplay_data() -----------------------------------------------------------------
        self.buffer_size = 10000
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.kl_targ = 0.02

        # policy_update() -------------------------------------------------------------------------
        self.epochs = 20  # 每次更新 lr 前應嘗試的訓練次數

        # board -----------------------------------------------------------------------------------
        self.board_width = 13  # 寬度
        self.board_height = 13  # 高度
        self.n_in_row = 5  # 多少顆連成一線獲得勝利
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)

        # keras -----------------------------------------------------------------------------------
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # 基於KL自適應調整學習率
        self.temp = 2.0  # 溫度參數,太小會導致訓練不夠全面

        file_folder = './n400'
        model_tag = '13_13_5'
        self.current_model = f'{file_folder}/current_model_{model_tag}.h5'
        self.best_model = f'{file_folder}/best_model_{model_tag}.h5'
        init_model = self.current_model

        self.policy_value_net = PolicyValueNet(
            self.board_width,
            self.board_height,
            model_file=init_model if os.path.exists(init_model) else None)

        self.progress = file_folder + '/progress.csv'
        self.evaluate_path = file_folder + '/evaluate.csv'

        self.history_path = file_folder + '/history.csv'
        self.history = []

        # MCTS ------------------------------------------------------------------------------------
        self.c_puct = 5  # MCTS的搜索偏好
        self.n_playout = 400  # 神經網路每一步的模擬次數,越大代表結果越依賴MCTS的技巧,否則依靠神經網路的判斷

        self.loss_goal = 4.0  # 直到 loss 小於此值才會與MCTS比較,以節省訓練時間
        self.pure_mcts_playout_num = 1000  # MCTS每一步的模擬次數,隨著模型強度提升
        self.pure_mcts_playout_num_upgrade = 500  # MCTS隨著模型強度提升的模擬次數
        self.best_win_ratio = 0.0
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

        self.flush_gate = [5.5, 5.0, 4.4, 4.0, 3.6, 3.2, 2.8, 2.6, 2.4,
                           2.2]  # 當 loss 降低到一定程度後,清空之前舊模型生成的爛數據,以新數據重新訓練
        self.flushTimes = 0
        # -----------------------------------------------------------------------------------------

    def run(self):
        try:
            reset = False
            if os.path.exists(self.progress) and os.path.exists(
                    self.history_path) and not reset:
                with open(self.progress, 'r', newline='') as f:
                    rows = csv.DictReader(f)
                    for row in rows:
                        self.i = int(row['i'])
                        self.pure_mcts_playout_num = int(
                            row['pure_mcts_playout_num'])
                        self.best_win_ratio = float(row['best_win_ratio'])
                        self.flushTimes = int(row['flushTimes'])
                    print(
                        f'continue training: i = {self.i}, pure_mcts_playout_num = {self.pure_mcts_playout_num}, best_win_ratio = {self.best_win_ratio}, flushTimes = {self.flushTimes}'
                    )
            else:
                self.i = 0
                self.save_progress()
                with open(self.history_path, 'w', newline='') as csvfile:
                    writer = csv.writer(csvfile)
                    writer.writerow([
                        'i', 'kl', 'lr_multiplier', 'loss', 'entropy',
                        'explained_var_old', 'explained_var_new'
                    ])

            while (self.i != self.game_batch_num):
                self.i += 1
                self.collect_selfplay_data(self.play_batch_size)
                print("batch i:{}, episode_len:{}".format(
                    self.i, self.episode_len))

                # 資料累積足夠,開始訓練
                if len(self.data_buffer) > self.batch_size:
                    # 更新 policy 並計算 loss
                    self.loss, entropy = self.policy_update()

                    if (self.i) % self.save_freq == 0:
                        # save
                        self.policy_value_net.save_model(self.current_model)
                        with open(self.history_path, 'a', newline='') as f:
                            writer = csv.writer(f)
                            writer.writerows(self.history)
                        self.history = []
                        self.save_progress()

                    # 檢查當前模型的性能,並保存模型參數
                    if (
                            self.i
                    ) % self.check_freq == 0 and self.loss < self.loss_goal:
                        print("current self-play batch: {}".format(self.i))
                        win_ratio = self.policy_evaluate()
                        if win_ratio > self.best_win_ratio:
                            print("New best policy!!!!!!!!")
                            self.best_win_ratio = win_ratio
                            # update the best_policy
                            self.policy_value_net.save_model(self.best_model)
                            if (self.best_win_ratio == 1.0
                                    and self.pure_mcts_playout_num < 5000):
                                self.pure_mcts_playout_num += self.pure_mcts_playout_num_upgrade
                                self.best_win_ratio = 0.0
                        # save
                        self.policy_value_net.save_model(self.current_model)
                        with open(self.history_path, 'a', newline='') as f:
                            writer = csv.writer(f)
                            writer.writerows(self.history)
                        self.history = []
                        self.save_progress()

                    # 清空爛數據
                    if self.flushTimes < len(self.flush_gate):
                        if self.loss < self.flush_gate[self.flushTimes]:
                            print(
                                f'loss {self.loss} < flush gate {self.flush_gate[self.flushTimes]}, clear old data'
                            )
                            self.data_buffer.clear()  # 清空 data buffer
                            self.flushTimes += 1
                else:
                    # 還未開始訓練,本次不算數
                    self.i -= 1

        except KeyboardInterrupt:
            print('\n\rquit')

    def collect_selfplay_data(self, n_games=1):
        """收集自我訓練數據進行訓練"""
        self.episode_len = []
        for i in range(n_games):
            winner, play_data = self.game.start_self_play(self.mcts_player,
                                                          temp=self.temp)
            # todo: 解析比賽資料的內容
            play_data = list(play_data)[:]  # deepcopy 一個 play_data
            self.episode_len.append(len(play_data))  # 統計 episode_len
            # augment the data
            play_data = self.get_equi_data(play_data)  # 對稱/鏡像複製,增加資料量
            self.data_buffer.extend(play_data)  # 將 play_data 新增至 deque 右方
        self.episode_len = np.array(self.episode_len).mean(
        )  # 計算 episode_len 為所有 episode_len 的平均值 (用途?)

    def get_equi_data(self, play_data):
        """通過旋轉和翻轉增強數據集
        play_data:[(狀態,mcts_prob,winner_z),...,...]
        """
        extend_data = []
        for state, mcts_porb, winner in play_data:
            for i in [1, 2, 3, 4]:
                # rotate counterclockwise
                equi_state = np.array([np.rot90(s, i) for s in state])
                equi_mcts_prob = np.rot90(
                    np.flipud(
                        mcts_porb.reshape(self.board_height,
                                          self.board_width)), i)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
                # flip horizontally
                equi_state = np.array([np.fliplr(s) for s in equi_state])
                equi_mcts_prob = np.fliplr(equi_mcts_prob)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
        return extend_data

    def policy_update(self):
        """更新價值網路, 回傳新的 loss, entropy"""
        mini_batch = random.sample(self.data_buffer, self.batch_size)

        # 分類資料 -----------------------------------------
        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]
        # -------------------------------------------------

        old_probs, old_v = self.policy_value_net.policy_value(state_batch)
        """
            * 簡言之 old_probs, old_v = model.predict_on_batch(state_input)
            
            * predict_on_batch is a keras function
            > Returns predictions for a single batch of samples.

            > Arguments
            >     x: Input samples, as a Numpy array.

            > Returns
            >     Numpy array(s) of predictions.
        """

        for i in range(self.epochs):
            # 計算 loss 和 entropy
            loss, entropy = self.policy_value_net.train_step(
                state_batch, mcts_probs_batch, winner_batch,
                self.learn_rate * self.lr_multiplier)
            new_probs, new_v = self.policy_value_net.policy_value(state_batch)
            kl = np.mean(
                np.sum(old_probs *
                       (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)),
                       axis=1))
            if kl > self.kl_targ * 4:  # early stopping if D_KL diverges badly
                break
        # 自適應調整學習率
        if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
            self.lr_multiplier /= 1.5
        elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
            self.lr_multiplier *= 1.5

        explained_var_old = (1 -
                             np.var(np.array(winner_batch) - old_v.flatten()) /
                             np.var(np.array(winner_batch)))
        explained_var_new = (1 -
                             np.var(np.array(winner_batch) - new_v.flatten()) /
                             np.var(np.array(winner_batch)))
        self.history.append([
            self.i, kl, self.lr_multiplier, loss, entropy, explained_var_old,
            explained_var_new
        ])
        print(("kl:{:.5f},"
               "lr_multiplier:{:.3f},"
               "loss:{:.8f},"
               "entropy:{:.5f},"
               "explained_var_old:{:.3f},"
               "explained_var_new:{:.3f}").format(kl, self.lr_multiplier, loss,
                                                  entropy, explained_var_old,
                                                  explained_var_new))
        return loss, entropy

    def policy_evaluate(self, n_games=10):
        """
        通過與純MCTS玩家對戰來評估經過培訓的策略網路
        注意:這僅用於監視培訓進度
        """
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)
        pure_mcts_player = MCTS_Pure(c_puct=5,
                                     n_playout=self.pure_mcts_playout_num)
        win_cnt = defaultdict(int)
        for i in range(n_games):
            winner = self.game.start_play(current_mcts_player,
                                          pure_mcts_player,
                                          start_player=i % 2,
                                          is_shown=0)
            win_cnt[winner] += 1
        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
            self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))

        send_msg("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
            self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))

        if not os.path.exists(self.evaluate_path):
            with open(self.evaluate_path, 'w') as f:
                f.write('i, num_playouts, win, lose, tie')
        with open(self.evaluate_path, 'a') as f:
            f.write(
                f'{self.i}, {self.pure_mcts_playout_num}, {win_cnt[1]}, {win_cnt[2]}, {win_cnt[-1]}\n'
            )
        return win_ratio

    def save_progress(self):
        with open(self.progress, 'w', newline='') as f:
            table = [[
                'i', 'pure_mcts_playout_num', 'best_win_ratio', 'flushTimes'
            ],
                     [
                         self.i, self.pure_mcts_playout_num,
                         self.best_win_ratio, self.flushTimes
                     ]]
            writer = csv.writer(f)
            writer.writerows(table)
Exemple #19
0
class TrainPipeline():
    def __init__(self, init_model='./current_policy.hdf5'):
        # 棋盘参数
        self.board_width = 8
        self.board_height = 8
        self.n_in_row = 5
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # t训练的参数
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0
        self.temp = 1.0  # 温度参数
        self.n_playout = 400  # 每一次落子模拟次数
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  #每次更新的训练步数
        self.kl_targ = 0.02
        self.check_freq = 50
        self.game_batch_num = 1500
        self.best_win_ratio = 0.0
        # 对策略评估使用的MCTS
        self.pure_mcts_playout_num = 2000
        if init_model:
            # 从现有的网络开始训练
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model)
        else:
            # 从新的网络开始训练
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

    def get_equi_data(self, play_data):
        """
        通过对成变换或者旋转之类的来增加数据
        """
        extend_data = []
        for state, mcts_porb, winner in play_data:
            for i in [1, 2, 3, 4]:
                equi_state = np.array([np.rot90(s, i) for s in state])
                equi_mcts_prob = np.rot90(np.flipud(
                    mcts_porb.reshape(self.board_height, self.board_width)), i)
                extend_data.append((equi_state,
                                    np.flipud(equi_mcts_prob).flatten(),
                                    winner))

                equi_state = np.array([np.fliplr(s) for s in equi_state])
                equi_mcts_prob = np.fliplr(equi_mcts_prob)
                extend_data.append((equi_state,
                                    np.flipud(equi_mcts_prob).flatten(),
                                    winner))
        return extend_data

    def collect_selfplay_data(self, n_games=1):
        """
        收集自我对局的数据
        """
        for i in range(n_games):
            winner, play_data = self.game.start_self_play(self.mcts_player,
                                                          temp=self.temp)
            play_data = list(play_data)[:]
            self.episode_len = len(play_data)
            # 数据增加
            play_data = self.get_equi_data(play_data)
            self.data_buffer.extend(play_data)

    def policy_update(self):
        """
        更新策略价值网络
        """
        mini_batch = random.sample(self.data_buffer, self.batch_size)
        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]
        old_probs, old_v = self.policy_value_net.policy_value(state_batch)
        for i in range(self.epochs):
            loss, entropy = self.policy_value_net.train_step(
                    state_batch,
                    mcts_probs_batch,
                    winner_batch,
                    self.learn_rate*self.lr_multiplier)
            new_probs, new_v = self.policy_value_net.policy_value(state_batch)
            kl = np.mean(np.sum(old_probs * (
                    np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)),
                    axis=1)
            )
            if kl > self.kl_targ * 4:  # 如果loss增加,停止训练
                break
        # 调整学习率
        if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
            self.lr_multiplier /= 1.5
        elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
            self.lr_multiplier *= 1.5

        explained_var_old = (1 -
                             np.var(np.array(winner_batch) - old_v.flatten()) /
                             np.var(np.array(winner_batch)))
        explained_var_new = (1 -
                             np.var(np.array(winner_batch) - new_v.flatten()) /
                             np.var(np.array(winner_batch)))
        print(("kl:{:.5f},"
               "lr_multiplier:{:.3f},"
               "loss:{},"
               "entropy:{},"
               "explained_var_old:{:.3f},"
               "explained_var_new:{:.3f}"
               ).format(kl,
                        self.lr_multiplier,
                        loss,
                        entropy,
                        explained_var_old,
                        explained_var_new))
        return loss, entropy

    def policy_evaluate(self, n_games=10):
        """
        通过和纯MCTS进行对弈来评估训练好的策略
        """
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)
        pure_mcts_player = MCTS_Pure(c_puct=5,
                                     n_playout=self.pure_mcts_playout_num)
        win_cnt = defaultdict(int)
        for i in range(n_games):
            winner = self.game.start_play(current_mcts_player,
                                          pure_mcts_player,
                                          start_player=i % 2,
                                          is_shown=0)
            win_cnt[winner] += 1
        win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
                self.pure_mcts_playout_num,
                win_cnt[1], win_cnt[2], win_cnt[-1]))
        return win_ratio

    def run(self):

        for i in range(self.game_batch_num):
            self.collect_selfplay_data(self.play_batch_size)
            print("batch i:{}, episode_len:{}".format(
                    i+1, self.episode_len))
            if len(self.data_buffer) > self.batch_size:
                loss, entropy = self.policy_update()
            # 检查当前模型,并且保存参数
            if (i+1) % self.check_freq == 0:
                print("current self-play batch: {}".format(i+1))
                win_ratio = self.policy_evaluate()
                self.policy_value_net.save_model('./current_policy.hdf5')
                if win_ratio > self.best_win_ratio:
                    print("New best policy!!!!!!!!")
                    self.best_win_ratio = win_ratio
                    # update the best_policy
                    self.policy_value_net.save_model('./best_policy.hdf5')
                    if (self.best_win_ratio == 1.0 and
                            self.pure_mcts_playout_num < 5000):
                        self.pure_mcts_playout_num += 1000
                        self.best_win_ratio = 0.0
Exemple #20
0
class TrainPipeline():
    def __init__(self, init_model=None):
        # params of the board and the game
        self.board_width = 8
        self.board_height = 8
        self.n_in_row = 5
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02
        self.check_freq = 50
        self.game_batch_num = 1500
        self.best_win_ratio = 0.0
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        if init_model:
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model)
        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

    def get_equi_data(self, play_data):
        """augment the data set by rotation and flipping
        play_data: [(state, mcts_prob, winner_z), ..., ...]
        """
        extend_data = []
        for state, mcts_porb, winner in play_data:
            for i in [1, 2, 3, 4]:
                # rotate counterclockwise
                equi_state = np.array([np.rot90(s, i) for s in state])
                equi_mcts_prob = np.rot90(
                    np.flipud(
                        mcts_porb.reshape(self.board_height,
                                          self.board_width)), i)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
                # flip horizontally
                equi_state = np.array([np.fliplr(s) for s in equi_state])
                equi_mcts_prob = np.fliplr(equi_mcts_prob)
                extend_data.append(
                    (equi_state, np.flipud(equi_mcts_prob).flatten(), winner))
        return extend_data

    def collect_selfplay_data(self, n_games=1):
        """collect self-play data for training"""
        for i in range(n_games):
            winner, play_data = self.game.start_self_play(self.mcts_player,
                                                          temp=self.temp)
            play_data = list(play_data)[:]
            self.episode_len = len(play_data)
            # augment the data
            play_data = self.get_equi_data(play_data)
            self.data_buffer.extend(play_data)

    def policy_update(self):
        """update the policy-value net"""
        mini_batch = random.sample(self.data_buffer, self.batch_size)
        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]
        old_probs, old_v = self.policy_value_net.policy_value(state_batch)
        for i in range(self.epochs):
            loss, entropy = self.policy_value_net.train_step(
                state_batch, mcts_probs_batch, winner_batch,
                self.learn_rate * self.lr_multiplier)
            new_probs, new_v = self.policy_value_net.policy_value(state_batch)
            kl = np.mean(
                np.sum(old_probs *
                       (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)),
                       axis=1))
            if kl > self.kl_targ * 4:  # early stopping if D_KL diverges badly
                break
        # adaptively adjust the learning rate
        if kl > self.kl_targ * 2 and self.lr_multiplier > 0.1:
            self.lr_multiplier /= 1.5
        elif kl < self.kl_targ / 2 and self.lr_multiplier < 10:
            self.lr_multiplier *= 1.5

        explained_var_old = (1 -
                             np.var(np.array(winner_batch) - old_v.flatten()) /
                             np.var(np.array(winner_batch)))
        explained_var_new = (1 -
                             np.var(np.array(winner_batch) - new_v.flatten()) /
                             np.var(np.array(winner_batch)))
        print(("kl:{:.5f},"
               "lr_multiplier:{:.3f},"
               "loss:{},"
               "entropy:{},"
               "explained_var_old:{:.3f},"
               "explained_var_new:{:.3f}").format(kl, self.lr_multiplier, loss,
                                                  entropy, explained_var_old,
                                                  explained_var_new))
        return loss, entropy

    def policy_evaluate(self, n_games=10):
        """
        Evaluate the trained policy by playing against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)
        pure_mcts_player = MCTS_Pure(c_puct=5,
                                     n_playout=self.pure_mcts_playout_num)
        win_cnt = defaultdict(int)
        for i in range(n_games):
            winner = self.game.start_play(current_mcts_player,
                                          pure_mcts_player,
                                          start_player=i % 2,
                                          is_shown=0)
            win_cnt[winner] += 1
        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
            self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
        return win_ratio

    def run(self):
        """run the training pipeline"""
        try:
            for i in range(self.game_batch_num):
                self.collect_selfplay_data(self.play_batch_size)
                print("batch i:{}, episode_len:{}".format(
                    i + 1, self.episode_len))
                if len(self.data_buffer) > self.batch_size:
                    loss, entropy = self.policy_update()
                # check the performance of the current model,
                # and save the model params
                if (i + 1) % self.check_freq == 0:
                    print("current self-play batch: {}".format(i + 1))
                    win_ratio = self.policy_evaluate()
                    self.policy_value_net.save_model('./current_policy.model')
                    if win_ratio > self.best_win_ratio:
                        print("New best policy!!!!!!!!")
                        self.best_win_ratio = win_ratio
                        # update the best_policy
                        self.policy_value_net.save_model('./best_policy.model')
                        if (self.best_win_ratio == 1.0
                                and self.pure_mcts_playout_num < 5000):
                            self.pure_mcts_playout_num += 1000
                            self.best_win_ratio = 0.0
        except KeyboardInterrupt:
            print('\n\rquit')
Exemple #21
0
class TrainPipeline():
    def __init__(self, init_model=None):
        self.board = CSB_Game()
        self.game = Game(self.board)
        # training params
        self.learn_rate = .001
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 50  # num of simulations for each move
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 50  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 20  # num of train_steps for each update
        self.kl_targ = 0.02
        self.check_freq = 100000000000000000000000
        self.game_batch_num = 200000000
        self.best_win_ratio = 0.0
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        if init_model:
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(model_file=init_model)
        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet()
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

    def collect_selfplay_data(self, n_games=1):
        """collect self-play data for training"""
        for i in range(n_games):

            winner, play_data = self.game.start_self_play(self.mcts_player,
                                                          temp=self.temp)
            play_data = list(play_data)[:]
            self.episode_len = len(play_data)
            # augment the data

            self.data_buffer.extend(play_data)

    def policy_update(self):
        """update the policy-value net"""
        mini_batch = random.sample(self.data_buffer, self.batch_size)
        #print(mini_batch)
        state_batch = [data[0] for data in mini_batch]
        mcts_probs_batch = [data[1] for data in mini_batch]
        winner_batch = [data[2] for data in mini_batch]
        old_probs, old_v = self.policy_value_net.policy_value(state_batch)
        for i in range(self.epochs):
            loss, entropy = self.policy_value_net.train_step(
                state_batch, mcts_probs_batch, winner_batch, self.learn_rate)
            new_probs, new_v = self.policy_value_net.policy_value(state_batch)
            #print(winner_batch, new_v)
            kl = np.mean(
                np.sum(old_probs *
                       (np.log(old_probs + 1e-10) - np.log(new_probs + 1e-10)),
                       axis=1))
            #if kl > self.kl_targ * 4:  # early stopping if D_KL diverges badly
            #    break
        # adaptively adjust the learning rate
        if kl > self.kl_targ * 2 and self.lr_multiplier > 0.01:
            self.lr_multiplier /= 1.5
        elif kl < self.kl_targ / 2 and self.lr_multiplier < 100:
            self.lr_multiplier *= 1.5

        #print(winner_batch)
        eps = 0.00000000001
        explained_var_old = (1 -
                             np.var(np.array(winner_batch) - old_v.flatten()) /
                             (np.var(np.array(winner_batch)) + eps))
        explained_var_new = (1 -
                             np.var(np.array(winner_batch) - new_v.flatten()) /
                             (np.var(np.array(winner_batch)) + eps))
        print(("kl:{:.5f},"
               "lr_multiplier:{:.3f},"
               "loss:{},"
               "entropy:{},"
               "explained_var_old:{:.3f},"
               "explained_var_new:{:.3f}").format(kl, self.lr_multiplier, loss,
                                                  entropy, explained_var_old,
                                                  explained_var_new))
        return loss, entropy

    def run(self):
        """run the training pipeline"""
        try:
            for i in range(self.game_batch_num):

                self.collect_selfplay_data(self.play_batch_size)
                print("batch i:{}, episode_len:{}".format(
                    i + 1, self.episode_len))
                if len(self.data_buffer) > self.batch_size:
                    loss, entropy = self.policy_update()
                # check the performance of the current model,
                # and save the model params
                if (i + 1) % 100 == 0:
                    self.policy_value_net.save_model('./current_policy.model')

        except KeyboardInterrupt:
            print('\n\rquit')