Esempio n. 1
0
 def policy_evaluate(self, n_games=10):
     """
     通过和纯MCTS进行对弈来评估训练好的策略
     """
     current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout)
     pure_mcts_player = MCTS_Pure(c_puct=5,
                                  n_playout=self.pure_mcts_playout_num)
     win_cnt = defaultdict(int)
     for i in range(n_games):
         winner = self.game.start_play(current_mcts_player,
                                       pure_mcts_player,
                                       start_player=i % 2,
                                       is_shown=0)
         win_cnt[winner] += 1
     win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1]) / n_games
     print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
             self.pure_mcts_playout_num,
             win_cnt[1], win_cnt[2], win_cnt[-1]))
     return win_ratio
Esempio n. 2
0
    def play(self):

        model_file = "current.model"
        best_policy = PolicyValueNet(self.width, self.height, model_file)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=300)
        pure_player = MCTS_Pure(c_puct=5, n_playout=300)

        human1 = Human()
        human2 = Human()
        # self.show()

        win_cnt = defaultdict(int)
        for i in range(10):
            winner = self.start_play(mcts_player,
                                     pure_player,
                                     start_player=(i % 2),
                                     is_shown=1)
            win_cnt[winner] += 1
        print "win", win_cnt[1], "lose", win_cnt[2], "tie", win_cnt[0]
Esempio n. 3
0
def run():
    n_row = 5
    width, height = 11, 11

    try:
        board = Board(width=width, height=height, n_in_row=n_row)
        game = Game(board)

        ################ human VS AI ###################

        best_policy = PolicyValueNet(width, height, n_row)
        mcts_player = MCTSPlayer(
            best_policy.policy_value_fn, c_puct=5,
            n_playout=400)  # set larger n_playout for better performance

        human = Human()

        # set start_player=0 for human first
        game.start_play(human, mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Esempio n. 4
0
 def policy_evaluate(self, n_games=10):
     """
     Evaluate the trained policy by playing against the pure MCTS player
     Note: this is only for monitoring the progress of training
     """
     current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout)
     pure_mcts_player = MCTS_Pure(c_puct=5,
                                  n_playout=self.pure_mcts_playout_num)
     win_cnt = defaultdict(int)
     for i in range(n_games):
         winner = self.game.start_play(current_mcts_player,
                                       pure_mcts_player,
                                       start_player=i % 2,
                                       is_shown=0)
         win_cnt[winner] += 1
     win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
     print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
         self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
     return win_ratio
Esempio n. 5
0
def MCTSput(board, who, n_playout=400):
    # print("n_playout=", n_playout)
    # input("按任意键继续")
    player = MCTSPlayer(c_puct=5, n_playout=n_playout)
    # 设置当前下棋者,使用do_move的才要
    # board.set_current_player(who)
    # 如果是先手,随机下一个地方
    last = board.getLast()
    if last == [-1, -1]:
        row = random.randint(2, 5)
        col = random.randint(2, 5)
        if board[row][col] == 0:
            move = board.location_to_move((row, col))
            if board.do_move(move):
                return True
        return False
    # 不是先手
    move = player.get_action(board)
    #print(board.current_player, who)
    #    input("按任意键继续")
    return board.do_move(move)
Esempio n. 6
0
def run():
    n = 5
    width, height = 8, 8
    model_file = './best_policy_8_8_5.model'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

        # best_policy = PolicyValueNet(width, height, model_file = model_file)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        try:
            policy_param = pickle.load(open(model_file, 'rb'))
        except:
            policy_param = pickle.load(open(model_file, 'rb'),
                                       encoding='bytes')  # To support python3
        best_policy = PolicyValueNetNumpy(width, height, policy_param)
        mcts_player = MCTSPlayer(
            best_policy.policy_value_fn, c_puct=5,
            n_playout=400)  # set larger n_playout for better performance

        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        mcts_player2 = MCTS_Pure(c_puct=5, n_playout=1)

        # human player, input your move in the format: 2,3
        human = Human()

        # set start_player=0 for human first
        # 解除注释就是人机对抗
        game.start_play(human, mcts_player, start_player=1, is_shown=1)
        ## 解除注释就是就是AI自己博弈
        # game.start_self_play(mcts_player,is_shown=1)
        # 解除注释就是AI打AI
        # game.start_play(mcts_player2,mcts_player,is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Esempio n. 7
0
 def __init__(self, init_model=None):
     self.board_width = 6
     self.board_height = 6
     self.config = GameConfig()
     self.board = Board(self.config)
     self.game = Game(self.board)
     # training params
     #学习率0.002
     self.learn_rate = 2e-3
     #自动调整学习率 kl比较两个概率分布的接近程度。在某个变化范围内,KL散度取到最小值的时候,对应的参数是我们想要的最优参数
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 1500  # num of simulations for each move
     self.c_puct = 5  #UCTK
     self.buffer_size = 10000
     self.batch_size = 200  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.02
     self.check_freq = 50
     #    self.check_freq = 25
     #    self.game_batch_num = 1500
     self.game_batch_num = 5000
     # num of simulations used for the pure mcts, which is used as
     # the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 5000
     if init_model:
         # start training from an initial policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                model_file=init_model)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
Esempio n. 8
0
 def __init__(self, init_model=None):
     # 设置棋盘和游戏的参数
     self.board_width = 10
     self.board_height = 10
     self.n_in_row = 4
     self.board = Board(width=self.board_width,
                        height=self.board_height,
                        n_in_row=self.n_in_row)
     self.game = Game(self.board)
     # 设置训练参数
     self.learn_rate = 2e-3  # 基准学习率
     self.lr_multiplier = 1.0  # 基于KL自动调整学习倍速
     self.temp = 1.0  # 温度参数
     self.n_playout = 400  # 每下一步棋,模拟的步骤数
     self.c_puct = 5  # exploitation和exploration之间的折中系数
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)  #使用 deque 创建一个双端队列
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.02  # 早停检查
     self.check_freq = 50  # 每50次检查一次,策略价值网络是否更新
     self.game_batch_num = 500  # 训练多少个epoch
     self.best_win_ratio = 0.0  # 当前最佳胜率,用他来判断是否有更好的模型
     # 弱AI(纯MCTS)模拟步数,用于给训练的策略AI提供对手
     self.pure_mcts_playout_num = 1000
     if init_model:
         # 通过init_model设置策略网络
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                model_file=init_model)
     else:
         # 训练一个新的策略网络
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height)
     # AI Player,设置is_selfplay=1 自我对弈,因为是在进行训练
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
Esempio n. 9
0
 def __init__(self, init_model=None):
     # params of the board and the game
     self.board_width = 15
     self.board_height = 15
     self.n_in_row = 5
     self.board = Board(width=self.board_width,
                        height=self.board_height,
                        n_in_row=self.n_in_row)
     self.game = Game(self.board)
     # training params
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 800  # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)  # 存储mcts的数据,增广以后的数据
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update # 此处应该是400或者800
     self.kl_targ = 0.02
     self.check_freq = 50
     self.game_batch_num = 1500
     self.best_win_ratio = 0.0
     # num of simulations used for the pure mcts, which is used as
     # the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 1000  # 此处是1000
     if init_model:
         # start training from an initial policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                model_file=init_model)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
Esempio n. 10
0
def run():
    n = 5
    #width, height = 8, 8
    width, height = 16,16
    #model_file =  'best_policy_8_8_5.model'
    model_file =  './tfData/best_policy.model'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

        # best_policy = PolicyValueNet(width, height, model_file = model_file)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy

        #得到策略                               
        best_policy = PolicyValueNet(width, height, model_file)
        #得到策略函数
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=400)  # set larger n_playout for better performance

        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)

        # human player, input your move in the format: 2,3
        human = Human()

        # set start_player=0 for human first
        winner, play_data  = game.start_play(human, mcts_player, start_player=1, is_shown=1)
        play_data = list(play_data)[:]
        play_data = get_equi_data(play_data,height,width )
        backupSave(play_data,"human" )

    except KeyboardInterrupt:
        print('\n\rquit')
Esempio n. 11
0
 def local_thread_func(thread_id, shared_queue, net_lock, data_lock):
     from policy_value_net_tensorflow import PolicyValueNet
     # 读取模型文件,加锁
     logging.info("selfplay process {} ask net lock".format(thread_id))
     with net_lock:
         logging.info(
             'selfpaly process {} get net lock'.format(thread_id))
         current_policy = PolicyValueNet(self.board_width,
                                         self.board_height,
                                         model_dir,
                                         model_file=current_model_name)
     logging.info(
         'selfplay process {} release net lock'.format(thread_id))
     local_board = Board(width=self.board_width,
                         height=self.board_height,
                         n_in_row=self.n_in_row)
     local_game = Game(local_board)
     local_mcts_player = MCTSPlayer(current_policy.policy_value_fn,
                                    c_puct=self.c_puct,
                                    n_playout=self.n_playout,
                                    is_selfplay=1)
     logging.info("selfplay process {} start {}th selfplay".format(
         thread_id, index))
     winner, play_data = local_game.start_self_play(local_mcts_player,
                                                    temp=self.temp)
     logging.info("selfplay process {} finish {}th selfplay".format(
         thread_id, index))
     play_data = list(play_data)
     play_data = self.get_equi_data(play_data)
     # 添加对弈数据,加锁
     logging.info('selfplay process {} ask date lock'.format(thread_id))
     with data_lock:
         logging.info(
             'selfplay process {} get date lock'.format(thread_id))
         shared_queue.extend(play_data)
         while len(shared_queue) > self.buffer_num:
             shared_queue.pop(0)
     logging.info(
         'selfplay process {} release data lock'.format(thread_id))
Esempio n. 12
0
 def __init__(self, init_model=None):
     # 棋盘数据
     self.board_width = 8
     self.board_height = 8
     # self.n_in_row = 5
     self.board = chessboard(row=self.board_width, col=self.board_height)
     # 训练参数
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0
     self.temp = 1.0
     self.n_playout = 400  # 每次模拟次数
     self.c_puct = 5
     self.buffer_size = 10000000
     self.batch_size = 512  # 每批样本量
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  # 每次更新前迭代次数
     self.kl_targ = 0.02
     self.check_freq = 2
     # 自我对弈次数
     self.game_batch_num = 1000
     self.best_win_ratio = 0.0
     # 纯蒙特卡罗树搜索,用来作为基准
     self.pure_mcts_playout_num = 400
     # 有预训练模型的情况
     if init_model:
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                model_file=init_model,
                                                use_gpu=True)
     else:
         # 从头开始训练
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                use_gpu=True)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
Esempio n. 13
0
def single_game_play(num,initmode):
    print('Starting worker {} '.format(num))
    board = Board(width=board_width,
                  height=board_height,
                  n_in_row=n_in_row)
    game = Game(board)
    if initmode:
        policy_value_net = PolicyValueNet(board_width,board_height,model_file=initmode)
    else:
        policy_value_net = PolicyValueNet(board_width,board_height)

    mcts_player = MCTSPlayer(policy_value_net.policy_value_fn,
                                 c_puct=c_puct,
                                 n_playout=n_playout,
                                 is_selfplay=1)

    winner, play_data = game.start_self_play(mcts_player,temp=temp)
    #should not do following line because zip function return a iterator instead of a static data strutcure like list
    #playlen = len(list(play_data))
    #print('Exiting worker{} and len is {}'.format(num,playlen))
    #logging.info('Exiting worker{} and len is {}'.format(num,playlen))
    return winner, play_data
Esempio n. 14
0
 def __init__(self, init_model='./current_policy.hdf5'):
     # 棋盘参数
     self.board_width = 8
     self.board_height = 8
     self.n_in_row = 5
     self.board = Board(width=self.board_width,
                        height=self.board_height,
                        n_in_row=self.n_in_row)
     self.game = Game(self.board)
     # t训练的参数
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0
     self.temp = 1.0  # 温度参数
     self.n_playout = 400  # 每一次落子模拟次数
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 512
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  #每次更新的训练步数
     self.kl_targ = 0.02
     self.check_freq = 50
     self.game_batch_num = 1500
     self.best_win_ratio = 0.0
     # 对策略评估使用的MCTS
     self.pure_mcts_playout_num = 2000
     if init_model:
         # 从现有的网络开始训练
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                model_file=init_model)
     else:
         # 从新的网络开始训练
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
Esempio n. 15
0
 def policy_evaluate(self, n_games=10):
     """
     Evaluate the trained policy by playing against the pure MCTS player
     Note: this is only for monitoring the progress of training
     """
     print('4')
     current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout)
     pure_mcts_player = MCTS_Pure(c_puct=5,
                                  n_playout=self.pure_mcts_playout_num)
     win_cnt = 0
     for i in range(n_games):
         winner = self.game.start_play(current_mcts_player,
                                       pure_mcts_player,
                                       start_player=i % 2,
                                       is_shown=0)
         win_cnt += 1
     win_ratio = win_cnt / n_games
     print("num_playouts:{}, win: {}".format(self.pure_mcts_playout_num,
                                             win_cnt))
     return win_ratio
Esempio n. 16
0
 def policy_evaluate(self, n_games=10):
     """
     Evaluate the trained policy by playing games against the pure MCTS player
     Note: this is only for monitoring the progress of training
     """
     current_mcts_player = MCTSPlayer(
         self.policy_value_net.policy_value_func,
         c_puct=self.c_puct,
         n_play_out=self.n_play_out)
     pure_mcts_player = MCTS_Pure(c_puct=5,
                                  n_play_out=self.pure_mcts_play_out_number)
     win_cnt = defaultdict(int)
     results = self.pool.map(self.game.start_play,
                             [(current_mcts_player, pure_mcts_player, i)
                              for i in range(n_games)])
     for winner in results:
         win_cnt[winner] += 1
     win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
     print_log("number_play_outs:{}, win: {}, lose: {}, tie:{}".format(
         self.pure_mcts_play_out_number, win_cnt[1], win_cnt[2],
         win_cnt[-1]))
     return win_ratio
Esempio n. 17
0
def run():
    width, height = 9, 9
    model_file = 'best_policy.model'
    try:
        board = Board(width=width, height=height)
        game = Game(board)

        # ############### human VS AI ###################

        best_policy = PolicyValueNet(width, height, model_file = model_file)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=800)

        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)

        # human player, input your move in the format: 2,3
        human = Human()

        # set start_player=0 for human first
        game.start_play(human, mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Esempio n. 18
0
 def __init__(self, init_model=None):
     # params of the board and the game
     #width of chessboard
     self.board_width = 8  #6 #10
     #height of chessboard
     self.board_height = 8 #6 #10
     #conditions for victory
     self.n_in_row = 5     #4 #5
     self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row)
     self.game = Game(self.board)
     # training params 
     self.learn_rate = 5e-3   #learning rate
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0 # the temperature param
     self.n_playout = 400 # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000 #The number of maximum elements in the queue
     self.batch_size = 512 # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size) # queue size      
     self.play_batch_size = 1 # collect a set of data if it self-play once
     self.epochs = 5 # num of train_steps for each update
     self.kl_targ = 0.025 #KL target
     #check frequency: evaluate the game and current AI model every 50 times of self-play
     #The evaluation method is to use the latest AI model and MCTs-pure AI (based on random roll out) to fight 10 rounds
     self.check_freq = 50  #50
     self.game_batch_num = 200 #the number of training batches
     self.best_win_ratio = 0.0 #historical best winning rate
     # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 1000  
     if init_model:
         # start training from an initial policy-value net
         #pickle.load(file)反序列化对象。将文件中的数据解析为一个pytorch对象
         policy_param = pickle.load(open(init_model, 'rb')) #使用‘rb’按照二进制位读取
         self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params = policy_param)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) 
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
Esempio n. 19
0
def run():
    n = 5
    width, height = 15, 15
    model_file = 'best_policy_3000.pt'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

        # best_policy = PolicyValueNet(width, height, model_file = model_file)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        # try:
        #     policy_param = pickle.load(open(model_file, 'rb'))
        # except:
        #     policy_param = pickle.load(open(model_file, 'rb'),
        #                                encoding='bytes')  # To support python3

        best_policy = PolicyValueNet(width, height, model_file=model_file)
        mcts_player1 = MCTSPlayer(
            best_policy.policy_value_fn, c_puct=5,
            n_playout=400)  # set larger n_playout for better performance
        mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)

        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)

        # human player, input your move in the format: 2,3
        human = Human()
        print("!!!!!")

        # set start_player=0 for human first
        game.start_play(mcts_player1, human, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Esempio n. 20
0
def main(config):
    try:
        game = Game.from_config(config)

        # ############### human VS AI ###################
        # load the trained policy_value_net in PyTorch

        policy_value_net = PolicyValueNet(config.size,
                                          model_file=config.model_file)
        mcts_player = MCTSPlayer(
            policy_value_net,
            c_puct=config.c_puct,
            n_playout=config.n_playout,
            temperature=HUMAN_PLAY_TEMPERATURE,
        )

        # human player, input your move in the format: 2,3
        human = Human()

        # set start_player=0 for human first
        game.start_play(human, mcts_player, display=1)
    except KeyboardInterrupt:
        print("\n\rquit")
Esempio n. 21
0
def run():
    n = 6
    width, height = 9, 9
    model_file = 'best_policy.model' #載入模型
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)      
        
        try:
            policy_param = pickle.load(open(model_file, 'rb'))
        except:
            policy_param = pickle.load(open(model_file, 'rb'), encoding = 'bytes')  # To support python3
        best_policy = PolicyValueNet(width, height, policy_param)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)  # set larger n_playout for better performance
        
        
        # human player, input your move in the format: 2,3
        human = Human()                   
        
        # set start_player=0 for human first
        game.start_play(human, mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Esempio n. 22
0
def run(states, sensible_moves, currentPlayer, lastMove):
    #胜利所需要连续的子
    n = 5
    #棋盘宽度,高度
    width, height = 8, 8
    board = Board(width=width, height=height, n_in_row=n)
    board.init_board()

    board.states = states
    board.availables = sensible_moves
    board.current_player = currentPlayer
    board.last_move = lastMove
    #策略价值网络
    best_policy = PolicyValueNetNumpy(width, height, policy_param)
    #纯蒙特卡洛搜索
    mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                             c_puct=5,
                             n_playout=400)

    #从蒙特卡洛搜索中返回下一步要走的地方
    nextmove = mcts_player.get_action(board)

    return nextmove
Esempio n. 23
0
def run():
    n = 5
    width, height = 8, 8
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

        best_policy = PolicyValueNet(width, height, model_file=model_file)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=500)

        # human player, input your move in the format: 2,3
        human1 = Human()

        # set start_player=0 for human first
        game.start_play(human1, mcts_player, start_player=1, is_shown=1)
        # game.start_play(human1, human2, start_player=0, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Esempio n. 24
0
def get_mcts_player(player_index=1):
    """
    Get an mcts player, an index of 1 corresponds to first player (typically
    human) and an index of 2 corresponds to the second player (typically AI
    opponent).
    """
    board = Board()
    board.init_board()

    size = 8
    model_file = '../AlphaZero_Gomoku/best_policy_8_8_5.model'

    try:
        policy_param = pickle.load(open(model_file, 'rb'))
    except Exception:
        policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes')

    best_policy = PolicyValueNetNumpy(size, size, policy_param)
    mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5,
                             n_playout=200)
    mcts_player.set_player_ind(player_index)

    return mcts_player
    def __init__(self, init_model=None, is_shown=0):

        self.board_width = 15
        self.board_height = 15
        self.n_in_row = 5
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.is_shown = is_shown
        self.game = Game_UI(self.board, is_shown)

        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0
        self.temp = 1.0
        self.n_playout = 400
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5
        self.kl_targ = 0.02
        self.check_freq = 50
        self.game_batch_num = 1500
        self.best_win_ratio = 0.0
        self.pure_mcts_playout_num = 1000
        if init_model:
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model)
        else:
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)
Esempio n. 26
0
 def __init__(self, n: int, init_model=None):
     # params of the board and the game
     self.n = n
     self.board = Board(self.n)
     self.game = Game(self.board)
     # training params
     self.learn_rate = 5e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_play_out = 400  # number of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.epochs = 5  # number of train_steps for each update
     self.kl_target = 0.025
     self.check_freq = 50
     self.game_batch_number = 10000
     self.best_win_ratio = 0.0
     self.episode_length = 0
     self.pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
     # number of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy
     self.last_batch_number = 0
     self.pure_mcts_play_out_number = 1000
     if init_model:
         # start training from an initial policy-value net
         policy_param = pickle.load(open(init_model, 'rb'))
         self.policy_value_net = PolicyValueNet(self.n,
                                                net_params=policy_param)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(self.n)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_func,
                                   c_puct=self.c_puct,
                                   n_play_out=self.n_play_out,
                                   is_self_play=1)
Esempio n. 27
0
def run():
    n = 5
    width, height = 9, 9
    model_file = 'output/best_policy.model'
    try:
        board = Board(width=width,
                      height=height,
                      n_in_row=n,
                      forbidden_hands=True)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

        # best_policy = PolicyValueNet(width, height, model_file = model_file)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        best_policy = PolicyValueNetRes30(width,
                                          height,
                                          'l+',
                                          model_file=model_file)
        mcts_player = MCTSPlayer(
            best_policy.policy_value_fn, c_puct=5,
            n_playout=400)  # set larger n_playout for better performance

        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)

        # human player, input your move in the format: 2,3
        human = Human()

        # set start_player=0 for human first
        game.start_play(human, mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
Esempio n. 28
0
 def __init__(self, game_batch_num, model_file=None):
     # params of the board and the game
     self.size = BOARD_SIZE
     use_gpu = False
     board = Board(size=self.size, n_in_row=N_IN_ROW)
     self.game = Game(board)
     # training params
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 400  # num of simulations for each move
     self.c_puct = 5
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=10000)
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.02
     self.check_freq = 50
     self.game_batch_num = game_batch_num
     self.best_win_ratio = 0.0
     # num of simulations used for the pure mcts, which is used as
     # the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 1000
     if model_file:
         # start training from an initial policy-value net
         self.policy_value_net = PolicyValueNet(size=self.size,
                                                model_file=model_file,
                                                use_gpu=use_gpu)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(size=self.size,
                                                use_gpu=use_gpu)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
Esempio n. 29
0
 def __init__(self):
     # params of the board and the game
     self.board_width = 6
     self.board_height = 6
     self.n_in_row = 4
     self.board = Board(width=self.board_width,
                        height=self.board_height,
                        n_in_row=self.n_in_row)
     self.game = Game(self.board)
     # training params
     self.learn_rate = 5e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 400  # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.025
     self.check_freq = 50
     self.game_batch_num = 1500
     self.best_win_ratio = 0.0
     # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 1000
     # start training from a given policy-value net
     #        policy_param = pickle.load(open('current_policy.model', 'rb'))
     #        self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params = policy_param)
     # start training from a new policy-value net
     self.policy_value_net = PolicyValueNet(self.board_width,
                                            self.board_height)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
Esempio n. 30
0
def run():
    n = 5
    width, height = 15, 15
    # model_file = 'best_policy_8_8_5.model'
    model_file = 'best_policy.model'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        ################ human VS AI ###################
        # MCTS player with the policy_value_net trained by AlphaZero algorithm
        #        policy_param = pickle.load(open(model_file, 'rb'))
        #        best_policy = PolicyValueNet(width, height, net_params = policy_param)
        #        mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)

        # MCTS player with the trained policy_value_net written in pure numpy
        try:
            policy_param = pickle.load(open(model_file, 'rb'))
        except:
            policy_param = pickle.load(open(model_file, 'rb'),
                                       encoding='bytes')  # To support python3
        best_policy = PolicyValueNetNumpy(width, height, policy_param)
        mcts_player = MCTSPlayer(
            best_policy.policy_value_fn, c_puct=5,
            n_playout=400)  # set larger n_playout for better performance

        # uncomment the following line to play with pure MCTS (its much weaker even with a larger n_playout)
        #        mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)

        # human player, input your move in the format: 2,3
        human = Human()

        # set start_player=0 for human first
        game.start_play(human, mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')