Beispiel #1
0
def run():
    model_file = './current_policy.model'
    best_policy = PolicyValueNet(6, 6, model_file)
    config = GameConfig()
    board = Board(config)
    game = Game(board)
    mcts_player1 = MCTSPlayer(best_policy.policy_value_fn,
                              c_puct=5,
                              n_playout=1000)
    mcts_player2 = MCTS_Pure(c_puct=5, n_playout=1000)
    mcts_player3 = MCTS_Pure(c_puct=5, n_playout=1000)
    human = Human(config)
    human2 = Human(config)
    human3 = Human(config)
    game.start_play(mcts_player3, human, mcts_player2)
Beispiel #2
0
    def policy_evaluate(self, iteration, n_games=10):
        """
        Evaluate the trained policy by playing against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)

        pure_mcts_player = MCTS_Pure(c_puct=5,
                                     n_playout=self.pure_mcts_playout_num)

        win_cnt = defaultdict(int)
        for i in range(n_games):
            winner = self.game.start_play(current_mcts_player,
                                          pure_mcts_player,
                                          start_player=i % 2 + 1,
                                          is_shown=0,
                                          savefig=False)
            win_cnt[winner] += 1
        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
        print("num_playouts: {}, win: {}, lose: {}, tie:{}".format(
            self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))

        self.writer.add_text(
            tag='evaluation results',
            text_string=
            f"num_playouts: {self.pure_mcts_playout_num}, win: {win_cnt[1]}, lose: {win_cnt[2]}, tie:{win_cnt[-1]}",
            global_step=iteration + 1)

        return win_ratio
Beispiel #3
0
 def policy_evaluate(self,
                     pure_mcts_playout_num,
                     current_policy_value_net,
                     n_games=10):
     """
     Evaluate the trained policy by playing against the pure MCTS player
     Note: this is only for monitoring the progress of training
     """
     current_mcts_player = MCTSPlayer(
         current_policy_value_net.policy_value_fn,
         c_puct=self.c_puct,
         n_playout=self.n_playout)
     pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=pure_mcts_playout_num)
     win_cnt = defaultdict(int)
     board = Board(width=self.board_width,
                   height=self.board_height,
                   n_in_row=self.n_in_row)
     game = Game(board)
     logging.info('update process alphazero with pure mcts game start')
     for i in range(n_games):
         winner = game.start_play(current_mcts_player,
                                  pure_mcts_player,
                                  start_player=i % 2,
                                  is_shown=0)
         win_cnt[winner] += 1
     logging.info('update process alphazero with pure mcts game finished')
     win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
     logging.info(
         "update process num_playouts:{}, win: {}, lose: {}, tie:{}".format(
             pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
     return win_ratio
Beispiel #4
0
def run():
    n = N
    width, height = SIZE, SIZE

    # best_policy_1 = PolicyValueNet(width, height, model_file=MODEL_1)
    # player_1 = MCTSPlayer(best_policy_1.policy_value_fn,
    #                          c_puct=5,
    #                          n_playout=400)  # set larger n_playout for better performance
    #
    # best_policy_2 = PolicyValueNet(width, height, model_file=MODEL_2)
    # player_2 = MCTSPlayer(best_policy_2.policy_value_fn,
    #                          c_puct=5,
    #                          n_playout=400)  # set larger n_playout for better performance

    if MCTS_PURE:
        player_2 = MCTS_Pure(c_puct=5, n_playout=PLAYOUT)
        print("Benchmarking the following two models:" + MODEL_1 +
              " Pure MCTS")
    elif HUMAN:
        player_2 = Human()
        print("Benchmarking the following two models:" + MODEL_1 + " Human")
    else:
        print("Benchmarking the following two models:" + MODEL_1 + " " +
              MODEL_2)

    player_1 = Human()

    result = policy_evaluate(player_1, player_2)
    print("The win ratio for " + MODEL_1 + " is: ", str(100 * result) + "%")
Beispiel #5
0
def single_game_play():
    board = Board(width=15,height=15,n_in_row=5)
    game = Game(board)
    temp = 1.0
    player = MCTS_Pure()
    winner, play_data = game.start_self_play(player, temp=temp)
    return winner, play_data
Beispiel #6
0
    def policy_evaluate(self, n_games=10):
        """
        通過與純MCTS玩家對戰來評估經過培訓的策略網路
        注意:這僅用於監視培訓進度
        """
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)
        pure_mcts_player = MCTS_Pure(c_puct=5,
                                     n_playout=self.pure_mcts_playout_num)
        win_cnt = defaultdict(int)
        for i in range(n_games):
            winner = self.game.start_play(current_mcts_player,
                                          pure_mcts_player,
                                          start_player=i % 2,
                                          is_shown=0)
            win_cnt[winner] += 1
        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
            self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))

        send_msg("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
            self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))

        if not os.path.exists(self.evaluate_path):
            with open(self.evaluate_path, 'w') as f:
                f.write('i, num_playouts, win, lose, tie')
        with open(self.evaluate_path, 'a') as f:
            f.write(
                f'{self.i}, {self.pure_mcts_playout_num}, {win_cnt[1]}, {win_cnt[2]}, {win_cnt[-1]}\n'
            )
        return win_ratio
Beispiel #7
0
    def policy_evaluate(self, current_batch, n_games=10):
        """
        Evaluate the trained policy by playing against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """
        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)
        pure_mcts_player = MCTS_Pure(c_puct=5,
                                     n_playout=self.pure_mcts_playout_num)
        win_cnt = defaultdict(int)
        for i in range(n_games):
            winner = self.game.start_play(current_mcts_player,
                                          pure_mcts_player,
                                          start_player=i % 2,
                                          is_shown=0)
            win_cnt[winner] += 1
        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games

        output = "current_batch:{},num_playouts:{},win:{},lose:{},tie:{},win_ratio:{}".format(
            current_batch, self.pure_mcts_playout_num, win_cnt[1], win_cnt[2],
            win_cnt[-1], win_ratio)

        utils.log(output, SCORE_OUTPUT)

        return win_ratio
    def policy_evaluate(self, n_games=10):
        '''
        Evaluate the trained policy by playing against the pure MCTS player
        Note: this is only for monitoring the progress of training
        '''
        current_mcts_player = MCTSPlayer(
            policy_value_function=self.policy_value_net.policy_value_fn_random,
            action_fc=self.policy_value_net.action_fc_test,
            evaluation_fc=self.policy_value_net.evaluation_fc2_test,
            c_puct=5,
            n_playout=400,
            is_selfplay=False)

        test_player = MCTS_Pure(c_puct=5, n_playout=3000)

        win_cnt = defaultdict(int)
        # 5 white stone games, 5 black stone games
        for i in range(n_games):
            winner = self.game.start_play(player1=current_mcts_player,
                                          player2=test_player,
                                          start_player=i % 2,
                                          is_shown=0,
                                          print_prob=False)
            win_cnt[winner] += 1

        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
            self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
        return win_ratio
Beispiel #9
0
    def policy_evaluate(self, n_games=10):

        #print "_____policy__evaluation________"

        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.n_playout)

        pure_mcts_player = MCTS_Pure(c_puct=5,
                                     n_playout=self.pure_mcts_playout_num)

        win_cnt = defaultdict(int)

        for i in range(n_games):

            winner = self.game.start_play(current_mcts_player,
                                          pure_mcts_player,
                                          start_player=i % 2,
                                          is_shown=0)
            print "winner", winner
            win_cnt[winner] += 1

            win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[0]) / n_games

        print "win ratio =", win_ratio
        print("num_playout:{}, win: {}, lose: {}, tie:{}".format(
            self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[0]))
        return win_ratio
Beispiel #10
0
    def policy_evaluate(self, n_games=10):
        """
        Evaluate the trained policy by playing against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """
        self.evaluate_game = Game(
            Board(width=self.config['board_width'],
                  height=self.config['board_height'],
                  n_in_row=self.config['n_in_row']))

        current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                         c_puct=self.config['c_puct'],
                                         n_playout=self.config['n_playout'])

        pure_mcts_player = MCTS_Pure(
            c_puct=5, n_playout=self.config['pure_mcts_playout_num'])

        win_cnt = defaultdict(int)
        for i in range(n_games):
            winner = self.evaluate_game.start_play(current_mcts_player,
                                                   pure_mcts_player,
                                                   start_player=i % 2,
                                                   is_shown=0)
            win_cnt[winner] += 1

        win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
            self.config['pure_mcts_playout_num'], win_cnt[1], win_cnt[2],
            win_cnt[-1]))
        return win_ratio
    def policy_evaluate(self, n_games=10,num=0,self_evaluate = 0):
        '''
        Evaluate the trained policy by
        playing against the pure MCTS player or play with itself
        pure MCTS only for monitoring the progress of training
        play with itself (last best net) for evaluating the best model so as to collect data
        '''
        # fix the playout times to 400
        current_mcts_player = MCTSPlayer(policy_value_function=self.policy_value_net.policy_value_fn_random,
                                         action_fc=self.policy_value_net.action_fc_test,
                                         evaluation_fc=self.policy_value_net.evaluation_fc2_test,
                                         c_puct=self.c_puct,
                                         n_playout=400,
                                         is_selfplay=False)
        if self_evaluate:
            self.policy_value_net.load_numpy(self.policy_value_net.network_oppo_all_params)

            mcts_player_oppo = MCTSPlayer(policy_value_function=self.policy_value_net.policy_value_fn_random,
                                          action_fc=self.policy_value_net.action_fc_test_oppo,
                                          evaluation_fc=self.policy_value_net.evaluation_fc2_test_oppo,
                                          c_puct=self.c_puct,
                                          n_playout=400,
                                          is_selfplay=False)

        else:
            test_player = MCTS_Pure(c_puct=5,n_playout=self.pure_mcts_playout_num)

        win_cnt = defaultdict(int)
        for i in range(n_games):
            if self_evaluate:
                print('+' * 80 + 'rank: {},  epoch:{}, game:{} , now situation : {} , self evaluating ...'.format(rank, num,i,win_cnt))
                winner = self.game.start_play(player1=current_mcts_player,
                                              player2=mcts_player_oppo,
                                              start_player=i%2,
                                              is_shown=0,
                                              print_prob =False)
            else:
                print('+'*80+'pure mcts playout: {},  rank: {},  epoch:{}, game:{}  evaluating ...'.format(self.pure_mcts_playout_num,rank,num,i))
                print()
                winner = self.game.start_play(player1=current_mcts_player,
                                              player2=test_player,
                                              start_player=i % 2,
                                              is_shown=0,
                                              print_prob=False)
            win_cnt[winner] += 1
        win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1]) / n_games
        #win for 1,tie for 0.5
        if self_evaluate:
            print("-"*150+"win: {}, lose: {}, tie:{}".format(win_cnt[1], win_cnt[2], win_cnt[-1]))
        else:
            print("-"*80+"num_playouts:{}, win: {}, lose: {}, tie:{}".format(
                    self.pure_mcts_playout_num,
                    win_cnt[1], win_cnt[2], win_cnt[-1]))
        return win_ratio
Beispiel #12
0
def run():
    n = 5
    width, height = 8, 8
    model_file = 'best_policy_8_8_5.model'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

        # best_policy = PolicyValueNet(width, height, model_file = model_file)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        try:
            policy_param = pickle.load(open(model_file, 'rb'))
        except:
            policy_param = pickle.load(open(model_file, 'rb'),
                                       encoding='bytes')  # To support python3
        best_policy = PolicyValueNetNumpy(width, height, policy_param)
        mcts_player = MCTSPlayer(
            best_policy.policy_value_fn, c_puct=5,
            n_playout=400)  # set larger n_playout for better performance

        #pure mcts player
        #make quick_play=True to enable a weaker but much faster roll-out player without mcts
        pure_mcts_player = MCTS_Pure(c_puct=1, n_playout=600, quick_play=False)
        roll_out_player = MCTS_Pure(quick_play=True)

        #1.run with two human player
        game.start_play_with_UI()

        #2.run with alpha zero nerutral network AI, and my quick roll-out AI
        #game.start_play_with_UI(AI=mcts_player, AI2 = roll_out_player)

        #3.run with alpha zero nerutral network AI, and my pure mcts AI
        #game.start_play_with_UI(AI=mcts_player, AI2 = pure_mcts_player)

    except KeyboardInterrupt:
        print('\n\rquit')
Beispiel #13
0
 def __init__(self):
     self.temp = 1e-3  # the temperature param
     self.n_playout = 200  # num of simulations for each move
     self.c_puct = 5
     self.board_width = 8
     self.board_height = 8
     self.model_path = os.path.join("./models/curr_model_100rollout.pt")
     #self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params=None)
     #self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout)
     self.mcts_player = MCTS_Pure(c_puct=5, n_playout=self.n_playout)
     self.env = gym.make("Reversi8x8-v0")
     self.init_model()
def run():
    n = 5
    width, height = 12, 12
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        mcts_player = MCTS_Pure(c_puct=5, n_playout=10000)  # set larger n_playout for better performance

        human = Human()

        game.start_play(human, mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
def run():
    # n = 5
    width, height = 5, 5
    # model_file = 'best_policy_8_8_5.model'
    try:
        ## board = Board(width=width, height=height, n_in_row=n)
        board = Board(width=width, height=height)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

        # best_policy = PolicyValueNet(width, height, model_file = model_file)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        # try:
        #     policy_param = pickle.load(open(model_file, 'rb'))
        # except:
        #     policy_param = pickle.load(open(model_file, 'rb'),
        #                                encoding='bytes')  # To support python3
        # best_policy = PolicyValueNetNumpy(width, height, policy_param)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn,
        #                          c_puct=5,
        #                          n_playout=400)  # set larger n_playout for better performance

        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        mcts_player1 = MCTS_Pure(c_puct=5, n_playout=500)
        mcts_player2 = MCTS_Pure(c_puct=5, n_playout=500)
        # human player, input your move in the format: 2,3
        # human = Human()

        # set start_player=0 for human first
        game.start_play(mcts_player1, mcts_player2, start_player=1, is_shown=0)
        # game.start_play(human, mcts_player2, start_player=1, is_shown=0)
    except KeyboardInterrupt:
        print('\n\rquit')
Beispiel #16
0
 def policy_evaluate(self, n_games=10):
     current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct, n_playout=self.n_playout)
     pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num)
     win_cnt = defaultdict(int)
     for i in range(n_games):
         # AI和弱AI(纯MCTS)对弈,不需要可视化 is_shown=0,双方轮流职黑 start_player=i % 2
         winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0)
         win_cnt[winner] += 1
     # 计算胜率,平手计为0.5分
     win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1]) / n_games
     print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
             self.pure_mcts_playout_num,
             win_cnt[1], win_cnt[2], win_cnt[-1]))
     return win_ratio
Beispiel #17
0
 def policy_evaluate(self, n_games=10,batch=0):
     """
     Evaluate the trained policy by playing games against the pure MCTS player
     Note: this is only for monitoring the progress of training
     """
     current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout)
     pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num)
     win_cnt = defaultdict(int)
     for i in range(n_games):
         winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i%2, is_shown=0)
         win_cnt[winner] += 1
     win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1])/n_games
     print("batch_i:{}, num_playouts:{}, win: {}, lose: {}, tie:{}".format(batch, self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
     logging.debug("batch_i {} num_playouts {} win {} lose {} tie {}".format(batch, self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
     return win_ratio
def run():
    n = N
    width, height = SIZE, SIZE

    if MCTS_PURE:
        player_2 = MCTS_Pure(c_puct=5, n_playout=PLAYOUT)
        # print ("Benchmarking the following two models:"+MODEL_1+" Pure MCTS")
    elif HUMAN:
        player_2 = Human()
        # print ("Benchmarking the following two models:"+MODEL_1+" Human")
    else:
        pass
        # print ("Benchmarking the following two models:"+MODEL_1+" "+MODEL_2)

    #
    # best_policy_2 = PolicyValueNet(width, height, model_file=MODEL_2)
    # player_2 = MCTSPlayer(best_policy_2.policy_value_fn,
    #                          c_puct=5,
    #                          n_playout=400)  # set larger n_playout for better performance
    # player_1=Human()

    win_ratios = []
    game_batchs = range(50, 1501, 100)
    for game_batch in game_batchs:
        model = './models/iter_' + str(game_batch) + '.model'
        print(model)

        policy = PolicyValueNet(width, height, model_file=model)
        player_1 = MCTSPlayer(
            policy.policy_value_fn, c_puct=5,
            n_playout=400)  # set larger n_playout for better performance
        win_ratio = policy_evaluate(player_1, player_2)
        win_ratios.append(win_ratio)
        print("The win ratio for " + model + " is: ",
              str(100 * win_ratio) + "%")

    print(zip(win_ratios, game_batchs))

    fig, ax = plt.subplots()
    ax.plot(game_batchs, win_ratios)

    ax.set(
        xlabel='iterations',
        ylabel='win ratios',
        title='Win ratio of models trained by 5 input states vs. MCTS player')
    ax.grid()

    fig.savefig("win_ratio.png")
Beispiel #19
0
def run():
    n = 5
    width, height = 8, 8
    # model_file = 'best_policy_8_8_5.model'
    # model_file = 'best_policy_6_6_4.model'
    model_file = 'current_policy.model'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)
        # human player, input your move in the format: 2,3
        human = Human()

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow
        # Add FORBIDDEN move player
        best_policy = PolicyValueNet(width, height, model_file=model_file)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        try:
            policy_param = pickle.load(open(model_file, 'rb'))
        except:
            policy_param = pickle.load(open(model_file, 'rb'),
                                       encoding='bytes')  # To support python3

        # ################ ORIGINAL POLICY and PLAYER ################
        # best_policy = PolicyValueNetNumpy(width, height, policy_param)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn,
        #                          c_puct=5,
        #                          n_playout=400)  # set larger n_playout for better performance

        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)
        mcts_pure = MCTS_Pure(c_puct=5, n_playout=1000)

        # set start_player=0 for human first

        # game.start_play(human, mcts_player, start_player=1, is_shown=1)

        # ############## IMPLEMENTED PURE RL PLAYER ##############
        adv_player = QPlayer(board)
        # game.start_play(human, adv_player, start_player=1, is_shown=1)
        game.start_play(human, adv_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')
 def policy_evaluate(self, n_games=10):
     """
     通过与纯的MCTS算法对抗来评估训练的策略
     注意:这仅用于监控训练进度
     """
     current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout)
     pure_mcts_player = MCTS_Pure(c_puct=5,
                                  n_playout=self.pure_mcts_playout_num)
     win_cnt = defaultdict(int)
     for i in range(n_games):
         winner = self.game.start_play(current_mcts_player,
                                       pure_mcts_player,
                                       start_player=i % 2)
         win_cnt[winner] += 1
     win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
     print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
         self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
     return win_ratio
Beispiel #21
0
def run(states, sensible_moves, currentPlayer, lastMove):
    n = 5
    width, height = 8, 8
    board = Board(width=width, height=height, n_in_row=n)
    board.init_board()

    board.states = states
    board.availables = sensible_moves
    board.current_player = currentPlayer
    board.last_move = lastMove

    #best_policy = PolicyValueNetNumpy(width, height, policy_param)
    #mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)

    #只用纯MCTS
    mcts_player = MCTS_Pure(c_puct=5, n_playout=4000)  # n_playout参数 表示 搜索次数

    nextmove = mcts_player.get_action(board)

    return nextmove
 def policy_evaluate(self, n_games=10):
     """
     Evaluate the trained policy by playing against the pure MCTS player
     Note: this is only for monitoring the progress of training
     """
     current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.chess_mcts_playout)
     pure_mcts_player = MCTS_Pure(c_puct=5,
                                  n_playout=self.mcts_num)
     win_cnt = defaultdict(int)
     for i in range(n_games):
         winner = self.run_game.play_game(current_mcts_player, pure_mcts_player)
         print(winner)
         win_cnt[winner] += 1
     win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
     print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
         self.mcts_num,
         win_cnt[1], win_cnt[2], win_cnt[-1]))
     return win_ratio
Beispiel #23
0
 def policy_evaluate(self, n_games=10):
     """
     Evaluate the trained policy by playing against the pure MCTS player
     Note: this is only for monitoring the progress of training
     """
     current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout)
     pure_mcts_player = MCTS_Pure(c_puct=5,
                                  n_playout=self.pure_mcts_playout_num)
     win_cnt = defaultdict(int)
     for i in tqdm(range(n_games), ascii=True, desc='Policy Evaluate'):
         winner = self.game.start_play(current_mcts_player,
                                       pure_mcts_player,
                                       start_player=i % 2,
                                       is_shown=self.is_shown)
         win_cnt[winner] += 1
     win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
     print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
         self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
     return win_ratio
Beispiel #24
0
 def policy_evaluate(self, n_games=10):
     """
     通过和纯MCTS玩家对战评估当前策略
     Note: 这仅仅是为了监控训练的进程
     """
     current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout)
     pure_mcts_player = MCTS_Pure(c_puct=5,
                                  n_playout=self.pure_mcts_playout_num)
     win_cnt = defaultdict(int)
     for i in range(n_games):
         winner = self.game.start_play(current_mcts_player,
                                       pure_mcts_player,
                                       start_player=i % 2,
                                       is_shown=self.is_shown)
         win_cnt[winner] += 1
     win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
     print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
         self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
     return win_ratio
Beispiel #25
0
    def play(self):

        model_file = "current.model"
        best_policy = PolicyValueNet(self.width, self.height, model_file)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=300)
        pure_player = MCTS_Pure(c_puct=5, n_playout=300)

        human1 = Human()
        human2 = Human()
        # self.show()

        win_cnt = defaultdict(int)
        for i in range(10):
            winner = self.start_play(mcts_player,
                                     pure_player,
                                     start_player=(i % 2),
                                     is_shown=1)
            win_cnt[winner] += 1
        print "win", win_cnt[1], "lose", win_cnt[2], "tie", win_cnt[0]
Beispiel #26
0
 def policy_evaluate(self, n_games=10):
     """
     Evaluate the trained policy by playing against the pure MCTS player
     Note: this is only for monitoring the progress of training
     """
     current_player = self.player
     win_ratios = {}
     for playout_num in self.pure_mcts_playout_num:
         pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=playout_num)
         win_cnt = defaultdict(int)
         for i in range(n_games):
             winner = self.game.start_play(current_player,
                                           pure_mcts_player,
                                           start_player=i % 2,
                                           is_shown=0)
             win_cnt[winner] += 1
         win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
         print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
             playout_num, win_cnt[1], win_cnt[2], win_cnt[-1]))
         win_ratios[str(playout_num)] = win_ratio
     return win_ratios
Beispiel #27
0
 def policy_evaluate(self, n_games=10):
     """
     Evaluate the trained policy by playing against the pure MCTS player
     Note: this is only for monitoring the progress of training
     """
     print('4')
     current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout)
     pure_mcts_player = MCTS_Pure(c_puct=5,
                                  n_playout=self.pure_mcts_playout_num)
     win_cnt = 0
     for i in range(n_games):
         winner = self.game.start_play(current_mcts_player,
                                       pure_mcts_player,
                                       start_player=i % 2,
                                       is_shown=0)
         win_cnt += 1
     win_ratio = win_cnt / n_games
     print("num_playouts:{}, win: {}".format(self.pure_mcts_playout_num,
                                             win_cnt))
     return win_ratio
 def policy_evaluate(self, n_games=10):
     """
     Evaluate the trained policy by playing games against the pure MCTS player
     Note: this is only for monitoring the progress of training
     """
     current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout)
     pure_mcts_player = MCTS_Pure(c_puct=5,
                                  n_playout=self.pure_mcts_playout_num)
     win_cnt = defaultdict(int)
     for i in range(n_games):
         print("train-policy_evaluate: game = %d" % (i))
         winner = start_play(self.board,
                             current_mcts_player,
                             pure_mcts_player,
                             startPlayer=i % 2)
         win_cnt[winner] += 1
     win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[0]) / n_games
     print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
         self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[0]))
     return win_ratio
Beispiel #29
0
 def policy_evaluate(self, n_games=10):
     """
     Evaluate the trained policy by playing games against the pure MCTS player
     Note: this is only for monitoring the progress of training
     """
     current_mcts_player = MCTSPlayer(
         self.policy_value_net.policy_value_func,
         c_puct=self.c_puct,
         n_play_out=self.n_play_out)
     pure_mcts_player = MCTS_Pure(c_puct=5,
                                  n_play_out=self.pure_mcts_play_out_number)
     win_cnt = defaultdict(int)
     results = self.pool.map(self.game.start_play,
                             [(current_mcts_player, pure_mcts_player, i)
                              for i in range(n_games)])
     for winner in results:
         win_cnt[winner] += 1
     win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games
     print_log("number_play_outs:{}, win: {}, lose: {}, tie:{}".format(
         self.pure_mcts_play_out_number, win_cnt[1], win_cnt[2],
         win_cnt[-1]))
     return win_ratio
Beispiel #30
0
        def parse_agent(agent_type, filename):
            if agent_type == 'mcts_a0':
                model_file = 'best_policy_8_8_5.model'
                if filename:
                    model_file = filename
                # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

                # best_policy = PolicyValueNet(width, height, model_file = model_file)
                # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)

                # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
                try:
                    policy_param = pickle.load(open(model_file, 'rb'))
                except:
                    policy_param = pickle.load(
                        open(model_file,
                             'rb'), encoding='bytes')  # To support python3
                best_policy = PolicyValueNetNumpy(width, height, policy_param)
                player = MCTSPlayer(
                    best_policy.policy_value_fn, c_puct=5, n_playout=400
                )  # set larger n_playout for better performance
            elif agent_type == 'mcts_pure':
                player = MCTS_Pure(c_puct=5, n_playout=1000)
            elif agent_type == 'minmax':
                player = Minimax()
            elif agent_type == 'dqn':
                model_file = 'output/v_1/epoch_100/agent_2.pkl'
                if filename:
                    model_file = filename
                player = DQNPlayer(model_file)
            elif agent_type == 'human':
                player = Human()
            else:
                player = Human()
                print('Illegal Agent Type. Defaulting to human player.')
            return player