Python PolicyValueNetの例、policy_value_net_tensorflow.PolicyValueNet Pythonの例

コード例 #1

0

ファイルを表示

    def __init__(self,
                 model_name,
                 loss_function,
                 forbidden_hands,
                 init_model=None):
        # params of the board and the game
        self.board_width = 9
        self.board_height = 9
        self.n_in_row = 5
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row,
                           forbidden_hands=forbidden_hands)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 1000  # num of simulations for each move
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02
        self.check_freq = 50
        self.game_batch_num = 3000
        self.best_win_ratio = 0.0
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        self.model_name = model_name
        if init_model:
            # start training from an initial policy-value net
            if self.model_name == 'baseline':
                self.policy_value_net = PolicyValueNet(self.board_width,
                                                       self.board_height,
                                                       loss_function,
                                                       model_file=init_model)
            else:
                self.policy_value_net = PolicyValueNetRes30(
                    self.board_width,
                    self.board_height,
                    loss_function,
                    model_file=init_model)
        else:
            # start training from a new policy-value net
            if self.model_name == 'baseline':
                self.policy_value_net = PolicyValueNet(self.board_width,
                                                       self.board_height,
                                                       loss_function)
            else:
                self.policy_value_net = PolicyValueNetRes30(
                    self.board_width, self.board_height, loss_function)

        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

コード例 #2

0

ファイルを表示

ファイル: human_play.py プロジェクト: xzabg/AlphaZero_Gobang

def run():
    n = 5
    width, height = 15, 15
    model_file = 'models/resnet/output318/current_policy.model'
    model_file1 = 'models/resnet/output196/current_policy.model'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

        best_policy = PolicyValueNet(width,
                                     height,
                                     model_file=model_file,
                                     output='output/')
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=1,
                                 n_playout=1500)

        tf.reset_default_graph()

        best_policy1 = PolicyValueNet(width,
                                      height,
                                      model_file=model_file1,
                                      output='output/')
        mcts_player1 = MCTSPlayer(best_policy1.policy_value_fn,
                                  c_puct=1,
                                  n_playout=1500)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        '''try:
            policy_param = pickle.load(open(model_file, 'rb'))
        except:
            policy_param = pickle.load(open(model_file, 'rb'),
                                       encoding='bytes')  # To support python3
        best_policy = PolicyValueNetNumpy(width, height, policy_param)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=400)  # set larger n_playout for better performance
        '''
        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        # human = MCTS_Pure(c_puct=5, n_playout=5000)

        # human player, input your move in the format: 2,3
        human = Human()  # MCTS_Pure(c_puct=5, n_playout=1000)  #

        # set start_player=0 for human first
        game.start_play(mcts_player1, mcts_player, start_player=0, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')

コード例 #3

0

ファイルを表示

class SelfPlayer(Process):
    def __init__(self, config, sample_queue, model_queue):
        super(SelfPlayer, self).__init__()

        self.config = config
        self.temp = config['temperature']

        self.sample_queue = sample_queue
        self.model_queue = model_queue

        self.board = Board(width=config['board_width'],
                           height=config['board_height'],
                           n_in_row=config['n_in_row'])
        self.game = Game(self.board)

    def collect_selfplay_data(self, n_games=1):
        """collect self-play data for training"""
        samples = []
        for i in range(n_games):
            _, play_data = self.game.start_self_play(self.mcts_player,
                                                     temp=self.temp)
            samples.extend(list(play_data)[:])
        return samples

    def run(self):

        self.policy_value_net = PolicyValueNet(
            self.config['board_width'],
            self.config['board_height'],
            model_file=self.config['init_model'])
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.config['c_puct'],
                                      n_playout=self.config['n_playout'],
                                      is_selfplay=1)

        print("running")
        while True:
            # always use the latest weight
            weights = None
            while not self.model_queue.empty():
                weights = self.model_queue.get()
            if weights:
                self.policy_value_net.set_weight(weights)

            # sample
            samples = self.collect_selfplay_data()
            # put the new sample to sample queue
            self.sample_queue.put(samples)

コード例 #4

0

ファイルを表示

    def __init__(self, init_model=None, is_remote=False):
        # params of the board and the game
        self.board_width = 8
        self.board_height = 8
        self.n_in_row = 5
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        self.n_playout_self_play = 1000
        self.c_puct = 5
        self.buffer_size = 2000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02
        self.check_freq = 100
        self.game_batch_num = 1500
        self.best_win_ratio = 0.0
        self.td_step = 2
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        if is_remote:
            self.path = '/content/drive/My Drive/'
        else:
            self.path = './'

        if init_model:
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=self.path +
                                                   init_model)
        else:
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout_self_play,
                                      is_selfplay=1)

コード例 #5

0

ファイルを表示

ファイル: train_multi_3.0.py プロジェクト: liyaozong1991/Gomoku-AI

 def collect_selfplay_data_thread(self, thread_id, shared_queue, net_lock, data_lock):
     logging.info('selfpaly process start: {}'.format(thread_id))
     os.environ["CUDA_VISIBLE_DEVICES"] = str(thread_id % 6 + 2)
     from policy_value_net_tensorflow import PolicyValueNet
     # 读取模型文件，加锁
     with net_lock:
         current_policy = PolicyValueNet(self.board_width, self.board_height, model_file=current_model_name)
     local_board = Board(width=self.board_width,
                        height=self.board_height,
                        n_in_row=self.n_in_row)
     local_game = Game(local_board)
     local_mcts_player = MCTSPlayer(current_policy.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)
     winner, play_data = local_game.start_self_play(local_mcts_player,
                                                    temp=self.temp)
     play_data = list(play_data)
     play_data = self.get_equi_data(play_data)
     # 添加对弈数据，加锁
     with data_lock:
         shared_queue.extend(play_data)
         while len(shared_queue) > self.buffer_num:
             shared_queue.pop(0)
     logging.info('selfpaly process finished: {}'.format(thread_id))

コード例 #6

0

ファイルを表示

ファイル: human_play.py プロジェクト: hyyh28/Jordan

def run():
    n = 5
    width, height = 9, 9
    model_file = 'best_policy_9_9_5.model'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        ################ human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow
        # best_policy = PolicyValueNet(width, height, model_file = model_file)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        try:
            policy_param = pickle.load(open(model_file, 'rb'))
        except:
            policy_param = pickle.load(open(model_file, 'rb'),
                                       encoding='bytes')  # To support python3
        best_policy = PolicyValueNet(width, height, policy_param)
        mcts_player = MCTSPlayer(
            best_policy.policy_value_fn, c_puct=5,
            n_playout=1000)  # set larger n_playout for better performance

        # uncomment the following line to play with pure MCTS (its much weaker even with a larger n_playout)
        # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)

        # human player, input your move in the format: 2,3
        human = Human()

        # set start_player=0 for human first
        game.start_play(human, mcts_player, start_player=1, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')

コード例 #7

0

ファイルを表示

ファイル: yixin_play.py プロジェクト: xzabg/AlphaZero_Gobang

 def __init__(self, startplayer=0):
     model_file = 'models/resnet/output318/current_policy.model'
     policy_param = None
     self.height = 15
     self.width = 15
     '''if model_file is not None:
         print('loading...', model_file)
         try:
             policy_param = pickle.load(open(model_file, 'rb'))
         except:
             policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes')'''
     policy_value_net = PolicyValueNet(self.height,
                                       self.width,
                                       model_file=model_file,
                                       output='output/')
     self.mcts_player = MCTSPlayer(policy_value_net.policy_value_fn,
                                   c_puct=1,
                                   n_playout=1000)
     self.board = Board(width=self.width, height=self.height, n_in_row=5)
     self.board.init_board(startplayer)
     self.game = Game(self.board)
     p1, p2 = self.board.players
     print('players:', p1, p2)
     self.mcts_player.set_player_ind(p1)
     pass

コード例 #8

0

ファイルを表示

ファイル: train.py プロジェクト: zxcqwe4906/AlphaZero_Gomoku

    def __init__(self, init_model):
        self.init_model = init_model
        # params of the board and the game
        self.board_width = 6
        self.board_height = 6
        self.n_in_row = 4
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.game = Game(self.board)
        # training params
        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
        self.temp = 1.0  # the temperature param
        self.n_playout = 400  # num of simulations for each move
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512  # mini-batch size for training
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5  # num of train_steps for each update
        self.kl_targ = 0.02
        self.check_freq = 50
        self.game_batch_num = 1000
        self.best_win_ratio = 0.0
        # num of simulations used for the pure mcts, which is used as
        # the opponent to evaluate the trained policy
        self.pure_mcts_playout_num = 1000
        if os.path.isdir(init_model):
            self.is_init = True
            # start training from an initial policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model)
        else:
            self.is_init = False
            os.system('mkdir ' + init_model)
            # start training from a new policy-value net
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

        if not os.path.isdir(init_model + 'best'):
            os.system('mkdir ' + init_model + 'best')

コード例 #9

0

ファイルを表示

 def __init__(self, init_model=None):
     # params of the board and the game
     self.board_width = 7
     self.board_height = 7
     self.n_in_row = 5
     self.board = Board(width=self.board_width,
                        height=self.board_height,
                        n_in_row=self.n_in_row)
     self.game = Game(self.board)
     # training params
     self.learn_rate = 2e-3
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 1500  # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 150000
     self.batch_size = 2048  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     if os.path.exists("data_buffer.pkl"):
         with open("data_buffer.pkl", "rb") as f:
             self.data_buffer = pickle.load(f)
             print("Load data, size = %d" % len(self.data_buffer))
     self.play_batch_size = 1
     self.epochs = 10  # num of train_steps for each update
     self.kl_targ = 0.02
     self.check_freq = 1500
     self.save_freq = 500
     self.game_batch_num = 10000
     self.best_win_ratio = 0.0
     self.episode_len = 0
     # num of simulations used for the pure mcts, which is used as
     # the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 1000
     if init_model:
         # start training from an initial policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                model_file=init_model)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)

コード例 #10

0

ファイルを表示

ファイル: game.py プロジェクト: liyaozong1991/Gomoku-AI

 def get_net_player_next_action(
     player,
     i,
     shared_board_states,
     shared_board_availables,
     shared_board_last_move,
     shared_board_current_player,
     game_continue,
     winner,
     play_lock,
     net_lock,
 ):
     from policy_value_net_tensorflow import PolicyValueNet
     from mcts_alphaZero import MCTSPlayer
     local_board = Board(width=self.board.width,
                         height=self.board.height,
                         n_in_row=self.board.n_in_row)
     local_board.init_board(start_player)
     with net_lock:
         policy = PolicyValueNet(local_board.width,
                                 local_board.height,
                                 model_file=player)
     mcts_player = MCTSPlayer(policy.policy_value_fn,
                              c_puct=5,
                              n_playout=400,
                              is_selfplay=0)
     while game_continue.value == 1:
         if shared_board_current_player.value == i:
             with play_lock:
                 # 必须进行同步，好麻烦
                 for k, v in shared_board_states.items():
                     local_board.states[k] = v
                 local_board.availables = []
                 for availables in shared_board_availables:
                     local_board.availables.append(availables)
                 local_board.last_move = shared_board_last_move.value
                 local_board.current_player = shared_board_current_player.value
                 # 同步结束
                 move = mcts_player.get_action(local_board)
                 local_board.do_move(move)
                 #print('player {} do move {}'.format(i, move))
                 if is_shown:
                     self.graphic(local_board)
                 end, win = local_board.game_end()
                 if end:
                     if win != -1:
                         print("Game end. Winner is", win)
                     else:
                         print("Game end. Tie")
                     game_continue.value = 0
                     winner.value = win
                 # 继续同步
                 shared_board_states[
                     move] = shared_board_current_player.value
                 shared_board_availables.remove(move)
                 shared_board_last_move.value = move
                 shared_board_current_player.value = 1 - shared_board_current_player.value
         time.sleep(0.2)

コード例 #11

0

ファイルを表示

ファイル: train_multi.py プロジェクト: liyaozong1991/Gomoku-AI

 def init_tensorflow_net(self, model_file=None):
     os.environ["CUDA_VISIBLE_DEVICES"] = "1"
     logging.info('init tf net')
     from policy_value_net_tensorflow import PolicyValueNet
     policy_value_net = PolicyValueNet(self.board_width,
                                       self.board_height,
                                       model_dir,
                                       model_file=model_file)
     logging.info('init tf net finished')

コード例 #12

0

ファイルを表示

ファイル: train.py プロジェクト: QilongPan/flippingChess

 def __init__(self, init_model=None):
     # params of the board and the game
     self.board_width = 5
     self.board_height = 6
     self.board = Board()
     self.game = Game(self.board)
     # training params
     #学习率0.002
     self.learn_rate = 2e-3
     #自动调整学习率 kl比较两个概率分布的接近程度。在某个变化范围内，KL散度取到最小值的时候，对应的参数是我们想要的最优参数
     self.lr_multiplier = 1.0  # adaptively adjust the learning rate based on KL
     self.temp = 1.0  # the temperature param
     self.n_playout = 400  # num of simulations for each move
     self.c_puct = 5
     self.buffer_size = 10000
     self.batch_size = 512  # mini-batch size for training
     self.data_buffer = deque(maxlen=self.buffer_size)
     self.play_batch_size = 1
     self.epochs = 5  # num of train_steps for each update
     self.kl_targ = 0.02
     #    self.check_freq = 50
     self.check_freq = 50
     #    self.game_batch_num = 1500
     self.game_batch_num = 50
     self.best_win_ratio = 0.0
     # num of simulations used for the pure mcts, which is used as
     # the opponent to evaluate the trained policy
     self.pure_mcts_playout_num = 1000
     if init_model:
         # start training from an initial policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height,
                                                model_file=init_model)
     else:
         # start training from a new policy-value net
         self.policy_value_net = PolicyValueNet(self.board_width,
                                                self.board_height)
     self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                   c_puct=self.c_puct,
                                   n_playout=self.n_playout,
                                   is_selfplay=1)

コード例 #13

0

ファイルを表示

ファイル: human_play.py プロジェクト: QilongPan/flippingChess

def run():
    model_file = './current_policy.model'
    width = 5
    height = 6

    board = Board()
    game = Game(board)
    best_policy = PolicyValueNet(width, height, model_file)
    mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                             c_puct=5,
                             n_playout=400)
    human = Human()
    game.start_play(human, mcts_player, start_player=1, is_shown=1)

コード例 #14

0

ファイルを表示

ファイル: train_multi.py プロジェクト: liyaozong1991/Gomoku-AI

 def update_net(self, shared_queue, net_lock, data_lock,
                stop_update_process):
     os.environ["CUDA_VISIBLE_DEVICES"] = "1"
     from policy_value_net_tensorflow import PolicyValueNet
     logging.info('update process start')
     # 读取和写入模型文
     current_policy_value_net = PolicyValueNet(self.board_width,
                                               self.board_height, model_dir)
     current_policy_value_net.save_model(current_model_name)
     i = 0
     best_win_ratio = 0
     pure_mcts_playout_num = 1000
     get_enough_train_data = False
     while stop_update_process.value == 0:
         time.sleep(1)
         if get_enough_train_data:
             i += 1
             logging.info('update process start {} th self train'.format(i))
             self.policy_update(current_policy_value_net, shared_queue,
                                net_lock, data_lock, i)
             logging.info('update process end {} th self train'.format(i))
             # 这里更新最新模型文件
             if (i + 1) % self.update_freq == 0:
                 logging.info('update process ask net lock')
                 with net_lock:
                     logging.info('update process get net lock')
                     current_policy_value_net.save_model(current_model_name)
                 logging.info('update process release net lock')
             # 这里和纯MCTS比赛，判断胜率，更新最优模型文件
             if (i + 1) % self.check_freq == 0:
                 logging.info("Game {}: AlphagZero VS PURE MCTS".format(i +
                                                                        1))
                 win_ratio = self.policy_evaluate(pure_mcts_playout_num,
                                                  current_policy_value_net)
                 if win_ratio >= best_win_ratio:
                     logging.info("update process New best policy!!!!!!!!")
                     best_win_ratio = win_ratio
                     # update the best_policy
                     current_policy_value_net.save_model(best_model_name)
                     if (best_win_ratio == 1.0
                             and pure_mcts_playout_num < 5000):
                         pure_mcts_playout_num += 1000
                         best_win_ratio = 0.0
         else:
             with data_lock:
                 get_enough_train_data = len(
                     shared_queue) >= self.batch_size
     logging.info('update process finished')

コード例 #15

0

ファイルを表示

    def run(self):
        self.policy_value_net = PolicyValueNet(
            self.config['board_width'],
            self.config['board_height'],
            model_file=self.config['init_model'])

        while True:
            weight = self.queue.get()
            self.policy_value_net.set_weight(weight)
            win_ratio = self.policy_evaluate()
            self.policy_value_net.save_model(
                self.config['current_policy_name'])

            if win_ratio > self.best_win_ratio:
                print("New best policy!!!!!!!!")
                self.best_win_ratio = win_ratio
                # update the best_policy
                self.policy_value_net.save_model(
                    self.config['best_policy_name'])
                if (self.best_win_ratio == 1.0
                        and self.pure_mcts_playout_num < 10000):
                    self.pure_mcts_playout_num += 1000
                    self.best_win_ratio = 0.0

コード例 #16

0

ファイルを表示

    def __init__(self, config):
        self.config = config
        self.lr_multiplier = self.config['lr_multiplier']

        self.data_buffer = deque(maxlen=config['buffer_size'])

        # sample queue: Self player will put the samples to this queue
        # model queue: Train process will put the update model to this queue
        self.sample_queue = Queue()
        self.model_queues = []
        self.self_players = []
        self.evaluator_queue = Queue()
        self.evaluator = Evaluator(self.config, self.evaluator_queue)

        for _ in range(self.config['selfplayer_num']):
            model_queue = Queue()
            self.model_queues.append(model_queue)
            self.self_players.append(
                SelfPlayer(config, self.sample_queue, model_queue))

        self.policy_value_net = PolicyValueNet(config['board_width'],
                                               config['board_height'],
                                               model_file=config['init_model'])

コード例 #17

0

ファイルを表示

ファイル: train.py プロジェクト: aipromote/reinforcement-learning-master

    def __init__(self, init_model=None, is_shown=0):

        self.board_width = 15
        self.board_height = 15
        self.n_in_row = 5
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row)
        self.is_shown = is_shown
        self.game = Game_UI(self.board, is_shown)

        self.learn_rate = 2e-3
        self.lr_multiplier = 1.0
        self.temp = 1.0
        self.n_playout = 400
        self.c_puct = 5
        self.buffer_size = 10000
        self.batch_size = 512
        self.data_buffer = deque(maxlen=self.buffer_size)
        self.play_batch_size = 1
        self.epochs = 5
        self.kl_targ = 0.02
        self.check_freq = 50
        self.game_batch_num = 1500
        self.best_win_ratio = 0.0
        self.pure_mcts_playout_num = 1000
        if init_model:
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   model_file=init_model)
        else:
            self.policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height)
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.n_playout,
                                      is_selfplay=1)

コード例 #18

0

ファイルを表示

def run():
    model_file = './current_policy.model'
    best_policy = PolicyValueNet(6, 6, model_file)
    config = GameConfig()
    board = Board(config)
    game = Game(board)
    mcts_player1 = MCTSPlayer(best_policy.policy_value_fn,
                              c_puct=5,
                              n_playout=1000)
    mcts_player2 = MCTS_Pure(c_puct=5, n_playout=1000)
    mcts_player3 = MCTS_Pure(c_puct=5, n_playout=1000)
    human = Human(config)
    human2 = Human(config)
    human3 = Human(config)
    game.start_play(mcts_player3, human, mcts_player2)

コード例 #19

0

ファイルを表示

    def run(self):

        self.policy_value_net = PolicyValueNet(
            self.config['board_width'],
            self.config['board_height'],
            model_file=self.config['init_model'])
        self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.config['c_puct'],
                                      n_playout=self.config['n_playout'],
                                      is_selfplay=1)

        print("running")
        while True:
            # always use the latest weight
            weights = None
            while not self.model_queue.empty():
                weights = self.model_queue.get()
            if weights:
                self.policy_value_net.set_weight(weights)

            # sample
            samples = self.collect_selfplay_data()
            # put the new sample to sample queue
            self.sample_queue.put(samples)

コード例 #20

0

ファイルを表示

ファイル: human_play.py プロジェクト: jimypeter/engineering_design_by_artificial_intelligence

def run():
    # n = 5
    width, height = 5, 5  # for pure_mcts
    board = Board(width=width, height=height)
    game = Game(board)
    width, height = real_dim(width, height)
    # model_file = 'best_policy_8_8_5.model'
    try:
        ## board = Board(width=width, height=height, n_in_row=n)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow
        model_file = './tmp/best_policy.model'
        best_policy = PolicyValueNet(width, height, model_file=model_file)

        mcts_player1 = MCTSPlayer(best_policy.policy_value_fn,
                                  c_puct=5,
                                  n_playout=500)

        mcts_player2 = MCTSPlayer(best_policy.policy_value_fn,
                                  c_puct=5,
                                  n_playout=500)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy
        # try:
        #     policy_param = pickle.load(open(model_file, 'rb'))
        # except:
        #     policy_param = pickle.load(open(model_file, 'rb'),
        #                                encoding='bytes')  # To support python3
        # best_policy = PolicyValueNetNumpy(width, height, policy_param)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn,
        #                          c_puct=5,
        #                          n_playout=400)  # set larger n_playout for better performance

        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        # mcts_player1 = MCTS_Pure(c_puct=5, n_playout=500)
        # mcts_player2 = MCTS_Pure(c_puct=5, n_playout=500)
        # human player, input your move in the format: 2,3
        # human = Human()

        # set start_player=0 for human first
        # game.start_play(mcts_player1, mcts_player2, start_player=1, is_shown=0)
        # game.start_play(human, mcts_player2, start_player=1, is_shown=0)
        game.start_play(mcts_player1, mcts_player2, start_player=1, is_shown=0)
    except KeyboardInterrupt:
        print('\n\rquit')

コード例 #21

0

ファイルを表示

ファイル: train_multi_2.0.py プロジェクト: liyaozong1991/Gomoku-AI

 def update_net_thread(self, shared_queue, net_lock, data_lock,
                       stop_update_process, update_best_model):
     os.environ["CUDA_VISIBLE_DEVICES"] = "1"
     from policy_value_net_tensorflow import PolicyValueNet
     logging.info('update process start')
     # 读取和写入模型文
     current_policy_value_net = PolicyValueNet(self.board_width,
                                               self.board_height, model_dir)
     current_policy_value_net.save_model(current_model_name)
     current_policy_value_net.save_model(best_model_name)
     best_win_ratio = 0
     get_enough_train_data = False
     global_update_step = 0
     lr_multiplier = 1.0
     while stop_update_process.value == 0:
         time.sleep(1)
         if get_enough_train_data:
             global_update_step += 1
             logging.info('update process start {} th self train'.format(
                 global_update_step))
             lr_multiplier = self.policy_update(current_policy_value_net,
                                                shared_queue, net_lock,
                                                data_lock,
                                                global_update_step,
                                                lr_multiplier)
             logging.info('update process end {} th self train'.format(
                 global_update_step))
             # 这里更新最新模型文件
             logging.info('update process ask net lock')
             with net_lock:
                 logging.info('update process get net lock')
                 current_policy_value_net.save_model(current_model_name)
             logging.info('update process release net lock')
             if (global_update_step + 1) % self.update_freq == 0:
                 update_best_model.value = 1
         else:
             with data_lock:
                 get_enough_train_data = len(
                     shared_queue) >= self.batch_size
     logging.info('update process finished')

コード例 #22

0

ファイルを表示

ファイル: human_play.py プロジェクト: aipromote/reinforcement-learning-master

def run():
    n = 5
    width, height = 15, 15
    model_file = 'dist/best_policy.model'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game_UI(board, is_shown=1)

        # ############### Human-machine ###################
        best_policy = PolicyValueNet(width, height, model_file=model_file)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=400)

        human = Human()

        game.start_play_mouse(human, mcts_player, start_player=0, is_shown=1)

    except KeyboardInterrupt:
        print('\n\rquit')

コード例 #23

0

ファイルを表示

ファイル: train_multi_3.0.py プロジェクト: liyaozong1991/Gomoku-AI

 def update_net(self, shared_queue, net_lock, update_best_model, global_update_step, lr_multiplier, stop_update_process, update_or_selfplay):
     os.environ["CUDA_VISIBLE_DEVICES"] = "1"
     from policy_value_net_tensorflow import PolicyValueNet
     current_policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_dir)
     current_policy_value_net.save_model(current_model_name)
     current_policy_value_net.save_model(best_model_name)
     while global_update_step.value <= self.game_batch_num:
         if update_or_selfplay.value == 0:
             if len(shared_queue) >= self.batch_size:
                 for _ in range(self.epochs):
                     global_update_step.value += 1
                     logging.info('update current model process start self train: {}'.format(global_update_step.value))
                     self.policy_update(current_policy_value_net, shared_queue, net_lock, global_update_step, lr_multiplier)
                     if (global_update_step.value) % self.check_freq == 0:
                         update_best_model.value = 1
                 # 这里更新最新模型文件
                 with net_lock:
                     logging.info('update process update current model')
                     current_policy_value_net.save_model(current_model_name)
             update_or_selfplay.value = 1
         else:
             time.sleep(1)
     stop_update_process.value = 1

コード例 #24

0

ファイルを表示

ファイル: train.py プロジェクト: xzabg/AlphaZero_Gobang

    def policy_evaluate(self, n_games=10):
        """
        Evaluate the trained policy by playing against the pure MCTS player
        Note: this is only for monitoring the progress of training
        """
        print("evaluating...")
        current_mcts_player = MCTSPlayer(self.policy_value_net_train.policy_value_fn,
                                         c_puct=self.c_puct,
                                         n_playout=self.pure_mcts_playout_num)
        best_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn,
                                      c_puct=self.c_puct,
                                      n_playout=self.pure_mcts_playout_num)

        win_cnt = defaultdict(int)
        for i in range(n_games):
            winner = self.game.start_play(current_mcts_player,
                                          best_mcts_player,
                                          start_player=i % 2,
                                          is_shown=0)
            win_cnt[winner] += 1
        win_ratio = 1.0*(win_cnt[1] + 0.5*win_cnt[-1]) / n_games
        print("num_playouts:{}, win: {}, lose: {}, tie:{}".format(
                self.pure_mcts_playout_num,
                win_cnt[1], win_cnt[2], win_cnt[-1]))

        # save the current_model
        self.policy_value_net_train.save_model('/data/output/current_policy.model')
        if win_ratio > self.best_win_ratio:
            print("New best policy!!!!!!!!")
            # update the best_policy
            self.policy_value_net_train.save_model('/data/output/best_policy.model')
            self.g1 = tf.Graph()
            with self.g1.as_default():
                self.policy_value_net = PolicyValueNet(self.board_width,
                                                       self.board_height,
                                                       model_file='/data/output/best_policy.model',
                                                       graph=self.g1,
                                                       output='/data/data/')

        return win_ratio

コード例 #25

0

ファイルを表示

    def __init__(self, current_model, baseline_model):
        # params of the board and the game
        self.board_width = 9
        self.board_height = 9
        self.n_in_row = 5
        self.board = Board(width=self.board_width,
                           height=self.board_height,
                           n_in_row=self.n_in_row,
                           forbidden_hands=True)
        self.game = Game(self.board)
        self.n_playout = 400  # num of simulations for each move
        self.c_puct = 5        
        
        self.baseline_policy_value_net = PolicyValueNet(self.board_width,
                                                   self.board_height,
                                                   'l+', 
                                                   model_file=baseline_model)

        self.current_policy_value_net = PolicyValueNetRes30(self.board_width,
                                                  self.board_height,
                                                  'l+', 
                                                   model_file=current_model)

コード例 #26

0

ファイルを表示

ファイル: train_multi.py プロジェクト: liyaozong1991/Gomoku-AI

 def local_thread_func(thread_id, shared_queue, net_lock, data_lock):
     from policy_value_net_tensorflow import PolicyValueNet
     # 读取模型文件，加锁
     logging.info("selfplay process {} ask net lock".format(thread_id))
     with net_lock:
         logging.info(
             'selfpaly process {} get net lock'.format(thread_id))
         current_policy = PolicyValueNet(self.board_width,
                                         self.board_height,
                                         model_dir,
                                         model_file=current_model_name)
     logging.info(
         'selfplay process {} release net lock'.format(thread_id))
     local_board = Board(width=self.board_width,
                         height=self.board_height,
                         n_in_row=self.n_in_row)
     local_game = Game(local_board)
     local_mcts_player = MCTSPlayer(current_policy.policy_value_fn,
                                    c_puct=self.c_puct,
                                    n_playout=self.n_playout,
                                    is_selfplay=1)
     logging.info("selfplay process {} start {}th selfplay".format(
         thread_id, index))
     winner, play_data = local_game.start_self_play(local_mcts_player,
                                                    temp=self.temp)
     logging.info("selfplay process {} finish {}th selfplay".format(
         thread_id, index))
     play_data = list(play_data)
     play_data = self.get_equi_data(play_data)
     # 添加对弈数据，加锁
     logging.info('selfplay process {} ask date lock'.format(thread_id))
     with data_lock:
         logging.info(
             'selfplay process {} get date lock'.format(thread_id))
         shared_queue.extend(play_data)
         while len(shared_queue) > self.buffer_num:
             shared_queue.pop(0)
     logging.info(
         'selfplay process {} release data lock'.format(thread_id))

コード例 #27

0

ファイルを表示

ファイル: human_play.py プロジェクト: linbirg/alphago_learnning

def run():
    n = 5
    #width, height = 8, 8
    width, height = 16,16
    #model_file =  'best_policy_8_8_5.model'
    model_file =  './tfData/best_policy.model'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

        # best_policy = PolicyValueNet(width, height, model_file = model_file)
        # mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=400)

        # load the provided model (trained in Theano/Lasagne) into a MCTS player written in pure numpy

        #得到策略                               
        best_policy = PolicyValueNet(width, height, model_file)
        #得到策略函数
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=400)  # set larger n_playout for better performance

        # uncomment the following line to play with pure MCTS (it's much weaker even with a larger n_playout)
        # mcts_player = MCTS_Pure(c_puct=5, n_playout=1000)

        # human player, input your move in the format: 2,3
        human = Human()

        # set start_player=0 for human first
        winner, play_data  = game.start_play(human, mcts_player, start_player=1, is_shown=1)
        play_data = list(play_data)[:]
        play_data = get_equi_data(play_data,height,width )
        backupSave(play_data,"human" )

    except KeyboardInterrupt:
        print('\n\rquit')

コード例 #28

0

ファイルを表示

    def __init__(self) :
        self.board = chessBoard()
        self.game_print = StringVar()
        self.game_print.set("")
        #16*16的二维列表，保证不会out of index
        self.db = [([2] * 9) for i in range(9)]
        #悔棋用的顺序列表
        self.order = []
        #棋子颜色
        self.color_count = 0 
        self.color = 'black'
        #清空与赢的初始化，已赢为1，已清空为1
        self.flag_win = 1
        self.flag_empty = 1

        self.start_player = 0
        width, height, n_in_row = 9, 9, 5
        model_file = 'output/best_policy.model'
        baseline_file = 'output/baseline_policy.model'
        board = Board(width=width, height=height, n_in_row=n_in_row, forbidden_hands=False)
        self.game = Game(board)
        self.game.board.init_board(self.start_player)
        self.best_policy = PolicyValueNetRes30(width, height, 'l+', model_file=model_file)
        self.baseline_policy = PolicyValueNet(width, height, 'l+', model_file=baseline_file)
        self.mcts_player = MCTSPlayer(self.best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=500)  # set larger n_playout for better performance
        self.mcts_baseline_player = MCTSPlayer(self.baseline_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=500)  # set larger n_playout for better performance
        self.human_player = Human()
        self.human_player.set_player_ind(1)
        #self.mcts_baseline_player.set_player_ind(1)
        self.mcts_player.set_player_ind(2)
        self.players = {1:self.human_player, 2:self.mcts_player}
        #self.players = {1:self.mcts_baseline_player, 2:self.mcts_player}

        self.options()

コード例 #29

0

ファイルを表示

def run():
    n = 5
    width, height = 8, 8
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)

        # ############### human VS AI ###################
        # load the trained policy_value_net in either Theano/Lasagne, PyTorch or TensorFlow

        best_policy = PolicyValueNet(width, height, model_file=model_file)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=500)

        # human player, input your move in the format: 2,3
        human1 = Human()

        # set start_player=0 for human first
        game.start_play(human1, mcts_player, start_player=1, is_shown=1)
        # game.start_play(human1, human2, start_player=0, is_shown=1)
    except KeyboardInterrupt:
        print('\n\rquit')

コード例 #30

0

ファイルを表示

ファイル: human_play.py プロジェクト: Dancy679/-

def run():
    n = 5
    width, height = 10, 10
    model_file = 'best_policy.model'
    try:
        board = Board(width=width, height=height, n_in_row=n)
        game = Game(board)
        graphic = Graphic()
        # graphic.run()
        print(1111)
        # thread1 = threading.Thread(target=graphic.run, args=())
        best_policy = PolicyValueNet(width,
                                     height,
                                     model_file='./model/' + model_file)
        mcts_player = MCTSPlayer(best_policy.policy_value_fn,
                                 c_puct=5,
                                 n_playout=1000)
        print("hhh")
        human = Human(graphic)
        # set start_player=0 for human first
        thread2 = threading.Thread(target=game.start_play,
                                   args=(human, mcts_player, graphic, 1, 1))
        # game.start_play(human, mcts_player, graphic, start_player=0, is_shown=1)
        # thread1.setDaemon(True)
        # thread1.start()
        thread2.setDaemon(True)
        thread2.start()
        graphic.run()
        # thread1.join()
        # thread2.join()
        # game.start_play(human, mcts_player, graphic, start_player=0, is_shown=1)

        # thread.start_new_thread(game.start_play, (human, mcts_player, graphic, 0, 1))
        # thread.start_new_thread(graphic.run, ())
    except KeyboardInterrupt:
        print('\n\rquit')