Ejemplo n.º 1
0
    def play(self, board):
        if self.stone_val == -1:
            board_self = rule.flip_board(board)
        else:
            board_self = board.copy()

        def _play():
            action, q, opp_q = self.play_process.predict(
                board_self, self.stone_val)
            logger.info('resv: action:%s', action)
            if action is None:
                logger.info('_play thread stop...')
                return
            from_, act = action
            to_ = tuple(np.add(from_, rule.actions_move[act]))
            q_table = np.zeros((5, 5, 4))
            for (f, a), q_ in q:
                q_table[f][a] = q_
            if self.stone_val == -1:
                from_ = rule.flip_location(from_)
                to_ = rule.flip_location(to_)
                q_table = rule.flip_action_probs(q_table)
            # self.play_func(board, self.stone_val, from_, to_, q_table)
            opp_q_table = np.zeros((5, 5, 4))
            for (f, a), q_ in opp_q:
                opp_q_table[f][a] = q_
            self.play_func(self.stone_val,
                           from_,
                           to_,
                           q_table,
                           opp_q=opp_q_table)

        Thread(target=_play).start()
Ejemplo n.º 2
0
 def __init__(self, policy_model, value_model, init_board, first_player,
              player):
     PlayProcess.__init__(self, model_fuc=None)
     self.policy_model = policy_model
     self.value_model = value_model
     self.first_player = first_player
     self.player = player
     if first_player == -1:
         init_board = rule.flip_board(init_board)
     self.init_board = init_board
Ejemplo n.º 3
0
 def simulate(self, ts, board, player):
     from record import Record
     from value_network import NoActionException
     records = Record()
     while True:
         try:
             bd = board.copy()
             board_str = util.board_str(board)
             valid_action = rule.valid_actions(board, player)
             while True:
                 (from_,
                  act), q = self.epsilon_greedy(board, player, valid_action,
                                                ts)
                 if (board_str, from_, act) not in self.predicts or len(
                         ts.root.sub_edge) == 1:
                     break
                 ts.root.sub_edge = [
                     e for e in ts.root.sub_edge if e.a != (from_, act)
                 ]
                 valid_action.remove((from_, act))
             assert board[from_] == player
             ts.move_down(board, player, action=(from_, act))
             if self.episode % 10 == 0:
                 logger.info('action:%s,%s', from_, act)
                 logger.info('q is %s', q)
             to_ = tuple(np.add(from_, rule.actions_move[act]))
             command, eat = rule.move(board, from_, to_)
             records.add3(bd, from_, act, len(eat), win=command == rule.WIN)
         except NoActionException:
             # 随机初始化局面后一方无路可走
             return Record(), 0
         except Exception as ex:
             logging.warning('board is:\n%s', board)
             logging.warning('player is: %s', player)
             valid = rule.valid_actions(board, player)
             logging.warning('valid is:\n%s', valid)
             logging.warning('from_:%s, act:%s', from_, act)
             ts.show_info()
             records.save('records/train/1st_')
             raise ex
         if command == rule.WIN:
             logging.info('%s WIN, step use: %s, epsilon:%s', str(player),
                          records.length(), self.epsilon)
             return records, player
         if records.length() > 10000:
             logging.info('走子数过多: %s', records.length())
             return Record(), 0
         player = -player
         board = rule.flip_board(board)
Ejemplo n.º 4
0
 def __init__(self, upper_node, a, v, p, lambda_):
     self.upper_node = upper_node
     self.a = a
     self.v = v
     self.l = lambda_
     self.p = p
     self.n = 0
     self.w = 0
     board, player = upper_node.board.copy(), upper_node.player
     result, _ = rule.move_by_action(board, *a)
     self.down_node = Node(board=rule.flip_board(board),
                           player=-player,
                           tree=upper_node.tree,
                           level=upper_node.level + 1,
                           parent_edge=self,
                           final=result == rule.WIN)
Ejemplo n.º 5
0
 def opponent_play(self, board, from_, to_):
     """
     对手走的棋
     :param board:   对手走棋之前的局面
     :param from_:
     :param to_:
     """
     player = board[from_]
     assert player == -self.stone_val, str(board) + '\nfrom:' + str(
         from_) + ' to:' + str(to_)
     act = tuple(np.subtract(to_, from_))
     a = rule.actions_move.index(act)
     action = (from_, a)
     if player == -1:
         board = rule.flip_board(board)
         action = rule.flip_action(action)
     self.play_process.opponent_play(board, player, action)
Ejemplo n.º 6
0
    def play(self, board):
        logger.info('%s play...', self.name)
        board_self = rule.flip_board(
            board) if self.stone_val == -1 else board.copy()
        from_, action, vp, p = self.play_process.predict(
            board_self, self.stone_val)
        to_ = tuple(np.add(from_, rule.actions_move[action]))
        if self.stone_val == -1:
            from_ = rule.flip_location(from_)
            to_ = rule.flip_location(to_)
            # vp = rule.flip_action_probs(vp)
            p = rule.flip_action_probs(p)
        logger.info('from %s to %s', from_, to_)

        rule.move(board, from_, to_)
        opp_q_table = self.predict_opponent(board)
        logger.debug(opp_q_table)
        self.play_func(self.stone_val, from_, to_, p, opp_q_table)
Ejemplo n.º 7
0
def simulate(nw0, nw1, init='fixed'):
    board = rule.init_board() if init == 'fixed' else rule.random_init_board()
    player = 1
    records = Record()
    while True:
        nw = nw0 if player == 1 else nw1
        try:
            bd = board.copy()
            from_, action, vp, p = nw.policy(board, player)
            # print('>', from_, action)
            assert board[from_] == player
            to_ = tuple(np.add(from_, rule.actions_move[action]))
            command, eat = rule.move(board, from_, to_)
            reward = len(eat)
            records.add(bd, from_, action, reward, vp, win=command == rule.WIN)
        except NoActionException:
            return Record(), 0
        except Exception as e:
            logging.info('board is:')
            logging.info(board)
            logging.info('player is: %s', player)
            valid = rule.valid_action(board, player)
            logging.info('predict is:')
            print(nw.p)
            logging.info('sum is: %s', nw.p.sum())
            logging.info('valid action is:')
            logging.info(nw.valid)
            logging.info('p * valid is:')
            logging.info(nw.vp)
            logging.info('from:%s, action:%s', from_, action)
            logging.info('prob is: %s', valid[from_][action])
            records.save('records/train/1st_')
            raise e
        # if eat:
        #     print(player, from_, to_, eat, N)
        if command == rule.WIN:
            logging.info('%s WIN, step use: %s', str(player), records.length())
            return records, player
        if records.length() > 10000:
            logging.info('走子数过多: %s', records.length())
            return Record(), 0
        player = -player
        board = rule.flip_board(board)
Ejemplo n.º 8
0
 def read(self, filepath):
     """
     读取棋谱
     :param filepath:
     """
     need_flip = '1st' in filepath
     with open(filepath) as f:
         for line in f:
             board, from_, action, reward = line.split(',')
             board = (np.array([int(i) for i in board]) - 1).reshape(5, 5)
             from_ = tuple(map(int, from_))
             action = int(action)
             reward = float(reward)
             player = board[from_]
             if need_flip and player == -1:
                 board = rule.flip_board(board)
                 from_ = rule.flip_location(from_)
                 action = rule.flip_action(action)
             self.records.append([board, from_, action, reward])
Ejemplo n.º 9
0
 def default_policy(self):
     board = self.board.copy()
     player = self.player
     step = 0
     while True:
         try:
             from_, action, *_ = self.tree.worker.policy(
                 board, player)  # worker.predict(board, player)
             command, eat = rule.move_by_action(board, from_, action)
         except Exception as e:
             logger.info('board is:\n%s', board)
             logger.info('player is: %s', player)
             logger.info('from:%s, action:%s', from_, action)
             raise e
         if command == rule.WIN:
             logging.info('%s WIN, step use: %s', str(player), step)
             return player
         player = -player
         board = rule.flip_board(board)
         step += 1
Ejemplo n.º 10
0
 def move_down(self, board, player, action):
     assert np.all(self.root.board == board), 'root_board:\n' + str(
         self.root.board) + '\nboard:\n' + str(board)
     assert self.root.player == player, 'root_player:%s, player:%s' % (
         self.root.player, player)
     node = self.get_node(action)
     logger.debug('get_node(%s):\n%s', action, node)
     if node is None:
         logger.info('node is None, new Node()')
         board = board.copy()
         rule.move_by_action(board, *action)
         node = Node(rule.flip_board(board), -player, tree=self)
     if not node.expanded:
         node.expansion()
     self.root = node
     self.root.parent_edge = None
     self.root.level = 1
     self.n_node = 1
     self.depth = 1
     self.update_tree_info(self.root)
     logger.debug('move down to node:%s', action)
Ejemplo n.º 11
0
 def __init__(self, upper_node, a, v, p, lambda_):
     self.upper_node = upper_node
     self.a = a
     self.v = v
     self.v_ = v
     self.l = lambda_
     self.p = p
     self.n = 1
     self.n_update = 1
     self.w = 1
     board, player = upper_node.board.copy(), upper_node.player
     result, _ = rule.move_by_action(board, *a)
     self.down_node = Node(board=rule.flip_board(board),
                           player=-player,
                           tree=upper_node.tree,
                           level=upper_node.level + 1,
                           parent_edge=self,
                           final=result == rule.WIN)
     self.win = result == rule.WIN
     assert p != np.nan
     assert v != np.nan
     if self.win:
         self.v = 1 + 1e-15  # 1.0005
Ejemplo n.º 12
0
    def play(self, board):
        logger.info('%s play...', self.name)
        board_self = rule.flip_board(
            board) if self.stone_val == -1 else board.copy()
        (from_,
         action), (valid,
                   q) = self.play_process.predict(board_self, self.stone_val)
        logger.debug('valid is:%s', valid)
        logger.debug('q is:%s', q)
        logger.debug('from:%s, action:%s', from_, action)
        to_ = tuple(np.add(from_, rule.actions_move[action]))
        q_table = np.zeros((5, 5, 4))
        for (f, a), q1 in zip(valid, q):
            q_table[f][a] = q1
        if self.stone_val == -1:
            from_ = rule.flip_location(from_)
            to_ = rule.flip_location(to_)
            q_table = rule.flip_action_probs(q_table)
        logger.info('from %s to %s', from_, to_)

        rule.move(board, from_, to_)
        opp_q_table = self.predict_opponent(board)
        logger.debug(opp_q_table)
        self.play_func(self.stone_val, from_, to_, q_table, opp_q_table)
Ejemplo n.º 13
0
def simulate(nw0, nw1, activation, init='fixed'):
    np.random.seed(util.rand_int32())
    player = 1 if np.random.random() > 0.5 else -1
    logger.info('init:%s, player:%s', init, player)
    board = rule.init_board(
        player) if init == 'fixed' else rule.random_init_board()
    records = Record()
    # full_records = Record()
    boards = set()  # {(board,player)}
    nws = [None, nw0, nw1]
    n_steps = 0
    while True:
        nw = nws[player]  # nw0 if player == 1 else nw1
        try:
            bd = board.copy()
            board_str = util.board_str(board)

            if (board_str, player) in boards:
                # 找出环,并将目标置为0.5进行训练,然后将环清除
                finded = False
                records2 = Record()
                for i in range(len(boards) - 1, -1, -1):
                    b, f, a, _, _ = records[i]
                    if (b == board).all() and b[f] == player:
                        finded = True
                        break
                assert finded, (board, player)
                records2.records = records.records[i:]
                records2.draw()
                nw0.train(records2)
                nw1.train(records2)

                # 将环里的数据清除
                records.records = records.records[:i]
                for b, f, a, _, _ in records2:
                    boards.remove((util.board_str(b), b[f]))
                logger.info('环:%s, records:%s, epsilon:%s', len(records2),
                            records.length(), nw.epsilon)
            boards.add((board_str, player))

            from_, action = nw.policy(board, player)
            assert board[from_] == player
            to_ = tuple(np.add(from_, rule.actions_move[action]))
            command, eat = rule.move(board, from_, to_)
            reward = len(eat)
            if activation == 'sigmoid':
                records.add3(bd,
                             from_,
                             action,
                             reward,
                             win=command == rule.WIN)
                # full_records.add3(bd, from_, action, reward, win=command==rule.WIN)
            elif activation == 'linear':
                records.add2(bd,
                             from_,
                             action,
                             reward,
                             win=command == rule.WIN)
                # full_records.add2(bd, from_, action, reward, win=command == rule.WIN)
            elif activation == 'selu':
                records.add4(bd,
                             from_,
                             action,
                             reward,
                             win=command == rule.WIN)
                # full_records.add4(bd, from_, action, reward, win=command == rule.WIN)
            else:
                raise ValueError
            if command == rule.WIN:
                logging.info('%s WIN, stone:%s, step use: %s, epsilon:%s',
                             str(player), (board == player).sum(),
                             records.length(), nw.epsilon)
                return records, player
            if n_steps - records.length() > 500:
                logging.info('循环走子数过多: %s', records.length())
                # 走子数过多,和棋
                records.clear()
                return records, 0

            player = -player
            if init == 'fixed':
                board = rule.flip_board(board)
            n_steps += 1
        except NoActionException:
            # 随机初始化局面后一方无路可走
            return Record(), 0
        except Exception as e:
            logging.info('board is:\n%s', board)
            logging.info('player is: %s', player)
            valid = rule.valid_actions(board, player)
            logging.info('valid is:\n%s', valid)
            logging.info('predict is:\n%s', nw.q_value)
            logging.info('valid action is:\n%s', nw.valid)
            logging.info('from:%s, action:%s', from_, action)
            records.save('records/train/1st_')
            raise e