Esempio n. 1
0
 def feature(board, from_, action):
     """
     第一视角的棋局特征
     :param board:   棋盘
     :param from_:   走哪颗子
     :param action:  动作,向哪个方向走
     :return: 当前动作的特征(5x5xN)
     """
     player = board[from_]
     to_ = tuple(np.add(from_, rule.actions_move[action]))
     # 棋盘特征:空白-己方棋子-对方棋子
     space = (board == 0).astype(np.int8).reshape((5, 5, 1))
     self = (board == player).astype(np.int8).reshape((5, 5, 1))
     opponent = (board == -player).astype(np.int8).reshape((5, 5, 1))
     # 动作特征
     from_location = np.zeros((5,5,1))
     from_location[from_] = 1
     to_location = np.zeros((5,5,1))
     to_location[to_] = 1
     # 走子后的棋盘
     board = board.copy()
     result,_ = rule.move(board, from_, to_)
     space2 = (board == 0).astype(np.int8).reshape((5, 5, 1))
     self2 = (board == player).astype(np.int8).reshape((5, 5, 1))
     opponent2 = (board == -player).astype(np.int8).reshape((5, 5, 1))
     # 走子后是否赢棋
     is_win = np.ones((5,5,1)) if result == rule.WIN else np.zeros((5,5,1))
     # 偏置
     bias = np.ones((5, 5, 1))
     return np.concatenate((space, self, opponent, from_location, to_location, space2, self2, opponent2, is_win, bias), axis=2)
Esempio n. 2
0
 def move_to(self, stone, to_loc):
     """
     把棋子移动到to_loc处,同时判断是否吃子
     :param stone:
     :param to_loc:
     :return: True:终止移动,False:继续移动
     """
     old_board = self.board()
     from_ = stone.loc
     result, del_stone_loc = rule.move(self.board(), stone.loc, to_loc)
     if result == rule.ACCQUIRE or result == rule.WIN:
         self.move_to_loc(stone, to_loc)
         for loc in del_stone_loc:
             self.del_stone(self.stone(loc))
         logger.info('from %s to %s, result:%s, del:%s', from_, to_loc,
                     result, del_stone_loc)
         action = rule.actions_move.index(tuple(np.subtract(to_loc, from_)))
         logger.debug('action is: %s', action)
         self.record.add(old_board,
                         from_,
                         action,
                         len(del_stone_loc),
                         None,
                         win=(result == rule.WIN))
     return result
Esempio n. 3
0
    def play(self, board):
        logger.info('%s play...', self.name)
        board_self = rule.flip_board(
            board) if self.stone_val == -1 else board.copy()
        from_, action, vp, p = self.play_process.predict(
            board_self, self.stone_val)
        to_ = tuple(np.add(from_, rule.actions_move[action]))
        if self.stone_val == -1:
            from_ = rule.flip_location(from_)
            to_ = rule.flip_location(to_)
            # vp = rule.flip_action_probs(vp)
            p = rule.flip_action_probs(p)
        logger.info('from %s to %s', from_, to_)

        rule.move(board, from_, to_)
        opp_q_table = self.predict_opponent(board)
        logger.debug(opp_q_table)
        self.play_func(self.stone_val, from_, to_, p, opp_q_table)
Esempio n. 4
0
 def simulate(self, ts, board, player):
     from record import Record
     from value_network import NoActionException
     records = Record()
     while True:
         try:
             bd = board.copy()
             board_str = util.board_str(board)
             valid_action = rule.valid_actions(board, player)
             while True:
                 (from_,
                  act), q = self.epsilon_greedy(board, player, valid_action,
                                                ts)
                 if (board_str, from_, act) not in self.predicts or len(
                         ts.root.sub_edge) == 1:
                     break
                 ts.root.sub_edge = [
                     e for e in ts.root.sub_edge if e.a != (from_, act)
                 ]
                 valid_action.remove((from_, act))
             assert board[from_] == player
             ts.move_down(board, player, action=(from_, act))
             if self.episode % 10 == 0:
                 logger.info('action:%s,%s', from_, act)
                 logger.info('q is %s', q)
             to_ = tuple(np.add(from_, rule.actions_move[act]))
             command, eat = rule.move(board, from_, to_)
             records.add3(bd, from_, act, len(eat), win=command == rule.WIN)
         except NoActionException:
             # 随机初始化局面后一方无路可走
             return Record(), 0
         except Exception as ex:
             logging.warning('board is:\n%s', board)
             logging.warning('player is: %s', player)
             valid = rule.valid_actions(board, player)
             logging.warning('valid is:\n%s', valid)
             logging.warning('from_:%s, act:%s', from_, act)
             ts.show_info()
             records.save('records/train/1st_')
             raise ex
         if command == rule.WIN:
             logging.info('%s WIN, step use: %s, epsilon:%s', str(player),
                          records.length(), self.epsilon)
             return records, player
         if records.length() > 10000:
             logging.info('走子数过多: %s', records.length())
             return Record(), 0
         player = -player
         board = rule.flip_board(board)
Esempio n. 5
0
    def play(self, board):
        logger.info('%s play...', self.name)
        board_self = rule.flip_board(
            board) if self.stone_val == -1 else board.copy()
        (from_,
         action), (valid,
                   q) = self.play_process.predict(board_self, self.stone_val)
        logger.debug('valid is:%s', valid)
        logger.debug('q is:%s', q)
        logger.debug('from:%s, action:%s', from_, action)
        to_ = tuple(np.add(from_, rule.actions_move[action]))
        q_table = np.zeros((5, 5, 4))
        for (f, a), q1 in zip(valid, q):
            q_table[f][a] = q1
        if self.stone_val == -1:
            from_ = rule.flip_location(from_)
            to_ = rule.flip_location(to_)
            q_table = rule.flip_action_probs(q_table)
        logger.info('from %s to %s', from_, to_)

        rule.move(board, from_, to_)
        opp_q_table = self.predict_opponent(board)
        logger.debug(opp_q_table)
        self.play_func(self.stone_val, from_, to_, q_table, opp_q_table)
def simulate(nw0, nw1, init='fixed'):
    board = rule.init_board() if init == 'fixed' else rule.random_init_board()
    player = 1
    records = Record()
    while True:
        nw = nw0 if player == 1 else nw1
        try:
            bd = board.copy()
            from_, action, vp, p = nw.policy(board, player)
            # print('>', from_, action)
            assert board[from_] == player
            to_ = tuple(np.add(from_, rule.actions_move[action]))
            command, eat = rule.move(board, from_, to_)
            reward = len(eat)
            records.add(bd, from_, action, reward, vp, win=command == rule.WIN)
        except NoActionException:
            return Record(), 0
        except Exception as e:
            logging.info('board is:')
            logging.info(board)
            logging.info('player is: %s', player)
            valid = rule.valid_action(board, player)
            logging.info('predict is:')
            print(nw.p)
            logging.info('sum is: %s', nw.p.sum())
            logging.info('valid action is:')
            logging.info(nw.valid)
            logging.info('p * valid is:')
            logging.info(nw.vp)
            logging.info('from:%s, action:%s', from_, action)
            logging.info('prob is: %s', valid[from_][action])
            records.save('records/train/1st_')
            raise e
        # if eat:
        #     print(player, from_, to_, eat, N)
        if command == rule.WIN:
            logging.info('%s WIN, step use: %s', str(player), records.length())
            return records, player
        if records.length() > 10000:
            logging.info('走子数过多: %s', records.length())
            return Record(), 0
        player = -player
        board = rule.flip_board(board)
def simulate(nw0, nw1, activation, init='fixed'):
    np.random.seed(util.rand_int32())
    player = 1 if np.random.random() > 0.5 else -1
    logger.info('init:%s, player:%s', init, player)
    board = rule.init_board(
        player) if init == 'fixed' else rule.random_init_board()
    records = Record()
    # full_records = Record()
    boards = set()  # {(board,player)}
    nws = [None, nw0, nw1]
    n_steps = 0
    while True:
        nw = nws[player]  # nw0 if player == 1 else nw1
        try:
            bd = board.copy()
            board_str = util.board_str(board)

            if (board_str, player) in boards:
                # 找出环,并将目标置为0.5进行训练,然后将环清除
                finded = False
                records2 = Record()
                for i in range(len(boards) - 1, -1, -1):
                    b, f, a, _, _ = records[i]
                    if (b == board).all() and b[f] == player:
                        finded = True
                        break
                assert finded, (board, player)
                records2.records = records.records[i:]
                records2.draw()
                nw0.train(records2)
                nw1.train(records2)

                # 将环里的数据清除
                records.records = records.records[:i]
                for b, f, a, _, _ in records2:
                    boards.remove((util.board_str(b), b[f]))
                logger.info('环:%s, records:%s, epsilon:%s', len(records2),
                            records.length(), nw.epsilon)
            boards.add((board_str, player))

            from_, action = nw.policy(board, player)
            assert board[from_] == player
            to_ = tuple(np.add(from_, rule.actions_move[action]))
            command, eat = rule.move(board, from_, to_)
            reward = len(eat)
            if activation == 'sigmoid':
                records.add3(bd,
                             from_,
                             action,
                             reward,
                             win=command == rule.WIN)
                # full_records.add3(bd, from_, action, reward, win=command==rule.WIN)
            elif activation == 'linear':
                records.add2(bd,
                             from_,
                             action,
                             reward,
                             win=command == rule.WIN)
                # full_records.add2(bd, from_, action, reward, win=command == rule.WIN)
            elif activation == 'selu':
                records.add4(bd,
                             from_,
                             action,
                             reward,
                             win=command == rule.WIN)
                # full_records.add4(bd, from_, action, reward, win=command == rule.WIN)
            else:
                raise ValueError
            if command == rule.WIN:
                logging.info('%s WIN, stone:%s, step use: %s, epsilon:%s',
                             str(player), (board == player).sum(),
                             records.length(), nw.epsilon)
                return records, player
            if n_steps - records.length() > 500:
                logging.info('循环走子数过多: %s', records.length())
                # 走子数过多,和棋
                records.clear()
                return records, 0

            player = -player
            if init == 'fixed':
                board = rule.flip_board(board)
            n_steps += 1
        except NoActionException:
            # 随机初始化局面后一方无路可走
            return Record(), 0
        except Exception as e:
            logging.info('board is:\n%s', board)
            logging.info('player is: %s', player)
            valid = rule.valid_actions(board, player)
            logging.info('valid is:\n%s', valid)
            logging.info('predict is:\n%s', nw.q_value)
            logging.info('valid action is:\n%s', nw.valid)
            logging.info('from:%s, action:%s', from_, action)
            records.save('records/train/1st_')
            raise e