def play_later(): result = self.move_to(stone, to_) if opp_q is not None: opp_valid = rule.valid_action(self.board(), -player) self.show_qtext(opp_q, opp_valid, hide=False) if result == rule.ACCQUIRE: # 对手走棋 self.switch_player_and_play() elif result == rule.WIN: logger.info('GAME OVER, WINNER IS %s', stone.player.name) self.game_over(stone.player)
def probabilities(self, board, player): x = self.feature_1st(board, player) valid = rule.valid_action(board, player) x = np.array([x]) p = self.model.predict(x)[0] p = p.reshape(5, 5, 4) vp = p * valid # 所有可能走法的概率 if vp.max() == 0: vp = valid / valid.sum() else: vp = vp / vp.sum() return vp
def feature_1st(board, player): """ 第一视角的棋局特征 :param board: 棋盘 :param player: 当前的棋手 :return: 当前局面的特征(5x5xN) """ space = (board == 0).astype(np.int8).reshape((5, 5, 1)) self = (board == player).astype(np.int8).reshape((5, 5, 1)) opponent = (board == -player).astype(np.int8).reshape((5, 5, 1)) v_locations = rule.valid_location(board, player).reshape((5, 5, 1)) v_actions = rule.valid_action(board, player) bias = np.ones((5, 5, 1)) return np.concatenate( (space, self, opponent, v_locations, v_actions, bias), axis=2)
def feature(board, player): """ 棋局的特征 :param board: 棋盘 :param player: 当前的棋手 :return: 当前局面的特征(5x5x10) """ space = (board == 0).astype(np.int8).reshape((5, 5, 1)) black = (board == 1).astype(np.int8).reshape((5, 5, 1)) white = (board == -1).astype(np.int8).reshape((5, 5, 1)) who = np.ones((5, 5, 1)) if player == 1 else np.zeros((5, 5, 1)) v_locations = rule.valid_location(board, player).reshape((5, 5, 1)) v_actions = rule.valid_action(board, player) bias = np.ones((5, 5, 1)) return np.concatenate( (space, black, white, who, v_locations, v_actions, bias), axis=2)
def simulate(nw0, nw1, init='fixed'): board = rule.init_board() if init == 'fixed' else rule.random_init_board() player = 1 records = Record() while True: nw = nw0 if player == 1 else nw1 try: bd = board.copy() from_, action, vp, p = nw.policy(board, player) # print('>', from_, action) assert board[from_] == player to_ = tuple(np.add(from_, rule.actions_move[action])) command, eat = rule.move(board, from_, to_) reward = len(eat) records.add(bd, from_, action, reward, vp, win=command == rule.WIN) except NoActionException: return Record(), 0 except Exception as e: logging.info('board is:') logging.info(board) logging.info('player is: %s', player) valid = rule.valid_action(board, player) logging.info('predict is:') print(nw.p) logging.info('sum is: %s', nw.p.sum()) logging.info('valid action is:') logging.info(nw.valid) logging.info('p * valid is:') logging.info(nw.vp) logging.info('from:%s, action:%s', from_, action) logging.info('prob is: %s', valid[from_][action]) records.save('records/train/1st_') raise e # if eat: # print(player, from_, to_, eat, N) if command == rule.WIN: logging.info('%s WIN, step use: %s', str(player), records.length()) return records, player if records.length() > 10000: logging.info('走子数过多: %s', records.length()) return Record(), 0 player = -player board = rule.flip_board(board)
def _play(self, player, from_, to_, p, opp_q=None): logger.info('from:%s, to_:%s', from_, to_) logger.debug('p:\n%s', p) board = self.board() valid_action = rule.valid_action(board, player) logger.debug('valid_action:\n%s', valid_action) self.show_qtext(p, valid_action) self.show_select(from_, to_) stone = self.stone(from_) def play_later(): result = self.move_to(stone, to_) if opp_q is not None: opp_valid = rule.valid_action(self.board(), -player) self.show_qtext(opp_q, opp_valid, hide=False) if result == rule.ACCQUIRE: # 对手走棋 self.switch_player_and_play() elif result == rule.WIN: logger.info('GAME OVER, WINNER IS %s', stone.player.name) self.game_over(stone.player) self.play_timer = self.window.after(int(self.period * 1000), play_later)
def policy_1st(self, board, player): x = self.feature_1st(board, player) valid = rule.valid_action(board, player) self.set_dropout(0) return self._policy(x, board, valid)
def policy(self, board, player): x = self.feature_1st(board, player).flatten() valid = rule.valid_action(board, player) return self._policy(x, board, player, valid)