def play(self, board): if self.stone_val == -1: board_self = rule.flip_board(board) else: board_self = board.copy() def _play(): action, q, opp_q = self.play_process.predict( board_self, self.stone_val) logger.info('resv: action:%s', action) if action is None: logger.info('_play thread stop...') return from_, act = action to_ = tuple(np.add(from_, rule.actions_move[act])) q_table = np.zeros((5, 5, 4)) for (f, a), q_ in q: q_table[f][a] = q_ if self.stone_val == -1: from_ = rule.flip_location(from_) to_ = rule.flip_location(to_) q_table = rule.flip_action_probs(q_table) # self.play_func(board, self.stone_val, from_, to_, q_table) opp_q_table = np.zeros((5, 5, 4)) for (f, a), q_ in opp_q: opp_q_table[f][a] = q_ self.play_func(self.stone_val, from_, to_, q_table, opp_q=opp_q_table) Thread(target=_play).start()
def __init__(self, policy_model, value_model, init_board, first_player, player): PlayProcess.__init__(self, model_fuc=None) self.policy_model = policy_model self.value_model = value_model self.first_player = first_player self.player = player if first_player == -1: init_board = rule.flip_board(init_board) self.init_board = init_board
def simulate(self, ts, board, player): from record import Record from value_network import NoActionException records = Record() while True: try: bd = board.copy() board_str = util.board_str(board) valid_action = rule.valid_actions(board, player) while True: (from_, act), q = self.epsilon_greedy(board, player, valid_action, ts) if (board_str, from_, act) not in self.predicts or len( ts.root.sub_edge) == 1: break ts.root.sub_edge = [ e for e in ts.root.sub_edge if e.a != (from_, act) ] valid_action.remove((from_, act)) assert board[from_] == player ts.move_down(board, player, action=(from_, act)) if self.episode % 10 == 0: logger.info('action:%s,%s', from_, act) logger.info('q is %s', q) to_ = tuple(np.add(from_, rule.actions_move[act])) command, eat = rule.move(board, from_, to_) records.add3(bd, from_, act, len(eat), win=command == rule.WIN) except NoActionException: # 随机初始化局面后一方无路可走 return Record(), 0 except Exception as ex: logging.warning('board is:\n%s', board) logging.warning('player is: %s', player) valid = rule.valid_actions(board, player) logging.warning('valid is:\n%s', valid) logging.warning('from_:%s, act:%s', from_, act) ts.show_info() records.save('records/train/1st_') raise ex if command == rule.WIN: logging.info('%s WIN, step use: %s, epsilon:%s', str(player), records.length(), self.epsilon) return records, player if records.length() > 10000: logging.info('走子数过多: %s', records.length()) return Record(), 0 player = -player board = rule.flip_board(board)
def __init__(self, upper_node, a, v, p, lambda_): self.upper_node = upper_node self.a = a self.v = v self.l = lambda_ self.p = p self.n = 0 self.w = 0 board, player = upper_node.board.copy(), upper_node.player result, _ = rule.move_by_action(board, *a) self.down_node = Node(board=rule.flip_board(board), player=-player, tree=upper_node.tree, level=upper_node.level + 1, parent_edge=self, final=result == rule.WIN)
def opponent_play(self, board, from_, to_): """ 对手走的棋 :param board: 对手走棋之前的局面 :param from_: :param to_: """ player = board[from_] assert player == -self.stone_val, str(board) + '\nfrom:' + str( from_) + ' to:' + str(to_) act = tuple(np.subtract(to_, from_)) a = rule.actions_move.index(act) action = (from_, a) if player == -1: board = rule.flip_board(board) action = rule.flip_action(action) self.play_process.opponent_play(board, player, action)
def play(self, board): logger.info('%s play...', self.name) board_self = rule.flip_board( board) if self.stone_val == -1 else board.copy() from_, action, vp, p = self.play_process.predict( board_self, self.stone_val) to_ = tuple(np.add(from_, rule.actions_move[action])) if self.stone_val == -1: from_ = rule.flip_location(from_) to_ = rule.flip_location(to_) # vp = rule.flip_action_probs(vp) p = rule.flip_action_probs(p) logger.info('from %s to %s', from_, to_) rule.move(board, from_, to_) opp_q_table = self.predict_opponent(board) logger.debug(opp_q_table) self.play_func(self.stone_val, from_, to_, p, opp_q_table)
def simulate(nw0, nw1, init='fixed'): board = rule.init_board() if init == 'fixed' else rule.random_init_board() player = 1 records = Record() while True: nw = nw0 if player == 1 else nw1 try: bd = board.copy() from_, action, vp, p = nw.policy(board, player) # print('>', from_, action) assert board[from_] == player to_ = tuple(np.add(from_, rule.actions_move[action])) command, eat = rule.move(board, from_, to_) reward = len(eat) records.add(bd, from_, action, reward, vp, win=command == rule.WIN) except NoActionException: return Record(), 0 except Exception as e: logging.info('board is:') logging.info(board) logging.info('player is: %s', player) valid = rule.valid_action(board, player) logging.info('predict is:') print(nw.p) logging.info('sum is: %s', nw.p.sum()) logging.info('valid action is:') logging.info(nw.valid) logging.info('p * valid is:') logging.info(nw.vp) logging.info('from:%s, action:%s', from_, action) logging.info('prob is: %s', valid[from_][action]) records.save('records/train/1st_') raise e # if eat: # print(player, from_, to_, eat, N) if command == rule.WIN: logging.info('%s WIN, step use: %s', str(player), records.length()) return records, player if records.length() > 10000: logging.info('走子数过多: %s', records.length()) return Record(), 0 player = -player board = rule.flip_board(board)
def read(self, filepath): """ 读取棋谱 :param filepath: """ need_flip = '1st' in filepath with open(filepath) as f: for line in f: board, from_, action, reward = line.split(',') board = (np.array([int(i) for i in board]) - 1).reshape(5, 5) from_ = tuple(map(int, from_)) action = int(action) reward = float(reward) player = board[from_] if need_flip and player == -1: board = rule.flip_board(board) from_ = rule.flip_location(from_) action = rule.flip_action(action) self.records.append([board, from_, action, reward])
def default_policy(self): board = self.board.copy() player = self.player step = 0 while True: try: from_, action, *_ = self.tree.worker.policy( board, player) # worker.predict(board, player) command, eat = rule.move_by_action(board, from_, action) except Exception as e: logger.info('board is:\n%s', board) logger.info('player is: %s', player) logger.info('from:%s, action:%s', from_, action) raise e if command == rule.WIN: logging.info('%s WIN, step use: %s', str(player), step) return player player = -player board = rule.flip_board(board) step += 1
def move_down(self, board, player, action): assert np.all(self.root.board == board), 'root_board:\n' + str( self.root.board) + '\nboard:\n' + str(board) assert self.root.player == player, 'root_player:%s, player:%s' % ( self.root.player, player) node = self.get_node(action) logger.debug('get_node(%s):\n%s', action, node) if node is None: logger.info('node is None, new Node()') board = board.copy() rule.move_by_action(board, *action) node = Node(rule.flip_board(board), -player, tree=self) if not node.expanded: node.expansion() self.root = node self.root.parent_edge = None self.root.level = 1 self.n_node = 1 self.depth = 1 self.update_tree_info(self.root) logger.debug('move down to node:%s', action)
def __init__(self, upper_node, a, v, p, lambda_): self.upper_node = upper_node self.a = a self.v = v self.v_ = v self.l = lambda_ self.p = p self.n = 1 self.n_update = 1 self.w = 1 board, player = upper_node.board.copy(), upper_node.player result, _ = rule.move_by_action(board, *a) self.down_node = Node(board=rule.flip_board(board), player=-player, tree=upper_node.tree, level=upper_node.level + 1, parent_edge=self, final=result == rule.WIN) self.win = result == rule.WIN assert p != np.nan assert v != np.nan if self.win: self.v = 1 + 1e-15 # 1.0005
def play(self, board): logger.info('%s play...', self.name) board_self = rule.flip_board( board) if self.stone_val == -1 else board.copy() (from_, action), (valid, q) = self.play_process.predict(board_self, self.stone_val) logger.debug('valid is:%s', valid) logger.debug('q is:%s', q) logger.debug('from:%s, action:%s', from_, action) to_ = tuple(np.add(from_, rule.actions_move[action])) q_table = np.zeros((5, 5, 4)) for (f, a), q1 in zip(valid, q): q_table[f][a] = q1 if self.stone_val == -1: from_ = rule.flip_location(from_) to_ = rule.flip_location(to_) q_table = rule.flip_action_probs(q_table) logger.info('from %s to %s', from_, to_) rule.move(board, from_, to_) opp_q_table = self.predict_opponent(board) logger.debug(opp_q_table) self.play_func(self.stone_val, from_, to_, q_table, opp_q_table)
def simulate(nw0, nw1, activation, init='fixed'): np.random.seed(util.rand_int32()) player = 1 if np.random.random() > 0.5 else -1 logger.info('init:%s, player:%s', init, player) board = rule.init_board( player) if init == 'fixed' else rule.random_init_board() records = Record() # full_records = Record() boards = set() # {(board,player)} nws = [None, nw0, nw1] n_steps = 0 while True: nw = nws[player] # nw0 if player == 1 else nw1 try: bd = board.copy() board_str = util.board_str(board) if (board_str, player) in boards: # 找出环,并将目标置为0.5进行训练,然后将环清除 finded = False records2 = Record() for i in range(len(boards) - 1, -1, -1): b, f, a, _, _ = records[i] if (b == board).all() and b[f] == player: finded = True break assert finded, (board, player) records2.records = records.records[i:] records2.draw() nw0.train(records2) nw1.train(records2) # 将环里的数据清除 records.records = records.records[:i] for b, f, a, _, _ in records2: boards.remove((util.board_str(b), b[f])) logger.info('环:%s, records:%s, epsilon:%s', len(records2), records.length(), nw.epsilon) boards.add((board_str, player)) from_, action = nw.policy(board, player) assert board[from_] == player to_ = tuple(np.add(from_, rule.actions_move[action])) command, eat = rule.move(board, from_, to_) reward = len(eat) if activation == 'sigmoid': records.add3(bd, from_, action, reward, win=command == rule.WIN) # full_records.add3(bd, from_, action, reward, win=command==rule.WIN) elif activation == 'linear': records.add2(bd, from_, action, reward, win=command == rule.WIN) # full_records.add2(bd, from_, action, reward, win=command == rule.WIN) elif activation == 'selu': records.add4(bd, from_, action, reward, win=command == rule.WIN) # full_records.add4(bd, from_, action, reward, win=command == rule.WIN) else: raise ValueError if command == rule.WIN: logging.info('%s WIN, stone:%s, step use: %s, epsilon:%s', str(player), (board == player).sum(), records.length(), nw.epsilon) return records, player if n_steps - records.length() > 500: logging.info('循环走子数过多: %s', records.length()) # 走子数过多,和棋 records.clear() return records, 0 player = -player if init == 'fixed': board = rule.flip_board(board) n_steps += 1 except NoActionException: # 随机初始化局面后一方无路可走 return Record(), 0 except Exception as e: logging.info('board is:\n%s', board) logging.info('player is: %s', player) valid = rule.valid_actions(board, player) logging.info('valid is:\n%s', valid) logging.info('predict is:\n%s', nw.q_value) logging.info('valid action is:\n%s', nw.valid) logging.info('from:%s, action:%s', from_, action) records.save('records/train/1st_') raise e