def simulate(self, ts, board, player): from record import Record from value_network import NoActionException records = Record() while True: try: bd = board.copy() board_str = util.board_str(board) valid_action = rule.valid_actions(board, player) while True: (from_, act), q = self.epsilon_greedy(board, player, valid_action, ts) if (board_str, from_, act) not in self.predicts or len( ts.root.sub_edge) == 1: break ts.root.sub_edge = [ e for e in ts.root.sub_edge if e.a != (from_, act) ] valid_action.remove((from_, act)) assert board[from_] == player ts.move_down(board, player, action=(from_, act)) if self.episode % 10 == 0: logger.info('action:%s,%s', from_, act) logger.info('q is %s', q) to_ = tuple(np.add(from_, rule.actions_move[act])) command, eat = rule.move(board, from_, to_) records.add3(bd, from_, act, len(eat), win=command == rule.WIN) except NoActionException: # 随机初始化局面后一方无路可走 return Record(), 0 except Exception as ex: logging.warning('board is:\n%s', board) logging.warning('player is: %s', player) valid = rule.valid_actions(board, player) logging.warning('valid is:\n%s', valid) logging.warning('from_:%s, act:%s', from_, act) ts.show_info() records.save('records/train/1st_') raise ex if command == rule.WIN: logging.info('%s WIN, step use: %s, epsilon:%s', str(player), records.length(), self.epsilon) return records, player if records.length() > 10000: logging.info('走子数过多: %s', records.length()) return Record(), 0 player = -player board = rule.flip_board(board)
def probabilities(self, board, player): valid = rule.valid_actions(board, player) qs = [self.q(board, from_, action) for from_, action in valid] q2 = np.zeros((5, 5, 4)) for (from_, action), q in zip(valid, qs): q2[from_][action] = q return q2
def expansion(self): board, player = self.board, self.player actions = rule.valid_actions(board, player) # actions_ = list(filter(lambda a:(self.board_str, *a) not in walked, actions)) # if len(actions_) == 0: # 全部已经走过,重新选 # actions_ = actions if self.player == self.tree.player: with self.tree.value_model_lock: values = [ self.tree.value_model.q(board, from_, act) for from_, act in actions ] else: with self.tree.opp_value_model_lock: values = [ self.tree.opp_value_model.q(board, from_, act) for from_, act in actions ] probs = ValueNetwork.value_to_probs(values) for a, v, p in zip(actions, values, probs): e = Edge(upper_node=self, a=a, v=v, p=p, lambda_=self.tree.lambda_) self.add_edge(e) self.expanded = True
def policy_by_probs(self, board, player): valid = rule.valid_actions(board, player) q = None self.set_pre(q, valid, q) if len(valid) == 0: raise NoActionException qs = [self.q(board, from_, act) for from_, act in valid] probs = self.value_to_probs(qs) action = util.select_by_prob(valid, probs) if self.episode % 10 == 0: logger.info('action:%s', action) logger.info('q:%s', qs) logger.info('probs:%s', probs) return action
def policy_by_epsilon_greedy(self, board, player): valid = rule.valid_actions(board, player) q = None self.set_pre(q, valid, q) if len(valid) == 0: raise NoActionException board_str = ''.join(map(str, board.flatten())) (from_, action), q = self.epsilon_greedy_probs(board, valid, q) self.predicts.add((board_str, from_, action)) self.set_pre(q, valid, None) if self.episode % 10 == 0: logger.info('action:%s,%s', from_, action) # logger.info('valid:%s', valid) logger.info('q:%s', q) return from_, action
def expansion(self): board, player = self.board.copy(), self.player probs = self.tree.policy.probabilities(board, player) for action in rule.valid_actions(board, player): v = 0 from_, act = action if (self.board_str, player, action) in self.tree.worker.predicts: continue p = probs[from_][act] e = Edge(upper_node=self, a=action, v=v, p=p, lambda_=self.tree.lambda_) self.add_edge(e) self.expanded = True assert len(self.sub_edge) > 0, 'board:\n' + str( self.board) + '\nplayer:' + str(self.player)
def policy_by_epsilon_greedy_no_repeat(self, board, player): valid = rule.valid_actions(board, player) q = None self.set_pre(q, valid, q) if len(valid) == 0: raise NoActionException board_str = ''.join(map(str, board.flatten())) while True: (from_, action), q = self.epsilon_greedy(board, valid, q) if (board_str, from_, action) not in self.predicts or len(valid) == 1: self.predicts.add((board_str, from_, action)) self.set_pre(q, valid, None) if self.episode % 10 == 0: logger.info('action:%s,%s', from_, action) # logger.info('valid:%s', valid) logger.info('q:%s', q) return from_, action else: # 将已经走过的位置移除,选择其它位置 idx = valid.index((from_, action)) valid.pop(idx) if q: q.pop(idx)
def simulate(nw0, nw1, activation, init='fixed'): np.random.seed(util.rand_int32()) player = 1 if np.random.random() > 0.5 else -1 logger.info('init:%s, player:%s', init, player) board = rule.init_board( player) if init == 'fixed' else rule.random_init_board() records = Record() # full_records = Record() boards = set() # {(board,player)} nws = [None, nw0, nw1] n_steps = 0 while True: nw = nws[player] # nw0 if player == 1 else nw1 try: bd = board.copy() board_str = util.board_str(board) if (board_str, player) in boards: # 找出环,并将目标置为0.5进行训练,然后将环清除 finded = False records2 = Record() for i in range(len(boards) - 1, -1, -1): b, f, a, _, _ = records[i] if (b == board).all() and b[f] == player: finded = True break assert finded, (board, player) records2.records = records.records[i:] records2.draw() nw0.train(records2) nw1.train(records2) # 将环里的数据清除 records.records = records.records[:i] for b, f, a, _, _ in records2: boards.remove((util.board_str(b), b[f])) logger.info('环:%s, records:%s, epsilon:%s', len(records2), records.length(), nw.epsilon) boards.add((board_str, player)) from_, action = nw.policy(board, player) assert board[from_] == player to_ = tuple(np.add(from_, rule.actions_move[action])) command, eat = rule.move(board, from_, to_) reward = len(eat) if activation == 'sigmoid': records.add3(bd, from_, action, reward, win=command == rule.WIN) # full_records.add3(bd, from_, action, reward, win=command==rule.WIN) elif activation == 'linear': records.add2(bd, from_, action, reward, win=command == rule.WIN) # full_records.add2(bd, from_, action, reward, win=command == rule.WIN) elif activation == 'selu': records.add4(bd, from_, action, reward, win=command == rule.WIN) # full_records.add4(bd, from_, action, reward, win=command == rule.WIN) else: raise ValueError if command == rule.WIN: logging.info('%s WIN, stone:%s, step use: %s, epsilon:%s', str(player), (board == player).sum(), records.length(), nw.epsilon) return records, player if n_steps - records.length() > 500: logging.info('循环走子数过多: %s', records.length()) # 走子数过多,和棋 records.clear() return records, 0 player = -player if init == 'fixed': board = rule.flip_board(board) n_steps += 1 except NoActionException: # 随机初始化局面后一方无路可走 return Record(), 0 except Exception as e: logging.info('board is:\n%s', board) logging.info('player is: %s', player) valid = rule.valid_actions(board, player) logging.info('valid is:\n%s', valid) logging.info('predict is:\n%s', nw.q_value) logging.info('valid action is:\n%s', nw.valid) logging.info('from:%s, action:%s', from_, action) records.save('records/train/1st_') raise e
def predict(self, board, player): valid = rule.valid_actions(board, player) q = [self.q(board, from_, action) for from_, action in valid] return self.pi_star(valid, q), (valid, q)
def maxq(self, board, player): q = [ self.q(board, from_, action) for from_, action in rule.valid_actions(board, player) ] return max(q)