def __init__(self, T: int, n: int, tau: float, rollout: Rollout, eps: float): # initialize game config self.game_config = GameConfig(StartingSplit.StartingRandomSplit, prosperity=False, num_players=1, sandbox=True) self.supply = Supply(self.game_config) self.game = None # max number of turns in a game self.T = T self.expanded = False self.rollout_model = rollout self.data = MCTSData() self.player = None self.iter = 0 self.iters = n if self.rollout_model == Rollout.Random: self.rollout = RandomRollout() elif rollout == Rollout.HistoryHeuristic: self.rollout_cards = [] self.rollout = HistoryHeuristicRollout(tau=tau, train=True) elif rollout == Rollout.LinearRegression: self.rollout = LinearRegressionRollout(self.iters, self.supply, tau=tau, train=True, eps=eps) self.player = MCTSPlayer(rollout=self.rollout, train=True)
def main(args): config = GameConfig(num_players=2, sandbox=args.sandbox, feature_type=args.ftype, device=args.device) tree = GameTree(train=True) D_in = config.feature_size H = (config.feature_size + 1) // 2 player = MCTSPlayer(rollout=init_rollouts(args.rollout, D_in=D_in, H=H)[0], tree=tree) players = [player, player] env = DefaultEnvironment(config, players) train_mcts(env, tree, args.n, save_epochs=args.save_epochs, train_epochs=args.train_epochs, train_epochs_interval=args.train_epochs_interval, path=args.path, rollout_path=args.rollout_path, capacity=args.buffer_cap)
def run(): parse = argparse.ArgumentParser(description="gomoku program") parse.add_argument("player1", type=int, choices=[1, 2, 3, 4], help="1.Human; 2.MCTS; 3.Random; 4.Expert") parse.add_argument("player2", type=int, choices=[1, 2, 3, 4], help="1.Human; 2.MCTS; 3.Random; 4.Expert") parse.add_argument("--size", type=int, default=8, help="The Board size,default is 8*8 ") parse.add_argument("--simulate_time", type=int, default=2, help="The MCTS playout simulation time,default is 2s ") args = parse.parse_args() chess = Gomoku(board_size=args.size) p1 = { 1: HumanPlayer(chess), 2: MCTSPlayer(chess, simulate_time=args.simulate_time), 3: RandomPlayer(chess), 4: ExpertPlayer(chess) } p2 = { 1: HumanPlayer(chess), 2: MCTSPlayer(chess, simulate_time=args.simulate_time), 3: RandomPlayer(chess), 4: ExpertPlayer(chess) } chess.play(p1[args.player1], p2[args.player2], isShow=True)
def __init__(self, game_state, policy_value_net): self.state = game_state self.figures = self.init_figures() self.buffer_value = [] self.policy_value_net = policy_value_net self.p1 = MCTSPlayer(self.state, 'p1', self.policy_value_net.policy_value_fn, n_playout=100, is_selfplay=1) #用于训练是selfplay self.p2 = MCTSPlayer(self.state, 'p2', self.policy_value_net.policy_value_fn, n_playout=1000, is_selfplay=0) #用于真正自己玩 self.human = Human_Player('human') self.random_player = Player('random') self.pure_tree_playre = Pure_MCTS_Player( self.state, 'pure_tree', self.policy_value_net.policy_value_fn, n_playout=1000, is_selfplay=0)
def construct_player_model(self, player_model_str): if player_model_str == 'random': return RandomPlayer(draft=self) elif player_model_str.startswith('mcts'): max_iters, c = parse_mcts_maxiter_c(player_model_str) return MCTSPlayer(name=player_model_str, draft=self, maxiters=max_iters, c=c) elif player_model_str == 'assocrule': return AssocRulePlayer(draft=self) elif player_model_str == 'hwr': return HighestWinRatePlayer(draft=self) else: raise NotImplementedError
def generate_model(choice, param): if choice == 'player': from player import Player return Player(**param) elif choice == 'random': from player import RandomBot return RandomBot(**param) elif choice == 'mcts': from player import MCTSPlayer return MCTSPlayer(name=param['name'], c_puct=5, n_playout=1000, max_step=1000) else: from player import MyPolicy return MyPolicy(**param)
def policy_evaluate(self, n_playout_ai=400, n_playout_mcts=100, n_games=10): """ 策略胜率评估:模型与纯MCTS玩家对战n局看胜率 n_playout_ai ai预测每个action的mcts模拟次数 n_playout_mcts 纯mcts随机走子时每个action的mcts模拟步数 n_games 策略评估胜率时的模拟对局次数 """ logging.info("__policy_evaluate__") # ai玩家(使用策略价值网络来指导树搜索和评估叶节点) ai_player = AIPlayer(self.policy_value_net.policy_value_fn, n_playout=n_playout_ai) # 纯mcts玩家 mcts_player = MCTSPlayer(n_playout=n_playout_mcts) win_cnt = {'ai': 0, 'mcts': 0, 'tie': 0} for i in range(n_games): # 对战 if i % 2 == 0: # ai first logging.info("policy evaluate start: {}, ai use W".format(i + 1)) winner = self.game.start_play(ai_player, mcts_player) if winner == 0: win_cnt['ai'] += 1 elif winner == 1: win_cnt['mcts'] += 1 else: win_cnt['tie'] += 1 else: # mcts first logging.info("policy evaluate start: {}, ai use B".format(i + 1)) winner = self.game.start_play(mcts_player, ai_player) if winner == 0: win_cnt['mcts'] += 1 elif winner == 1: win_cnt['ai'] += 1 else: win_cnt['tie'] += 1 # win_cnt[winner] += 1 logging.info("policy evaluate res: {},{}".format(i + 1, win_cnt)) # 胜率 win_ratio = 1.0 * (win_cnt['ai'] + 0.5 * win_cnt['tie']) / n_games logging.info( "evaluate n_playout_mcts:{}, win: {}, lose: {}, tie:{}".format( n_playout_mcts, win_cnt['ai'], win_cnt['mcts'], win_cnt['tie'])) return win_ratio
# # print(g.get_parents_to_root(">10")) # keras.backend.clear_session() # m = MCTSPlayer(numplayouts = 20, movetime = 10, ep = 1.4142135623730950488) # m = MinimaxPlayer(ev=None, depth=9) # t = TicTacToe(r, m, verbose = True) # print(t.play()) m = HumanPlayer() dic = {} unit = 1.414 r = MCTSPlayer() match = Match(r, m, True) res = match.play(10) dic[unit] = res print(dic) # m.startGame() # b = Board() # b.pushMove(0) # b.pushMove(2) # b.pushMove(3) # b.pushMove(4) # b.pushMove(6) # print(m.board_already_in_gt(b))
class Blackjack(): def __init__(self, game_state, policy_value_net): self.state = game_state self.figures = self.init_figures() self.buffer_value = [] self.policy_value_net = policy_value_net self.p1 = MCTSPlayer(self.state, 'p1', self.policy_value_net.policy_value_fn, n_playout=100, is_selfplay=1) #用于训练是selfplay self.p2 = MCTSPlayer(self.state, 'p2', self.policy_value_net.policy_value_fn, n_playout=1000, is_selfplay=0) #用于真正自己玩 self.human = Human_Player('human') self.random_player = Player('random') self.pure_tree_playre = Pure_MCTS_Player( self.state, 'pure_tree', self.policy_value_net.policy_value_fn, n_playout=1000, is_selfplay=0) # 初始化数字池和可用数字 def init_figures(self): figures = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] for _ in range(2): figures.append(random.randint(1, 10)) self.figures = np.array(figures) self.availabel_figures = np.ones(12) self.state.update_current_state(self.figures, self.availabel_figures, p1_num=False, p2_num=False, p1_choi=-1, p2_choi=-1) # 初始化玩家的数字 def init_player_figures(self): a = random.randint(15, 21) b = random.randint(19, 27) # 两者相加不能大于21*2 if a + b >= 21 * 2: a = a - (a + b - 21 * 2) / 2 b = b - (a + b - 21 * 2) / 2 a = int(a) - 1 b = int(b) - 1 # 两者相加要为奇数 if (a + b) % 2 == 0: b = b - 1 # 随机P1和P2 if random.random() >= .5: self.p1_num = a self.p2_num = b else: self.p1_num = b self.p2_num = a self.state.current_state[2] = self.p1_num self.state.current_state[3] = self.p2_num return self.p1_num, self.p2_num def who_first(self): if self.p1.num >= self.p2.num: return self.p1, self.p2 else: return self.p2, self.p1 def get_winner(self): if count_one(self.state.current_state[1], 1) <= 2: if self.state.current_state[2][ 0] <= 21 and self.state.current_state[3][0] <= 21: if self.state.current_state[2][0] >= self.state.current_state[ 3][0]: winner = 0 else: winner = 1 elif self.state.current_state[3][0] > 21: winner = 0 else: winner = 1 return winner # 开始游戏(和纯树玩家玩) def start_game(self): print('=========START GAME==========') self.init_figures() # 初始化数字池 self.init_player_figures() # 初始化双方数字 self.state.save_current_state() #print(self.state.current_state) for i in range(5): #=====打印状态===== print('********ROUND %i*********' % (i + 1)) print(self.state.current_state[0]) print(self.state.current_state[1]) print(self.state.current_state[2]) print(self.state.current_state[3]) if self.state.current_state[2][0] > self.state.current_state[3][ 0]: #如果第3行大于第4行,就纯树先走。 act, num = self.pure_tree_playre.get_action( self.state.current_state) # 纯树玩家选择 #act, num = self.random_player.get_action(self.state.current_state) # 随机玩家选择 self.state.do_move(act) print('PTreePlayer Selcet No.%i fig: %i ' % (act, self.state.current_state[0][act - 1])) act_2nd, num_2nd = self.p2.get_action( self.state.current_state) # MCTS玩家选择 self.state.do_move(act_2nd) print('MCTSPlayer Selcet No.%i fig: %i ' % (act_2nd, self.state.current_state[0][act_2nd - 1])) else: act_2nd, num_2nd = self.p2.get_action( self.state.current_state) # MCTS玩家选择 self.state.do_move(act_2nd) print('MCTSPlayer Selcet No.%i fig: %i ' % (act_2nd, self.state.current_state[0][act_2nd - 1])) act, num = self.pure_tree_playre.get_action( self.state.current_state) # 纯树玩家选择 #act, num = self.random_player.get_action(self.state.current_state) self.state.do_move(act) print('PTreePlayer Selcet No.%i fig: %i ' % (act, self.state.current_state[0][act - 1])) if count_one(self.state.current_state[1], 1) <= 2: # 判断是否已经结束 if self.state.current_state[2][ 0] <= 21 and self.state.current_state[3][ 0] <= 21: #如果两者都小于21,那么大的一方获胜 if self.state.current_state[2][0] >= self.state.current_state[ 3][0]: winner = 0 # 纯树 else: winner = 1 # MCTS elif self.state.current_state[3][0] > 21: winner = 0 # 纯树 else: winner = 1 # 纯树 return winner # 开始游戏(和真人玩家玩) def start_game_human(self): print('=========START GAME==========') self.init_figures() self.init_player_figures() self.state.save_current_state() #print(self.state.current_state) for i in range(5): print('********ROUND %i*********' % (i + 1)) num_list = [] for i in range(12): if self.state.current_state[1][i] == 1: num_list.append(int(self.state.current_state[0][i])) else: num_list.append(0) print('数字列表: : ', num_list) print('行动列表: : ', list(range(1, 13))) print('your number: ', self.state.current_state[2][0]) print('oppe number: ', self.state.current_state[3][0]) if self.state.current_state[2][0] > self.state.current_state[3][ 0]: #p1先手,p1是random 或者是玩家 act, num = self.human.get_action(self.state.current_state) self.state.do_move(act) print('你的选择:[%i] 数字: [%i] ' % (act + 1, self.state.current_state[0][act])) act_2nd, num_2nd = self.p2.get_action(self.state.current_state) self.state.do_move(act_2nd) print('对手选择:[%i] 数字: [%i] ' % (act_2nd + 1, self.state.current_state[0][act_2nd])) else: act_2nd, num_2nd = self.p2.get_action(self.state.current_state) self.state.do_move(act_2nd) print('对手选择:[%i] 数字: [%i] ' % (act_2nd + 1, self.state.current_state[0][act_2nd])) act, num = self.human.get_action(self.state.current_state) self.state.do_move(act) print('你的选择:[%i] 数字: [%i] ' % (act + 1, self.state.current_state[0][act])) if count_one(self.state.current_state[1], 1) <= 2: if self.state.current_state[2][ 0] <= 21 and self.state.current_state[3][0] <= 21: if self.state.current_state[2][0] >= self.state.current_state[ 3][0]: winner = 0 else: winner = 1 elif self.state.current_state[3][0] > 21: winner = 0 else: winner = 1 return winner def start_self_play(self): states, mcts_probs, current_players, buffer_value = [], [], [], [] run_down_list = [] self.init_figures() # 初始化数字公共数字 self.init_player_figures() # 初始化玩家自己的数字 self.state.save_current_state() # 保存到current_state #=====start a selfplay game======= for _ in range(5): # 进行5轮游戏 #通过state判断谁先手。 if self.state.current_state[2][0] > self.state.current_state[3][0]: #run_down_list主要记录哪个player先手 run_down_list.append(0) run_down_list.append(1) else: run_down_list.append(1) run_down_list.append(0) #【敲黑板】选择1个动作。这个动作的选择,是根据MCTS模拟获得的。 act1, act1_porbs = self.p1.get_action(self.state.current_state) self.state.do_move(act1) # 执行动作,并进入下一个state states.append((copy.copy(self.state.current_state)).reshape( -1, 6, 12, 1).astype('float32')) #加入到states保存,等会拿来训练网络 mcts_probs.append(np.array(act1_porbs).astype( 'float32')) #把act1_porbs保存,等会拿来训练网络 #print('======change player========') act2, act2_porbs = self.p1.get_action(self.state.current_state) self.state.do_move(act2) states.append((copy.copy(self.state.current_state)).reshape( -1, 6, 12, 1).astype('float32')) mcts_probs.append(np.array(act2_porbs).astype('float32')) # 经过5轮之后,计算winner winner = self.get_winner() if winner == 0: print('winner: p1') else: print('winner: p2') # 根据胜负,放入到最后 for p in run_down_list: if p != winner: #if p == winner: buffer_value.append(np.ones(12).astype('float32')) else: buffer_value.append((np.ones(12) * (-1)).astype('float32')) self.p1.reset_player() self.p2.reset_player() #把state,动作概率,结果返回。 return zip(states, mcts_probs, buffer_value)
class MCTS: def __init__(self, T: int, n: int, tau: float, rollout: Rollout, eps: float): # initialize game config self.game_config = GameConfig(StartingSplit.StartingRandomSplit, prosperity=False, num_players=1, sandbox=True) self.supply = Supply(self.game_config) self.game = None # max number of turns in a game self.T = T self.expanded = False self.rollout_model = rollout self.data = MCTSData() self.player = None self.iter = 0 self.iters = n if self.rollout_model == Rollout.Random: self.rollout = RandomRollout() elif rollout == Rollout.HistoryHeuristic: self.rollout_cards = [] self.rollout = HistoryHeuristicRollout(tau=tau, train=True) elif rollout == Rollout.LinearRegression: self.rollout = LinearRegressionRollout(self.iters, self.supply, tau=tau, train=True, eps=eps) self.player = MCTSPlayer(rollout=self.rollout, train=True) def run(self): s = self.game.state d: DecisionState = s.decision tree_score = 0 # run the game up to game end or turn limit reached while d.type != DecisionType.DecisionGameOver and s.player_states[ 0]._turns < self.T: if d.text: logging.info(d.text) response = DecisionResponse([]) player = self.game.players[d.controlling_player] next_node = player.controller.makeDecision(s, response) if s.phase == Phase.BuyPhase: # apply selection until leaf node is reached if next_node: assert next_node == self.player.node self.player.node.n += 1 elif not self.expanded: # expand one node cards = list( filter(lambda x: not isinstance(x, Curse), d.card_choices + [None])) self.player.node.add_unique_children(cards) self.expanded = True self.player.node = self.player.node.get_child_node( response.single_card) self.player.node.n += 1 # Uncomment to track UCT score within the tree tree_score = self.game.get_player_scores()[0] self.data.update_split_scores(tree_score, False, self.iter) elif self.rollout_model == Rollout.HistoryHeuristic: self.rollout_cards.append(response.single_card) s.process_decision(response) s.advance_next_decision() score = self.game.get_player_scores()[0] # update data self.data.update_split_scores(score - tree_score, True, self.iter) # backpropagate delta = score self.player.node.v += delta self.player.node = self.player.node.parent while self.player.node != self.player.root: self.player.node.update_v(lambda x: sum(x) / len(x)) self.player.node = self.player.node.parent # update history heuristic if self.rollout_model == Rollout.HistoryHeuristic: self.rollout.update(cards=self.rollout_cards, score=score) elif self.rollout_model == Rollout.LinearRegression: counts = self.game.state.get_card_counts(0) self.rollout.update(counts=counts, score=score, i=self.iter) return self.game.get_player_scores()[0] def reset(self, i: int): self.expanded = False self.rollout_cards = [] self.iter = i self.game_config = GameConfig(StartingSplit.StartingRandomSplit, prosperity=False, num_players=1, sandbox=True) self.game = Game(self.game_config, [self.player]) self.game.new_game() self.game.state.advance_next_decision() self.player.reset(self.game.state.player_states[0]) def train(self, n: int, output_iters: int, save_model=False, model_dir=model_dir, model_name='mcts', save_data=False, data_dir=data_dir, data_name='data'): avg = 0 for i in tqdm(range(n)): # initialize new game self.reset(i) self.run() self.data.update(self.game, self.player, i) avg = sum(self.data.scores) / (i + 1) if i > 0 and i % output_iters == 0: print( f'Last {output_iters} avg: {sum(self.data.scores[i-output_iters:i]) / output_iters}' ) print(f'Total {i} avg: {avg}') if save_model: save(os.path.join(model_dir, model_name), self.player.root) save(os.path.join(model_dir, f'{model_name}_rollout'), self.rollout) if save_data: self.data.update_dataframes() self.data.augment_avg_scores(100) save(os.path.join(data_dir, data_name), self.data)
def start_infer(self, vs_type='human-vs-ai', n_playout=400, best_model=None): """ 启动对战 Params: vs_type 对战类型 n_playout ai预测每个action的mcts模拟次数 best_model AIPlayer使用的模型 """ logging.info("__start_vsplay__") # 1.初始化棋盘 self.board.init_board() # 2.初始化棋手 # 初始化AI棋手 #from net.policy_value_net_keras import PolicyValueNet # Keras from net.policy_value_net_tensorflow import PolicyValueNet # Tensorflow best_policy = PolicyValueNet(self.board.action_ids_size, model_file=best_model) ai_player = AIPlayer(best_policy.policy_value_fn, n_playout=n_playout) # 初始化MCTS棋手 mcts_player = MCTSPlayer(n_playout=n_playout) # 初始化人类棋手,输入移动命令的格式: Nf3 human_player = HumanPlayer() # 初始化MiniMax棋手 minimax_player = MiniMaxPlayer(depth=4) # 初始化stockfish棋手 stockfish_player = StockfishPlayer() # 3.启动游戏 logging.info("vsplay start: ".format(vs_type)) if vs_type == 'human-vs-ai': self.start_play(human_player, ai_player, vsprint=True) elif vs_type == 'human-vs-mcts': self.start_play(human_player, mcts_player, vsprint=True) elif vs_type == 'human-vs-minimax': self.start_play(human_player, minimax_player, vsprint=True) elif vs_type == 'human-vs-stockfish': self.start_play(human_player, stockfish_player, vsprint=True) elif vs_type == 'ai-vs-human': self.start_play(ai_player, human_player, vsprint=True, angle_player=Board.BLACK) elif vs_type == 'mcts-vs-human': self.start_play(mcts_player, human_player, vsprint=True, angle_player=Board.BLACK) elif vs_type == 'minimax-vs-human': self.start_play(minimax_player, human_player, vsprint=True, angle_player=Board.BLACK) elif vs_type == 'stockfish-vs-human': self.start_play(stockfish_player, human_player, vsprint=True, angle_player=Board.BLACK) else: exit("undefind vs-type: ".format(vs_type))