def main(debug=False): model_file = os.path.join(curr_dir, "../model/best_model_15_5.pth") policy_value_net = PolicyValueNet(size, model_file=model_file) context = zmq.Context() socket = context.socket(zmq.REP) socket.bind("tcp://*:5555") print("Server start on 5555 port") while True: message = socket.recv() try: message = message.decode('utf-8') actions = json.loads(message) print("Received: %s" % message) start = datetime.now() mcts_player = MCTSPlayer(policy_value_net.policy_value_fn, c_puct=c_puct, n_playout=n_playout, is_selfplay=0) # result = predict game = FiveChess(size=size, n_in_row=n_in_row) for act in actions: step=(act[0],act[1]) game.step_nocheck(step) action, value = mcts_player.get_action(game, return_value=1) result = {"action":action, "value": value} print(result) print('time used: {} sec'.format((datetime.now() - start).total_seconds())) socket.send_string(json.dumps(result, ensure_ascii=False)) except Exception as e: traceback.print_exc() socket.send_string(json.dumps({"error":str(e)}, ensure_ascii=False))
def policy_evaluate(self): """ Evaluate the trained policy by playing against the pure MCTS player Note: this is only for monitoring the progress of training """ player = MCTSPlayer(self.policy_value_net.policy_value, c_puct=self.c_puct, n_playout=30) environment = Molecule(["C", "O", "N"], init_mol=self.mol, allow_removal=True, allow_no_modification=False, allow_bonds_between_rings=False, allowed_ring_sizes=[5, 6], max_steps=10, target_fn=None, record_path=False) environment.initialize() environment.init_qed = QED.qed(Chem.MolFromSmiles(self.mol)) moves, fp, _S_P, _Qs = player.get_action(environment, temp=self.temp, return_prob=1, rand=False) return moves, _S_P, _Qs
def run(): curr_dir = os.path.dirname(os.path.abspath(__file__)) model_dir = os.path.join(curr_dir, './model/') model_file = os.path.join(model_dir, 'model.pth') try: agent = Agent() # agent.limit_piece_count = 8 # agent.limit_max_height = 10 env = TetrominoEnv(agent.tetromino) # 神经网络的价值策略 net_policy = PolicyValueNet(10, 20, 5, model_file=model_file) mcts_ai_player = MCTSPlayer(net_policy.policy_value_fn, c_puct=1, n_playout=64) # agent.start_play(mcts_ai_player, env) while not agent.terminal: if agent.curr_player == 0: # act_probs, value = net_policy.policy_value_fn(agent) # act = max(act_probs, key=lambda act_prob: act_prob[1])[0] # print(act, act_probs, value) act = mcts_ai_player.get_action(agent) else: act = 4 agent.step(act, env) agent.print() except KeyboardInterrupt: print('quit')
def collect_selfplay_data(self, i): """收集自我对抗数据用于训练""" # 使用MCTS蒙特卡罗树搜索进行自我对抗 logging.info("TRAIN Self Play starting ...") agent = Agent(size, n_in_row, is_shown=0) # 创建使用策略价值网络来指导树搜索和评估叶节点的MCTS玩家 if i % 2 == 0: mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) pure_mcts_player = None mcts_player.mcts._limit_max_var = False else: if os.path.exists(best_model_file): best_policy_value_net = PolicyValueNet( size, model_file=best_model_file) else: best_policy_value_net = self.policy_value_net mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) pure_mcts_player = MCTSPlayer( best_policy_value_net.policy_value_fn, c_puct=self.c_puct + 0.5, n_playout=self.n_playout, is_selfplay=1) mcts_player.mcts._limit_max_var = False pure_mcts_player.mcts._limit_max_var = False # 开始下棋 winner, play_data = agent.start_self_play(mcts_player, pure_mcts_player, temp=self.temp) agent.game.print() if winner is None or play_data is None: print("give up this agent") return if pure_mcts_player != None: if winner == mcts_player.player: self.c_puct_win[0] = self.c_puct_win[0] + 1 else: self.c_puct_win[1] = self.c_puct_win[1] + 1 play_data = list(play_data)[:] # 采用翻转棋盘来增加样本数据集 play_data = self.get_equi_data(play_data) logging.info("Self Play end. length:%s saving ..." % len(play_data)) logging.info("c_puct:{}/{} = {}/{}".format(self.c_puct, self.c_puct + 0.5, self.c_puct_win[0], self.c_puct_win[1])) # 保存训练数据 for obj in play_data: self.save_wait_data(obj)
def initPlayers(self): self.width = 9 self.height = 9 self.board = Board(width=self.width, height=self.height, n_in_row=5) self.mcts_player = MCTSPlayer(c_puct=5, n_playout=1000) self.human_player = HumanPlayer() self.start_player = 0 # 0 - human, 1 - mcts_player self.board.init_board(self.start_player) p1, p2 = self.board.players self.human_player.set_player_id(p1) self.mcts_player.set_player_id(p2) self.players = {p2: self.mcts_player, p1: self.human_player} self.board.show(self.human_player.playerId, self.mcts_player.playerId)
def init(self): self.dataset = list() computerA = MCTSPlayer(value_function=self.ai.value_function, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=True, role='Self_A', verbose=self.verbose) computerB = MCTSPlayer(value_function=self.ai.value_function, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=True, role='Self_B', verbose=self.verbose) self.gameenigne = GameEngine(playerA=computerA,playerB=computerB,verbose=self.verbose)
def __init__(self, init_model=True): self.config = GomokuConfig() # params of the board and the game self.board = GomokuBase(width=self.config.board_width, height=self.config.board_height, n_to_win=self.config.n_to_win, use_forbidden=self.config.use_forbidden) self.game = GomokuServer(self.board) # training params self.data_buffer = deque(maxlen=self.config.buffer_size) self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy #self.pure_mcts_playout_num = 1000 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet( self.config.board_width, self.config.board_height, model_file=self.config.model_path) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.config.board_width, self.config.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.config.c_puct, n_playout=self.config.n_playout, is_selfplay=True)
def run(): size = 15 # 棋盘大小 n_in_row = 5 # 几子连线 curr_dir = os.path.dirname(os.path.abspath(__file__)) model_dir = os.path.join(curr_dir, './model/') model_file = os.path.join(model_dir, 'model_%s_%s.pth' % (size, n_in_row)) try: agent = Agent(size=size, n_in_row=n_in_row) # ############### human VS AI ################### # 神经网络的价值策略 net_policy = PolicyValueNet(size, model_file=model_file) mcts_ai_player = MCTSPlayer(net_policy.policy_value_fn, c_puct=4, n_playout=500, is_selfplay=0) # 纯MCTS玩家 # mcts_player = MCTSPurePlayer(c_puct=5, n_playout=2000) # 人类玩家 human = Human(agent, is_show=1) # 设置 start_player=0 AI先走棋 agent.start_play(mcts_ai_player, human, start_player=0) agent.game.print() agent.env.close() # agent.start_play(human, human, start_player=0 if random.random()>0.5 else 1) except KeyboardInterrupt: print('quit')
def __init__(self, board_width, board_height, net_params = None): # init network parameters self.learning_rate = 5e-3 self.l2_const = 1e-4 #coef of l2 penalty self.lr_multiplier = 1.0 self.temp = 1.0 #temporary parameter self.n_playout = 400 # number of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # number of mini-batch self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 #number of train step for each update self.kl_targ = 0.025 self.check_freq = 50 self.game_batch_num = 1500 self.best_win_ratio = 0.0 self.pure_mcts_playout_num = 1000 # initial env self.board = Board() self.game = Game(self.board) self.board_width = board_width self.board_height = board_height self.create_policy_value_net() self._loss_train_op() #init mcts player self.mcts_player = MCTSPlayer(self.policy_value_fn, self.board.get_current_player(), c_puct = self.c_puct, n_playout = self.n_playout, is_selfplay = 1)
def __init__(self, init_model=None): # 棋盘参数 self.game = Quoridor() # 训练参数 self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # 适应性调节学习速率 self.temp = 1.0 self.n_playout = 400 self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 128 # 取1 测试ing self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 1500 self.best_win_ratio = 0.0 self.pure_mcts_playout_num = 1000 if init_model: self.policy_value_net = PolicyValueNet(model_file=init_model) else: self.policy_value_net = PolicyValueNet() # 设置电脑玩家信息 self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def __init__(self, init_model=None): self.game = Quoridor() self.learn_rate = 2e-3 self.lr_multiplier = 1.0 self.temp = 1.0 self.n_playout = 200 self.c_puct = 5 self.buffer_size = 10000 self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.kl_targ = 0.02 self.check_freq = 10 self.game_batch_num = 1000 self.best_win_ratio = 0.0 self.pure_mcts_playout_num = 1000 self.old_probs = 0 self.new_probs = 0 self.first_trained = False if init_model: self.policy_value_net = PolicyValueNet(model_file=init_model) else: self.policy_value_net = PolicyValueNet() self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def __init__(self, init_model=None, size=8): # 棋盘大小 8*8, 5个子连起来 self.board_width = size self.board_height = size self.n_in_row = 5 # n子相连 self.policy_evaluate_size = 10 # 策略评估胜率时的模拟对局次数 self.game_batch_num = 10000 # selfplay对战次数 self.batch_size = 512 # data_buffer中对战次数超过n次后开始启动模型训练 self.check_freq = 50 # 每对战n次检查一次当前模型vs旧模型胜率 self.board = Board(width=self.board_width, height=self.board_height, n_in_row=self.n_in_row) self.game = Game(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # 基于KL的自适应学习率 self.temp = 1.0 # the temperature param self.n_playout = 400 # 每个动作的模拟次数 self.buffer_size = 10000 # cache对战记录个数 self.data_buffer = deque(maxlen=self.buffer_size) # 完整对战历史记录,用于训练 self.play_batch_size = 1 self.epochs = 5 # 每次更新策略价值网络的训练步骤数 self.kl_targ = 0.02 # 策略价值网络KL值目标 self.best_win_ratio = 0.0 # 纯MCTS的模拟数,用于评估策略模型 self.pure_mcts_playout_num = 1000 # 1000 # 用户纯MCTS构建初始树时的随机走子步数 self.c_puct = 5 # MCTS child搜索深度 if init_model: # 使用一个训练好的策略价值网络 self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # 使用一个新的的策略价值网络 self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) # 创建使用策略价值网络来指导树搜索和评估叶节点的MCTS玩家 self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def __init__(self): # params of the board and the game self.board_width = 6 self.board_height = 6 self.n_in_row = 4 self.board = ShogiBoard() # training params self.learn_rate = 5e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.025 self.check_freq = 50 self.game_batch_num = 3000 self.best_win_ratio = 0.0 # num of simulations used for the pure mcts, which is used as the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 # start training from a given policy-value net # policy_param = pickle.load(open('current_policy.model', 'rb')) # self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, net_params = policy_param) # start training from a new policy-value net self.policy_value_net = PolicyValueNet() self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def policy_evaluate(self): """ 策略胜率评估:当前模型与最佳模型对战n局看胜率 """ # 如果不存在最佳模型,直接将当前模型保存为最佳模型 if not os.path.exists(best_model_file): self.policy_value_net.save_model(best_model_file) return # 当前训练好的模型 current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) if self.best_policy_value_net is None: self.best_policy_value_net = PolicyValueNet( size, model_file=best_model_file) best_mcts_player = MCTSPlayer( self.best_policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) current_mcts_player.mcts._limit_max_var = False best_mcts_player.mcts._limit_max_var = False agent = Agent(size, n_in_row, is_shown=0) winner, play_data = agent.start_self_evaluate( current_mcts_player, best_mcts_player, temp=self.temp, start_player=sum(self.best_win) % 2) if winner == current_mcts_player.player: self.best_win[0] = self.best_win[0] + 1 print("Curr Model Win!", "win:", self.best_win[0], "lost", self.best_win[1]) if winner == best_mcts_player.player: self.best_win[1] = self.best_win[1] + 1 print("Curr Model Lost!", "win:", self.best_win[0], "lost", self.best_win[1]) agent.game.print() # 保存训练数据 play_data = list(play_data)[:] play_data = self.get_equi_data(play_data) logging.info("Eval Play end. length:%s saving ..." % len(play_data)) for obj in play_data: self.save_wait_data(obj)
def main(): width = 15 height = 15 net = PolicyValueNet(width, height) player = MCTSPlayer(net, n_playout=1000, is_selfplay=True) trainer = Trainer(width, height, net) for i in range(1500): print("episode " + str(i) + "...\n") board = NewBoard(width, height) trainer.simulate(board, player) trainer.train()
class Engine(): def __init__(self, time=5, policy=None): self.player = MCTSPlayer(time, policy) def set_komi(self, komi): go.KOMI = komi def set_size(self, size): go.init(size) def clear(self): self.player.clear() def debug(self, info=''): print(self.player.debug_info + info) def move(self, color, vertex=None): legal = True if vertex is None: vertex = self.player.move() legal = vertex is not None else: legal = go.move(vertex) if legal: take = go.get_take(go.POSITION) return go.toJI(go.POSITION.vertex), {go.toJI(v) for v in take} else: return None, {} def get_score(self): return go.POSITION.result() def save(self): return go.POSITION.toJSON() def load(self, str): go.POSITION.fromJSON(str)
def run_play(cmd_line_args=None): # Set initial conditions policy = simplenet.PolicyValue(simplenet.PolicyValue.create_network()) policy.load() boardsize = policy.model.input_shape[-1] best_player = MCTSPlayer(policy.eval_value_state, policy.eval_policy_state, n_playout=10, evaluating=True) human_player = Human(boardsize) run_a_game(best_player, human_player, boardsize)
def run(): curr_dir = os.path.dirname(os.path.abspath(__file__)) model_dir = os.path.join(curr_dir, './model/') model_file = os.path.join(model_dir, 'model-cnn.pth') try: agent = Agent() agent.limit_piece_count = 0 agent.limit_max_height = 10 # env = TetrominoEnv(agent.tetromino) # 神经网络的价值策略 net_policy = PolicyValueNet(10, 20, 5, model_file=model_file) mcts_ai_player = MCTSPlayer(net_policy.policy_value_fn, c_puct=1, n_playout=64) # agent.start_play(mcts_ai_player, env) while not agent.terminal: act = mcts_ai_player.get_action(agent) # agent.step(act, env) agent.step(act) print(agent.get_availables()) agent.print2(True) except KeyboardInterrupt: print('quit')
def __init__(self, init_model=None): # 设置棋盘和游戏的参数 ''' self.node1 = node({'cpu':20, 'memory':20, 'gpu':0}) self.node2 = node({'cpu':20, 'memory':20, 'gpu':0}) self.node3 = node({'cpu':50, 'memory':50, 'gpu':50}) self.node_dict = {'node1':self.node1, 'node2':self.node2, 'node3':self.node3} self.data_name = 'gpu' self.c_puct_list = [0.03,0.3,3] self.n_job_thread_list = [0,5] self.probability_1_list = [0,0.03,0.3] self.probability_2_list = [0.3,0.6,0.9] ''' self.node1 = node({'cpu': 30, 'memory': 30, 'gpu': 30, 'fpga': 0}) self.node2 = node({'cpu': 30, 'memory': 30, 'gpu': 0, 'fpga': 30}) self.node3 = node({'cpu': 50, 'memory': 50, 'gpu': 50, 'fpga': 50}) self.node4 = node({'cpu': 30, 'memory': 30, 'gpu': 0, 'fpga': 0}) self.node5 = node({'cpu': 30, 'memory': 30, 'gpu': 0, 'fpga': 0}) #按比例应该是越大越明显 self.node_dict = { 'node1': self.node1, 'node2': self.node2, 'node3': self.node3, 'node4': self.node4, 'node5': self.node5 } self.data_name = 'fpga_gpu' self.c_puct_list = [0.03, 0.3, 3] self.n_job_thread_list = [0, 5] self.probability_1_list = [0, 0.03, 0.3] self.probability_2_list = [0.3, 0.6, 0.9] #self.weight = {'cpu':0.3, 'memory':0.2, 'gpu':0.5} self.weight = None self.state = State(self.node_dict) self.game = Game(self.node_dict, self.weight) # 设置训练参数 self.n_playout = 1000 # 每下一步棋,模拟的步骤数 self.c_puct = 1 # exploitation和exploration之间的折中系数 self.game_batch_num = 3 self.n_job_thread = 6 #0 self.probability_1 = 0 #0 self.probability_2 = 0.2 #0.2 #self.path = r'D:\科研\论文\High effient resource scheduling for cloud based on modified MCTS\programing\parameter_check_on_have_fpga.pkl' # AI Player,设置is_selfplay=1 自我对弈,因为是在进行训练 self.mcts_player = MCTSPlayer(c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def run(): n = 5 width, height = 8, 8 try: board = Board(width=width, height=height, n_in_row=n) game = Game(board) mcts_player = MCTSPlayer(c_puct=5, n_playout=400) human = Human() # set start_player=0 for human first game.start_play(human, mcts_player, start_player=0) # 每次对弈结束,把state-node pair 存进去 store_object(mcts_player.state_node_pairs, "Data") except KeyboardInterrupt: print('\n\rquit')
def policy_evaluate(self, n_games=10): """ 策略胜率评估:模型与纯MCTS玩家对战n局看胜率 """ # AlphaGo Zero风格的MCTS玩家(使用策略价值网络来指导树搜索和评估叶节点) current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) # 纯MCTS玩家 pure_mcts_player = MCTSPurePlayer(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): # 对战 winner = self.game.start_play(current_mcts_player, pure_mcts_player, start_player=i % 2, is_shown=0) win_cnt[winner] += 1 # 胜率 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[-1]) / n_games logging.info("TRAIN Num_playouts:{}, win: {}, lose: {}, tie:{}, win_ratio:{}".format(self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[-1], win_ratio)) return win_ratio
def __init__(self, mol=None, init_model=None): # params of the board and the game # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 30 # num of simulations for each move self.c_puct = 1 self.buffer_size = 200 self.batch_size = 200 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.epochs = 50 # num of train_steps for each update self.kl_targ = 0.2 self.check_freq = 5 self.mol = mol self.play_batch_size = 1 self.game_batch_num = 15 self.in_dim = 1024 self.n_hidden_1 = 1024 self.n_hidden_2 = 1024 self.out_dim = 1 self.output_smi = [] self.output_qed = [] # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.in_dim, self.n_hidden_1, self.n_hidden_2, self.out_dim, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.in_dim, self.n_hidden_1, self.n_hidden_2, self.out_dim) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def collect_selfplay_data(self): """收集自我对抗数据用于训练""" # 使用MCTS蒙特卡罗树搜索进行自我对抗 logging.info("TRAIN Self Play starting ...") # 游戏代理 agent = Agent() # 创建使用策略价值网络来指导树搜索和评估叶节点的MCTS玩家 mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) # 开始下棋 winer, play_data = agent.start_self_play(mcts_player, temp=self.temp) play_data = list(play_data)[:] episode_len = len(play_data) # 把翻转棋盘数据加到数据集里 # play_data = self.get_equi_data(play_data) logging.info("TRAIN Self Play end. length:%s saving ..." % episode_len) # 保存对抗数据到data_buffer for obj in play_data: self.dataset.save(obj)
def __init__(self, player1=GomokuPlayer.Human, player2=GomokuPlayer.Human): # params of the board and the game self.config = GomokuConfig() self.board = GomokuBase(width=self.config.board_width, height=self.config.board_height, n_to_win=self.config.n_to_win) self.game = GomokuServer(self.board, player1, player2) # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy # start training from an initial policy-value net if player1 == GomokuPlayer.AI or player2 == GomokuPlayer.AI: self.policy_value_net = PolicyValueNet( self.config.board_width, self.config.board_height, model_file=self.config.model_path) self.mcts_player = MCTSPlayer( self.policy_value_net.policy_value_fn, c_puct=self.config.c_puct, n_playout=self.config.n_playout_play, is_selfplay=False)
def __init__(self, size=(8, 8), init_model=None): # params of the board and the game print(size) self.board_width = size[1] self.board_height = size[0] self.board = GomokuBoard(size=(self.board_width, self.board_height)) self.game = GomokuGame(self.board) # training params self.learn_rate = 2e-3 self.lr_multiplier = 1.0 # adaptively adjust the learning rate based on KL self.temp = 1.0 # the temperature param self.n_playout = 400 # num of simulations for each move self.c_puct = 5 self.buffer_size = 10000 self.batch_size = 512 # mini-batch size for training self.data_buffer = deque(maxlen=self.buffer_size) self.play_batch_size = 1 self.epochs = 5 # num of train_steps for each update self.kl_targ = 0.02 self.check_freq = 50 self.game_batch_num = 3000 self.best_win_ratio = 0.0 self.all_loss = [] # num of simulations used for the pure mcts, which is used as # the opponent to evaluate the trained policy self.pure_mcts_playout_num = 1000 if init_model: # start training from an initial policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height, model_file=init_model) else: # start training from a new policy-value net self.policy_value_net = PolicyValueNet(self.board_width, self.board_height) self.mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1)
def policy_evaluate(self, n_games=10): """ Evaluate the trained policy by playing games against the pure MCTS player Note: this is only for monitoring the progress of training """ current_mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout) pure_mcts_player = MCTS_Pure(c_puct=5, n_playout=self.pure_mcts_playout_num) win_cnt = defaultdict(int) for i in range(n_games): print("train-policy_evaluate: game = %d" % (i)) winner = start_play(self.board, current_mcts_player, pure_mcts_player, startPlayer=i % 2) win_cnt[winner] += 1 win_ratio = 1.0 * (win_cnt[1] + 0.5 * win_cnt[0]) / n_games print("num_playouts:{}, win: {}, lose: {}, tie:{}".format( self.pure_mcts_playout_num, win_cnt[1], win_cnt[2], win_cnt[0])) return win_ratio
from game import AlternateTurnGame from helper import User, RandomPlayer from mcts import MCTSPlayer from tictactoe import TTTState from connectn import ConnectNState board1 = TTTState() game1 = AlternateTurnGame(2, board1) board2 = ConnectNState() game2 = AlternateTurnGame(2, board2) #p1 = MCTSPlayer() players = [RandomPlayer(), RandomPlayer()] players = [MCTSPlayer(), MCTSPlayer()] print(game1.play_games(1, players)) #print(game2.play_games(1000, [RandomPlayer()]*2)) #print(game1.play_games(1000, [RandomPlayer()]*2))
def __init__(self, time=5, policy=None): self.player = MCTSPlayer(time, policy)
model_file = '{}/model/best_policy_{}x{}.model'.format(CUR_PATH, size, size) try: # 初始化棋盘 board = Board(width=size, height=size, n_in_row=5) game = Game(board) # 初始化AI棋手 best_policy = PolicyValueNet(size, size, model_file=model_file) """ # 使用numpy加载训练好的模型(仅限Theano/Lasagne训练出的模型) try: policy_param = pickle.load(open(model_file, 'rb')) except: policy_param = pickle.load(open(model_file, 'rb'), encoding='bytes') # To support python3 best_policy = PolicyValueNetNumpy(size, size, policy_param) """ mcts_player = MCTSPlayer(best_policy.policy_value_fn, c_puct=5, n_playout=900) # 纯MCTS棋手 # mcts_player = MCTSPurePlayer(c_puct=5, n_playout=4000) # 初始化人类棋手,输入移动命令的格式: 2,3 human_player = HumanPlayer() # 启动游戏(start_player=0人类先/1机器先) game.start_play(human_player, mcts_player, start_player=1, is_shown=1) except KeyboardInterrupt: print('\n\rquit')
def collect_selfplay_data(self): """收集自我对抗数据用于训练""" # 使用MCTS蒙特卡罗树搜索进行自我对抗 logging.info("TRAIN Self Play starting ...") # 游戏代理 agent = Agent() # 创建使用策略价值网络来指导树搜索和评估叶节点的MCTS玩家 mcts_player = MCTSPlayer(self.policy_value_net.policy_value_fn, c_puct=self.c_puct, n_playout=self.n_playout, is_selfplay=1) for _ in range(3): # 开始下棋 reward, piececount, agentcount, play_data = agent.start_self_play( mcts_player, temp=self.temp) play_data = list(play_data)[:] episode_len = len(play_data) # 把翻转棋盘数据加到数据集里 # play_data = self.get_equi_data(play_data) logging.info("TRAIN Self Play end. length:%s saving ..." % episode_len) # 保存对抗数据到data_buffer for obj in play_data: filename = "{}.pkl".format(uuid.uuid1()) savefile = os.path.join(data_wait_dir, filename) pickle.dump(obj, open(savefile, "wb")) # self.dataset.save(obj) if agent.limit_max_height == 10: jsonfile = os.path.join(data_dir, "result.json") if os.path.exists(jsonfile): result = json.load(open(jsonfile, "r")) else: result = {"reward": 0, "steps": 0, "agent": 0} if "1k" not in result: result["1k"] = {"reward": 0, "steps": 0, "agent": 0} result["reward"] = result["reward"] + reward result["steps"] = result["steps"] + piececount result["agent"] = result["agent"] + agentcount result["1k"]["reward"] = result["1k"]["reward"] + reward result["1k"]["steps"] = result["1k"]["steps"] + piececount result["1k"]["agent"] = result["1k"]["agent"] + agentcount if result["agent"] > 0 and result["agent"] % 100 <= 1: result[str(result["agent"])] = { "reward": result["1k"]["reward"] / result["1k"]["agent"], "steps": result["1k"]["steps"] / result["1k"]["agent"] } if result["agent"] > 0 and result["agent"] % 1000 == 0: # 额外保存 steps = round(result["1k"]["steps"] / result["1k"]["agent"]) model_file = os.path.join(model_dir, 'model_%s.pth' % steps) self.policy_value_net.save_model(model_file) for key in list(result.keys()): if key.isdigit(): c = int(key) if c % 1000 > 10: del result[key] result["1k"] = {"reward": 0, "steps": 0, "agent": 0} json.dump(result, open(jsonfile, "w"), ensure_ascii=False) if reward >= 1: break