def arena(self, agent1, agent2, mcts_args, games_to_play=10): mcts1 = MCTS(agent1, mcts_args) mcts2 = MCTS(agent2, mcts_args) results = [] for i in range(games_to_play): #tqdm() if i % 2 == 0: player1 = mcts1 player2 = mcts2 else: player2 = mcts1 player1 = mcts2 env = TicTacToeEnv() done = False while not done: first_player_move = env.fpt if first_player_move: probs = player1.getProbs(env, temp=0) else: probs = player2.getProbs(env, temp=0) action = np.random.choice(probs.shape[0], p=probs.reshape(-1,)) _, reward, done, _ = env.step( (action//env.size, action % env.size) ) if reward == -1: print('Repeated move!') if done: results.append( reward if first_player_move else -1*reward ) return results
def execute_update(self, agent, mcts_args): train_samples = [] for _ in range(self.num_eps): #tqdm() self.mcts = MCTS(agent, mcts_args) train_samples += self.play_game(agent) agent.updateNet(np.array(train_samples)) return train_samples
def __init__(self, game, nnet, args): self.game = game self.nnet = nnet # 对抗者的网络 self.pnet = self.nnet.__class__(self.game) # the competitor network self.args = args # 蒙特卡洛树搜索 self.mcts = MCTS(self.game, self.nnet, self.args) self.trainExamplesHistory = [] # history of examples from args.numItersForTrainExamplesHistory latest iterations # 是否跳过第一次自学习 self.skipFirstSelfPlay = False # can be overriden in loadTrainExamples()
class AlphaBot(): def __init__(self, path, args): self.type = 'computer' self.net = AlphaNet(11, device='cuda:0') self.net.load_net(path) self.mcts = MCTS(self.net, args) def get_action(self, env, info): probs = self.mcts.getProbs(env, temp=0) action = np.random.choice(probs.shape[0], p=probs.reshape(-1, )) return action // env.size, action % env.size
def selfPlay(self, mcts: MCTS, queue): dataOneGame = [] mcts.reset() while not mcts.game.isTerminated(): mcts.expandMaxNodes() Pi = mcts.Pi() if not len(Pi) > 0: break stepData = TrainData() stepData.inputPlanes = mcts.game.getInputPlanes() stepData.inputPolicyMask = mcts.game.getInputPolicyMask() stepData.predictionProbability = Pi actionIndex = self.selectActionIndex(Pi, stepData.inputPolicyMask) action = mcts.play(actionIndex) assert action dataOneGame.append(stepData) print('_', end='') sys.stdout.flush() resultValue = mcts.game.getTerminateValue() dataOneGame.reverse() for data in dataOneGame: data.predictionValue = resultValue resultValue = -resultValue for data in dataOneGame: queue.put(data)
def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximum length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ for i in range(1, self.args.numIters + 1): # bookkeeping log.info(f'Starting Iter #{i} ...') # examples of the iteration # 获取自学习数据 if not self.skipFirstSelfPlay or i > 1: iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue) for _ in tqdm(range(self.args.numEps), desc="Self Play"): self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree iterationTrainExamples += self.executeEpisode() # save the iteration examples to the history self.trainExamplesHistory.append(iterationTrainExamples) if len(self.trainExamplesHistory) > self.args.numItersForTrainExamplesHistory: log.warning( f"Removing the oldest entry in trainExamples. len(trainExamplesHistory) = {len(self.trainExamplesHistory)}") self.trainExamplesHistory.pop(0) # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) # 备份训练数据 self.saveTrainExamples(i - 1) # shuffle examples before training # 所有历史样本累加,再洗牌 trainExamples = [] for e in self.trainExamplesHistory: trainExamples.extend(e) shuffle(trainExamples) # training new network, keeping a copy of the old one # 培训新网络,保留旧网络的副本 self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') # 对抗的mcts,要也上一次训练的网络 pmcts = MCTS(self.game, self.pnet, self.args) # 训练 self.nnet.train(trainExamples) # 新的mcts nmcts = MCTS(self.game, self.nnet, self.args) log.info('PITTING AGAINST PREVIOUS VERSION') # 新旧模型对决 arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game) pwins, nwins, draws = arena.playGames(self.args.arenaCompare) log.info('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins + nwins == 0 or float(nwins) / (pwins + nwins) < self.args.updateThreshold: log.info('REJECTING NEW MODEL') # 训练模型还退步了,不保留新的模型 self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') else: log.info('ACCEPTING NEW MODEL') # 训练有进步,保留新的模型 self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar')
class Coach(): """ This class executes the self-play + learning. It uses the functions defined in Game and NeuralNet. args are specified in train.py. """ def __init__(self, game, nnet, args): self.game = game self.nnet = nnet # 对抗者的网络 self.pnet = self.nnet.__class__(self.game) # the competitor network self.args = args # 蒙特卡洛树搜索 self.mcts = MCTS(self.game, self.nnet, self.args) self.trainExamplesHistory = [] # history of examples from args.numItersForTrainExamplesHistory latest iterations # 是否跳过第一次自学习 self.skipFirstSelfPlay = False # can be overriden in loadTrainExamples() def executeEpisode(self): """ This function executes one episode of self-play, starting with player 1. As the game is played, each turn is added as a training example to trainExamples. The game is played till the game ends. After the game ends, the outcome of the game is used to assign values to each example in trainExamples. It uses a temp=1 if episodeStep < tempThreshold, and thereafter uses temp=0. Returns: trainExamples: a list of examples of the form (canonicalBoard, currPlayer, pi,v) pi is the MCTS informed policy vector, v is +1 if the player eventually won the game, else -1. """ trainExamples = [] board = self.game.getInitBoard() self.curPlayer = 1 episodeStep = 0 while True: episodeStep += 1 canonicalBoard = self.game.getCanonicalForm(board, self.curPlayer) temp = int(episodeStep < self.args.tempThreshold) # 获取实际概率 pi = self.mcts.getActionProb(canonicalBoard, temp=temp) # 获取两个棋盘和先验证概率(原始棋盘 与 对称的棋盘) # 列表,两个元素 [(b1,p1),(b2,p2)] sym = self.game.getSymmetries(canonicalBoard, pi) for b, p in sym: # b为棋盘 p为概率 trainExamples.append([b, self.curPlayer, p, None]) # p = pi按这个概率的形式进行随机选择 action = np.random.choice(len(pi), p=pi) # 下一步的棋盘,并交换玩家 board, self.curPlayer = self.game.getNextState(board, self.curPlayer, action) # 获取回报 r = self.game.getGameEnded(board, self.curPlayer) if r != 0: # 棋盘,选择概率,奖励值 return [(x[0], x[2], r * ((-1) ** (x[1] != self.curPlayer))) for x in trainExamples] def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximum length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ for i in range(1, self.args.numIters + 1): # bookkeeping log.info(f'Starting Iter #{i} ...') # examples of the iteration # 获取自学习数据 if not self.skipFirstSelfPlay or i > 1: iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue) for _ in tqdm(range(self.args.numEps), desc="Self Play"): self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree iterationTrainExamples += self.executeEpisode() # save the iteration examples to the history self.trainExamplesHistory.append(iterationTrainExamples) if len(self.trainExamplesHistory) > self.args.numItersForTrainExamplesHistory: log.warning( f"Removing the oldest entry in trainExamples. len(trainExamplesHistory) = {len(self.trainExamplesHistory)}") self.trainExamplesHistory.pop(0) # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) # 备份训练数据 self.saveTrainExamples(i - 1) # shuffle examples before training # 所有历史样本累加,再洗牌 trainExamples = [] for e in self.trainExamplesHistory: trainExamples.extend(e) shuffle(trainExamples) # training new network, keeping a copy of the old one # 培训新网络,保留旧网络的副本 self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') # 对抗的mcts,要也上一次训练的网络 pmcts = MCTS(self.game, self.pnet, self.args) # 训练 self.nnet.train(trainExamples) # 新的mcts nmcts = MCTS(self.game, self.nnet, self.args) log.info('PITTING AGAINST PREVIOUS VERSION') # 新旧模型对决 arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game) pwins, nwins, draws = arena.playGames(self.args.arenaCompare) log.info('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins + nwins == 0 or float(nwins) / (pwins + nwins) < self.args.updateThreshold: log.info('REJECTING NEW MODEL') # 训练模型还退步了,不保留新的模型 self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') else: log.info('ACCEPTING NEW MODEL') # 训练有进步,保留新的模型 self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar') def getCheckpointFile(self, iteration): return 'checkpoint_' + str(iteration) + '.pth.tar' def saveTrainExamples(self, iteration): folder = self.args.checkpoint if not os.path.exists(folder): os.makedirs(folder) filename = os.path.join(folder, self.getCheckpointFile(iteration) + ".examples") with open(filename, "wb+") as f: Pickler(f).dump(self.trainExamplesHistory) f.closed def loadTrainExamples(self): modelFile = os.path.join(self.args.load_folder_file[0], self.args.load_folder_file[1]) examplesFile = modelFile + ".examples" if not os.path.isfile(examplesFile): log.warning(f'File "{examplesFile}" with trainExamples not found!') r = input("Continue? [y|n]") if r != "y": sys.exit() else: log.info("File with trainExamples found. Loading it...") with open(examplesFile, "rb") as f: self.trainExamplesHistory = Unpickler(f).load() log.info('Loading done!') # examples based on the model were already collected (loaded) self.skipFirstSelfPlay = True
def createMCTS(self, network) -> MCTS: config = self.createMCTSConfig() game = self.createGame(network) mcts = MCTS(game, config) return mcts
def __init__(self, agent, mcts_args,temp_thres = 16, num_eps = 20): self.temp_thres = temp_thres self.mcts = MCTS(agent, mcts_args) self.num_eps = num_eps
class Trainer(): def __init__(self, agent, mcts_args,temp_thres = 16, num_eps = 20): self.temp_thres = temp_thres self.mcts = MCTS(agent, mcts_args) self.num_eps = num_eps def play_game(self, agent): train_samples = [] episode_step = 0 env = TicTacToeEnv() while True: episode_step += 1 temp = int(episode_step < self.temp_thres) probs = self.mcts.getProbs(env, temp=temp) action = np.random.choice(probs.shape[0], p=probs.reshape(-1,)) #probs_orig, val = self.mcts.net.predict(env.getPBoard()) for board_s, probs_s in getBoardSims(env.getPBoard(), probs): train_samples.append([board_s, probs_s, env.fpt]) #, probs_orig, action, val _, reward, done, _ = env.step( (action//env.size, action % env.size) ) if done: final_player = env.fpt return [ (x[0], x[1], reward if final_player != x[2] else -reward) for x in train_samples] #, x[3], x[4], x[5] def execute_update(self, agent, mcts_args): train_samples = [] for _ in range(self.num_eps): #tqdm() self.mcts = MCTS(agent, mcts_args) train_samples += self.play_game(agent) agent.updateNet(np.array(train_samples)) return train_samples def arena(self, agent1, agent2, mcts_args, games_to_play=10): mcts1 = MCTS(agent1, mcts_args) mcts2 = MCTS(agent2, mcts_args) results = [] for i in range(games_to_play): #tqdm() if i % 2 == 0: player1 = mcts1 player2 = mcts2 else: player2 = mcts1 player1 = mcts2 env = TicTacToeEnv() done = False while not done: first_player_move = env.fpt if first_player_move: probs = player1.getProbs(env, temp=0) else: probs = player2.getProbs(env, temp=0) action = np.random.choice(probs.shape[0], p=probs.reshape(-1,)) _, reward, done, _ = env.step( (action//env.size, action % env.size) ) if reward == -1: print('Repeated move!') if done: results.append( reward if first_player_move else -1*reward ) return results
if display: env.render() if reward == -1: print('You lose!') lose_count += 1 elif reward == 1: print('You won!') won_count += 1 elif reward == -10: exit() break print('%d-%d-%d' % (won_count, round - (won_count + lose_count), lose_count)) pass g = ConnectX_AlphaZero() # 神经网络 n = NNet(g) n.load_checkpoint('./temp', 'best.pth.tar') mcts = MCTS(g, n, dotdict({'numMCTSSims': 50, 'cpuct': 1.0})) # 执行选择 alpha_zero_action = lambda x: np.argmax(mcts.getActionProb(x, temp=0)) # play_with_coumpter(alpha_zero_action, player=1, agent_level=2, round=100) # play_with_coumpter(alpha_zero_action, player=2, agent_level=2, round=100) # play_with_coumpter(alpha_zero_action, player=1, agent_level=1, round=100) # play_with_coumpter(alpha_zero_action, player=2, agent_level=1, round=100) play_with_human(alpha_zero_action, 1)
import os os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "2" from env import TicTacToeEnv from utils import * from AlphaZero.AlphaNet import AlphaNet from AlphaZero.MCTS import MCTS from AlphaZero.trainAlphaZero import Trainer import torch if __name__ == '__main__': net = AlphaNet(11, device='cuda:0') net.load_net('AlphaZero/models/net_updates_2000.pth') args = {'c': 1., 'num_sims': 25, 'sleep_time': 0} mcts = MCTS(net, args) coach = Trainer(net, args, num_eps=100) for num_update in range(2001, 3001): print('Starting update {}'.format(num_update)) coach.execute_update(net, args) net_old = AlphaNet(11, device='cuda:0') net_old.load_net('AlphaZero/models/net_updates_' + str(num_update - 1) + '.pth') results = coach.arena(net, net_old, args, games_to_play=40) num_of_wins = np.sum( [1 if results[i] == (-1)**(i) else 0 for i in range(len(results))]) num_of_losses = np.sum([
def __init__(self, path, args): self.type = 'computer' self.net = AlphaNet(11, device='cuda:0') self.net.load_net(path) self.mcts = MCTS(self.net, args)