def logCurrentCapabilities(game, iter_num, args): gpus = args.setGPU.split(',') os.environ["CUDA_VISIBLE_DEVICES"] = gpus[iter_num % len(gpus)] # improved nnet player n2 = nn(game) n2.load_checkpoint('./temp/', 'best.pth.tar') #args2 = dotdict({'numMCTSSims': args.numMCTSSims, 'cpuct':args.cpuct, 'multiGPU':True}) mcts2 = MCTS(game, n2, args) n2p = lambda b, p: np.argmax(mcts2.getActionProb(b, p, temp=0)) # Heuristic player: heuristic = Heuristic(game).random_play # Random Player: rp = RandomPlayer(game).play arena = Arena(n2p, heuristic, game, display=display) resultHeur = "{} {}".format(*arena.playGames(40, verbose=False)[:2]) arena = Arena(n2p, rp, game, display=display) resultRand = "{} {}".format(*arena.playGames(40, verbose=False)[:2]) MyLogger.info("Iter:{} Heuristic: {} Random: {}".format( iter_num, resultHeur, resultRand)) print("Iter:{} Heuristic: {} Random: {}\n".format(iter_num, resultHeur, resultRand))
def play_games(game, args, processID, enemy): np.random.seed(processID) #set gpu gpus = args.setGPU.split(',') os.environ["CUDA_VISIBLE_DEVICES"] = gpus[processID % len(gpus)] #set gpu memory grow config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) # Players: heuristic = Heuristic(game).random_play policy = PolicyPlayer(game).play rp = RandomPlayer(game).play if enemy == "heuristic": second_player = heuristic elif enemy == "rp": second_player = rp elif enemy == "n1p": # improved nnet player n1 = nn(game) n1.load_checkpoint('./temp/', 'best.pth.tar') mcts1 = MCTS(game, n1, args, lambdaHeur=args.lambdaHeur) n1p = lambda b, p: np.argmax(mcts1.getActionProb(b, p, temp=0)) second_player = n1p arena = Arena(n1p, heuristic, game, display=display) return arena.playGames(args.numPerProcessAgainst, verbose=False) arena = Arena(policy, second_player, game, display=display) return arena.playGames(args.numPerProcessAgainst, verbose=False)
def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximum length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ for i in range(1, self.args.numIters + 1): # bookkeeping log.info(f'Starting Iter #{i} ...') # examples of the iteration if not self.skipFirstSelfPlay or i > 1: iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue) for _ in tqdm(range(self.args.numEps), desc="Self Play"): self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree iterationTrainExamples += self.executeEpisode() # save the iteration examples to the history self.trainExamplesHistory.append(iterationTrainExamples) if len(self.trainExamplesHistory) > self.args.numItersForTrainExamplesHistory: log.warning( f"Removing the oldest entry in trainExamples. len(trainExamplesHistory) = {len(self.trainExamplesHistory)}") self.trainExamplesHistory.pop(0) # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) self.saveTrainExamples(i - 1) # shuffle examples before training trainExamples = [] for e in self.trainExamplesHistory: trainExamples.extend(e) shuffle(trainExamples) # training new network, keeping a copy of the old one self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') # self.loadTrainExamples() # trainExamples = self.trainExamplesHistory pmcts = MCTS(self.game, self.pnet, self.args) self.nnet.train(trainExamples) nmcts = MCTS(self.game, self.nnet, self.args) log.info('PITTING AGAINST PREVIOUS VERSION') arena = Arena(lambda x, y: np.argmax(pmcts.getActionProb(x, y, temp=0, player=1)), lambda x, y: np.argmax(nmcts.getActionProb(x, y, temp=0, player=-1)), self.game) pwins, nwins, draws = arena.playGames(self.args.arenaCompare) print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins + nwins == 0 or float(nwins) / (pwins + nwins) < self.args.updateThreshold: log.info('REJECTING NEW MODEL') self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') else: log.info('ACCEPTING NEW MODEL') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar')
def pitter(self): # training new network, keeping a copy of the old one self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best1.pth.tar') self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='best2.pth.tar') pmcts = MCTS(self.game, self.pnet, self.args) nmcts = MCTS(self.game, self.nnet, self.args) print('PITTING AGAINST PREVIOUS VERSION') arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game) pwins, nwins, draws = arena.playGames(self.args.arenaCompare) print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins + nwins > 0 and float(nwins) / ( pwins + nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') else: print('ACCEPTING NEW MODEL') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar') self.aws_s3_sync()
def PitAgents(agent1, agent2, boardSize: int, gameCount: int): game = Game(boardSize) print('(P1 = {0}) vs. (P2 = {1})'.format(agent1.name, agent2.name)) arena = Arena(agent1.playFunc, agent2.playFunc, game) p1wins, p2wins, draws = arena.playGames(gameCount) print('P1/P2 WINS : %d / %d ; DRAWS : %d' % (p1wins, p2wins, draws))
def pitting(self, previous_weights, current_weights, games_num): """Fighting between previous generation agent and current generation agent Args: previous_weights (numpy.array): weights of previous generation neural network current_weights (numpy.array): weights of current generation neural network game_num (int): game number of fighting Returns: tuple of (game number of previous agent won, game number of current agent won, game number of draw) """ # update weights of previous and current neural network self.previous_agent.set_weights(previous_weights) self.current_agent.set_weights(current_weights) # reset node state of MCTS self.previous_mcts = MCTS(self.game, self.previous_agent, self.args) self.current_mcts = MCTS(self.game, self.current_agent, self.args) arena = Arena( lambda x: np.argmax(self.previous_mcts.getActionProb(x, temp=0)), lambda x: np.argmax(self.current_mcts.getActionProb(x, temp=0)), self.game) previous_wins, current_wins, draws = arena.playGames(games_num) return (previous_wins, current_wins, draws)
def optimize_and_evaluate(self): # training new network, keeping a copy of the old one self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') pmcts = MCTS(self.game, self.pnet, self.args) self.nnet.train(trainExamples) nmcts = MCTS(self.game, self.nnet, self.args) # if self.args.arenaCompare is large enough both nmcts and pmcts consume huge amount of RAM. # RAM consumed depends also on your game implementation, particularly on game.getActionSize # and size of game.stringRepresentation. print('PITTING AGAINST PREVIOUS VERSION') arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game) pwins, nwins, draws = arena.playGames(self.args.arenaCompare) # input("Arena finished, continue?\n") print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins+nwins > 0 and float(nwins)/(pwins+nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') else: print('ACCEPTING NEW MODEL') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=getCheckpointFile(i)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar') print("") print("Previous NN MCTS stats") pmcts.print_stats() pmcts = None print("New NN MCTS stats") nmcts.print_stats() nmcts = None
def callgreedy(num, q, args): ''' :param num: number of games :param q: the queue that will store the results :param args: configs :return: doesn't return anything, the results are stored in q It uses the greedy agent to play the specified games ''' from tictactoe.TicTacToeGame import TicTacToeGame as Game from tictactoe.tensorflow.NNet import NNetWrapper as nn verify = 0 while verify == 0: try: g = Game(3) nnet = nn(g, 0.06) filenameCurrent = "currentforprocess:temp:iter" + str(args.numIters) + \ ":eps" + str(args.numEps) + ":dim" + str(g.n) + ".pth.tar" nnet.load_checkpoint(folder=args.checkpoint, filename=filenameCurrent) gp = returnplayer(args, "greedy", g) nmcts1 = MCTS(g, nnet, args) arenagreedy = Arena(lambda x: np.argmax(nmcts1.getActionProb(x, temp=0)), gp, g) pwins, nwins, drawwins = arenagreedy.playGames(num) q.put((pwins, nwins, drawwins)) nmcts1.clear() verify = 1 except: verify = 0
def main(): game = Connect4Game() config = Config() rp = RandomPlayer(game).play oslp = OneStepLookaheadPlayer(game).play hp = HumanConnect4Player(game).play mctsp = MCTSPlayer(game, config).play c4config = C4Config() nn = NNetWrapper(game, c4config) ckpt = ('./trained/connect4','connect4_best_34.pth.tar') nn.load_checkpoint(ckpt[0], ckpt[1]) nnp = NNetPlayer(game, nn, c4config).play nn2 = NNetWrapper(game, c4config) ckpt2 = ('./trained/connect4','connect4_checkpoint_26.pth.tar') nn2.load_checkpoint(ckpt2[0], ckpt2[1]) nnp2 = NNetPlayer(game, nn2, c4config).play arena = Arena(hp, nnp2, game, display=display) # arena = ArenaMP(nnp, nnp2, game, display=display) # arena.playGame(verbose=True) out = arena.playGames(50, verbose=True) print(out)
def test_play_games_cumulative_score(): mock_player = mock.Mock() mock_game = mock.Mock() mock_game.getGameEnded.side_effect = [1, 1, 1, 1, 1, 5, 5, 5, 5, 5] arena = Arena(mock_player, mock_player, mock_game) p1_score, p2_score = arena.playGames(10, verbose=False) assert p1_score == 5 assert p2_score == 25
def learn(self): for i in range(1, self.args.numIters+1): self.iter = i # bookkeeping print('------ITER ' + str(i) + '------') if self.args.use_pitting: # training new network, keeping a copy of the old one if os.path.exists(os.path.join(self.args.checkpoint,'best.pth.tar.index')): self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar') else: self.pnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar') pmcts_game = self.game.__class__() pmcts_game.getInitBoard() pmcts = MCTS(pmcts_game, self.pnet, self.args) if self.args.comment_training: self.learn_comment_iter() if not self.args.nn_args['is_train'] and not self.args.use_pitting: break if self.args.chess_training: self.learn_chess_iter() if self.args.use_self_play: self.learn_self_play_iter() self.mcts = MCTS(self.game, self.nnet, self.args) if self.args.save_model: self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) if self.args['comment_training']: self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='new.pth.tar') if self.args.use_pitting: nmcts_game = self.game.__class__() nmcts_game.getInitBoard() nmcts = MCTS(nmcts_game, self.nnet, self.args) print('PITTING AGAINST PREVIOUS VERSION') arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game) pwins, nwins, draws = arena.playGames(self.args.arenaCompare) print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins+nwins == 0 or float(nwins)/(pwins+nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') else: print('ACCEPTING NEW MODEL') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar') if self.args.use_self_play: self.trainExampleSelfPlay = [] self.selfplaynum+=1 nmcts = None pmcts = None
def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximium length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ trainExamples = deque([], maxlen=self.args.maxlenOfQueue) for i in range(self.args.numIters): # bookkeeping print('------ITER ' + str(i+1) + '------') eps_time = AverageMeter() bar = Bar('Self Play', max=self.args.numEps) end = time.time() for eps in range(self.args.numEps): self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree trainExamples += self.executeEpisode() # bookkeeping + plot progress eps_time.update(time.time() - end) end = time.time() bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format(eps=eps+1, maxeps=self.args.numEps, et=eps_time.avg, total=bar.elapsed_td, eta=bar.eta_td) bar.next() bar.finish() # training new network, keeping a copy of the old one self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') pnet = self.nnet.__class__(self.game) pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') pmcts = MCTS(self.game, pnet, self.args) self.nnet.train(trainExamples) nmcts = MCTS(self.game, self.nnet, self.args) print('PITTING AGAINST PREVIOUS VERSION') arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game) pwins, nwins, draws = arena.playGames(self.args.arenaCompare) print('NEW/PREV WINS : ' + str(nwins) + '/' + str(pwins) + ' ; DRAWS : ' + str(draws)) if pwins+nwins > 0 and float(nwins)/(pwins+nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') self.nnet = pnet else: print('ACCEPTING NEW MODEL') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='checkpoint_' + str(i) + '.pth.tar') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar')
def Async_Play(game, args, iter_num, bar): bar.suffix = "iter:{i}/{x} | Total: {total:} | ETA: {eta:}".format( i=iter_num + 1, x=args.numPlayGames, total=bar.elapsed_td, eta=bar.eta_td) bar.next() # set gpu if (args.multiGPU): if (iter_num % 2 == 0): os.environ["CUDA_VISIBLE_DEVICES"] = "0" else: os.environ["CUDA_VISIBLE_DEVICES"] = "1" else: os.environ["CUDA_VISIBLE_DEVICES"] = args.setGPU # set gpu growth config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) # create NN model1 = NNet(game) model2 = NNet(game) # try load weight try: model1.load_checkpoint(folder=args.model1Folder, filename=args.model1FileName) except: print("load model1 fail") pass try: model2.load_checkpoint(folder=args.model2Folder, filename=args.model2FileName) except: print("load model2 fail") pass # create MCTS mcts1 = MCTS(game, model1, args) mcts2 = MCTS(game, model2, args) # each process play 2 games arena = Arena(lambda x: np.argmax(mcts1.getActionProb(x, temp=0)), lambda x: np.argmax(mcts2.getActionProb(x, temp=0)), game) arena.displayBar = False oneWon, twoWon, draws = arena.playGames(2) return oneWon, twoWon, draws
def AsyncAgainst(nnet, game, args, iter_num): os.environ["CUDA_VISIBLE_DEVICES"] = '3' minimax = minimaxAI(game,depth=7) local_args = dotdict({'numMCTSSims': 200, 'cpuct': 1.0}) mcts = MCTS(game, nnet, local_args, eval=True) arena = Arena(lambda x: np.argmax(mcts.getActionProb(x, temp=0)), minimax.get_move, game) arena.displayBar = False net_win, minimax_win, draws = arena.playGames(2) return net_win, minimax_win, draws
def AsyncAgainst(game, args, iter_num, bar): # create separate seeds for each worker np.random.seed(iter_num) if args.displaybar: bar.suffix = "iter:{i}/{x} | Total: {total:} | ETA: {eta:}".format( i=iter_num + 1, x=args.numAgainstPlayProcess, total=bar.elapsed_td, eta=bar.eta_td) bar.next() #set gpu gpus = args.setGPU.split(',') os.environ["CUDA_VISIBLE_DEVICES"] = gpus[iter_num % len(gpus)] #set gpu memory grow config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) #create nn and load nnet = nn(game, args.displaybar) pnet = nn(game, args.displaybar) try: nnet.load_checkpoint(folder=args.checkpoint, filename='train.pth.tar') except: print("load train model fail") pass try: pnet.load_checkpoint(folder=args.checkpoint, filename='best.pth.tar') except: print("load old model fail") filepath = os.path.join(args.checkpoint, "best.pth.tar") pnet.save_checkpoint(folder=args.checkpoint, filename='best.pth.tar') pmcts = MCTS(game, pnet, args, args.lambdaHeur) nmcts = MCTS(game, nnet, args, args.lambdaHeur) arena = Arena(lambda b, p: np.argmax( pmcts.getActionProb(board=b, curPlayer=p, temp=1)), lambda b, p: np.argmax( nmcts.getActionProb(board=b, curPlayer=p, temp=1)), game, displaybar=args.displaybar) # each against process play the number of numPerProcessAgainst games. pwins, nwins, draws = arena.playGames(args.numPerProcessAgainst) return pwins, nwins, draws
def AsyncAgainst(game, args, iter_num, bar): bar.suffix = "iter:{i}/{x} | Total: {total:} | ETA: {eta:}".format( i=iter_num + 1, x=args.numAgainstPlayProcess, total=bar.elapsed_td, eta=bar.eta_td) bar.next() #set gpu if (args.multiGPU): if (iter_num % 2 == 0): os.environ["CUDA_VISIBLE_DEVICES"] = "0" else: os.environ["CUDA_VISIBLE_DEVICES"] = "1" else: os.environ["CUDA_VISIBLE_DEVICES"] = args.setGPU #set gpu memory grow config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) #create nn and load nnet = nn(game) pnet = nn(game) try: nnet.load_checkpoint(folder=args.checkpoint, filename='train.pth.tar') except: print("load train model fail") pass try: pnet.load_checkpoint(folder=args.checkpoint, filename='best.pth.tar') except: print("load old model fail") pass pmcts = MCTS(game, pnet, args) nmcts = MCTS(game, nnet, args) arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), game) arena.displayBar = True # each against process play the number of numPerProcessAgainst games. pwins, nwins, draws = arena.playGames(args.numPerProcessAgainst) return pwins, nwins, draws
def pitAgainstOpponents(self): print('COMPARING AGAINS DEFAULT PLAYERS') for player in self.opponents: print(f'PITTING AGAINST {player}') nmcts = MCTS(self.game, self.nnet, self.args) opponent = player(self.game) def bot(x): return opponent.play(x) def model(x): return np.argmax(nmcts.getActionProb(x, temp=0)) arena = Arena(bot, model, self.game, render=self.args.render) fwins, swins, draws = arena.playGames(self.args.arenaCompare // len(self.opponents)) print('MODEL/PLAYER WINS : %d / %d ; DRAWS : %d' % (swins, fwins, draws))
def AsyncAgainst(nnet, game, args, gameth): logging.debug("play self test game " + str(gameth)) os.environ["CUDA_VISIBLE_DEVICES"] = args.setGPU # create nn and load minimax = minimaxAI(game) local_args = dotdict({'numMCTSSims': 100, 'cpuct': 1.0}) # local_args.numMCTSSims = 100 # local_args.cpuct = 1 mcts = MCTS(game, nnet, local_args, eval=True) arena = Arena(lambda x: np.argmax(mcts.getActionProb(x, temp=0)), minimax.get_move, game) arena.displayBar = False net_win, minimax_win, draws = arena.playGames(2) return net_win, minimax_win, draws
def eval(self, iter): print('Evaluating against random play...') def mplay(board): mcts = MCTS(self.game, self.nnet, self.args) return np.argmax(mcts.getActionProb(board, temp=0)) def rplay(board): a = np.random.randint(self.game.getActionSize()) valids = self.game.getValidMoves(board, 1) while valids[a] != 1: a = np.random.randint(self.game.getActionSize()) return a arena = Arena(mplay, rplay, self.game) mwins, rwins, draws = arena.playGames(20) self.evalResults.append((mwins / 20.0, iter)) print('Saving eval results:') print(self.evalResults) self.saveEvalResults()
def PitNetworks(gameCount: int): pred = OthelloPredictor(6, 'trainedModels/othello/pred_othello_087.pth', 100000) g_pred = Game(6, predictor=pred) g_regular = Game(6) nnet1 = nn(g_regular) nnet2 = nn(g_regular) #nnet1.load_checkpoint('AlphaZeroModels', 'predictor_87_ep93.pth.tar') nnet1.load_checkpoint('AlphaZeroModels', 'predictor_87_ep131.pth.tar') nnet2.load_checkpoint('AlphaZeroModels', 'pretrained_ep153.pth.tar') mcts1 = MCTS(g_regular, nnet1, args) mcts2 = MCTS(g_regular, nnet2, args) print('PITTING AGAINST PREVIOUS VERSION') arena = Arena(lambda x: np.argmax(mcts1.getActionProb(x, temp=0)), lambda x: np.argmax(mcts2.getActionProb(x, temp=0)), g_regular) p1wins, p2wins, draws = arena.playGames(gameCount) print('P1/P2 WINS : %d / %d ; DRAWS : %d' % (p1wins, p2wins, draws))
def pit(self, iteration, proc_num): self.pnet = self.nnet.__class__(self.game) # the competitor network if iteration != 1: self.pnet.load_checkpoint(folder=self.args["checkpoint"], filename="checkpoint_%d.pth.tar" % (iteration - 1)) nmcts = MCTS(self.game, self.nnet, self.args) pmcts = MCTS(self.game, self.pnet, self.args) print('PITTING AGAINST PREVIOUS VERSION') arena = Arena(lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), self.game) nwins, pwins, draws = arena.playGames( self.args["arenaCompare"] // self.args["genFilesPerIteration"]) # nwins = 1 # pwins = 2 # draws = 0 print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) self.savePit(iteration, proc_num, nwins, pwins, draws)
def async_against(game, args, iter_num): import tensorflow as tf #bar.suffix = "iter:{i}/{x} | Total: {total:} | ETA: {eta:}".format(i=iter_num + 1, x=args.arenaCompare, # total=bar.elapsed_td, eta=bar.eta_td) #bar.next() # set gpu if args.multiGPU: if iter_num % 2 == 0: os.environ["CUDA_VISIBLE_DEVICES"] = "0" else: os.environ["CUDA_VISIBLE_DEVICES"] = "1" else: os.environ["CUDA_VISIBLE_DEVICES"] = args.setGPU # set gpu memory grow config = tf.ConfigProto() config.gpu_options.allow_growth = True _ = tf.Session(config=config) # create nn and load nnet = nn(game) pnet = nn(game) try: nnet.load_checkpoint(folder=args.checkpoint, filename='train.pth.tar') except: print("load train model fail") pass try: pnet.load_checkpoint(folder=args.checkpoint, filename='best.pth.tar') except: print("load old model fail") pass pmcts = MCTS(game, pnet, args) nmcts = MCTS(game, nnet, args) arena = Arena(lambda x: np.argmax(pmcts.get_action_prob(x, temp=0)), lambda x: np.argmax(nmcts.get_action_prob(x, temp=0)), game) arena.displayBar = False pwins, nwins, draws = arena.playGames(2) return pwins, nwins, draws
def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximium length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ for i in range(1, self.args.numIters + 1): # bookkeeping print('------ITER ' + str(i) + '------') # examples of the iteration if not self.skipFirstSelfPlay or i > 1: iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue) eps_time = AverageMeter() bar = Bar('Self Play', max=self.args.numEps) end = time.time() for eps in range(self.args.numEps): self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree iterationTrainExamples += self.executeEpisode() # bookkeeping + plot progress eps_time.update(time.time() - end) end = time.time() bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format( eps=eps + 1, maxeps=self.args.numEps, et=eps_time.avg, total=bar.elapsed_td, eta=bar.eta_td) bar.next() bar.finish() # save the iteration examples to the history self.trainExamplesHistory.append(iterationTrainExamples) trainStats = [0, 0, 0] for _, _, res in iterationTrainExamples: trainStats[res] += 1 print trainStats if len(self.trainExamplesHistory ) > self.args.numItersForTrainExamplesHistory: print("len(trainExamplesHistory) =", len(self.trainExamplesHistory), " => remove the oldest trainExamples") self.trainExamplesHistory.pop(0) # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) self.saveTrainExamples(i - 1) # shuffle examlpes before training trainExamples = [] for e in self.trainExamplesHistory: trainExamples.extend(e) shuffle(trainExamples) # training new network, keeping a copy of the old one self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') pmcts = MCTS(self.game, self.pnet, self.args) self.nnet.train(trainExamples) nmcts = MCTS(self.game, self.nnet, self.args) print('PITTING AGAINST PREVIOUS VERSION') arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game) pwins, nwins, draws = arena.playGames(self.args.arenaCompare) print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins + nwins > 0 and float(nwins) / ( pwins + nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') else: print('ACCEPTING NEW MODEL') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar')
# total num should x2, because each process play 2 games. 'numPlayGames': 10, 'numPlayPool': 5, # num of processes pool. 'model1Folder': '/workspace/CU_Makhos/models/', 'model1FileName': 'best.pth.tar', 'model2Folder': '/workspace/CU_Makhos/models/', 'model2FileName': 'best.pth.tar', }) g = ThaiCheckersGame() minimax = minimaxAI(game=g, depth=7).get_move # nnet players n1 = NNet(g, gpu_num=0) n1.load_checkpoint('models_minimax/', 'train_iter_268.pth.tar') args1 = dotdict({'numMCTSSims': 100, 'cpuct': 1.0}) mcts1 = MCTS(g, n1, args1, eval=True, verbose=True) def n1p(x): return np.random.choice(32 * 32, p=mcts1.getActionProb(x, temp=0)) n2 = NNet(g, gpu_num=0) n2.load_checkpoint('models_minimax/', 'train_iter_140.pth.tar') args2 = dotdict({'numMCTSSims': 100, 'cpuct': 1.0}) mcts2 = MCTS(g, n2, args2, eval=True) def n2p(x): return np.random.choice(32 * 32, p=mcts2.getActionProb(x, temp=0)) arena = Arena(n1p, n2p, g, display=display) print(arena.playGames(2, verbose=True))
def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximum length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ for i in range(1, self.args.numIters + 1): time_begin_iter = time.time() # bookkeeping log.info(f'Starting Iter #{i} ...') # examples of the iteration if not self.skipFirstSelfPlay or i > 1: iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue) for _ in tqdm(range(self.args.numEps), desc="Self Play"): self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree iterationTrainExamples += self.executeEpisode() # save the iteration examples to the history self.trainExamplesHistory.append(iterationTrainExamples) if len(self.trainExamplesHistory ) > self.args.numItersForTrainExamplesHistory: log.warning( f"Removing the oldest entry in trainExamples. len(trainExamplesHistory) = {len(self.trainExamplesHistory)}" ) self.trainExamplesHistory.pop(0) # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) self.saveTrainExamples(i - 1) # shuffle examples before training trainExamples = [] for e in self.trainExamplesHistory: trainExamples.extend(e) shuffle(trainExamples) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') pmcts = MCTS(self.game, self.pnet, self.args) losses = self.nnet.train(trainExamples) nmcts = MCTS(self.game, self.nnet, self.args) log.info('PITTING AGAINST PREVIOUS VERSION') arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game) pwins, nwins, draws, avg_moves = arena.playGames( self.args.arenaCompare) self.df_stats = self.log_to_file( file=self.log_file, args=self.args, it=i, trainExamples=trainExamples, time_begin_iter=time_begin_iter, nwins=nwins, df_stats=self.df_stats, nb_model_improv=self.nb_model_improv, avg_nb_moves=avg_moves, train_losses=losses) self.df_stats.to_feather( os.path.join(self.args.log_file_location, f"{self.args.log_run_name}.feather")) log.info('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins + nwins == 0 or float(nwins) / ( pwins + nwins) < self.args.updateThreshold: log.info('REJECTING NEW MODEL') self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') else: log.info('ACCEPTING NEW MODEL') self.nb_model_improv += 1 self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar') if self.nb_model_improv % self.args.nb_of_new_model_for_random_player == 0: game_simul = SantoriniGame(5, 4) rp = RandomPlayer(game_simul).play n_simul = NNet(game_simul, self.nn_args) n_simul.load_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar') mcts_simul = MCTS(game_simul, n_simul, self.args) n1_simul = lambda x: np.argmax( mcts_simul.getActionProb(x, temp=0)) arena_simul = Arena(n1_simul, rp, game_simul, display=False) nnwins, _, _, avg_nb_moves = arena_simul.playGames( self.args.nb_of_game_agaisnt_random_player, verbose=False) self.df_stats = self.log_to_file( file=self.log_file, args=self.args, it=i, trainExamples=trainExamples, time_begin_iter=time_begin_iter, nwins=nwins, df_stats=self.df_stats, nb_model_improv=self.nb_model_improv, nb_game_rdm=self.args.nb_of_game_agaisnt_random_player, nnwins=nnwins, only_random=True, avg_nb_moves=avg_nb_moves)
def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximum length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ for i in range(1, self.args.numIters+1): # bookkeeping print('------ITER ' + str(i) + '------') # examples of the iteration if not self.skipFirstSelfPlay or i>1: saveSelfPlayTimeLog('------ITER ' + str(i) + '------') iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue) selfPlayStartTime = time.time() if self.multiprocessing: pool = Pool(processes = self.cpu, maxtasksperchild = self.maxtasksperchild) for eps in range(self.args.numEps): self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree # re = pool.apply_async(selfPlay, args=(eps, self.game, self.args)) # iterationTrainExamples += re.get() re = pool.starmap(selfPlay, [(str(eps), self.args)]) iterationTrainExamples += re[0] gc.collect() pool.close() pool.join() else: for eps in range(self.args.numEps): selfStartTime = time.time() self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree re = self.executeEpisode() iterationTrainExamples += re print('Episode ',eps,' eps cost time = %.3f'%(time.time()-selfStartTime) ,' sec') saveSelfPlayTimeLog('Episode ' + str(eps) + ' eps cost time = %.3f'%(time.time()-selfStartTime) + ' sec') print('SelfPlay total cost time = %.3f'%(time.time()-selfPlayStartTime),' sec') saveSelfPlayTimeLog('SelfPlay total cost time = %.3f'%(time.time()-selfPlayStartTime) + ' sec') # save the iteration examples to the history self.trainExamplesHistory.append(iterationTrainExamples) if len(self.trainExamplesHistory) > self.args.numItersForTrainExamplesHistory: print("len(trainExamplesHistory) =", len(self.trainExamplesHistory), " => remove the oldest trainExamples") self.trainExamplesHistory.pop(0) # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) saveStart = time.time() self.saveTrainExamples(i-1) saveEnd = time.time() self.saveTimeLog(i,saveEnd-saveStart) # shuffle examples before training trainExamples = [] for e in self.trainExamplesHistory: trainExamples.extend(e) shuffle(trainExamples) # training new network, keeping a copy of the old one self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') pmcts = MCTS(self.game, self.pnet, self.args) self.nnet.train(trainExamples) nmcts = MCTS(self.game, self.nnet, self.args) print('PITTING AGAINST PREVIOUS VERSION') arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game) pwins, nwins, draws = arena.playGames(self.args.arenaCompare) print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins+nwins == 0 or float(nwins)/(pwins+nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') else: print('ACCEPTING NEW MODEL') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar')
def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximium length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ #Generate a fixed sensing matrix if option is toggled to True. #1)A is fixed. Also set arena_game_args.sensing_matrix to be equal to that of coach.game_args so the arena uses the same sensing matrix. #2)the folder which saves the fixed sensing matrix is empty if self.args['fixed_matrix'] == True: if self.args['load_existing_matrix'] == True: self.game_args.sensing_matrix = np.load(self.args['fixed_matrix_filepath'] + '/sensing_matrix.npy') self.arena_game_args.sensing_matrix = np.load(self.args['fixed_matrix_filepath'] + '/sensing_matrix.npy') #FOR TESTING------------------------------------------------------- #print(self.game_args.sensing_matrix) #END TESTING------------------------------------------------------- else: #if not loading an existing matrix in self.args['fixed_matrix_filepath'], then generate a new sensing matrix of given type self.args['matrix_type'] self.game_args.generateSensingMatrix(self.args['m'], self.args['n'], self.args['matrix_type']) self.arena_game_args.sensing_matrix = self.game_args.sensing_matrix #Save the fixed matrix self.game_args.save_Matrix(self.args['fixed_matrix_filepath']) #FOR TESTING------------------------------------------------------- #print(self.game_args.sensing_matrix) #END TESTING------------------------------------------------------- for i in range(1, self.args['numIters']+1): print('------ITER ' + str(i) + '------') if not self.skipFirstSelfPlay or i>1: #default of self.skipFirstSelfPlay is False. If loading training from file then skipFirstSelfPlay is set to True. skipFirstSelfPlay allows us to load the latest nn_model with latest set of TrainingExamples iterationTrainExamples = deque([], maxlen=self.args['maxlenOfQueue']) #bookkeeping objects contained in pytorch_classification.utils eps_time = AverageMeter() bar = Bar('Self Play', max=self.args['numEps']) end = time.time() #IMPORTANT PART OF THE CODE. GENERATE NEW A AND NEW y HERE. EACH SELF-PLAY GAME HAS DIFFERENT A AND y. #----------------------------------------------------- for eps in range(self.args['numEps']): #Initialize a new game by setting A, x, y, and then execute a single game of self play with self.executeEpisode() if self.args['fixed_matrix'] == False: #repeatedly generate sensing matrices if we are not fixing the sensing matrix. self.game_args.generateSensingMatrix(self.args['m'], self.args['n'], self.args['matrix_type']) #generate a new sensing matrix self.game_args.generateNewObsVec(self.args['x_type'], self.args['sparsity'])#generate a new observed vector y. This assumes a matrix has been loaded in self.game_args!!! self.mcts = MCTS(self.game, self.nnet, self.args, self.game_args, self.skip_nnet)#create new search tree for each game we play #TESTING------------------------- #print('The generated sparse vector x has sparsity: ' + str(self.game_args.game_iter)) #-------------------------------- #TESTING-------------------------- #print('Starting self-play game iteration: ' + str(eps)) #start_game = time.time() #-------------------------------- iterationTrainExamples += self.executeEpisode() #Play a new game with newly generated y. iterationTrainExamples is a deque containing states each generated self play game #TESTING-------------------------- #end_game = time.time() #print('Total time to play game ' + str(eps) + ' is: ' + str(end_game-start_game)) #----------------------------------------------------- # bookkeeping + plot progress eps_time.update(time.time() - end) end = time.time() bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format(eps=eps+1, maxeps=self.args['numEps'], et=eps_time.avg, total=bar.elapsed_td, eta=bar.eta_td) bar.next() bar.finish() # save the iteration examples to the history #self.trainExamplesHistory is a list of deques, where each deque contains all the states from numEps number of self-play games self.trainExamplesHistory.append(iterationTrainExamples) #Jump to here on the first iteration if we loaded an existing file into self.trainExamplesHistory from method loadTrainExamples below. if len(self.trainExamplesHistory) > self.args['numItersForTrainExamplesHistory']: print("len(trainExamplesHistory) =", len(self.trainExamplesHistory), " => remove the oldest trainExamples") self.trainExamplesHistory.pop(0) # backup history to a file by calling saveTrainExamples method # The examples were collected using the model from the previous iteration, so (i-1) self.saveTrainExamples(i-1) #save examples to self.args['checkpoint'] folder with given iteration name of i-1 # shuffle examples before training #trainExamples is the list form of trainExamplesHistory. Note that trainExamplesHistory is a list of deques, #where each deque contains training examples. trainExamples gets rid of the deque, and instead puts all training #samples in a single list, shuffled trainExamples = [] for e in self.trainExamplesHistory: #Each e is a deque trainExamples.extend(e) shuffle(trainExamples) #The Arena-------------------------------------------------------- if self.args['Arena'] == True: self.nnet.save_checkpoint(folder=self.args['network_checkpoint'], filename='temp') #copy old neural network into new one self.pnet.load_checkpoint(folder=self.args['network_checkpoint'], filename='temp') #convert trainExamples into a format recognizable by Neural Network and train trainExamples = self.nnet.constructTraining(trainExamples) self.nnet.train(trainExamples[0], trainExamples[1])#Train the new neural network self.nnet. The weights are now updated #Pit the two neural networks self.pnet and self.nnet in the arena print('PITTING AGAINST PREVIOUS VERSION') arena = Arena(self.pnet, self.nnet, self.game, self.args, self.arena_game_args) #note that Arena will pit pnet with nnet, and Game_args A and y will change constantly. Note that next iteration, arena is a reference to a different object, so old object is deleted when there are no other references to it. pwins, nwins, draws = arena.playGames() print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins+nwins > 0 and float(nwins)/(pwins+nwins) < self.args['updateThreshold']: print('REJECTING NEW MODEL') self.nnet.load_checkpoint(folder=self.args['network_checkpoint'], filename='temp') else:#saves the weights(.h5) and model(.json) twice. Creates nnet_checkpoint(i-1)_model.json and nnet_checkpoint(i-1)_weights.h5, and rewrites best_model.json and best_weights.h5 print('ACCEPTING NEW MODEL') self.nnet.save_checkpoint(folder=self.args['network_checkpoint'], filename='nnet_checkpoint' + str(i-1)) self.nnet.save_checkpoint(folder=self.args['network_checkpoint'], filename='best') #----------------------------------------------------------------- else: #If we do not activate Arena, then all we do is just train the network, rewrite best, and write a new file 'nnet_checkpoint' + str(i-1). print('TRAINING NEW NEURAL NETWORK...') trainExamples = self.nnet.constructTraining(trainExamples) #FOR TESTING----------------------------------------------------- #print('trainExamples feature arrays: ' + str(trainExamples[0])) #print('trainExamples label arrays: ' + str(trainExamples[1])) #END TESTING----------------------------------------------------- self.nnet.train(trainExamples[0], trainExamples[1], folder = self.args['network_checkpoint'], filename = 'trainHistDict' + str(i-1)) #FOR TESTING----------------------------------------------------- #weights = self.nnet.nnet.model.get_weights() #min_max = [] #for layer_weights in weights: #print('number of weights in current array in list (output as matrix size): ', layer_weights.shape) #layer_weights_min = np.amin(layer_weights) #layer_weights_max = np.amax(layer_weights) #min_max.append([layer_weights_min, layer_weights_max]) #print('') #print('The smallest and largest weights of each layer are: ') #for pair in min_max: #print(pair) #print('') #END TESTING----------------------------------------------------- self.nnet.save_checkpoint(folder = self.args['network_checkpoint'], filename='nnet_checkpoint' + str(i-1)) self.nnet.save_checkpoint(folder = self.args['network_checkpoint'], filename = 'best')
def learn(self): #generate or load a matrix if fixed matrix set to True. We save a Game_args object in Coach in case A is fixed so when we #initialize multiple MCTS objects below, we do not have to store multiple copies of A. if self.args['fixed_matrix'] == True: if self.args['load_existing_matrix'] == True: self.game_args.sensing_matrix = np.load(self.args['fixed_matrix_filepath'] + '/sensing_matrix.npy') else: self.game_args.generateSensingMatrix(self.args['m'], self.args['n'], self.args['matrix_type']) self.game_args.save_Matrix(self.args['fixed_matrix_filepath']) #keep track of learning time learning_start = time.time() #start training iterations for i in range(1, self.args['numIters']+1): print('------ITER ' + str(i) + '------') #If we are not loading a set of training data.... then: if not self.skipFirstSelfPlay or i>1: #1)Initialize empty deque for storing training data after every eps in the iteration has been processed iterationTrainExamples = deque([], maxlen=self.args['maxlenOfQueue']) #3)Start search. A single search consists of a synchronous search over ALL eps in the current batch. #Essentially the number of MCTS trees that must be maintained at once is equal to number of eps in current batch for j in range(self.args['num_batches']): iterationTrainExamples += self.playAllGames(self.args['eps_per_batch']) #Add the training samples generated in a single training iteration to self.trainExamplesHistory #This step is the last line included in "if not self.skipFirstSelfPlay or i>1:" block self.trainExamplesHistory.append(iterationTrainExamples) #Jump to here if self.skipFirstSelfPlay returns True or i<=1 #Once iterationTrainExamples has been completed, we will use these iterationTrainExamples to retrain the Neural Network. if len(self.trainExamplesHistory) > self.args['numItersForTrainExamplesHistory']: print("len(trainExamplesHistory) =", len(self.trainExamplesHistory), " => remove the oldest trainExamples") self.trainExamplesHistory.pop(0) #save trainExamplesHistory self.saveTrainExamples(i-1) #move all training samples from trainExamplesHistory to trainExamples for shuffling #shuffle trainExamples trainExamples = [] for e in self.trainExamplesHistory: trainExamples.extend(e) shuffle(trainExamples) #The Arena-------------------------------------------------------- if self.args['Arena'] == True: self.nnet.save_checkpoint(folder=self.args['network_checkpoint'], filename='temp') #copy old neural network into new one self.pnet.load_checkpoint(folder=self.args['network_checkpoint'], filename='temp') #convert trainExamples into a format recognizable by Neural Network and train trainExamples = self.nnet.constructTraining(trainExamples) self.nnet.train(trainExamples[0], trainExamples[1])#Train the new neural network self.nnet. The weights are now updated #Pit the two neural networks self.pnet and self.nnet in the arena print('PITTING AGAINST PREVIOUS VERSION') arena = Arena(self.pnet, self.nnet, self.game, self.args, self.arena_game_args) #note that Arena will pit pnet with nnet, and Game_args A and y will change constantly. Note that next iteration, arena is a reference to a different object, so old object is deleted when there are no other references to it. pwins, nwins, draws = arena.playGames() print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins+nwins > 0 and float(nwins)/(pwins+nwins) < self.args['updateThreshold']: print('REJECTING NEW MODEL') self.nnet.load_checkpoint(folder=self.args['network_checkpoint'], filename='temp') else:#saves the weights(.h5) and model(.json) twice. Creates nnet_checkpoint(i-1)_model.json and nnet_checkpoint(i-1)_weights.h5, and rewrites best_model.json and best_weights.h5 print('ACCEPTING NEW MODEL') self.nnet.save_checkpoint(folder=self.args['network_checkpoint'], filename='nnet_checkpoint' + str(i-1)) self.nnet.save_checkpoint(folder=self.args['network_checkpoint'], filename='best') #----------------------------------------------------------------- else: #If we do not activate Arena, then all we do is just train the network, rewrite best, and write a new file 'nnet_checkpoint' + str(i-1). print('TRAINING NEW NEURAL NETWORK...') trainExamples = self.nnet.constructTraining(trainExamples) self.nnet.train(trainExamples[0], trainExamples[1], folder = self.args['network_checkpoint'], filename = 'trainHistDict' + str(i-1)) self.nnet.save_checkpoint(folder = self.args['network_checkpoint'], filename='nnet_checkpoint' + str(i-1)) self.nnet.save_checkpoint(folder = self.args['network_checkpoint'], filename = 'best') #Compute total time to run alphazero learning_end = time.time() print('----------TRAINING COMPLETE----------') print('Total training time: ', learning_end - learning_start)
def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximium length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ for i in range(1, self.args.numIters + 1): # bookkeeping print('------ITER ' + str(i) + '------') # examples of the iteration if not self.skipFirstSelfPlay or i > 1: iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue) tracker = ParallelRuntimes(self.args.mcts_workers) bar = Bar('Self Play', max=self.args.numEps) # Multiprocess self-play proccesses = [] work_queue = mp.Queue() done_queue = mp.Queue() print("[Master] Spawning Workers...") # Spawn workers for ep in range(self.args.mcts_workers): tup = (work_queue, done_queue, ep) proc = mp.Process(target=self.coach_worker, args=tup) proc.start() proccesses.append(proc) print("[Master] Adding work...") # Add work to queue for eps in range(self.args.numEps): data = dict() data["i"] = eps data["game"] = copy.deepcopy(self.game) work_queue.put(data) print("[Master] Waiting for results...") # Wait for results to come in for ep in range(self.args.numEps): runtime, examples = done_queue.get() # Drop 80% of draws to_add = False loss_rate = self.args.filter_draw_rate if abs(examples[0][2]) != 1: if random.random() >= loss_rate: to_add = True else: to_add = True if to_add: iterationTrainExamples += examples tracker.update(runtime) bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format( eps=ep + 1, maxeps=self.args.numEps, et=tracker.avg(), total=bar.elapsed_td, eta=tracker.eta(ep + 1, self.args.numEps)) bar.next() print("[Master] Killing workers...") # Kill workers for p in proccesses: p.terminate() p.join() print("[Master] iter={} adding {} examples".format(i, len(iterationTrainExamples))) self.trainExamplesHistory.append(iterationTrainExamples) bar.finish() if len(self.trainExamplesHistory) > self.args.numItersForTrainExamplesHistory: print("len(trainExamplesHistory) =", len(self.trainExamplesHistory), " => remove the oldest trainExamples") self.trainExamplesHistory.pop(0) # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) self.saveTrainExamples(i) # shuffle examlpes before training trainExamples = [] for e in self.trainExamplesHistory: trainExamples.extend(e) shuffle(trainExamples) # training new network, keeping a copy of the old one self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') # normal network, don't use parallel code self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') pmcts = MCTS(copy.deepcopy(self.game), self.pnet, self.args) self.nnet.train(trainExamples) nmcts = MCTS(copy.deepcopy(self.game), self.nnet, self.args) print('PITTING AGAINST PREVIOUS VERSION (player1 = previous, player2 = new)') arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game, num_workers=self.args.mcts_workers) pwins, nwins, draws = arena.playGames(self.args.arenaCompare) print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins+nwins > 0 and float(nwins)/(pwins+nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') else: print('ACCEPTING NEW MODEL') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar') # Load so all nnets are updated accordingly self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar')
def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximium length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ for i in range(1, self.args.numIters+1): # bookkeeping print('------ITER ' + str(i) + '------') print(str(self.game.innerN) + "x" + str(self.game.innerM)) # examples of the iteration if not self.skipFirstSelfPlay or i > 1: iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue) eps_time = AverageMeter() bar = Bar('Self Play', max=self.args.numEps) end = time.time() for eps in range(self.args.numEps): # self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree self.mcts = MCTS(self.nnet, self.args) # reset search tree iterationTrainExamples += self.executeEpisode() # bookkeeping + plot progress eps_time.update(time.time() - end) end = time.time() bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format(eps=eps+1, maxeps=self.args.numEps, et=eps_time.avg, total=bar.elapsed_td, eta=bar.eta_td) bar.next() bar.finish() # save the iteration examples to the history self.trainExamplesHistory.append(iterationTrainExamples) if len(self.trainExamplesHistory) > self.args.numItersForTrainExamplesHistory: print("len(trainExamplesHistory) =", len(self.trainExamplesHistory), " => remove the oldest trainExamples") self.trainExamplesHistory.pop(0) # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) self.saveTrainExamples(i-1) # shuffle examlpes before training trainExamples = [] for e in self.trainExamplesHistory: trainExamples.extend(e) shuffle(trainExamples) tempfile = 'temp.pth.tar' bestfile = 'best.pth.tar' # training new network, keeping a copy of the old one self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=tempfile) self.nnet.train(trainExamples) if self.arenaEnabled: self.pnet.load_checkpoint(folder=self.args.checkpoint, filename=tempfile) pmcts = MCTS(self.pnet, self.args) nmcts = MCTS(self.nnet, self.args) print('PITTING AGAINST PREVIOUS VERSION') # arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), # lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game) arena = Arena(lambda x, y: pmcts.getActionProb(x, y, temp=0), lambda x, y: nmcts.getActionProb(x, y, temp=0), self.game) pwins, nwins, draws = arena.playGames(self.args.arenaCompare) print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins+nwins > 0 and float(nwins)/(pwins+nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') self.nnet.load_checkpoint(folder=self.args.checkpoint, filename=tempfile) else: print('ACCEPTING NEW MODEL') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=bestfile)
def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximium length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ for i in tqdm(range(1, self.args.numIters + 1), desc='Iteration'): # examples of the iteration if not self.skipFirstSelfPlay or i > 1: iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue) for eps in tqdm(range(self.args.numEps), desc='mcts.Episode'): iterationTrainExamples += self.executeEpisode() # save the iteration examples to the history self.trainExamplesHistory.append(iterationTrainExamples) if len(self.trainExamplesHistory ) > self.args.numItersForTrainExamplesHistory: print("len(trainExamplesHistory) =", len(self.trainExamplesHistory), " => remove the oldest trainExamples") self.trainExamplesHistory.pop(0) # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) self.saveTrainExamples(i - 1) # shuffle examples before training trainExamples = [] for e in self.trainExamplesHistory: trainExamples.extend(e) shuffle(trainExamples) # training new network, keeping a copy of the old one self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') pmcts = MCTS(self.game, self.pnet, self.args) self.nnet.train(trainExamples, self.writer) self.writer.set_step(i - 1, "learning") nmcts = MCTS(self.game, self.nnet, self.args) print("PITTING AGAINST METRIC COMPONENTS") for metric_opponent in self.args.metric_opponents: arena = Arena( lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), metric_opponent(self.game).play, self.game) nwins, owins, draws = arena.playGames( self.args.metricArenaCompare) print('%s WINS : %d / %d ; DRAWS : %d' % (metric_opponent.__name__, nwins, owins, draws)) if nwins + owins == 0: win_prct = 0 else: win_prct = float(nwins) / (nwins + owins) self.writer.add_scalar( '{}_win'.format(metric_opponent.__name__), win_prct) # Reset nmcts nmcts = MCTS(self.game, self.nnet, self.args) print('PITTING AGAINST PREVIOUS VERSION') arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game) pwins, nwins, draws = arena.playGames(self.args.arenaCompare) if nwins + pwins == 0: win_prct = 0 else: win_prct = float(nwins) / (nwins + pwins) self.writer.add_scalar('self_win', win_prct) # Calculate elo score for self play results = [-x for x in arena.get_results() ] # flip to be next neural network wins nelo, pelo = elo(self.elo, self.elo, results) print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins + nwins == 0 or float(nwins) / ( pwins + nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') self.elo = pelo self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') else: print('ACCEPTING NEW MODEL') self.elo = nelo self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar') self.writer.add_scalar('self_elo', self.elo)