class GamePlay(object): def __init__(self, policy_net_path, value_net_path): self.mcts = MCTS(policy_net_path, value_net_path, time_limit=20) def play(self, game): self.mcts.set_game(game) return self.mcts.start()
def __init__(self, game, nnet, args): self.game = game self.nnet = nnet self.pnet = self.nnet.__class__(self.game) # the competitor network self.args = args # self.mcts = MCTS(self.game, self.nnet, self.args) self.mcts = MCTS(self.nnet, self.args) self.trainExamplesHistory = [] # history of examples from args.numItersForTrainExamplesHistory latest iterations self.skipFirstSelfPlay = False # can be overriden in loadTrainExamples() self.arenaEnabled = args.arena == "true"
mcts2 = MCTS(g, n2, args2) n2p = lambda x: np.argmax(mcts2.getActionProb(x, temp=0)) """ for p in range(1, 5): print("iter:%d" % p) for i in range(70, 1, -1): n1 = NNet(g, argsNN) try: n1.load_checkpoint('/content/drive/My Drive/model/', 'checkpoint_%d.pth.tar' % i) except: print("no model:%d" % i) continue args1 = dotdict({'numMCTSSims': 50, 'cpuct': 1.0}) mcts1 = MCTS(g, n1, args1) n1p = lambda x: np.argmax(mcts1.getActionProb(x, temp=0)) #expoitation for j in range(70, 1, -1): if (i <= j): continue n2 = NNet(g, argsNN) try: n2.load_checkpoint('/content/drive/My Drive/model/', 'checkpoint_%d.pth.tar' % j) except: print("no model:%d" % j) continue args2 = dotdict({'numMCTSSims': 50, 'cpuct': 1.0}) mcts2 = MCTS(g, n2, args2) n2p = lambda x: np.argmax(mcts2.getActionProb(x, temp=0))
class Coach(): """ This class executes the self-play + learning. It uses the functions defined in Game and NeuralNet. args are specified in main.py. """ def __init__(self, game, nnet, args): self.game = game self.nnet = nnet self.pnet = self.nnet.__class__(self.game) # the competitor network self.args = args # self.mcts = MCTS(self.game, self.nnet, self.args) self.mcts = MCTS(self.nnet, self.args) self.trainExamplesHistory = [] # history of examples from args.numItersForTrainExamplesHistory latest iterations self.skipFirstSelfPlay = False # can be overriden in loadTrainExamples() self.arenaEnabled = args.arena == "true" def executeEpisode(self): """ This function executes one episode of self-play, starting with player 1. As the game is played, each turn is added as a training example to trainExamples. The game is played till the game ends. After the game ends, the outcome of the game is used to assign values to each example in trainExamples. It uses a temp=1 if episodeStep < tempThreshold, and thereafter uses temp=0. Returns: trainExamples: a list of examples of the form (canonicalBoard,pi,v) pi is the MCTS informed policy vector, v is +1 if the player eventually won the game, else -1. """ trainExamples = [] self.game.reset_game() self.curPlayer = 1 episodeStep = 0 players = [] while True: players.append(self.curPlayer) episodeStep += 1 # canonicalBoard = self.game.getCanonicalForm() temp = int(episodeStep < self.args.tempThreshold) pi = self.mcts.getActionProb(self.game, self.curPlayer, temp=temp) sym = self.game.getSymmetries(pi) for b,p in sym: trainExamples.append([b, self.curPlayer, p]) # # trainExamples.append([self.game.boxes, self.curPlayer, pi, None]) # Random Choice Based on the pi array as weights action = np.random.choice(len(pi), p=pi) self.curPlayer = self.game.getNextState(self.curPlayer, action) r = self.game.getGameEnded() if r!=0: # print(r) # print(self.curPlayer) # for _,p,_ in trainExamples[::-8]: # print("Last Player: {} Winner:: {} Score: {} Player Example {} \t Value {}".format(self.curPlayer, r, self.game.score, p,r*p)) # return [(x[0],x[2],r*((-1)**(x[1]!=self.curPlayer))) for x in trainExamples] return [(x[0],x[2],r*x[1]) for x in trainExamples] def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximium length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ for i in range(1, self.args.numIters+1): # bookkeeping print('------ITER ' + str(i) + '------') print(str(self.game.innerN) + "x" + str(self.game.innerM)) # examples of the iteration if not self.skipFirstSelfPlay or i > 1: iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue) eps_time = AverageMeter() bar = Bar('Self Play', max=self.args.numEps) end = time.time() for eps in range(self.args.numEps): # self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree self.mcts = MCTS(self.nnet, self.args) # reset search tree iterationTrainExamples += self.executeEpisode() # bookkeeping + plot progress eps_time.update(time.time() - end) end = time.time() bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format(eps=eps+1, maxeps=self.args.numEps, et=eps_time.avg, total=bar.elapsed_td, eta=bar.eta_td) bar.next() bar.finish() # save the iteration examples to the history self.trainExamplesHistory.append(iterationTrainExamples) if len(self.trainExamplesHistory) > self.args.numItersForTrainExamplesHistory: print("len(trainExamplesHistory) =", len(self.trainExamplesHistory), " => remove the oldest trainExamples") self.trainExamplesHistory.pop(0) # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) self.saveTrainExamples(i-1) # shuffle examlpes before training trainExamples = [] for e in self.trainExamplesHistory: trainExamples.extend(e) shuffle(trainExamples) tempfile = 'temp.pth.tar' bestfile = 'best.pth.tar' # training new network, keeping a copy of the old one self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=tempfile) self.nnet.train(trainExamples) if self.arenaEnabled: self.pnet.load_checkpoint(folder=self.args.checkpoint, filename=tempfile) pmcts = MCTS(self.pnet, self.args) nmcts = MCTS(self.nnet, self.args) print('PITTING AGAINST PREVIOUS VERSION') # arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), # lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game) arena = Arena(lambda x, y: pmcts.getActionProb(x, y, temp=0), lambda x, y: nmcts.getActionProb(x, y, temp=0), self.game) pwins, nwins, draws = arena.playGames(self.args.arenaCompare) print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins+nwins > 0 and float(nwins)/(pwins+nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') self.nnet.load_checkpoint(folder=self.args.checkpoint, filename=tempfile) else: print('ACCEPTING NEW MODEL') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=bestfile) # self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) # self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=bestfile) def getCheckpointFile(self, iteration): return 'checkpoint_' + str(iteration) + '.pth.tar' def saveTrainExamples(self, iteration): folder = self.args.checkpoint if not os.path.exists(folder): os.makedirs(folder) filename = os.path.join(folder, self.getCheckpointFile(iteration)+".examples") with open(filename, "wb+") as f: Pickler(f).dump(self.trainExamplesHistory) f.closed def loadTrainExamples(self): modelFile = os.path.join(self.args.load_folder, self.args.load_file) examplesFile = modelFile+".examples" if not os.path.isfile(examplesFile): print(examplesFile) r = input("File with trainExamples not found. Continue? [y|n]") if r != "y": sys.exit() else: print("File with trainExamples found. Read it.") with open(examplesFile, "rb") as f: self.trainExamplesHistory = Unpickler(f).load() f.closed # examples based on the model were already collected (loaded) self.skipFirstSelfPlay = True
def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximium length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ for i in range(1, self.args.numIters+1): # bookkeeping print('------ITER ' + str(i) + '------') print(str(self.game.innerN) + "x" + str(self.game.innerM)) # examples of the iteration if not self.skipFirstSelfPlay or i > 1: iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue) eps_time = AverageMeter() bar = Bar('Self Play', max=self.args.numEps) end = time.time() for eps in range(self.args.numEps): # self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree self.mcts = MCTS(self.nnet, self.args) # reset search tree iterationTrainExamples += self.executeEpisode() # bookkeeping + plot progress eps_time.update(time.time() - end) end = time.time() bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format(eps=eps+1, maxeps=self.args.numEps, et=eps_time.avg, total=bar.elapsed_td, eta=bar.eta_td) bar.next() bar.finish() # save the iteration examples to the history self.trainExamplesHistory.append(iterationTrainExamples) if len(self.trainExamplesHistory) > self.args.numItersForTrainExamplesHistory: print("len(trainExamplesHistory) =", len(self.trainExamplesHistory), " => remove the oldest trainExamples") self.trainExamplesHistory.pop(0) # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) self.saveTrainExamples(i-1) # shuffle examlpes before training trainExamples = [] for e in self.trainExamplesHistory: trainExamples.extend(e) shuffle(trainExamples) tempfile = 'temp.pth.tar' bestfile = 'best.pth.tar' # training new network, keeping a copy of the old one self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=tempfile) self.nnet.train(trainExamples) if self.arenaEnabled: self.pnet.load_checkpoint(folder=self.args.checkpoint, filename=tempfile) pmcts = MCTS(self.pnet, self.args) nmcts = MCTS(self.nnet, self.args) print('PITTING AGAINST PREVIOUS VERSION') # arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), # lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game) arena = Arena(lambda x, y: pmcts.getActionProb(x, y, temp=0), lambda x, y: nmcts.getActionProb(x, y, temp=0), self.game) pwins, nwins, draws = arena.playGames(self.args.arenaCompare) print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins+nwins > 0 and float(nwins)/(pwins+nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') self.nnet.load_checkpoint(folder=self.args.checkpoint, filename=tempfile) else: print('ACCEPTING NEW MODEL') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=bestfile)
from MCTS import MCTS from connect4.Connect4Game import Connect4Game, display from connect4.Connect4Players import HumanConnect4Player from connect4.tensorflows.NNet import NNetWrapper as NNet from utils import dotdict import numpy as np if __name__ == '__main__': goingFirst = True folder = "H:\\alpha-zero-trained\\final\\h2\\mcts_visits_tanh\\default\\1\\" game = Connect4Game() nn = NNet(game) nn.load_checkpoint(folder, 'best.pth.tar') args = dotdict({'numMCTSSims': 25, 'cpuct': 1}) mcts1 = MCTS(game, nn, args) AI = lambda x: np.argmax(mcts1.getActionProb(x, temp=0)) human = HumanConnect4Player(game).play if goingFirst: players = [AI, None, human] else: players = [human, None, AI] curPlayer = 1 board = game.getInitBoard() while game.getGameEnded(board, curPlayer) == 0: display(board, symbols=True) action = players[curPlayer + 1](game.getCanonicalForm(
class Coach(): """ This class executes the self-play + learning. It uses the functions defined in Game and NeuralNet. args are specified in main.py. """ def __init__(self, game, nnet, args): self.game = game self.nnet = nnet self.pnet = self.nnet.__class__(self.game) # the competitor network self.args = args self.mcts = MCTS(self.game, self.nnet, self.args) self.trainExamplesHistory = [ ] # history of examples from args.numItersForTrainExamplesHistory latest iterations self.skipFirstSelfPlay = False # can be overriden in loadTrainExamples() def executeEpisode(self): """ This function executes one episode of self-play, starting with player 1. As the game is played, each turn is added as a training example to trainExamples. The game is played till the game ends. After the game ends, the outcome of the game is used to assign values to each example in trainExamples. It uses a temp=1 if episodeStep < tempThreshold, and thereafter uses temp=0. Returns: trainExamples: a list of examples of the form (canonicalBoard,pi,v) pi is the MCTS informed policy vector, v is +1 if the player eventually won the game, else -1. """ trainExamples = [] board = self.game.getInitBoard() self.curPlayer = 1 episodeStep = 0 while True: episodeStep += 1 canonicalBoard = self.game.getCanonicalForm(board, self.curPlayer) temp = int(episodeStep < self.args.tempThreshold) pi = self.mcts.getActionProb(canonicalBoard, temp=temp) sym = self.game.getSymmetries(canonicalBoard, pi) for b, p in sym: trainExamples.append([b, self.curPlayer, p, None]) action = np.random.choice(len(pi), p=pi) board, self.curPlayer = self.game.getNextState( board, self.curPlayer, action) r = self.game.getGameEnded(board, self.curPlayer) if r != 0: return [(x[0], x[2], r * ((-1)**(x[1] != self.curPlayer))) for x in trainExamples] def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximium length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ for i in range(1, self.args.numIters + 1): # bookkeeping print('------ITER ' + str(i) + '------') # examples of the iteration if not self.skipFirstSelfPlay or i > 1: iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue) eps_time = AverageMeter() bar = Bar('Self Play', max=self.args.numEps) end = time.time() for eps in range(self.args.numEps): self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree iterationTrainExamples += self.executeEpisode() # bookkeeping + plot progress eps_time.update(time.time() - end) end = time.time() bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format( eps=eps + 1, maxeps=self.args.numEps, et=eps_time.avg, total=bar.elapsed_td, eta=bar.eta_td) bar.next() bar.finish() # save the iteration examples to the history self.trainExamplesHistory.append(iterationTrainExamples) if len(self.trainExamplesHistory ) > self.args.numItersForTrainExamplesHistory: print("len(trainExamplesHistory) =", len(self.trainExamplesHistory), " => remove the oldest trainExamples") self.trainExamplesHistory.pop(0) # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) self.saveTrainExamples(i - 1) # shuffle examlpes before training trainExamples = [] for e in self.trainExamplesHistory: trainExamples.extend(e) shuffle(trainExamples) # training new network, keeping a copy of the old one self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') pmcts = MCTS(self.game, self.pnet, self.args) self.nnet.train(trainExamples) nmcts = MCTS(self.game, self.nnet, self.args) print('PITTING AGAINST PREVIOUS VERSION') arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game) pwins, nwins, draws = arena.playGames(self.args.arenaCompare) print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins + nwins > 0 and float(nwins) / ( pwins + nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') else: print('ACCEPTING NEW MODEL') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar') def getCheckpointFile(self, iteration): return 'checkpoint_' + str(iteration) + '.pth.tar' def saveTrainExamples(self, iteration): folder = self.args.checkpoint if not os.path.exists(folder): os.makedirs(folder) filename = os.path.join( folder, self.getCheckpointFile(iteration) + ".examples") with open(filename, "wb+") as f: Pickler(f).dump(self.trainExamplesHistory) f.closed def loadTrainExamples(self): modelFile = os.path.join(self.args.load_folder_file[0], self.args.load_folder_file[1]) examplesFile = modelFile + ".examples" if not os.path.isfile(examplesFile): print(examplesFile) r = input("File with trainExamples not found. Continue? [y|n]") if r != "y": sys.exit() else: print("File with trainExamples found. Read it.") with open(examplesFile, "rb") as f: self.trainExamplesHistory = Unpickler(f).load() f.closed # examples based on the model were already collected (loaded) self.skipFirstSelfPlay = True
class Coach(): """ This class executes the self-play + learning. It uses the functions defined in Game and NeuralNet. args are specified in main.py. """ def __init__(self, game, nnet, args): self.game = game self.board = game.getInitBoard() self.nnet = nnet self.args = args self.mcts = MCTS(self.game, self.nnet, self.args) def executeEpisode(self): """ This function executes one episode of self-play, starting with player 1. As the game is played, each turn is added as a training example to trainExamples. The game is played till the game ends. After the game ends, the outcome of the game is used to assign values to each example in trainExamples. It uses a temp=1 if episodeStep < tempThreshold, and thereafter uses temp=0. Returns: trainExamples: a list of examples of the form (canonicalBoard,pi,v) pi is the MCTS informed policy vector, v is +1 if the player eventually won the game, else -1. """ trainExamples = [] self.board = self.game.getInitBoard() self.curPlayer = 1 episodeStep = 0 while True: episodeStep += 1 canonicalBoard = self.game.getCanonicalForm( self.board, self.curPlayer) temp = int(episodeStep < self.args.tempThreshold) pi = self.mcts.getActionProb(canonicalBoard, temp=temp) sym = self.game.getSymmetries(canonicalBoard, pi) for b, p in sym: trainExamples.append([b, self.curPlayer, p, None]) action = np.random.choice(len(pi), p=pi) self.board, self.curPlayer = self.game.getNextState( self.board, self.curPlayer, action) r = self.game.getGameEnded(self.board, self.curPlayer) if r != 0: return [(x[0], x[2], r * ((-1)**(x[1] != self.curPlayer))) for x in trainExamples] def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximium length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ trainExamples = deque([], maxlen=self.args.maxlenOfQueue) for i in range(self.args.numIters): # bookkeeping print('------ITER ' + str(i + 1) + '------') eps_time = AverageMeter() bar = Bar('Self Play', max=self.args.numEps) end = time.time() for eps in range(self.args.numEps): trainExamples += self.executeEpisode() # bookkeeping + plot progress eps_time.update(time.time() - end) end = time.time() bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format( eps=eps + 1, maxeps=self.args.numEps, et=eps_time.avg, total=bar.elapsed_td, eta=bar.eta_td) bar.next() bar.finish() # training new network, keeping a copy of the old one self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') pnet = self.nnet.__class__(self.game) pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') pmcts = MCTS(self.game, pnet, self.args) self.nnet.train(trainExamples) nmcts = MCTS(self.game, self.nnet, self.args) print('PITTING AGAINST PREVIOUS VERSION') arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game) pwins, nwins = arena.playGames(self.args.arenaCompare) print('NEW/PREV WINS : ' + str(nwins) + '/' + str(pwins)) if float(nwins) / (pwins + nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') self.nnet = pnet else: print('ACCEPTING NEW MODEL') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='checkpoint_' + str(i) + '.pth.tar') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar') self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree
class Coach(): """ This class executes the self-play + learning. It uses the functions defined in Game and NeuralNet. args are specified in main.py. """ def __init__(self, game, white_nnet, black_nnet, args): self.game = game self.white_nnet = white_nnet self.black_nnet = black_nnet self.white_pnet = self.white_nnet.__class__(self.game) # the competitor network self.black_pnet = self.black_nnet.__class__(self.game) self.args = args self.mcts = MCTS(self.game, self.white_nnet, self.black_nnet, self.args) # self.trainExamplesHistory = [] ########### self.trainExamplesHistory_white = [] # history of examples from args.numItersForTrainExamplesHistory latest iterations self.trainExamplesHistory_black = [] # history of examples from args.numItersForTrainExamplesHistory latest iterations def executeEpisode(self): """ This function executes one episode of self-play, starting with player 1. As the game is played, each turn is added as a training example to trainExamples. The game is played till the game ends. After the game ends, the outcome of the game is used to assign values to each example in trainExamples. It uses a temp=1 if episodeStep < tempThreshold, and thereafter uses temp=0. Returns: trainExamples: a list of examples of the form (canonicalBoard,pi,v) pi is the MCTS informed policy vector, v is +1 if the player eventually won the game, else -1. """ trainExamples_white = [] trainExamples_black = [] # trainExamples = [] board = self.game.getInitBoard() self.curPlayer = 1 episodeStep = 0 while True: episodeStep += 1 # print("turn " + str(episodeStep)) canonicalBoard = self.game.getCanonicalForm(board, self.curPlayer) temp = int(episodeStep < self.args.tempThreshold) try: pi = self.mcts.getActionProb(canonicalBoard, self.curPlayer, temp=temp) except ZeroDivisionError: print("ZeroDivisionError while building training example. continue with next iteration") return [], [] sym = self.game.getSymmetries(canonicalBoard, pi, canonicalBoard.king_position) player_train_examples = trainExamples_white if self.curPlayer == Player.white else trainExamples_black for b,p, scalar_values in sym: player_train_examples.append([b, self.curPlayer, p, scalar_values]) action = np.random.choice(len(pi), p=pi) if action == 0: print(pi) board.print_game_over_reason = False board, self.curPlayer = self.game.getNextState(board, self.curPlayer, action) board.print_game_over_reason = False r = self.game.getGameEnded(board, self.curPlayer) if r!=0: # if board.outcome == Outcome.black: # print(" black wins") return [(x[0],x[2],r*((-1)**(x[1]!=self.curPlayer)), x[3]) for x in trainExamples_white], \ [(x[0],x[2],r*((-1)**(x[1]!=self.curPlayer)), x[3]) for x in trainExamples_black] def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximium length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ self.game.prune_prob = self.args.prune_starting_prob train_black = self.args.train_black_first for i in range(1, self.args.numIters+1): # bookkeeping print('------ITER ' + str(i) + '------') # examples of the iteration if not self.args.skip_first_self_play or i>1: iterationTrainExamples_white = deque([], maxlen=self.args.maxlenOfQueue) iterationTrainExamples_black = deque([], maxlen=self.args.maxlenOfQueue) eps_time = AverageMeter() bar = Bar('Self Play', max=self.args.numEps) end = time.time() if self.args.profile_coach: prof = cProfile.Profile() prof.enable() for eps in range(self.args.numEps): self.mcts = MCTS(self.game, self.white_nnet, self.black_nnet, self.args) # reset search tree white_examples, black_examples = self.executeEpisode() iterationTrainExamples_white += white_examples iterationTrainExamples_black += black_examples # bookkeeping + plot progress eps_time.update(time.time() - end) end = time.time() bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format(eps=eps + 1, maxeps=self.args.numEps, et=eps_time.avg, total=bar.elapsed_td, eta=bar.eta_td) bar.next() bar.finish() if self.args.profile_coach: prof.disable() prof.print_stats(sort=2) # save the iteration examples to the history self.trainExamplesHistory_white.append(iterationTrainExamples_white) self.trainExamplesHistory_black.append(iterationTrainExamples_black) while len(self.trainExamplesHistory_white) > self.args.numItersForTrainExamplesHistory: print("len(trainExamplesHistory) =", len(self.trainExamplesHistory_white), " => remove the oldest trainExamples") self.trainExamplesHistory_white.pop(0) self.trainExamplesHistory_black.pop(0) # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) self.saveTrainExamples(i-1) # training new network, keeping a copy of the old one self.white_nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp_white.pth.tar') self.black_nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp_black.pth.tar') self.white_pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_white.pth.tar') self.black_pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_black.pth.tar') pmcts = MCTS(self.game, self.white_pnet, self.black_pnet, self.args) if not self.args.train_both: if train_black: # shuffle examples before training trainExamples = [] for e in self.trainExamplesHistory_black: trainExamples.extend(e) shuffle(trainExamples) self.black_nnet.train(trainExamples) else: # shuffle examples before training trainExamples = [] for e in self.trainExamplesHistory_white: trainExamples.extend(e) shuffle(trainExamples) self.white_nnet.train(trainExamples) else: # shuffle examples before training trainExamples = [] for e in self.trainExamplesHistory_black: trainExamples.extend(e) shuffle(trainExamples) self.black_nnet.train(trainExamples) # shuffle examples before training trainExamples = [] for e in self.trainExamplesHistory_white: trainExamples.extend(e) shuffle(trainExamples) self.white_nnet.train(trainExamples) nmcts = MCTS(self.game, self.white_nnet, self.black_nnet, self.args) print('PITTING AGAINST PREVIOUS VERSION') arena = Arena(lambda board, turn_player: np.argmax(pmcts.getActionProb(board, turn_player, temp=0)), lambda board, turn_player: np.argmax(nmcts.getActionProb(board, turn_player, temp=0)), self.game) pwins, nwins, draws, pwins_white, pwins_black, nwins_white, nwins_black \ = arena.playGames(self.args.arenaCompare, self.args.profile_arena) print('NEW/PREV WINS (white, black) : (%d,%d) / (%d,%d) ; DRAWS : %d' % (nwins_white, nwins_black, pwins_white, pwins_black, draws)) if pwins+nwins == 0 or float(nwins)/(pwins+nwins) < self.args.updateThreshold \ or nwins_black < pwins_black or nwins_white < pwins_white: print('REJECTING NEW MODEL') if not self.args.train_both: if train_black: self.black_nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_black.pth.tar') else: self.white_nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_white.pth.tar') else: self.black_nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_black.pth.tar') self.white_nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_white.pth.tar') else: print('ACCEPTING NEW MODEL') if not self.args.train_both: if train_black: # self.black_nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i, Player.black)) self.black_nnet.save_checkpoint(folder=self.args.checkpoint, filename='best_black.pth.tar') # if nwins_white == 0 or nwins_black / nwins_white >= self.args.train_other_network_threshold: # train_black = False print("training white neural net next") train_black = False else: # self.white_nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i, Player.white)) self.white_nnet.save_checkpoint(folder=self.args.checkpoint, filename='best_white.pth.tar') # if nwins_black == 0 or nwins_white / nwins_black > self.args.train_other_network_threshold: # train_black = True print("training black neural net next") train_black = True else: self.black_nnet.save_checkpoint(folder=self.args.checkpoint, filename='best_black.pth.tar') self.white_nnet.save_checkpoint(folder=self.args.checkpoint, filename='best_white.pth.tar') self.game.prune_prob += self.args.prune_prob_gain_per_iteration self.args.arenaCompare = math.floor(self.args.arenaCompare * 1.05) # self.args.numEps = math.floor(self.args.numEps * 1.1) self.args.numMCTSSims = math.floor(self.args.numMCTSSims * 1.1) print("prune probability: " + str(self.game.prune_prob) + ", episodes: " + str(self.args.numEps) + ", sims: " + str(self.args.numMCTSSims) + ", arena compare: " + str(self.args.arenaCompare)) def getCheckpointFile(self, iteration, player=None): return 'checkpoint_' + ('white_' if player == Player.white else 'black_' if player == Player.black else '') + str(iteration) + '.pth.tar' def saveTrainExamples(self, iteration): folder = self.args.checkpoint if not os.path.exists(folder): os.makedirs(folder) filename_white = os.path.join(folder, "training_white.examples") filename_black = os.path.join(folder, "training_black.examples") with open(filename_white, "wb+") as f: Pickler(f).dump(self.trainExamplesHistory_white) with open(filename_black, "wb+") as f: Pickler(f).dump(self.trainExamplesHistory_black) def loadTrainExamples(self): folder = self.args.checkpoint filename_white = os.path.join(folder, "training_white.examples") filename_black = os.path.join(folder, "training_black.examples") if not os.path.isfile(filename_white) or not os.path.isfile(filename_black): print(filename_white) print(filename_black) r = input("File with trainExamples not found. Continue? [y|n]") if r != "y": sys.exit() else: print("File with trainExamples found. Read it.") with open(filename_white, "rb") as f: self.trainExamplesHistory_white = Unpickler(f).load() with open(filename_black, "rb") as f: self.trainExamplesHistory_black = Unpickler(f).load() # examples based on the model were already collected (loaded) def load_expert_examples(self): white, black = read_data(self.args) self.trainExamplesHistory_white.extend(white) self.trainExamplesHistory_black.extend(black)
from Shared.Functions import xy_to_index class Model: def eval(self, board): P = np.ones(26) / 26 V = 0.1 return P, V model = Model() boards = np.array([[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 0]], [[0, 0, 0, 0, 0], [0, -1, 0, 0, 0], [0, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 0]]]) P = np.ones(26) / 26 player = BLACK mcts = MCTS(model, player, start_boards=boards) pi = mcts.search_for_pi(iterations=250) print(pi) move = np.random.choice(len(pi), p=pi) mcts.set_move(move) print(move) pi = mcts.search_for_pi(iterations=250) print(pi)
def __init__(self, game, nnet, args): self.game = game self.board = game.getInitBoard() self.nnet = nnet self.args = args self.mcts = MCTS(self.game, self.nnet, self.args)
def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximum length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ for i in range(1, self.args.numIters + 1): # bookkeeping log.info(f'Starting Iter #{i} ...') # examples of the iteration if not self.skipFirstSelfPlay or i > 1: iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue) for _ in tqdm(range(self.args.numEps), desc="Self Play"): self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree iterationTrainExamples += self.executeEpisode() # save the iteration examples to the history self.trainExamplesHistory.append(iterationTrainExamples) if len(self.trainExamplesHistory ) > self.args.numItersForTrainExamplesHistory: log.warning( f"Removing the oldest entry in trainExamples. len(trainExamplesHistory) = {len(self.trainExamplesHistory)}" ) self.trainExamplesHistory.pop(0) # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) self.saveTrainExamples(i - 1) # shuffle examples before training trainExamples = [] for e in self.trainExamplesHistory: trainExamples.extend(e) shuffle(trainExamples) # training new network, keeping a copy of the old one self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') pmcts = MCTS(self.game, self.pnet, self.args) self.nnet.train(trainExamples) nmcts = MCTS(self.game, self.nnet, self.args) log.info('PITTING AGAINST PREVIOUS VERSION') arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game) pwins, nwins, draws = arena.playGames(self.args.arenaCompare) log.info('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins + nwins == 0 or float(nwins) / ( pwins + nwins) < self.args.updateThreshold: log.info('REJECTING NEW MODEL') self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') else: log.info('ACCEPTING NEW MODEL') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar')
def evaluate_trained(self): if self.verbose >= 1: print("Evaluate") balance = 0 #+ if trained, - if best won more #best begins for i in range(EVAL_NR//2): if self.verbose >= 2: print("i: ",i) g = Game() nr = 0 while (g.winner==0): if nr%2==0: mcts = MCTS(g, self.model_best.evaluate) x,y,_ = mcts.select_move() g.move(x,y) else: mcts = MCTS(g, self.model_trained.evaluate) x,y,_ = mcts.select_move() g.move(x,y) nr+=1 if i==0 and self.verbose>=4: g.print() if g.winner == 1: balance -= 1 elif g.winner == 2: balance += 1 if self.verbose >= 2: print("Result:", g.winner) #trained begins for i in range(EVAL_NR//2): if self.verbose >= 2: print("i: ",i) g = Game() nr = 0 while (g.winner==0): if nr%2==0: mcts = MCTS(g, self.model_trained.evaluate) x,y,_ = mcts.select_move() g.move(x,y) else: mcts = MCTS(g, self.model_best.evaluate) x,y,_ = mcts.select_move() g.move(x,y) nr+=1 if i==0 and self.verbose>=4: g.print() if g.winner == 1: balance += 1 elif g.winner == 2: balance -= 1 if self.verbose >= 2: print("Result:", g.winner) if self.verbose >= 1: print("Total result: ",balance," Thr.: ",THRESHOLD) #return balance >= THRESHOLD return balance >= THRESHOLD
def self_play(strategy, read_file=None): n = PolicyNetwork(use_cpu=True) if strategy == 'random': instance = RandomPlayer() elif strategy == 'policy': instance = PolicyNetworkBestMovePlayer(n, read_file) elif strategy == 'randompolicy': instance = PolicyNetworkRandomMovePlayer(n, read_file) elif strategy == 'mcts': instance = MCTS(n, read_file) else: sys.stderr.write("Unknown strategy") sys.exit() #instance神经网络 gtp_engine = gtp_lib.Engine(instance) sys.stderr.write("GTP engine ready\n") sys.stderr.flush() p1 = -1 save = '' inpt = 'genmove b' n = 500 while n > 0: inpt = 'genmove b' if n % 2 == 1: inpt = 'genmove b' else: inpt = 'genmove w' try: cmd_list = inpt.split("\n") except: cmd_list = [inpt] for cmd in cmd_list: engine_reply = gtp_engine.send(cmd) sys.stdout.write(engine_reply) if engine_reply == '= pass\n\n': #engine_reply == '= pass\n\n' n = 0 else: o1 = '' if len(engine_reply) == 7: o1 = engine_reply[3] + engine_reply[4] else: o1 = engine_reply[3] if n % 2 == 1: o2 = ch.change(engine_reply[2]) + ch.change(o1) save = save + ';B[' + ch.change( engine_reply[2]) + ch.change(o1) + ']' else: o2 = ch.change(engine_reply[2]) + ch.change(o1) save = save + ';W[' + ch.change( engine_reply[2]) + ch.change(o1) + ']' sys.stdout.flush() n = n - 1 p7 = instance.position.result() save2 = '(;GM[1]\n SZ[19]\nPB[go1]\nPW[go2]\nKM[6.50]\nRE[' + p7[0] + ']\n' save2 = save2 + save + ')' wenjian = '' wenjian = str(time.time()) p3 = '4' save_t.make_folder(wenjian + '_selfplay') save_t.save_txt(wenjian + '_selfplay', p3, save2)
"""Use this script to play manually against a Blooms agent. """ import numpy as np import Arena from MCTS import MCTS from blooms.BloomsGame import BloomsGame from blooms.BloomsPlayers import * from blooms.pytorch.NNet import NNetWrapper as NNet from utils import * # WARNING: The game size and score target should match the chosen agent game = BloomsGame(size=5, score_target=20) human = HumanBloomsPlayer(game).play # WARNING: The chosen agent should match the game size and score target model = NNet(game) model.load_checkpoint('./notebooks/results/chkpts_board5_24hrs', 'best.pth.tar') args = dotdict({'numMCTSSims': 100, 'cpuct': 1.0}) mcts = MCTS(game, model, args) agent = lambda x: np.argmax(mcts.getActionProb(x, temp=0)) arena = Arena.Arena(agent, human, game) print(arena.playGames(2, verbose=True, display=False))
def PlayGame(): menu_def = [ ['&File', ['&Do nothing', 'E&xit']], ['&Help', '&About...'], ] # sg.SetOptions(margins=(0,0)) sg.ChangeLookAndFeel('GreenTan') # create initial board setup psg_board = copy.deepcopy(initial_board) # the main board display layout board_layout = [[sg.T(' ')] + [ sg.T('{}'.format(a), pad=((23, 27), 0), font='Any 13') for a in 'efgh' ]] # loop though board and create buttons with images for i in range(8): row = [sg.T(str(8 - i) + ' ', font='Any 13')] for j in range(4): piece_image = images[psg_board[i][j]] row.append(render_square(piece_image, key=(i, j), location=(i, j))) row.append(sg.T(str(8 - i) + ' ', font='Any 13')) board_layout.append(row) # add the labels across bottom of board board_layout.append([sg.T(' ')] + [ sg.T('{}'.format(a), pad=((23, 27), 0), font='Any 13') for a in 'efgh' ]) # setup the controls on the right side of screen openings = ('Any', 'Defense', 'Attack', 'Trap', 'Gambit', 'Counter', 'Sicillian', 'English', 'French', 'Queen\'s openings', 'King\'s Openings', 'Indian Openings') board_controls = [ [sg.RButton('New Game', key='New Game'), sg.RButton('Draw')], [sg.RButton('Resign Game'), sg.RButton('Set FEN')], [sg.RButton('Player Odds'), sg.RButton('Training')], [sg.Drop(openings), sg.Text('Opening/Style')], [sg.CBox('Play As White', key='_white_')], [sg.Text('Move List')], [ sg.Multiline([], do_not_clear=True, autoscroll=True, size=(15, 10), key='_movelist_') ], ] # layouts for the tabs controls_layout = [[ sg.Text('Performance Parameters', font='_ 20') ], [sg.T('Put stuff like AI engine tuning parms on this tab')]] statistics_layout = [[sg.Text('Statistics', font=('_ 20'))], [sg.T('Game statistics go here?')]] board_tab = [[sg.Column(board_layout)]] # the main window layout layout = [[sg.Menu(menu_def, tearoff=False)], [ sg.TabGroup([[ sg.Tab('Board', board_tab), sg.Tab('Controls', controls_layout), sg.Tab('Statistics', statistics_layout) ]], title_color='red'), sg.Column(board_controls) ], [sg.Text('Click anywhere on board for next move', font='_ 14')]] window = sg.Window('Chess', default_button_element_size=(12, 1), auto_size_buttons=False, icon='kingb.ico').Layout(layout) g = HalfchessGame.HalfchessGame() nn = NNet(g) nn.load_checkpoint(nn_filepath, nn_filename) args = dotdict({'numMCTSSims': numMCTSSims, 'cpuct': cpuct}) mcts = MCTS(g, nn, args) nnp = lambda x: np.argmax(mcts.getActionProb(x, temp=temp)) board = g.getInitBoard() move_count = curPlayer = 1 move_state = move_from = move_to = 0 # ---===--- Loop taking in user input --- # while g.getGameEnded(board, curPlayer) == 0: canonicalBoard = g.getCanonicalForm(board, curPlayer) if curPlayer == human: # human_player(board) move_state = 0 while True: button, value = window.Read() if button in (None, 'Exit'): exit() if button == 'New Game': sg.Popup( 'You have to restart the program to start a new game... sorry....' ) break psg_board = copy.deepcopy(initial_board) redraw_board(window, psg_board) move_state = 0 break if type(button) is tuple: if move_state == 0: move_from = button row, col = move_from piece = psg_board[row][col] # get the move-from piece button_square = window.FindElement(key=(row, col)) button_square.Update(button_color=('white', 'red')) move_state = 1 elif move_state == 1: move_to = button row, col = move_to if move_to == move_from: # cancelled move color = '#B58863' if (row + col) % 2 else '#F0D9B5' button_square.Update(button_color=('white', color)) move_state = 0 continue picked_move = '{}{}{}{}'.format( 'efgh'[move_from[1]], 8 - move_from[0], 'efgh'[move_to[1]], 8 - move_to[0]) action = moveset[picked_move] valids = g.getValidMoves(canonicalBoard, 1) if valids[action] != 0: board, curPlayer = g.getNextState( board, curPlayer, action) else: print('Illegal move') move_state = 0 color = '#B58863' if ( move_from[0] + move_from[1]) % 2 else '#F0D9B5' button_square.Update(button_color=('white', color)) continue psg_board[move_from[0]][move_from[ 1]] = BLANK # place blank where piece was psg_board[row][ col] = piece # place piece in the move-to square redraw_board(window, psg_board) move_count += 1 window.FindElement('_movelist_').Update(picked_move + '\n', append=True) break else: best_move = nnp(canonicalBoard) move_str = moveset[best_move] if curPlayer == -1: move_str = HalfchessGame.mirrored_move(move_str) from_col = ord(move_str[0]) - ord('e') from_row = 8 - int(move_str[1]) to_col = ord(move_str[2]) - ord('e') to_row = 8 - int(move_str[3]) window.FindElement('_movelist_').Update(move_str + '\n', append=True) piece = psg_board[from_row][from_col] psg_board[from_row][from_col] = BLANK psg_board[to_row][to_col] = piece redraw_board(window, psg_board) board, curPlayer = g.getNextState(board, curPlayer, best_move) move_count += 1 sg.Popup('Game over!', 'Thank you for playing')
def train(self, iteration=None, board=None, numeps=None): # bookkeeping # examples of the iteration numeps = self.args.numEps if not self.skipFirstSelfPlay or iteration > 1: iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue) eps_time = AverageMeter() bar = Bar('Self Play', max=numeps) end = time.time() #for clif_state in self.board for eps in range(numeps): self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree if board is None: iterationTrainExamples += self.executeEpisode() else: iterationTrainExamples += self.executeEpisode(board) #print iterationTrainExamples[0] # bookkeeping + plot progress eps_time.update(time.time() - end) end = time.time() bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format( eps=eps + 1, maxeps=numeps, et=eps_time.avg, total=bar.elapsed_td, eta=bar.eta_td) bar.next() bar.finish() # save the iteration examples to the history self.trainExamplesHistory.append(iterationTrainExamples) if len(self.trainExamplesHistory ) > self.args.numItersForTrainExamplesHistory: print("len(trainExamplesHistory) =", len(self.trainExamplesHistory), " => remove the oldest trainExamples") self.trainExamplesHistory.pop(0) # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) self.saveTrainExamples(iteration - 1) # shuffle examlpes before training trainExamples = [] for e in self.trainExamplesHistory: trainExamples.extend(e) shuffle(trainExamples) # training new network, keeping a copy of the old one self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') pmcts = MCTS(self.game, self.pnet, self.args) self.nnet.train(trainExamples) nmcts = MCTS(self.game, self.nnet, self.args) print('PITTING AGAINST PREVIOUS VERSION') arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game) pwins, nwins, draws = arena.playGames(self.args.arenaCompare) print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins + nwins > 0 and float(nwins) / ( pwins + nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') else: print('ACCEPTING NEW MODEL') self.nnet.save_checkpoint( folder=self.args.checkpoint, filename=self.getCheckpointFile(iteration)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar')
def __init__(self, game, network, playouts): self.mcts = MCTS(game, network, playouts) self.gd = GameData()
import numpy as np from utils import * """ use this script to play any two agents against each other, or play manually with any agent. """ g = OthelloGame(6) # all players rp = RandomPlayer(g).play gp = GreedyOthelloPlayer(g).play hp = HumanOthelloPlayer(g).play # nnet players n1 = NNet(g) n1.load_checkpoint('./pretrained_models/', '6x100x25_best.pth.tar') args1 = dotdict({'numMCTSSims': 50, 'cpuct': 1.0}) mcts1 = MCTS(g, n1, args1) n1p = lambda x: np.argmax(mcts1.getActionProb(x, temp=0)) #n2 = NNet(g) #n2.load_checkpoint('/dev/8x50x25/','best.pth.tar') #args2 = dotdict({'numMCTSSims': 25, 'cpuct':1.0}) #mcts2 = MCTS(g, n2, args2) #n2p = lambda x: np.argmax(mcts2.getActionProb(x, temp=0)) arena = Arena.Arena(n1p, hp, g, display=display) print(arena.playGames(2, verbose=True))
import sys sys.path.append("../") from MCTS import MCTS import numpy as np game_state = np.zeros((3, 3)) game_state[0, 0] = 1 mcts = MCTS(n_iterations=255, depth=10, exploration_constant=10, game_board=game_state, win_mark=3, player='O') print(mcts._is_terminal(game_state)) # leaf_node_id,depth = mcts.selection() # print(leaf_node_id,depth) # child_node_id = mcts.expansion(leaf_node_id) # print(child_node_id) # winner = mcts.simulation(child_node_id) # print(winner) # mcts.backprop(child_node_id,winner)
def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximium length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ trainExamples = deque([], maxlen=self.args.maxlenOfQueue) for i in range(self.args.numIters): # bookkeeping print('------ITER ' + str(i + 1) + '------') eps_time = AverageMeter() bar = Bar('Self Play', max=self.args.numEps) end = time.time() for eps in range(self.args.numEps): trainExamples += self.executeEpisode() # bookkeeping + plot progress eps_time.update(time.time() - end) end = time.time() bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format( eps=eps + 1, maxeps=self.args.numEps, et=eps_time.avg, total=bar.elapsed_td, eta=bar.eta_td) bar.next() bar.finish() # training new network, keeping a copy of the old one self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') pnet = self.nnet.__class__(self.game) pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') pmcts = MCTS(self.game, pnet, self.args) self.nnet.train(trainExamples) nmcts = MCTS(self.game, self.nnet, self.args) print('PITTING AGAINST PREVIOUS VERSION') arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game) pwins, nwins = arena.playGames(self.args.arenaCompare) print('NEW/PREV WINS : ' + str(nwins) + '/' + str(pwins)) if float(nwins) / (pwins + nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') self.nnet = pnet else: print('ACCEPTING NEW MODEL') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='checkpoint_' + str(i) + '.pth.tar') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar') self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree
class Coach: """ This class executes the self-play + learning. It uses the functions defined in Game and NeuralNet. args are specified in main.py. """ def __init__(self, game, nnet, args): self.game = game self.nnet = nnet self.pnet = self.nnet.__class__() # the competitor network self.args = args self.mcts = MCTS(self.game, self.nnet, self.args) self.trainExamplesHistory = [ ] # history of examples from args.numItersForTrainExamplesHistory latest iterations self.skipFirstSelfPlay = False # can be overridden in loadTrainExamples() def executeEpisode(self, x): """ This function executes one episode of self-play, starting with player 1. As the game is played, each turn is added as a training example to trainExamples. The game is played till the game ends. After the game ends, the outcome of the game is used to assign values to each example in trainExamples. It uses a temp=1 if episodeStep < tempThreshold, and thereafter uses temp=0. Returns: trainExamples: a list of examples of the form (state,pi,v) pi is the MCTS informed policy vector, v is +1 if the player eventually won the game, else -1. """ logger = logging.getLogger("fireplace") logger.setLevel(logging.WARNING) trainExamples = [] #print(id(self.game)) #game = copy.deepcopy(self.game) current_game = self.game.getInitGame() #self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree -> not needed since we get a copy of self.mcts in this child process, and MCTS itself uses a deep clone of this copy #curPlayer = 1 if f'{current_game.current_player}' == 'Player1' else -1 #self.curPlayer = 1 if f'{current_game.current_player}' == 'Player1' else -1 self.curPlayer = 1 if current_game.current_player.name == 'Player1' else -1 #print(id(self.curPlayer)) episodeStep = 0 # timing # start = time.time() while True: episodeStep += 1 # print('---Episode step ' + str(episodeStep) + '--- ' + current_process().name) #os.getpid()) # print('TIME TAKEN : {0:03f}'.format(time.time()-start)) # start = time.time() state = self.game.getState( current_game) # state is from the current player's perspective temp = int(episodeStep < self.args.tempThreshold) # TODO: No MCTS reset? Should work because player switch leads to new mirror state and info in tree could possibly be reused. Otherwise starting a new tree could speed things up here (faster lookups through smaller mcts lists)! pi = self.mcts.getActionProb( state, temp=temp) # pi is from the current player's perspective # print(self.mcts) pi_reshape = np.reshape(pi, (21, 18)) # sym = self.game.getSymmetries(state, pi) trainExamples.append([state, self.curPlayer, pi, None]) # TODO: Is None still needed? # for b,p in sym: # trainExamples.append([b, self.curPlayer, p, None]) action = np.random.choice(len(pi), p=pi) a, b = np.unravel_index(np.ravel(action, np.asarray(pi).shape), pi_reshape.shape) next_state, self.curPlayer = self.game.getNextState( self.curPlayer, (a[0], b[0]), current_game) r = self.game.getGameEnded( current_game, self.curPlayer ) # returns 0 if game has not ended, 1 if curPlayer won, -1 if curPlayer lost if r != 0: return [(x[0], x[2], r * ((-1)**(x[1] != self.curPlayer))) for x in trainExamples] def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximium length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ for i in range(1, self.args.numIters + 1): # bookkeeping print('------ITER ' + str(i) + '------') # Find and load newest model (name scheme 'x.pth.tar' with highest x) modelfile = self.get_newest_model() print("Loading newest model:", modelfile) nnet.load_checkpoint(args.modelspath, modelfile) # examples of the iteration if not self.skipFirstSelfPlay or i > 1: iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue) # with ProcessPoolExecutor(self.args.numThreads) as executor: # results = list(tqdm(executor.map(self.executeEpisode, range(self.args.numEps)), total=self.args.numEps, desc='Self-play matches')) # iterationTrainExamples = [r for r in results] # with Pool(self.args.numThreads) as pool: # for result in list(tqdm(pool.imap(self.executeEpisode, range(self.args.numEps)), total=self.args.numEps, desc='Self-play matches')): # iterationTrainExamples += result for result in parallel_process(self.executeEpisode, range(self.args.numEps), workers=self.args.numThreads, desc='Self-play matches'): iterationTrainExamples += result # save the iteration examples to the history self.trainExamplesHistory.append(iterationTrainExamples) # if len(self.trainExamplesHistory) > self.args.numItersForTrainExamplesHistory: # print("len(trainExamplesHistory) =", len(self.trainExamplesHistory), " => remove the oldest trainExamples") # self.trainExamplesHistory.pop(0) # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) self.saveTrainExamples(modelfile, i - 1) def getCheckpointFile(self, modelfile, iteration): return modelfile + '_' + str(iteration) def saveTrainExamples(self, modelfile, iteration): folder = self.args.examplespath if not os.path.exists(folder): os.makedirs(folder) filename = os.path.join( folder, self.getCheckpointFile(modelfile, iteration) + ".examples") with open(filename, "wb+") as f: Pickler(f, protocol=pickle.HIGHEST_PROTOCOL).dump( self.trainExamplesHistory) f.closed # def loadTrainExamples(self): # modelFile = os.path.join(self.args.load_folder_file[0], self.args.load_folder_file[1]) # examplesFile = modelFile+".examples" # if not os.path.isfile(examplesFile): # print(examplesFile) # r = input("File with trainExamples not found. Continue? [y|n]") # if r != "y": # sys.exit() # else: # print("File with trainExamples found. Read it.") # with open(examplesFile, "rb") as f: # self.trainExamplesHistory = Unpickler(f).load() # f.closed # # examples based on the model were already collected (loaded) # self.skipFirstSelfPlay = True def list_files(self, directory, extension): return (f for f in os.listdir(directory) if f.endswith('.' + extension)) def get_modelnumber(self, filename): file_name = os.path.basename(filename) index_of_dot = file_name.index('.') file_name_without_extension = file_name[:index_of_dot] return int(file_name_without_extension) def get_newest_model(self): files = self.list_files(args.modelspath, "pth.tar") modelnumbers = list(map(self.get_modelnumber, files)) maximum = max(modelnumbers) return str(maximum) + ".pth.tar"
from datetime import datetime """ use this script to play any two agents against each other, or play manually with any agent. """ g = AzulGame(shouldRandomize=False) # all players n1 = NNet(g) n2 = NNet(g) n1.load_checkpoint('./temp/', 'best.pth.tar') n2.load_checkpoint('./temp/', 'best.pth.tar') args = dotdict({'numMCTSSims': 50, 'cpuct': 1.0}) mcts = MCTS(g, n1, args) mcts2 = MCTS(g, n2, args) n1p = lambda x: np.argmax(mcts.getActionProb(x, temp=0)) n2p = lambda x: np.argmax(mcts2.getActionProb(x, temp=0)) player1 = n1p player2 = n2p numberGames = 1 verbose = False conn = sqlite3.connect("AzulGamesSite/AzulGameViewer/db.sqlite3") for _ in range(numberGames): curTime = datetime.now().strftime("%m/%d/%y - %H:%M:%S") gameID = conn.cursor().execute(
res = {'random': {}, 'abp1': {}, 'abp2': {}, 'abp3': {}} num = 10 cps = [1, 2, 5, 9, 17, 24, 36, 50, 63, 74, 85, 95, 99] full_cps = [1, 2, 3, 5, 7, 8, 9, 11, 13, 17, 21, 24, 28, 29, 30, 31, 33, 36, 38,\ 39, 40, 41, 42, 44, 48, 50 ,57, 59, 60, 61, 63, 67, 68, 69, 71, 72, 73,\ 74, 78, 79, 85, 89, 91, 95, 99] cur_cps = [36, 38, 39, 40, 41, 42, 44, 48, 50 ,57, 59, 60, 61, 63, 67, 68, 69, 71,\ 72, 73, 74, 78, 79, 85, 89, 91, 95, 99] for cp in cur_cps: n1 = NNet(g) n1.load_checkpoint('./pretrained_models/hex/pytorch/temp/', 'Copy of checkpoint_{}.pth.tar'.format(cp)) args1 = dotdict({'numMCTSSims': 50, 'cpuct': 1.0}) mcts = MCTS(g, n1, args1) azp = lambda x, player: np.argmax(mcts.getActionProb(x, player, temp=0)) arena = Arena.Arena(azp, rp.play, g, display=display) print('=========== playing check point {} vs {} ==========='.format( cp, 'random')) az_won, rp_won, draws = arena.playGames(num, verbose=True) print((az_won, rp_won, draws)) total_turn = arena.total_turn print('sim count MCTS all', mcts.sim_count, 'avg game', mcts.sim_count / num, 'avg turn', mcts.sim_count / total_turn) res['random'][cp] = (az_won, num) for depth in [1, 2]: player = abps[depth] player.sim_count = 0
def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximium length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ self.game.prune_prob = self.args.prune_starting_prob train_black = self.args.train_black_first for i in range(1, self.args.numIters+1): # bookkeeping print('------ITER ' + str(i) + '------') # examples of the iteration if not self.args.skip_first_self_play or i>1: iterationTrainExamples_white = deque([], maxlen=self.args.maxlenOfQueue) iterationTrainExamples_black = deque([], maxlen=self.args.maxlenOfQueue) eps_time = AverageMeter() bar = Bar('Self Play', max=self.args.numEps) end = time.time() if self.args.profile_coach: prof = cProfile.Profile() prof.enable() for eps in range(self.args.numEps): self.mcts = MCTS(self.game, self.white_nnet, self.black_nnet, self.args) # reset search tree white_examples, black_examples = self.executeEpisode() iterationTrainExamples_white += white_examples iterationTrainExamples_black += black_examples # bookkeeping + plot progress eps_time.update(time.time() - end) end = time.time() bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format(eps=eps + 1, maxeps=self.args.numEps, et=eps_time.avg, total=bar.elapsed_td, eta=bar.eta_td) bar.next() bar.finish() if self.args.profile_coach: prof.disable() prof.print_stats(sort=2) # save the iteration examples to the history self.trainExamplesHistory_white.append(iterationTrainExamples_white) self.trainExamplesHistory_black.append(iterationTrainExamples_black) while len(self.trainExamplesHistory_white) > self.args.numItersForTrainExamplesHistory: print("len(trainExamplesHistory) =", len(self.trainExamplesHistory_white), " => remove the oldest trainExamples") self.trainExamplesHistory_white.pop(0) self.trainExamplesHistory_black.pop(0) # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) self.saveTrainExamples(i-1) # training new network, keeping a copy of the old one self.white_nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp_white.pth.tar') self.black_nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp_black.pth.tar') self.white_pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_white.pth.tar') self.black_pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_black.pth.tar') pmcts = MCTS(self.game, self.white_pnet, self.black_pnet, self.args) if not self.args.train_both: if train_black: # shuffle examples before training trainExamples = [] for e in self.trainExamplesHistory_black: trainExamples.extend(e) shuffle(trainExamples) self.black_nnet.train(trainExamples) else: # shuffle examples before training trainExamples = [] for e in self.trainExamplesHistory_white: trainExamples.extend(e) shuffle(trainExamples) self.white_nnet.train(trainExamples) else: # shuffle examples before training trainExamples = [] for e in self.trainExamplesHistory_black: trainExamples.extend(e) shuffle(trainExamples) self.black_nnet.train(trainExamples) # shuffle examples before training trainExamples = [] for e in self.trainExamplesHistory_white: trainExamples.extend(e) shuffle(trainExamples) self.white_nnet.train(trainExamples) nmcts = MCTS(self.game, self.white_nnet, self.black_nnet, self.args) print('PITTING AGAINST PREVIOUS VERSION') arena = Arena(lambda board, turn_player: np.argmax(pmcts.getActionProb(board, turn_player, temp=0)), lambda board, turn_player: np.argmax(nmcts.getActionProb(board, turn_player, temp=0)), self.game) pwins, nwins, draws, pwins_white, pwins_black, nwins_white, nwins_black \ = arena.playGames(self.args.arenaCompare, self.args.profile_arena) print('NEW/PREV WINS (white, black) : (%d,%d) / (%d,%d) ; DRAWS : %d' % (nwins_white, nwins_black, pwins_white, pwins_black, draws)) if pwins+nwins == 0 or float(nwins)/(pwins+nwins) < self.args.updateThreshold \ or nwins_black < pwins_black or nwins_white < pwins_white: print('REJECTING NEW MODEL') if not self.args.train_both: if train_black: self.black_nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_black.pth.tar') else: self.white_nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_white.pth.tar') else: self.black_nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_black.pth.tar') self.white_nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_white.pth.tar') else: print('ACCEPTING NEW MODEL') if not self.args.train_both: if train_black: # self.black_nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i, Player.black)) self.black_nnet.save_checkpoint(folder=self.args.checkpoint, filename='best_black.pth.tar') # if nwins_white == 0 or nwins_black / nwins_white >= self.args.train_other_network_threshold: # train_black = False print("training white neural net next") train_black = False else: # self.white_nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i, Player.white)) self.white_nnet.save_checkpoint(folder=self.args.checkpoint, filename='best_white.pth.tar') # if nwins_black == 0 or nwins_white / nwins_black > self.args.train_other_network_threshold: # train_black = True print("training black neural net next") train_black = True else: self.black_nnet.save_checkpoint(folder=self.args.checkpoint, filename='best_black.pth.tar') self.white_nnet.save_checkpoint(folder=self.args.checkpoint, filename='best_white.pth.tar') self.game.prune_prob += self.args.prune_prob_gain_per_iteration self.args.arenaCompare = math.floor(self.args.arenaCompare * 1.05) # self.args.numEps = math.floor(self.args.numEps * 1.1) self.args.numMCTSSims = math.floor(self.args.numMCTSSims * 1.1) print("prune probability: " + str(self.game.prune_prob) + ", episodes: " + str(self.args.numEps) + ", sims: " + str(self.args.numMCTSSims) + ", arena compare: " + str(self.args.arenaCompare))
'numItersForTrainExamplesHistory': 20, }) g = Game(6) nnet = nn(g) if args.load_model: nnet.load_checkpoint(args.load_folder_file[0], args.load_folder_file[1]) c = Coach(g, nnet, args) board = g.getInitBoard() player = 1 while g.getGameEnded(board, player) == 0: mcts = MCTS(g, nnet, args) action = mcts.get_action_prob(g.getCanonicalForm(board, 1)) print(action) board, player = g.getNextState(board, 1, np.argmax(action)) g.display(board) print([(g.action_dict[x], x) for x in np.where(g.getValidMoves(board, player) == 1)[0]]) mode = -1 while mode not in np.where(g.getValidMoves(board, player) == 1)[0]: try: mode = int(raw_input('Input:')) except ValueError: print "Not a number" board, player = g.getNextState(board, player, mode) g.display(board)
class Coach(): """ This class executes the self-play + learning. It uses the functions defined in Game and NeuralNet. args are specified in main_.py. """ def __init__(self, game, nnet, args): self.game = game self.nnet = nnet self.pnet = self.nnet.__class__(self.game) # the competitor network self.args = args self.mcts = MCTS(self.game, self.nnet, self.args) self.trainExamplesHistory = [] # history of examples from args.numItersForTrainExamplesHistory latest iterations self.skipFirstSelfPlay = False # can be overriden in loadTrainExamples() def executeEpisode(self): """ This function executes one episode of self-play, starting with player 1. As the game is played, each turn is added as a training example to trainExamples. The game is played till the game ends. After the game ends, the outcome of the game is used to assign values to each example in trainExamples. It uses a temp=1 if episodeStep < tempThreshold, and thereafter uses temp=0. Returns: trainExamples: a list of examples of the form (canonicalBoard, currPlayer, pi,v) pi is the MCTS informed policy vector, v is +1 if the player eventually won the game, else -1. """ trainExamples = [] board = self.game.getInitBoard() self.curPlayer = 1 episodeStep = 0 while True: episodeStep += 1 canonicalBoard = self.game.getCanonicalForm(board, self.curPlayer) # This is good, return depending on the payer # print('---------coach-------------') # print(board) temp = int(episodeStep < self.args.tempThreshold) pi = self.mcts.getActionProb(canonicalBoard, obj_board=board, player=self.curPlayer, temp=temp) # To check it returns what sym = self.game.getSymmetries(canonicalBoard, pi) for b, p in sym: trainExamples.append([b, self.curPlayer, p, None]) to_validate = self.game.getValidMoves(board, self.curPlayer) action = np.random.choice(len(pi), p=pi) if not to_validate[action] == 1: action = np.argmax(self.game.getValidMoves(board, self.curPlayer)) board, self.curPlayer = self.game.getNextState(board, self.curPlayer, action) # board.apply_mirror() r = self.game.getGameEnded(board, self.curPlayer) if r != 0: return [(x[0], x[2], r * ((-1) ** (x[1] != self.curPlayer))) for x in trainExamples] def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximum length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ for i in range(1, self.args.numIters + 1): # bookkeeping log.info(f'Starting Iter #{i} ...') # examples of the iteration if not self.skipFirstSelfPlay or i > 1: iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue) for _ in tqdm(range(self.args.numEps), desc="Self Play"): self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree iterationTrainExamples += self.executeEpisode() # save the iteration examples to the history self.trainExamplesHistory.append(iterationTrainExamples) if len(self.trainExamplesHistory) > self.args.numItersForTrainExamplesHistory: log.warning( f"Removing the oldest entry in trainExamples. len(trainExamplesHistory) = {len(self.trainExamplesHistory)}") self.trainExamplesHistory.pop(0) # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) self.saveTrainExamples(i - 1) # shuffle examples before training trainExamples = [] for e in self.trainExamplesHistory: trainExamples.extend(e) shuffle(trainExamples) # training new network, keeping a copy of the old one self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') # self.loadTrainExamples() # trainExamples = self.trainExamplesHistory pmcts = MCTS(self.game, self.pnet, self.args) self.nnet.train(trainExamples) nmcts = MCTS(self.game, self.nnet, self.args) log.info('PITTING AGAINST PREVIOUS VERSION') arena = Arena(lambda x, y: np.argmax(pmcts.getActionProb(x, y, temp=0, player=1)), lambda x, y: np.argmax(nmcts.getActionProb(x, y, temp=0, player=-1)), self.game) pwins, nwins, draws = arena.playGames(self.args.arenaCompare) print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins + nwins == 0 or float(nwins) / (pwins + nwins) < self.args.updateThreshold: log.info('REJECTING NEW MODEL') self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') else: log.info('ACCEPTING NEW MODEL') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar') def getCheckpointFile(self, iteration): return 'checkpoint_' + str(iteration) + '.pth.tar' def saveTrainExamples(self, iteration): folder = self.args.checkpoint if not os.path.exists(folder): os.makedirs(folder) filename = os.path.join(folder, self.getCheckpointFile(iteration) + ".examples") with open(filename, "wb+") as f: Pickler(f).dump(self.trainExamplesHistory) f.closed def loadTrainExamples(self): modelFile = os.path.join(self.args.load_folder_file[0], self.args.load_folder_file[1]) examplesFile = modelFile + ".examples" if not os.path.isfile(examplesFile): log.warning(f'File "{examplesFile}" with trainExamples not found!') r = input("Continue? [y|n]") if r != "y": sys.exit() else: log.info("File with trainExamples found. Loading it...") with open(examplesFile, "rb") as f: self.trainExamplesHistory = Unpickler(f).load() log.info('Loading done!') # examples based on the model were already collected (loaded) self.skipFirstSelfPlay = True
def __init__(self, game, n_mcts_per_step): self.mcts = MCTS(game) self.n_mcts_per_step = n_mcts_per_step
def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximium length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ for i in range(1, self.args.numIters + 1): # bookkeeping print('------ITER ' + str(i) + '------') # examples of the iteration if not self.skipFirstSelfPlay or i > 1: iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue) eps_time = AverageMeter() bar = Bar('Self Play', max=self.args.numEps) end = time.time() for eps in range(self.args.numEps): self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree iterationTrainExamples += self.executeEpisode() # bookkeeping + plot progress eps_time.update(time.time() - end) end = time.time() bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format( eps=eps + 1, maxeps=self.args.numEps, et=eps_time.avg, total=bar.elapsed_td, eta=bar.eta_td) bar.next() bar.finish() # save the iteration examples to the history self.trainExamplesHistory.append(iterationTrainExamples) if len(self.trainExamplesHistory ) > self.args.numItersForTrainExamplesHistory: print("len(trainExamplesHistory) =", len(self.trainExamplesHistory), " => remove the oldest trainExamples") self.trainExamplesHistory.pop(0) # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) self.saveTrainExamples(i - 1) # shuffle examlpes before training trainExamples = [] for e in self.trainExamplesHistory: trainExamples.extend(e) shuffle(trainExamples) # training new network, keeping a copy of the old one self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') pmcts = MCTS(self.game, self.pnet, self.args) self.nnet.train(trainExamples) nmcts = MCTS(self.game, self.nnet, self.args) print('PITTING AGAINST PREVIOUS VERSION') arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game) pwins, nwins, draws = arena.playGames(self.args.arenaCompare) print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins + nwins > 0 and float(nwins) / ( pwins + nwins) < self.args.updateThreshold: print('REJECTING NEW MODEL') self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar') else: print('ACCEPTING NEW MODEL') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar')
class Coach(): """ This class executes the self-play + learning. It uses the functions defined in Game and NeuralNet. args are specified in main.py. """ def __init__(self, game, nnet, args): self.game = game self.nnet = nnet self.pnet = self.nnet.__class__(self.game) # the competitor network self.args = args self.mcts = MCTS(self.game, self.nnet, self.args) self.trainExamplesHistory = [ ] # history of examples from args.numItersForTrainExamplesHistory latest iterations self.skipFirstSelfPlay = False # can be overriden in loadTrainExamples() def executeEpisode(self): """ This function executes one episode of self-play, starting with player 1. As the game is played, each turn is added as a training example to trainExamples. The game is played till the game ends. After the game ends, the outcome of the game is used to assign values to each example in trainExamples. It uses a temp=1 if episodeStep < tempThreshold, and thereafter uses temp=0. Returns: trainExamples: a list of examples of the form (canonicalBoard,pi,v) pi is the MCTS informed policy vector, v is +1 if the player eventually won the game, else -1. """ trainExamples = [] #move history of this single episode board = self.game.getInitBoard() #load the gam setup self.curPlayer = WHITE #WHITE goes first episodeStep = 0 #record the truns of self play #star playing the game while True: # turn objective board into self.curlPlayer POV's board. Ie: black, white -> friend, enemy # two kinds of board: # 1: Objective board: Black and White # 2: CanonicalBoard: Friendly and Enemy canonicalBoard = self.game.getCanonicalForm(board, self.curPlayer) # print("Received CanonicalBoard:\n%s"%canonicalBoard.reshape(8,8)) # print("Current player:%s"%self.curPlayer) # a = input() # if episodes > tempThreshold, MCTS will stop updating probs, and just return best move #NOTE: mainly for spped up I guess? currently disable, as episodeStep = 24, args.tempThreshold = 25 temp = int(episodeStep < self.args.tempThreshold) # create probability of winning for each action on board for self.currPlayer's POV pi = self.mcts.getActionProb(canonicalBoard, episodeStep, temp=temp) # one board situation can generate two tranining example # as symmetric does not matter sym = self.game.getSymmetries(canonicalBoard, pi) #adding tranning example #BUG: not showing correctly for b, policyVector in sym: # (canonicalBoard,player, polivy vector) trainExamples.append( [b, self.curPlayer, policyVector, episodeStep]) # #DEBUG: # probs_display = [round(x,2) for x in pi] # print("curr_player:%s turn:%s, probs:\n%s"%(self.curPlayer, episodeStep, np.array(probs_display).reshape(8,8))) #choose action with highest winning probability action = np.random.choice(len(pi), p=pi) if self.curPlayer == BLACK: action = self.game.blackActionConverter(action) #DEBUG # print("in player point of view \n player %s going to take action %s in turn %s board:\n%s"%(self.curPlayer, action, episodeStep, canonicalBoard.reshape(8,8))) #self.curPlayer turn to next player, objective board update, turn update board, self.curPlayer = self.game.getNextState( board, self.curPlayer, action) #regardless of friendly or enemy, show objective episodeStep += 1 #DEBUG # print("after action, objective board \n ") # print( board.reshape(8,8)) # print("next player %s next turn %s"%(self.curPlayer, episodeStep)) #check the new board status #return 0 if game continue, 1 if WHITE win, -1 if BLACK win. #thgouth the last turn was BLACK's Move, we updated self.curPlayer after Black action #so we judge result in WHITE's POV # the last turn after black does not added to the trainExample, as we already know who won # we add winning result in next if Statement r = self.game.getGameEnded(board, self.curPlayer, episodeStep, end_Evaluate=False) #in WHITE's POV if r != 0 and self.curPlayer == WHITE: #DEBUG # print("Objective board") # print("game has ended, player %s result %s board:\n%s"%(self.curPlayer, r, board.reshape(8,8))) #return board winning result, who won it #(canonicalBoard,policyVector,v) # x[0] cannonical board, x[2] policy vector x[1] self.curlPlayer of cannonical board # x[1] is BLACK, return -result as -result is in BLACK's POV # x[1] is WHITE, return result, as result is in WHITE's POV # x[2] policyVector from mcts # x[3] turn #TODO do I need to add turn for poliicy vector as well? r = self.game.getGameEnded(board, self.curPlayer, episodeStep, end_Evaluate=True) generatedTraining = [ (x[0], x[2], r * ((-1)**(x[1] != WHITE)), x[3]) for x in trainExamples ] #add turn as a input, no need to make change for pi # generatedTraining = [(x[0],x[2],r*((-1)**(x[1]!=WHITE))) for x in trainExamples] # generatedTraining = [(x[0],x[2],r*((-1)**(x[1]!=self.curPlayer))) for x in trainExamples] #DEBUG # lastResult = generatedTraining[-1] # print("Input to trainExample") # print("result:%s, cannonicalboard:\n%s "%(lastResult[2], lastResult[0].reshape(8,8))) # a = input() return generatedTraining def learn(self): """ Performs numIters iterations with numEps episodes of self-play in each iteration. After every iteration, it retrains neural network with examples in trainExamples (which has a maximium length of maxlenofQueue). It then pits the new neural network against the old one and accepts it only if it wins >= updateThreshold fraction of games. """ for i in range(1, self.args.numIters + 1): #for number of rounds # bookkeeping print('------ITER ' + str(i) + '------') # examples of the iteration if not self.skipFirstSelfPlay or i > 1: iterationTrainExamples = deque( [], maxlen=self.args.maxlenOfQueue ) #remove the previous training example eps_time = AverageMeter() bar = Bar('Self Play', max=self.args.numEps) end = time.time() for eps in range( self.args.numEps): #for each self-play of this rounds self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree #reutrn [(canonicalBoard,pi,v), (canonicalBoard,pi,v)] # v is the result selfPlayResult = self.executeEpisode() #play one game, adding the gaming history iterationTrainExamples += selfPlayResult # bookkeeping + plot progress eps_time.update(time.time() - end) end = time.time() bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format( eps=eps + 1, maxeps=self.args.numEps, et=eps_time.avg, total=bar.elapsed_td, eta=bar.eta_td) bar.next() bar.finish() # save the iteration examples to the history self.trainExamplesHistory.append(iterationTrainExamples) #self-play finished, updating the move history if len(self.trainExamplesHistory ) > self.args.numItersForTrainExamplesHistory: print("len(trainExamplesHistory) =", len(self.trainExamplesHistory), " => remove the oldest trainExamples") self.trainExamplesHistory.pop( 0) #remove the oldest gaming history # backup history to a file # NB! the examples were collected using the model from the previous iteration, so (i-1) self.saveTrainExamples(i - 1) # shuffle examlpes before training trainExamples = [] for e in self.trainExamplesHistory: trainExamples.extend(e) #adding new move record shuffle(trainExamples) # training new network, keeping a copy of the old one self.nnet.save_checkpoint( folder=self.args.checkpoint, filename='temp.pth.tar') #save the previous net self.pnet.load_checkpoint( folder=self.args.checkpoint, filename='temp.pth.tar') #read the previous net pmcts = MCTS(self.game, self.pnet, self.args) #reset previous models' mcts #using new data to train the new model self.nnet.train( trainExamples) #trin the network with new move record nmcts = MCTS(self.game, self.nnet, self.args) #rest new models' mcts #OLD VS NEW print('PITTING AGAINST PREVIOUS VERSION') arena = Arena( lambda board, turn: np.argmax( pmcts.getActionProb(board, turn, temp=0)), lambda board, turn: np.argmax( nmcts.getActionProb(board, turn, temp=0)), self.game) pwins, nwins, draws = arena.playGames( self.args.arenaCompare) #playing new mode against old models print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws)) if pwins + nwins > 0 and float(nwins) / ( pwins + nwins) < self.args.updateThreshold: #OLD WIN! print('REJECTING NEW MODEL') self.nnet.load_checkpoint( folder=self.args.checkpoint, filename='temp.pth.tar' ) #using previous mode, as it beat new model else: #NEW WIN! print('ACCEPTING NEW MODEL') self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i)) self.nnet.save_checkpoint( folder=self.args.checkpoint, filename='best.pth.tar' ) #save the new model, as this is the best def getCheckpointFile(self, iteration): #reading the tranined network return 'checkpoint_' + str(iteration) + '.pth.tar' def saveTrainExamples(self, iteration): folder = self.args.checkpoint if not os.path.exists(folder): os.makedirs(folder) filename = os.path.join( folder, self.getCheckpointFile(iteration) + ".examples") with open(filename, "wb+") as f: Pickler(f).dump(self.trainExamplesHistory) f.closed def loadTrainExamples(self): modelFile = os.path.join(self.args.load_folder_file[0], self.args.load_folder_file[1]) examplesFile = modelFile + ".examples" if not os.path.isfile(examplesFile): print(examplesFile) r = input("File with trainExamples not found. Continue? [y|n]") if r != "y": sys.exit() else: print("File with trainExamples found. Read it.") with open(examplesFile, "rb") as f: self.trainExamplesHistory = Unpickler(f).load() f.closed # examples based on the model were already collected (loaded) self.skipFirstSelfPlay = True
human_vs_cpu = True g = HalfchessGame() # all players rp = RandomPlayer(g).play # gp = GreedyOthelloPlayer(g).play hp = HumanPlayer(g).play # nnet player nn = NNet(g) #nn.load_checkpoint('./temp/','best.pth.tar') nn.load_checkpoint('./pretrained_models/halfchess', '43it.pth.tar') args = dotdict({'numMCTSSims': 50, 'cpuct': 1.0}) mcts = MCTS(g, nn, args) nnp = lambda x: np.argmax(mcts.getActionProb(x, temp=1)) if human_vs_cpu: player1 = hp else: n2 = NNet(g) n2.load_checkpoint('./pretrained_models/halfchess/', '26it_fixed_logic.pth.tar') args2 = dotdict({'numMCTSSims': 50, 'cpuct': 1.0}) mcts2 = MCTS(g, n2, args2) n2p = lambda x: np.argmax(mcts2.getActionProb(x, temp=0)) # player2 = n2p # Player 2 is neural network if it's cpu vs cpu. arena = Arena.Arena(hp, nnp, g, display=str)
def onevsonegame(budget1, random1, counter1, usecounter_in_rollout_1, budget2, random2, counter2, usecounter_in_rollout_2, whostarts, index): import random random.seed() np.random.seed() if whostarts == 'budget1': modulo = 1 elif whostarts == 'budget2': modulo = 0 # init tree, root, game tree = MCTS() c_uct = 1 game = Game() turn = 0 gameover = 0 rootnode = tree.createNode(game.state) currentnode = rootnode # main loop while gameover == 0: turn = turn + 1 if turn % 2 == modulo: #player = 'player1' sim_number = budget1 usecounterinrollout = usecounter_in_rollout_1 counter = counter1 rd = random1 else: #player = 'player2' sim_number = budget2 usecounterinrollout = usecounter_in_rollout_2 counter = counter2 rd = random2 if rd: #completely random play / or random + counter if counter: currentnode, existscounter = getcountermove(currentnode, tree) if existscounter == False: if len(currentnode.children) == 0: tree.expand_all(currentnode) randindex = int(random.random() * (len(currentnode.children))) currentnode = currentnode.children[randindex] else: if len(currentnode.children) == 0: tree.expand_all(currentnode) randindex = int(random.random() * (len(currentnode.children))) currentnode = currentnode.children[randindex] else: if counter: currentnode, existscounter = getcountermove(currentnode, tree) if existscounter == False: for sims in range(0, sim_number): tree.simulate(currentnode, UCT_simu, c_uct, usecounterinrollout) visits = np.array( [child.N for child in currentnode.children]) max_visits = np.where(visits == np.max(visits))[0] imax = max_visits[int(random.random() * len(max_visits))] currentnode = currentnode.children[imax] else: for sims in range(0, sim_number): tree.simulate(currentnode, UCT_simu, c_uct, usecounterinrollout) visits = np.array([child.N for child in currentnode.children]) max_visits = np.where(visits == np.max(visits))[0] imax = max_visits[int(random.random() * len(max_visits))] currentnode = currentnode.children[imax] # then reinit tree game = Game(currentnode.state) tree = MCTS() rootnode = tree.createNode(game.state) currentnode = rootnode gameover, winner = game.gameover() #print('end of game') if winner == 0: toreturn = 'draw' elif winner == 1: if whostarts == 'budget1': toreturn = 'budget1' else: toreturn = 'budget2' elif winner == -1: if whostarts == 'budget1': toreturn = 'budget2' else: toreturn = 'budget1' monresult = {'result': toreturn} filename = './data/game' + str(index) + '.txt' with open(filename, 'wb') as file: pickle.dump(monresult, file) file.close()
from utils import * """ use this script to play any two agents against each other, or play manually with any agent. """ g = OthelloGame(6) # all players rp = RandomPlayer(g).play gp = GreedyOthelloPlayer(g).play hp = HumanOthelloPlayer(g).play # nnet players n1 = NNet(g) n1.load_checkpoint('./pretrained_models/othello/pytorch/','6x100x25_best.pth.tar') args1 = dotdict({'numMCTSSims': 50, 'cpuct':1.0}) mcts1 = MCTS(g, n1, args1) n1p = lambda x: np.argmax(mcts1.getActionProb(x, temp=0)) #n2 = NNet(g) #n2.load_checkpoint('/dev/8x50x25/','best.pth.tar') #args2 = dotdict({'numMCTSSims': 25, 'cpuct':1.0}) #mcts2 = MCTS(g, n2, args2) #n2p = lambda x: np.argmax(mcts2.getActionProb(x, temp=0)) arena = Arena.Arena(n1p, hp, g, display=display) print(arena.playGames(2, verbose=True))
end = datetime.datetime.now() print("Elapsed:", end - start) for sims in np.arange(100, 2000, 100): good = 0 for i in range(n_games): if i % 10 == 0: print("- Game", i, datetime.datetime.now()) args = dotdict({ 'numMCTSSims': sims, 'n_nodes': N_NODES, 'max_dist': 100, 'cpuct': 1, 'max_dist': 100 }) mcts = MCTS(games[i], None, args) state = [0] R = 0 Actions = [] while not games[i].getGameEnded(state): action = np.argmax(mcts.getActionProb(state)) state, reward = games[i].getNextState(state, action) Actions.append(action) R += reward score_of_mcts_path = games[i].path_pay(Actions) if score_of_mcts_path / optimal[i] < 1.1: good += 1
def __init__(self, policy_net_path, value_net_path): self.mcts = MCTS(policy_net_path, value_net_path, time_limit=20)
'cpuct': 1, 'checkpoint': './temp/', 'load_model': True, 'load_folder_file': ('/media/nmo/3E26969E2696572D/Martin/Programmieren/Machine-Learning/reinforcement_learning/alpha_go_zero/temp','best.pth.tar'), 'numItersForTrainExamplesHistory': 20, }) if __name__ == "__main__": game = Connect4Proxy(6, 7) net = NNetWrapper(game) net.load_checkpoint(folder='./temp/', filename='best.pth.tar') board = game.getInitBoard() nmcts = MCTS(game, net, args) curPlayer = 1 it = 0 def computer_move(canonical_board, valids): action = np.argmax(nmcts.getActionProb(canonical_board, temp=0)) if valids[action] == 0: assert valids[action] > 0 return action def human_move(canonical_board, valids): while True: # break if we make a legal move move = input("Enter column: ") if valids[int(move)] == 1: break