Ejemplo n.º 1
0
class GamePlay(object):
    def __init__(self, policy_net_path, value_net_path):
        self.mcts = MCTS(policy_net_path, value_net_path, time_limit=20)

    def play(self, game):
        self.mcts.set_game(game)
        return self.mcts.start()
Ejemplo n.º 2
0
 def __init__(self, game, nnet, args):
     self.game = game
     self.nnet = nnet
     self.pnet = self.nnet.__class__(self.game)  # the competitor network
     self.args = args
     # self.mcts = MCTS(self.game, self.nnet, self.args)
     self.mcts = MCTS(self.nnet, self.args)
     self.trainExamplesHistory = []    # history of examples from args.numItersForTrainExamplesHistory latest iterations
     self.skipFirstSelfPlay = False # can be overriden in loadTrainExamples()
     self.arenaEnabled = args.arena == "true"
Ejemplo n.º 3
0
mcts2 = MCTS(g, n2, args2)
n2p = lambda x: np.argmax(mcts2.getActionProb(x, temp=0))
"""

for p in range(1, 5):
    print("iter:%d" % p)
    for i in range(70, 1, -1):
        n1 = NNet(g, argsNN)
        try:
            n1.load_checkpoint('/content/drive/My Drive/model/',
                               'checkpoint_%d.pth.tar' % i)
        except:
            print("no model:%d" % i)
            continue
        args1 = dotdict({'numMCTSSims': 50, 'cpuct': 1.0})
        mcts1 = MCTS(g, n1, args1)
        n1p = lambda x: np.argmax(mcts1.getActionProb(x, temp=0))  #expoitation

        for j in range(70, 1, -1):
            if (i <= j): continue
            n2 = NNet(g, argsNN)
            try:
                n2.load_checkpoint('/content/drive/My Drive/model/',
                                   'checkpoint_%d.pth.tar' % j)
            except:
                print("no model:%d" % j)
                continue
            args2 = dotdict({'numMCTSSims': 50, 'cpuct': 1.0})
            mcts2 = MCTS(g, n2, args2)
            n2p = lambda x: np.argmax(mcts2.getActionProb(x, temp=0))
Ejemplo n.º 4
0
class Coach():
    """
    This class executes the self-play + learning. It uses the functions defined
    in Game and NeuralNet. args are specified in main.py.
    """
    def __init__(self, game, nnet, args):
        self.game = game
        self.nnet = nnet
        self.pnet = self.nnet.__class__(self.game)  # the competitor network
        self.args = args
        # self.mcts = MCTS(self.game, self.nnet, self.args)
        self.mcts = MCTS(self.nnet, self.args)
        self.trainExamplesHistory = []    # history of examples from args.numItersForTrainExamplesHistory latest iterations
        self.skipFirstSelfPlay = False # can be overriden in loadTrainExamples()
        self.arenaEnabled = args.arena == "true"

    def executeEpisode(self):
        """
        This function executes one episode of self-play, starting with player 1.
        As the game is played, each turn is added as a training example to
        trainExamples. The game is played till the game ends. After the game
        ends, the outcome of the game is used to assign values to each example
        in trainExamples.

        It uses a temp=1 if episodeStep < tempThreshold, and thereafter
        uses temp=0.

        Returns:
            trainExamples: a list of examples of the form (canonicalBoard,pi,v)
                           pi is the MCTS informed policy vector, v is +1 if
                           the player eventually won the game, else -1.
        """
        trainExamples = []
        self.game.reset_game()
        self.curPlayer = 1
        episodeStep = 0

        players = []
        while True:
            players.append(self.curPlayer)
            episodeStep += 1
            # canonicalBoard = self.game.getCanonicalForm()
            temp = int(episodeStep < self.args.tempThreshold)

            pi = self.mcts.getActionProb(self.game, self.curPlayer, temp=temp)
            sym = self.game.getSymmetries(pi)

            for b,p in sym:
                trainExamples.append([b, self.curPlayer, p])
            #
            # trainExamples.append([self.game.boxes, self.curPlayer, pi, None])

            # Random Choice Based on the pi array as weights
            action = np.random.choice(len(pi), p=pi)
            self.curPlayer = self.game.getNextState(self.curPlayer, action)


            r = self.game.getGameEnded()

            if r!=0:
            #     print(r)
            #     print(self.curPlayer)
            #     for _,p,_ in trainExamples[::-8]:
                    # print("Last Player: {} Winner:: {} Score: {} Player Example {} \t Value {}".format(self.curPlayer, r, self.game.score, p,r*p))
                # return [(x[0],x[2],r*((-1)**(x[1]!=self.curPlayer))) for x in trainExamples]
                return [(x[0],x[2],r*x[1]) for x in trainExamples]

    def learn(self):
        """
        Performs numIters iterations with numEps episodes of self-play in each
        iteration. After every iteration, it retrains neural network with
        examples in trainExamples (which has a maximium length of maxlenofQueue).
        It then pits the new neural network against the old one and accepts it
        only if it wins >= updateThreshold fraction of games.
        """

        for i in range(1, self.args.numIters+1):
            # bookkeeping
            print('------ITER ' + str(i) + '------')
            print(str(self.game.innerN) + "x" + str(self.game.innerM))
            # examples of the iteration
            if not self.skipFirstSelfPlay or i > 1:
                iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue)

                eps_time = AverageMeter()
                bar = Bar('Self Play', max=self.args.numEps)
                end = time.time()
    
                for eps in range(self.args.numEps):
                    # self.mcts = MCTS(self.game, self.nnet, self.args)   # reset search tree
                    self.mcts = MCTS(self.nnet, self.args)   # reset search tree
                    iterationTrainExamples += self.executeEpisode()

    
                    # bookkeeping + plot progress
                    eps_time.update(time.time() - end)
                    end = time.time()
                    bar.suffix  = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format(eps=eps+1, maxeps=self.args.numEps, et=eps_time.avg,
                                                                                                               total=bar.elapsed_td, eta=bar.eta_td)
                    bar.next()
                bar.finish()

                # save the iteration examples to the history 
                self.trainExamplesHistory.append(iterationTrainExamples)
                
            if len(self.trainExamplesHistory) > self.args.numItersForTrainExamplesHistory:
                print("len(trainExamplesHistory) =", len(self.trainExamplesHistory), " => remove the oldest trainExamples")
                self.trainExamplesHistory.pop(0)
            # backup history to a file
            # NB! the examples were collected using the model from the previous iteration, so (i-1)  
            self.saveTrainExamples(i-1)
            
            # shuffle examlpes before training
            trainExamples = []
            for e in self.trainExamplesHistory:
                trainExamples.extend(e)
            shuffle(trainExamples)

            tempfile =  'temp.pth.tar'
            bestfile =  'best.pth.tar'

            # training new network, keeping a copy of the old one
            self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=tempfile)
            self.nnet.train(trainExamples)

            if self.arenaEnabled:
                self.pnet.load_checkpoint(folder=self.args.checkpoint, filename=tempfile)

                pmcts = MCTS(self.pnet, self.args)
                nmcts = MCTS(self.nnet, self.args)

                print('PITTING AGAINST PREVIOUS VERSION')
                # arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)),
                #               lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game)
                arena = Arena(lambda x, y: pmcts.getActionProb(x, y, temp=0),
                           lambda x, y: nmcts.getActionProb(x, y, temp=0), self.game)
                pwins, nwins, draws = arena.playGames(self.args.arenaCompare)

                print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws))
                if pwins+nwins > 0 and float(nwins)/(pwins+nwins) < self.args.updateThreshold:
                    print('REJECTING NEW MODEL')
                    self.nnet.load_checkpoint(folder=self.args.checkpoint, filename=tempfile)
                else:
                    print('ACCEPTING NEW MODEL')
                    self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i))
                    self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=bestfile)
                # self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i))
                # self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=bestfile)

    def getCheckpointFile(self, iteration):
        return 'checkpoint_' + str(iteration) + '.pth.tar'

    def saveTrainExamples(self, iteration):
        folder = self.args.checkpoint
        if not os.path.exists(folder):
            os.makedirs(folder)
        filename = os.path.join(folder, self.getCheckpointFile(iteration)+".examples")
        with open(filename, "wb+") as f:
            Pickler(f).dump(self.trainExamplesHistory)
        f.closed

    def loadTrainExamples(self):
        modelFile = os.path.join(self.args.load_folder, self.args.load_file)
        examplesFile = modelFile+".examples"
        if not os.path.isfile(examplesFile):
            print(examplesFile)
            r = input("File with trainExamples not found. Continue? [y|n]")
            if r != "y":
                sys.exit()
        else:
            print("File with trainExamples found. Read it.")
            with open(examplesFile, "rb") as f:
                self.trainExamplesHistory = Unpickler(f).load()
            f.closed
            # examples based on the model were already collected (loaded)
            self.skipFirstSelfPlay = True
Ejemplo n.º 5
0
    def learn(self):
        """
        Performs numIters iterations with numEps episodes of self-play in each
        iteration. After every iteration, it retrains neural network with
        examples in trainExamples (which has a maximium length of maxlenofQueue).
        It then pits the new neural network against the old one and accepts it
        only if it wins >= updateThreshold fraction of games.
        """

        for i in range(1, self.args.numIters+1):
            # bookkeeping
            print('------ITER ' + str(i) + '------')
            print(str(self.game.innerN) + "x" + str(self.game.innerM))
            # examples of the iteration
            if not self.skipFirstSelfPlay or i > 1:
                iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue)

                eps_time = AverageMeter()
                bar = Bar('Self Play', max=self.args.numEps)
                end = time.time()
    
                for eps in range(self.args.numEps):
                    # self.mcts = MCTS(self.game, self.nnet, self.args)   # reset search tree
                    self.mcts = MCTS(self.nnet, self.args)   # reset search tree
                    iterationTrainExamples += self.executeEpisode()

    
                    # bookkeeping + plot progress
                    eps_time.update(time.time() - end)
                    end = time.time()
                    bar.suffix  = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format(eps=eps+1, maxeps=self.args.numEps, et=eps_time.avg,
                                                                                                               total=bar.elapsed_td, eta=bar.eta_td)
                    bar.next()
                bar.finish()

                # save the iteration examples to the history 
                self.trainExamplesHistory.append(iterationTrainExamples)
                
            if len(self.trainExamplesHistory) > self.args.numItersForTrainExamplesHistory:
                print("len(trainExamplesHistory) =", len(self.trainExamplesHistory), " => remove the oldest trainExamples")
                self.trainExamplesHistory.pop(0)
            # backup history to a file
            # NB! the examples were collected using the model from the previous iteration, so (i-1)  
            self.saveTrainExamples(i-1)
            
            # shuffle examlpes before training
            trainExamples = []
            for e in self.trainExamplesHistory:
                trainExamples.extend(e)
            shuffle(trainExamples)

            tempfile =  'temp.pth.tar'
            bestfile =  'best.pth.tar'

            # training new network, keeping a copy of the old one
            self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=tempfile)
            self.nnet.train(trainExamples)

            if self.arenaEnabled:
                self.pnet.load_checkpoint(folder=self.args.checkpoint, filename=tempfile)

                pmcts = MCTS(self.pnet, self.args)
                nmcts = MCTS(self.nnet, self.args)

                print('PITTING AGAINST PREVIOUS VERSION')
                # arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)),
                #               lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game)
                arena = Arena(lambda x, y: pmcts.getActionProb(x, y, temp=0),
                           lambda x, y: nmcts.getActionProb(x, y, temp=0), self.game)
                pwins, nwins, draws = arena.playGames(self.args.arenaCompare)

                print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws))
                if pwins+nwins > 0 and float(nwins)/(pwins+nwins) < self.args.updateThreshold:
                    print('REJECTING NEW MODEL')
                    self.nnet.load_checkpoint(folder=self.args.checkpoint, filename=tempfile)
                else:
                    print('ACCEPTING NEW MODEL')
                    self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i))
                    self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=bestfile)
Ejemplo n.º 6
0
from MCTS import MCTS
from connect4.Connect4Game import Connect4Game, display
from connect4.Connect4Players import HumanConnect4Player
from connect4.tensorflows.NNet import NNetWrapper as NNet
from utils import dotdict
import numpy as np

if __name__ == '__main__':
    goingFirst = True
    folder = "H:\\alpha-zero-trained\\final\\h2\\mcts_visits_tanh\\default\\1\\"

    game = Connect4Game()
    nn = NNet(game)
    nn.load_checkpoint(folder, 'best.pth.tar')
    args = dotdict({'numMCTSSims': 25, 'cpuct': 1})
    mcts1 = MCTS(game, nn, args)
    AI = lambda x: np.argmax(mcts1.getActionProb(x, temp=0))

    human = HumanConnect4Player(game).play

    if goingFirst:
        players = [AI, None, human]
    else:
        players = [human, None, AI]

    curPlayer = 1
    board = game.getInitBoard()
    while game.getGameEnded(board, curPlayer) == 0:
        display(board, symbols=True)

        action = players[curPlayer + 1](game.getCanonicalForm(
Ejemplo n.º 7
0
class Coach():
    """
    This class executes the self-play + learning. It uses the functions defined
    in Game and NeuralNet. args are specified in main.py.
    """
    def __init__(self, game, nnet, args):
        self.game = game
        self.nnet = nnet
        self.pnet = self.nnet.__class__(self.game)  # the competitor network
        self.args = args
        self.mcts = MCTS(self.game, self.nnet, self.args)
        self.trainExamplesHistory = [
        ]  # history of examples from args.numItersForTrainExamplesHistory latest iterations
        self.skipFirstSelfPlay = False  # can be overriden in loadTrainExamples()

    def executeEpisode(self):
        """
        This function executes one episode of self-play, starting with player 1.
        As the game is played, each turn is added as a training example to
        trainExamples. The game is played till the game ends. After the game
        ends, the outcome of the game is used to assign values to each example
        in trainExamples.

        It uses a temp=1 if episodeStep < tempThreshold, and thereafter
        uses temp=0.

        Returns:
            trainExamples: a list of examples of the form (canonicalBoard,pi,v)
                           pi is the MCTS informed policy vector, v is +1 if
                           the player eventually won the game, else -1.
        """
        trainExamples = []
        board = self.game.getInitBoard()
        self.curPlayer = 1
        episodeStep = 0

        while True:
            episodeStep += 1
            canonicalBoard = self.game.getCanonicalForm(board, self.curPlayer)
            temp = int(episodeStep < self.args.tempThreshold)

            pi = self.mcts.getActionProb(canonicalBoard, temp=temp)
            sym = self.game.getSymmetries(canonicalBoard, pi)
            for b, p in sym:
                trainExamples.append([b, self.curPlayer, p, None])

            action = np.random.choice(len(pi), p=pi)
            board, self.curPlayer = self.game.getNextState(
                board, self.curPlayer, action)

            r = self.game.getGameEnded(board, self.curPlayer)

            if r != 0:
                return [(x[0], x[2], r * ((-1)**(x[1] != self.curPlayer)))
                        for x in trainExamples]

    def learn(self):
        """
        Performs numIters iterations with numEps episodes of self-play in each
        iteration. After every iteration, it retrains neural network with
        examples in trainExamples (which has a maximium length of maxlenofQueue).
        It then pits the new neural network against the old one and accepts it
        only if it wins >= updateThreshold fraction of games.
        """

        for i in range(1, self.args.numIters + 1):
            # bookkeeping
            print('------ITER ' + str(i) + '------')
            # examples of the iteration
            if not self.skipFirstSelfPlay or i > 1:
                iterationTrainExamples = deque([],
                                               maxlen=self.args.maxlenOfQueue)

                eps_time = AverageMeter()
                bar = Bar('Self Play', max=self.args.numEps)
                end = time.time()

                for eps in range(self.args.numEps):
                    self.mcts = MCTS(self.game, self.nnet,
                                     self.args)  # reset search tree
                    iterationTrainExamples += self.executeEpisode()

                    # bookkeeping + plot progress
                    eps_time.update(time.time() - end)
                    end = time.time()
                    bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format(
                        eps=eps + 1,
                        maxeps=self.args.numEps,
                        et=eps_time.avg,
                        total=bar.elapsed_td,
                        eta=bar.eta_td)
                    bar.next()
                bar.finish()

                # save the iteration examples to the history
                self.trainExamplesHistory.append(iterationTrainExamples)

            if len(self.trainExamplesHistory
                   ) > self.args.numItersForTrainExamplesHistory:
                print("len(trainExamplesHistory) =",
                      len(self.trainExamplesHistory),
                      " => remove the oldest trainExamples")
                self.trainExamplesHistory.pop(0)
            # backup history to a file
            # NB! the examples were collected using the model from the previous iteration, so (i-1)
            self.saveTrainExamples(i - 1)

            # shuffle examlpes before training
            trainExamples = []
            for e in self.trainExamplesHistory:
                trainExamples.extend(e)
            shuffle(trainExamples)

            # training new network, keeping a copy of the old one
            self.nnet.save_checkpoint(folder=self.args.checkpoint,
                                      filename='temp.pth.tar')
            self.pnet.load_checkpoint(folder=self.args.checkpoint,
                                      filename='temp.pth.tar')
            pmcts = MCTS(self.game, self.pnet, self.args)

            self.nnet.train(trainExamples)
            nmcts = MCTS(self.game, self.nnet, self.args)

            print('PITTING AGAINST PREVIOUS VERSION')
            arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)),
                          lambda x: np.argmax(nmcts.getActionProb(x, temp=0)),
                          self.game)
            pwins, nwins, draws = arena.playGames(self.args.arenaCompare)

            print('NEW/PREV WINS : %d / %d ; DRAWS : %d' %
                  (nwins, pwins, draws))
            if pwins + nwins > 0 and float(nwins) / (
                    pwins + nwins) < self.args.updateThreshold:
                print('REJECTING NEW MODEL')
                self.nnet.load_checkpoint(folder=self.args.checkpoint,
                                          filename='temp.pth.tar')
            else:
                print('ACCEPTING NEW MODEL')
                self.nnet.save_checkpoint(folder=self.args.checkpoint,
                                          filename=self.getCheckpointFile(i))
                self.nnet.save_checkpoint(folder=self.args.checkpoint,
                                          filename='best.pth.tar')

    def getCheckpointFile(self, iteration):
        return 'checkpoint_' + str(iteration) + '.pth.tar'

    def saveTrainExamples(self, iteration):
        folder = self.args.checkpoint
        if not os.path.exists(folder):
            os.makedirs(folder)
        filename = os.path.join(
            folder,
            self.getCheckpointFile(iteration) + ".examples")
        with open(filename, "wb+") as f:
            Pickler(f).dump(self.trainExamplesHistory)
        f.closed

    def loadTrainExamples(self):
        modelFile = os.path.join(self.args.load_folder_file[0],
                                 self.args.load_folder_file[1])
        examplesFile = modelFile + ".examples"
        if not os.path.isfile(examplesFile):
            print(examplesFile)
            r = input("File with trainExamples not found. Continue? [y|n]")
            if r != "y":
                sys.exit()
        else:
            print("File with trainExamples found. Read it.")
            with open(examplesFile, "rb") as f:
                self.trainExamplesHistory = Unpickler(f).load()
            f.closed
            # examples based on the model were already collected (loaded)
            self.skipFirstSelfPlay = True
Ejemplo n.º 8
0
class Coach():
    """
    This class executes the self-play + learning. It uses the functions defined
    in Game and NeuralNet. args are specified in main.py.
    """
    def __init__(self, game, nnet, args):
        self.game = game
        self.board = game.getInitBoard()
        self.nnet = nnet
        self.args = args
        self.mcts = MCTS(self.game, self.nnet, self.args)

    def executeEpisode(self):
        """
        This function executes one episode of self-play, starting with player 1.
        As the game is played, each turn is added as a training example to
        trainExamples. The game is played till the game ends. After the game
        ends, the outcome of the game is used to assign values to each example
        in trainExamples.

        It uses a temp=1 if episodeStep < tempThreshold, and thereafter
        uses temp=0.

        Returns:
            trainExamples: a list of examples of the form (canonicalBoard,pi,v)
                           pi is the MCTS informed policy vector, v is +1 if
                           the player eventually won the game, else -1.
        """
        trainExamples = []
        self.board = self.game.getInitBoard()
        self.curPlayer = 1
        episodeStep = 0

        while True:
            episodeStep += 1
            canonicalBoard = self.game.getCanonicalForm(
                self.board, self.curPlayer)
            temp = int(episodeStep < self.args.tempThreshold)

            pi = self.mcts.getActionProb(canonicalBoard, temp=temp)
            sym = self.game.getSymmetries(canonicalBoard, pi)
            for b, p in sym:
                trainExamples.append([b, self.curPlayer, p, None])

            action = np.random.choice(len(pi), p=pi)
            self.board, self.curPlayer = self.game.getNextState(
                self.board, self.curPlayer, action)

            r = self.game.getGameEnded(self.board, self.curPlayer)

            if r != 0:
                return [(x[0], x[2], r * ((-1)**(x[1] != self.curPlayer)))
                        for x in trainExamples]

    def learn(self):
        """
        Performs numIters iterations with numEps episodes of self-play in each
        iteration. After every iteration, it retrains neural network with
        examples in trainExamples (which has a maximium length of maxlenofQueue).
        It then pits the new neural network against the old one and accepts it
        only if it wins >= updateThreshold fraction of games.
        """

        trainExamples = deque([], maxlen=self.args.maxlenOfQueue)
        for i in range(self.args.numIters):
            # bookkeeping
            print('------ITER ' + str(i + 1) + '------')
            eps_time = AverageMeter()
            bar = Bar('Self Play', max=self.args.numEps)
            end = time.time()

            for eps in range(self.args.numEps):
                trainExamples += self.executeEpisode()

                # bookkeeping + plot progress
                eps_time.update(time.time() - end)
                end = time.time()
                bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format(
                    eps=eps + 1,
                    maxeps=self.args.numEps,
                    et=eps_time.avg,
                    total=bar.elapsed_td,
                    eta=bar.eta_td)
                bar.next()
            bar.finish()

            # training new network, keeping a copy of the old one
            self.nnet.save_checkpoint(folder=self.args.checkpoint,
                                      filename='temp.pth.tar')
            pnet = self.nnet.__class__(self.game)
            pnet.load_checkpoint(folder=self.args.checkpoint,
                                 filename='temp.pth.tar')
            pmcts = MCTS(self.game, pnet, self.args)
            self.nnet.train(trainExamples)
            nmcts = MCTS(self.game, self.nnet, self.args)

            print('PITTING AGAINST PREVIOUS VERSION')
            arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)),
                          lambda x: np.argmax(nmcts.getActionProb(x, temp=0)),
                          self.game)
            pwins, nwins = arena.playGames(self.args.arenaCompare)

            print('NEW/PREV WINS : ' + str(nwins) + '/' + str(pwins))
            if float(nwins) / (pwins + nwins) < self.args.updateThreshold:
                print('REJECTING NEW MODEL')
                self.nnet = pnet

            else:
                print('ACCEPTING NEW MODEL')
                self.nnet.save_checkpoint(folder=self.args.checkpoint,
                                          filename='checkpoint_' + str(i) +
                                          '.pth.tar')
                self.nnet.save_checkpoint(folder=self.args.checkpoint,
                                          filename='best.pth.tar')
                self.mcts = MCTS(self.game, self.nnet,
                                 self.args)  # reset search tree
Ejemplo n.º 9
0
class Coach():
    """
    This class executes the self-play + learning. It uses the functions defined
    in Game and NeuralNet. args are specified in main.py.
    """
    def __init__(self, game, white_nnet, black_nnet, args):
        self.game = game
        self.white_nnet = white_nnet
        self.black_nnet = black_nnet
        self.white_pnet = self.white_nnet.__class__(self.game)  # the competitor network
        self.black_pnet = self.black_nnet.__class__(self.game)
        self.args = args
        self.mcts = MCTS(self.game, self.white_nnet, self.black_nnet, self.args)
        # self.trainExamplesHistory = []  ###########
        self.trainExamplesHistory_white = []    # history of examples from args.numItersForTrainExamplesHistory latest iterations
        self.trainExamplesHistory_black = []    # history of examples from args.numItersForTrainExamplesHistory latest iterations

    def executeEpisode(self):
        """
        This function executes one episode of self-play, starting with player 1.
        As the game is played, each turn is added as a training example to
        trainExamples. The game is played till the game ends. After the game
        ends, the outcome of the game is used to assign values to each example
        in trainExamples.

        It uses a temp=1 if episodeStep < tempThreshold, and thereafter
        uses temp=0.

        Returns:
            trainExamples: a list of examples of the form (canonicalBoard,pi,v)
                           pi is the MCTS informed policy vector, v is +1 if
                           the player eventually won the game, else -1.
        """
        trainExamples_white = []
        trainExamples_black = []
        # trainExamples = []
        board = self.game.getInitBoard()
        self.curPlayer = 1
        episodeStep = 0

        while True:
            episodeStep += 1
            # print("turn " + str(episodeStep))
            canonicalBoard = self.game.getCanonicalForm(board, self.curPlayer)
            temp = int(episodeStep < self.args.tempThreshold)

            try:
                pi = self.mcts.getActionProb(canonicalBoard, self.curPlayer, temp=temp)
            except ZeroDivisionError:
                print("ZeroDivisionError while building training example. continue with next iteration")
                return [], []
            sym = self.game.getSymmetries(canonicalBoard, pi, canonicalBoard.king_position)

            player_train_examples = trainExamples_white if self.curPlayer == Player.white else trainExamples_black
            for b,p, scalar_values in sym:
                player_train_examples.append([b, self.curPlayer, p, scalar_values])

            action = np.random.choice(len(pi), p=pi)
            if action == 0:
                print(pi)

            board.print_game_over_reason = False
            board, self.curPlayer = self.game.getNextState(board, self.curPlayer, action)
            board.print_game_over_reason = False

            r = self.game.getGameEnded(board, self.curPlayer)

            if r!=0:
                # if board.outcome == Outcome.black:
                #     print(" black wins")
                return [(x[0],x[2],r*((-1)**(x[1]!=self.curPlayer)), x[3]) for x in trainExamples_white], \
                       [(x[0],x[2],r*((-1)**(x[1]!=self.curPlayer)), x[3]) for x in trainExamples_black]

    def learn(self):
        """
        Performs numIters iterations with numEps episodes of self-play in each
        iteration. After every iteration, it retrains neural network with
        examples in trainExamples (which has a maximium length of maxlenofQueue).
        It then pits the new neural network against the old one and accepts it
        only if it wins >= updateThreshold fraction of games.
        """

        self.game.prune_prob = self.args.prune_starting_prob
        train_black = self.args.train_black_first

        for i in range(1, self.args.numIters+1):
            # bookkeeping
            print('------ITER ' + str(i) + '------')
            # examples of the iteration
            if not self.args.skip_first_self_play or i>1:
                iterationTrainExamples_white = deque([], maxlen=self.args.maxlenOfQueue)
                iterationTrainExamples_black = deque([], maxlen=self.args.maxlenOfQueue)
    
                eps_time = AverageMeter()
                bar = Bar('Self Play', max=self.args.numEps)
                end = time.time()

                if self.args.profile_coach:
                    prof = cProfile.Profile()
                    prof.enable()

                for eps in range(self.args.numEps):
                    self.mcts = MCTS(self.game, self.white_nnet, self.black_nnet, self.args)   # reset search tree

                    white_examples, black_examples = self.executeEpisode()

                    iterationTrainExamples_white += white_examples
                    iterationTrainExamples_black += black_examples

                    # bookkeeping + plot progress
                    eps_time.update(time.time() - end)
                    end = time.time()
                    bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format(eps=eps + 1, maxeps=self.args.numEps, et=eps_time.avg,
                                                                                                               total=bar.elapsed_td, eta=bar.eta_td)
                    bar.next()
                bar.finish()
                if self.args.profile_coach:
                    prof.disable()
                    prof.print_stats(sort=2)

                # save the iteration examples to the history 
                self.trainExamplesHistory_white.append(iterationTrainExamples_white)
                self.trainExamplesHistory_black.append(iterationTrainExamples_black)
                
            while len(self.trainExamplesHistory_white) > self.args.numItersForTrainExamplesHistory:
                print("len(trainExamplesHistory) =", len(self.trainExamplesHistory_white), " => remove the oldest trainExamples")
                self.trainExamplesHistory_white.pop(0)
                self.trainExamplesHistory_black.pop(0)
            # backup history to a file
            # NB! the examples were collected using the model from the previous iteration, so (i-1)  
            self.saveTrainExamples(i-1)

            # training new network, keeping a copy of the old one
            self.white_nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp_white.pth.tar')
            self.black_nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp_black.pth.tar')
            self.white_pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_white.pth.tar')
            self.black_pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_black.pth.tar')

            pmcts = MCTS(self.game, self.white_pnet, self.black_pnet, self.args)

            if not self.args.train_both:
                if train_black:
                    # shuffle examples before training
                    trainExamples = []
                    for e in self.trainExamplesHistory_black:
                        trainExamples.extend(e)
                    shuffle(trainExamples)
                    self.black_nnet.train(trainExamples)
                else:
                    # shuffle examples before training
                    trainExamples = []
                    for e in self.trainExamplesHistory_white:
                        trainExamples.extend(e)
                    shuffle(trainExamples)
                    self.white_nnet.train(trainExamples)
            else:
                # shuffle examples before training
                trainExamples = []
                for e in self.trainExamplesHistory_black:
                    trainExamples.extend(e)
                shuffle(trainExamples)
                self.black_nnet.train(trainExamples)

                # shuffle examples before training
                trainExamples = []
                for e in self.trainExamplesHistory_white:
                    trainExamples.extend(e)
                shuffle(trainExamples)
                self.white_nnet.train(trainExamples)

            nmcts = MCTS(self.game, self.white_nnet, self.black_nnet, self.args)

            print('PITTING AGAINST PREVIOUS VERSION')
            arena = Arena(lambda board, turn_player: np.argmax(pmcts.getActionProb(board, turn_player, temp=0)),
                          lambda board, turn_player: np.argmax(nmcts.getActionProb(board, turn_player, temp=0)),
                          self.game)
            pwins, nwins, draws, pwins_white, pwins_black, nwins_white, nwins_black \
                = arena.playGames(self.args.arenaCompare, self.args.profile_arena)

            print('NEW/PREV WINS (white, black) : (%d,%d) / (%d,%d) ; DRAWS : %d' % (nwins_white, nwins_black, pwins_white, pwins_black, draws))

            if pwins+nwins == 0 or float(nwins)/(pwins+nwins) < self.args.updateThreshold \
                    or nwins_black < pwins_black or nwins_white < pwins_white:
                print('REJECTING NEW MODEL')
                if not self.args.train_both:
                    if train_black:
                        self.black_nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_black.pth.tar')
                    else:
                        self.white_nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_white.pth.tar')
                else:
                    self.black_nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_black.pth.tar')
                    self.white_nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_white.pth.tar')
            else:
                print('ACCEPTING NEW MODEL')
                if not self.args.train_both:
                    if train_black:
                        # self.black_nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i, Player.black))
                        self.black_nnet.save_checkpoint(folder=self.args.checkpoint, filename='best_black.pth.tar')
                        # if nwins_white == 0 or nwins_black / nwins_white >= self.args.train_other_network_threshold:
                        #     train_black = False
                        print("training white neural net next")
                        train_black = False
                    else:
                        # self.white_nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i, Player.white))
                        self.white_nnet.save_checkpoint(folder=self.args.checkpoint, filename='best_white.pth.tar')
                        # if nwins_black == 0 or nwins_white / nwins_black > self.args.train_other_network_threshold:
                        #     train_black = True
                        print("training black neural net next")
                        train_black = True
                else:
                    self.black_nnet.save_checkpoint(folder=self.args.checkpoint, filename='best_black.pth.tar')
                    self.white_nnet.save_checkpoint(folder=self.args.checkpoint, filename='best_white.pth.tar')
                self.game.prune_prob += self.args.prune_prob_gain_per_iteration
                self.args.arenaCompare = math.floor(self.args.arenaCompare * 1.05)
            # self.args.numEps = math.floor(self.args.numEps * 1.1)
            self.args.numMCTSSims = math.floor(self.args.numMCTSSims * 1.1)
            print("prune probability: " + str(self.game.prune_prob) + ", episodes: " + str(self.args.numEps) +
                  ", sims: " + str(self.args.numMCTSSims) + ", arena compare: " + str(self.args.arenaCompare))

    def getCheckpointFile(self, iteration, player=None):
        return 'checkpoint_' + ('white_' if player == Player.white else 'black_' if player == Player.black else '') + str(iteration) + '.pth.tar'

    def saveTrainExamples(self, iteration):
        folder = self.args.checkpoint
        if not os.path.exists(folder):
            os.makedirs(folder)
        filename_white = os.path.join(folder, "training_white.examples")
        filename_black = os.path.join(folder, "training_black.examples")
        with open(filename_white, "wb+") as f:
            Pickler(f).dump(self.trainExamplesHistory_white)
        with open(filename_black, "wb+") as f:
            Pickler(f).dump(self.trainExamplesHistory_black)

    def loadTrainExamples(self):
        folder = self.args.checkpoint
        filename_white = os.path.join(folder, "training_white.examples")
        filename_black = os.path.join(folder, "training_black.examples")
        if not os.path.isfile(filename_white) or not os.path.isfile(filename_black):
            print(filename_white)
            print(filename_black)
            r = input("File with trainExamples not found. Continue? [y|n]")
            if r != "y":
                sys.exit()
        else:
            print("File with trainExamples found. Read it.")
            with open(filename_white, "rb") as f:
                self.trainExamplesHistory_white = Unpickler(f).load()
            with open(filename_black, "rb") as f:
                self.trainExamplesHistory_black = Unpickler(f).load()
            # examples based on the model were already collected (loaded)

    def load_expert_examples(self):
        white, black = read_data(self.args)
        self.trainExamplesHistory_white.extend(white)
        self.trainExamplesHistory_black.extend(black)
Ejemplo n.º 10
0
from Shared.Functions import xy_to_index


class Model:
    def eval(self, board):
        P = np.ones(26) / 26
        V = 0.1
        return P, V


model = Model()

boards = np.array([[[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0],
                    [0, 1, 0, 0, 0], [0, 0, 0, 0, 0]],
                   [[0, 0, 0, 0, 0], [0, -1, 0, 0, 0], [0, 0, 0, 0, 0],
                    [0, 1, 0, 0, 0], [0, 0, 0, 0, 0]]])

P = np.ones(26) / 26
player = BLACK

mcts = MCTS(model, player, start_boards=boards)

pi = mcts.search_for_pi(iterations=250)
print(pi)

move = np.random.choice(len(pi), p=pi)
mcts.set_move(move)
print(move)

pi = mcts.search_for_pi(iterations=250)
print(pi)
Ejemplo n.º 11
0
 def __init__(self, game, nnet, args):
     self.game = game
     self.board = game.getInitBoard()
     self.nnet = nnet
     self.args = args
     self.mcts = MCTS(self.game, self.nnet, self.args)
Ejemplo n.º 12
0
    def learn(self):
        """
        Performs numIters iterations with numEps episodes of self-play in each
        iteration. After every iteration, it retrains neural network with
        examples in trainExamples (which has a maximum length of maxlenofQueue).
        It then pits the new neural network against the old one and accepts it
        only if it wins >= updateThreshold fraction of games.
        """

        for i in range(1, self.args.numIters + 1):
            # bookkeeping
            log.info(f'Starting Iter #{i} ...')
            # examples of the iteration
            if not self.skipFirstSelfPlay or i > 1:
                iterationTrainExamples = deque([],
                                               maxlen=self.args.maxlenOfQueue)

                for _ in tqdm(range(self.args.numEps), desc="Self Play"):
                    self.mcts = MCTS(self.game, self.nnet,
                                     self.args)  # reset search tree
                    iterationTrainExamples += self.executeEpisode()

                # save the iteration examples to the history
                self.trainExamplesHistory.append(iterationTrainExamples)

            if len(self.trainExamplesHistory
                   ) > self.args.numItersForTrainExamplesHistory:
                log.warning(
                    f"Removing the oldest entry in trainExamples. len(trainExamplesHistory) = {len(self.trainExamplesHistory)}"
                )
                self.trainExamplesHistory.pop(0)
            # backup history to a file
            # NB! the examples were collected using the model from the previous iteration, so (i-1)
            self.saveTrainExamples(i - 1)

            # shuffle examples before training
            trainExamples = []
            for e in self.trainExamplesHistory:
                trainExamples.extend(e)
            shuffle(trainExamples)

            # training new network, keeping a copy of the old one
            self.nnet.save_checkpoint(folder=self.args.checkpoint,
                                      filename='temp.pth.tar')
            self.pnet.load_checkpoint(folder=self.args.checkpoint,
                                      filename='temp.pth.tar')
            pmcts = MCTS(self.game, self.pnet, self.args)

            self.nnet.train(trainExamples)
            nmcts = MCTS(self.game, self.nnet, self.args)

            log.info('PITTING AGAINST PREVIOUS VERSION')
            arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)),
                          lambda x: np.argmax(nmcts.getActionProb(x, temp=0)),
                          self.game)
            pwins, nwins, draws = arena.playGames(self.args.arenaCompare)

            log.info('NEW/PREV WINS : %d / %d ; DRAWS : %d' %
                     (nwins, pwins, draws))
            if pwins + nwins == 0 or float(nwins) / (
                    pwins + nwins) < self.args.updateThreshold:
                log.info('REJECTING NEW MODEL')
                self.nnet.load_checkpoint(folder=self.args.checkpoint,
                                          filename='temp.pth.tar')
            else:
                log.info('ACCEPTING NEW MODEL')
                self.nnet.save_checkpoint(folder=self.args.checkpoint,
                                          filename=self.getCheckpointFile(i))
                self.nnet.save_checkpoint(folder=self.args.checkpoint,
                                          filename='best.pth.tar')
Ejemplo n.º 13
0
    def evaluate_trained(self):
        if self.verbose >= 1:
            print("Evaluate")

        balance = 0 #+ if trained, - if best won more
        
        #best begins
        for i in range(EVAL_NR//2):
            if self.verbose >= 2:
                print("i: ",i)
            g  = Game()
            nr = 0
            while (g.winner==0):
                if nr%2==0:
                    mcts = MCTS(g, self.model_best.evaluate)
                    x,y,_ = mcts.select_move()
                    g.move(x,y)
                else:
                    mcts = MCTS(g, self.model_trained.evaluate)
                    x,y,_ = mcts.select_move()
                    g.move(x,y)
                nr+=1
                if i==0 and self.verbose>=4:
                    g.print()
                
            if g.winner == 1:
                balance -= 1
            elif g.winner == 2:
                balance += 1

            if self.verbose >= 2:
                print("Result:", g.winner)

        #trained begins
        for i in range(EVAL_NR//2):
            if self.verbose >= 2:
                print("i: ",i)
            g  = Game()
            nr = 0
            while (g.winner==0):
                if nr%2==0:
                    mcts = MCTS(g, self.model_trained.evaluate)
                    x,y,_ = mcts.select_move()
                    g.move(x,y)
                else:
                    mcts = MCTS(g, self.model_best.evaluate)
                    x,y,_ = mcts.select_move()
                    g.move(x,y)
                nr+=1
                if i==0 and self.verbose>=4:
                    g.print()
            if g.winner == 1:
                balance += 1
            elif g.winner == 2:
                balance -= 1

            if self.verbose >= 2:
                print("Result:", g.winner)

                
        if self.verbose >= 1:
            print("Total result: ",balance," Thr.: ",THRESHOLD)
                
        #return balance >= THRESHOLD
        return balance >= THRESHOLD
           
        
        
Ejemplo n.º 14
0
def self_play(strategy, read_file=None):
    n = PolicyNetwork(use_cpu=True)
    if strategy == 'random':
        instance = RandomPlayer()
    elif strategy == 'policy':
        instance = PolicyNetworkBestMovePlayer(n, read_file)
    elif strategy == 'randompolicy':
        instance = PolicyNetworkRandomMovePlayer(n, read_file)
    elif strategy == 'mcts':
        instance = MCTS(n, read_file)
    else:
        sys.stderr.write("Unknown strategy")
        sys.exit()
        #instance神经网络
    gtp_engine = gtp_lib.Engine(instance)
    sys.stderr.write("GTP engine ready\n")
    sys.stderr.flush()

    p1 = -1
    save = ''
    inpt = 'genmove b'
    n = 500
    while n > 0:
        inpt = 'genmove b'
        if n % 2 == 1:
            inpt = 'genmove b'
        else:
            inpt = 'genmove w'
        try:
            cmd_list = inpt.split("\n")
        except:
            cmd_list = [inpt]
        for cmd in cmd_list:
            engine_reply = gtp_engine.send(cmd)
            sys.stdout.write(engine_reply)
            if engine_reply == '= pass\n\n':
                #engine_reply == '= pass\n\n'
                n = 0
            else:
                o1 = ''
                if len(engine_reply) == 7:
                    o1 = engine_reply[3] + engine_reply[4]
                else:
                    o1 = engine_reply[3]

                if n % 2 == 1:
                    o2 = ch.change(engine_reply[2]) + ch.change(o1)
                    save = save + ';B[' + ch.change(
                        engine_reply[2]) + ch.change(o1) + ']'
                else:
                    o2 = ch.change(engine_reply[2]) + ch.change(o1)
                    save = save + ';W[' + ch.change(
                        engine_reply[2]) + ch.change(o1) + ']'

            sys.stdout.flush()

        n = n - 1
    p7 = instance.position.result()
    save2 = '(;GM[1]\n SZ[19]\nPB[go1]\nPW[go2]\nKM[6.50]\nRE[' + p7[0] + ']\n'

    save2 = save2 + save + ')'

    wenjian = ''

    wenjian = str(time.time())
    p3 = '4'
    save_t.make_folder(wenjian + '_selfplay')
    save_t.save_txt(wenjian + '_selfplay', p3, save2)
Ejemplo n.º 15
0
"""Use this script to play manually against a Blooms agent.
"""
import numpy as np

import Arena
from MCTS import MCTS
from blooms.BloomsGame import BloomsGame
from blooms.BloomsPlayers import *
from blooms.pytorch.NNet import NNetWrapper as NNet
from utils import *

# WARNING: The game size and score target should match the chosen agent
game = BloomsGame(size=5, score_target=20)
human = HumanBloomsPlayer(game).play

# WARNING: The chosen agent should match the game size and score target
model = NNet(game)
model.load_checkpoint('./notebooks/results/chkpts_board5_24hrs',
                      'best.pth.tar')

args = dotdict({'numMCTSSims': 100, 'cpuct': 1.0})
mcts = MCTS(game, model, args)
agent = lambda x: np.argmax(mcts.getActionProb(x, temp=0))

arena = Arena.Arena(agent, human, game)

print(arena.playGames(2, verbose=True, display=False))
Ejemplo n.º 16
0
def PlayGame():
    menu_def = [
        ['&File', ['&Do nothing', 'E&xit']],
        ['&Help', '&About...'],
    ]

    # sg.SetOptions(margins=(0,0))
    sg.ChangeLookAndFeel('GreenTan')
    # create initial board setup
    psg_board = copy.deepcopy(initial_board)
    # the main board display layout
    board_layout = [[sg.T('     ')] + [
        sg.T('{}'.format(a), pad=((23, 27), 0), font='Any 13') for a in 'efgh'
    ]]
    # loop though board and create buttons with images
    for i in range(8):
        row = [sg.T(str(8 - i) + '   ', font='Any 13')]
        for j in range(4):
            piece_image = images[psg_board[i][j]]
            row.append(render_square(piece_image, key=(i, j), location=(i, j)))
        row.append(sg.T(str(8 - i) + '   ', font='Any 13'))
        board_layout.append(row)
    # add the labels across bottom of board
    board_layout.append([sg.T('     ')] + [
        sg.T('{}'.format(a), pad=((23, 27), 0), font='Any 13') for a in 'efgh'
    ])

    # setup the controls on the right side of screen
    openings = ('Any', 'Defense', 'Attack', 'Trap', 'Gambit', 'Counter',
                'Sicillian', 'English', 'French', 'Queen\'s openings',
                'King\'s Openings', 'Indian Openings')

    board_controls = [
        [sg.RButton('New Game', key='New Game'),
         sg.RButton('Draw')],
        [sg.RButton('Resign Game'),
         sg.RButton('Set FEN')],
        [sg.RButton('Player Odds'),
         sg.RButton('Training')],
        [sg.Drop(openings), sg.Text('Opening/Style')],
        [sg.CBox('Play As White', key='_white_')],
        [sg.Text('Move List')],
        [
            sg.Multiline([],
                         do_not_clear=True,
                         autoscroll=True,
                         size=(15, 10),
                         key='_movelist_')
        ],
    ]

    # layouts for the tabs
    controls_layout = [[
        sg.Text('Performance Parameters', font='_ 20')
    ], [sg.T('Put stuff like AI engine tuning parms on this tab')]]

    statistics_layout = [[sg.Text('Statistics', font=('_ 20'))],
                         [sg.T('Game statistics go here?')]]

    board_tab = [[sg.Column(board_layout)]]

    # the main window layout
    layout = [[sg.Menu(menu_def, tearoff=False)],
              [
                  sg.TabGroup([[
                      sg.Tab('Board', board_tab),
                      sg.Tab('Controls', controls_layout),
                      sg.Tab('Statistics', statistics_layout)
                  ]],
                              title_color='red'),
                  sg.Column(board_controls)
              ],
              [sg.Text('Click anywhere on board for next move', font='_ 14')]]

    window = sg.Window('Chess',
                       default_button_element_size=(12, 1),
                       auto_size_buttons=False,
                       icon='kingb.ico').Layout(layout)

    g = HalfchessGame.HalfchessGame()
    nn = NNet(g)
    nn.load_checkpoint(nn_filepath, nn_filename)
    args = dotdict({'numMCTSSims': numMCTSSims, 'cpuct': cpuct})
    mcts = MCTS(g, nn, args)
    nnp = lambda x: np.argmax(mcts.getActionProb(x, temp=temp))

    board = g.getInitBoard()
    move_count = curPlayer = 1
    move_state = move_from = move_to = 0
    # ---===--- Loop taking in user input --- #
    while g.getGameEnded(board, curPlayer) == 0:

        canonicalBoard = g.getCanonicalForm(board, curPlayer)

        if curPlayer == human:
            # human_player(board)
            move_state = 0
            while True:
                button, value = window.Read()
                if button in (None, 'Exit'):
                    exit()
                if button == 'New Game':
                    sg.Popup(
                        'You have to restart the program to start a new game... sorry....'
                    )
                    break
                    psg_board = copy.deepcopy(initial_board)
                    redraw_board(window, psg_board)
                    move_state = 0
                    break

                if type(button) is tuple:
                    if move_state == 0:
                        move_from = button
                        row, col = move_from
                        piece = psg_board[row][col]  # get the move-from piece
                        button_square = window.FindElement(key=(row, col))
                        button_square.Update(button_color=('white', 'red'))
                        move_state = 1
                    elif move_state == 1:
                        move_to = button
                        row, col = move_to
                        if move_to == move_from:  # cancelled move
                            color = '#B58863' if (row + col) % 2 else '#F0D9B5'
                            button_square.Update(button_color=('white', color))
                            move_state = 0
                            continue

                        picked_move = '{}{}{}{}'.format(
                            'efgh'[move_from[1]], 8 - move_from[0],
                            'efgh'[move_to[1]], 8 - move_to[0])

                        action = moveset[picked_move]

                        valids = g.getValidMoves(canonicalBoard, 1)

                        if valids[action] != 0:
                            board, curPlayer = g.getNextState(
                                board, curPlayer, action)
                        else:
                            print('Illegal move')
                            move_state = 0
                            color = '#B58863' if (
                                move_from[0] + move_from[1]) % 2 else '#F0D9B5'
                            button_square.Update(button_color=('white', color))
                            continue

                        psg_board[move_from[0]][move_from[
                            1]] = BLANK  # place blank where piece was
                        psg_board[row][
                            col] = piece  # place piece in the move-to square
                        redraw_board(window, psg_board)
                        move_count += 1

                        window.FindElement('_movelist_').Update(picked_move +
                                                                '\n',
                                                                append=True)

                        break
        else:

            best_move = nnp(canonicalBoard)
            move_str = moveset[best_move]

            if curPlayer == -1:
                move_str = HalfchessGame.mirrored_move(move_str)

            from_col = ord(move_str[0]) - ord('e')
            from_row = 8 - int(move_str[1])
            to_col = ord(move_str[2]) - ord('e')
            to_row = 8 - int(move_str[3])

            window.FindElement('_movelist_').Update(move_str + '\n',
                                                    append=True)

            piece = psg_board[from_row][from_col]
            psg_board[from_row][from_col] = BLANK
            psg_board[to_row][to_col] = piece
            redraw_board(window, psg_board)

            board, curPlayer = g.getNextState(board, curPlayer, best_move)
            move_count += 1
    sg.Popup('Game over!', 'Thank you for playing')
Ejemplo n.º 17
0
    def train(self, iteration=None, board=None, numeps=None):

        # bookkeeping

        # examples of the iteration
        numeps = self.args.numEps

        if not self.skipFirstSelfPlay or iteration > 1:
            iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue)

            eps_time = AverageMeter()
            bar = Bar('Self Play', max=numeps)
            end = time.time()
            #for clif_state in self.board
            for eps in range(numeps):
                self.mcts = MCTS(self.game, self.nnet,
                                 self.args)  # reset search tree
                if board is None:
                    iterationTrainExamples += self.executeEpisode()
                else:
                    iterationTrainExamples += self.executeEpisode(board)
                #print iterationTrainExamples[0]

                # bookkeeping + plot progress
                eps_time.update(time.time() - end)
                end = time.time()
                bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format(
                    eps=eps + 1,
                    maxeps=numeps,
                    et=eps_time.avg,
                    total=bar.elapsed_td,
                    eta=bar.eta_td)
                bar.next()
            bar.finish()

            # save the iteration examples to the history
            self.trainExamplesHistory.append(iterationTrainExamples)

        if len(self.trainExamplesHistory
               ) > self.args.numItersForTrainExamplesHistory:
            print("len(trainExamplesHistory) =",
                  len(self.trainExamplesHistory),
                  " => remove the oldest trainExamples")
            self.trainExamplesHistory.pop(0)
        # backup history to a file
        # NB! the examples were collected using the model from the previous iteration, so (i-1)
        self.saveTrainExamples(iteration - 1)

        # shuffle examlpes before training
        trainExamples = []
        for e in self.trainExamplesHistory:
            trainExamples.extend(e)
        shuffle(trainExamples)

        # training new network, keeping a copy of the old one
        self.nnet.save_checkpoint(folder=self.args.checkpoint,
                                  filename='temp.pth.tar')
        self.pnet.load_checkpoint(folder=self.args.checkpoint,
                                  filename='temp.pth.tar')
        pmcts = MCTS(self.game, self.pnet, self.args)

        self.nnet.train(trainExamples)
        nmcts = MCTS(self.game, self.nnet, self.args)

        print('PITTING AGAINST PREVIOUS VERSION')
        arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)),
                      lambda x: np.argmax(nmcts.getActionProb(x, temp=0)),
                      self.game)
        pwins, nwins, draws = arena.playGames(self.args.arenaCompare)

        print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws))
        if pwins + nwins > 0 and float(nwins) / (
                pwins + nwins) < self.args.updateThreshold:
            print('REJECTING NEW MODEL')
            self.nnet.load_checkpoint(folder=self.args.checkpoint,
                                      filename='temp.pth.tar')
        else:
            print('ACCEPTING NEW MODEL')
            self.nnet.save_checkpoint(
                folder=self.args.checkpoint,
                filename=self.getCheckpointFile(iteration))
            self.nnet.save_checkpoint(folder=self.args.checkpoint,
                                      filename='best.pth.tar')
Ejemplo n.º 18
0
 def __init__(self, game, network, playouts):
     self.mcts = MCTS(game, network, playouts)
     self.gd = GameData()
Ejemplo n.º 19
0
import numpy as np
from utils import *
"""
use this script to play any two agents against each other, or play manually with
any agent.
"""

g = OthelloGame(6)

# all players
rp = RandomPlayer(g).play
gp = GreedyOthelloPlayer(g).play
hp = HumanOthelloPlayer(g).play

# nnet players
n1 = NNet(g)
n1.load_checkpoint('./pretrained_models/', '6x100x25_best.pth.tar')
args1 = dotdict({'numMCTSSims': 50, 'cpuct': 1.0})
mcts1 = MCTS(g, n1, args1)
n1p = lambda x: np.argmax(mcts1.getActionProb(x, temp=0))

#n2 = NNet(g)
#n2.load_checkpoint('/dev/8x50x25/','best.pth.tar')
#args2 = dotdict({'numMCTSSims': 25, 'cpuct':1.0})
#mcts2 = MCTS(g, n2, args2)
#n2p = lambda x: np.argmax(mcts2.getActionProb(x, temp=0))

arena = Arena.Arena(n1p, hp, g, display=display)
print(arena.playGames(2, verbose=True))
Ejemplo n.º 20
0
import sys

sys.path.append("../")
from MCTS import MCTS
import numpy as np

game_state = np.zeros((3, 3))
game_state[0, 0] = 1
mcts = MCTS(n_iterations=255,
            depth=10,
            exploration_constant=10,
            game_board=game_state,
            win_mark=3,
            player='O')

print(mcts._is_terminal(game_state))
# leaf_node_id,depth = mcts.selection()
# print(leaf_node_id,depth)
# child_node_id = mcts.expansion(leaf_node_id)
# print(child_node_id)
# winner = mcts.simulation(child_node_id)
# print(winner)
# mcts.backprop(child_node_id,winner)
Ejemplo n.º 21
0
    def learn(self):
        """
        Performs numIters iterations with numEps episodes of self-play in each
        iteration. After every iteration, it retrains neural network with
        examples in trainExamples (which has a maximium length of maxlenofQueue).
        It then pits the new neural network against the old one and accepts it
        only if it wins >= updateThreshold fraction of games.
        """

        trainExamples = deque([], maxlen=self.args.maxlenOfQueue)
        for i in range(self.args.numIters):
            # bookkeeping
            print('------ITER ' + str(i + 1) + '------')
            eps_time = AverageMeter()
            bar = Bar('Self Play', max=self.args.numEps)
            end = time.time()

            for eps in range(self.args.numEps):
                trainExamples += self.executeEpisode()

                # bookkeeping + plot progress
                eps_time.update(time.time() - end)
                end = time.time()
                bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format(
                    eps=eps + 1,
                    maxeps=self.args.numEps,
                    et=eps_time.avg,
                    total=bar.elapsed_td,
                    eta=bar.eta_td)
                bar.next()
            bar.finish()

            # training new network, keeping a copy of the old one
            self.nnet.save_checkpoint(folder=self.args.checkpoint,
                                      filename='temp.pth.tar')
            pnet = self.nnet.__class__(self.game)
            pnet.load_checkpoint(folder=self.args.checkpoint,
                                 filename='temp.pth.tar')
            pmcts = MCTS(self.game, pnet, self.args)
            self.nnet.train(trainExamples)
            nmcts = MCTS(self.game, self.nnet, self.args)

            print('PITTING AGAINST PREVIOUS VERSION')
            arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)),
                          lambda x: np.argmax(nmcts.getActionProb(x, temp=0)),
                          self.game)
            pwins, nwins = arena.playGames(self.args.arenaCompare)

            print('NEW/PREV WINS : ' + str(nwins) + '/' + str(pwins))
            if float(nwins) / (pwins + nwins) < self.args.updateThreshold:
                print('REJECTING NEW MODEL')
                self.nnet = pnet

            else:
                print('ACCEPTING NEW MODEL')
                self.nnet.save_checkpoint(folder=self.args.checkpoint,
                                          filename='checkpoint_' + str(i) +
                                          '.pth.tar')
                self.nnet.save_checkpoint(folder=self.args.checkpoint,
                                          filename='best.pth.tar')
                self.mcts = MCTS(self.game, self.nnet,
                                 self.args)  # reset search tree
Ejemplo n.º 22
0
class Coach:
    """
    This class executes the self-play + learning. It uses the functions defined
    in Game and NeuralNet. args are specified in main.py.
    """
    def __init__(self, game, nnet, args):
        self.game = game
        self.nnet = nnet
        self.pnet = self.nnet.__class__()  # the competitor network
        self.args = args
        self.mcts = MCTS(self.game, self.nnet, self.args)
        self.trainExamplesHistory = [
        ]  # history of examples from args.numItersForTrainExamplesHistory latest iterations
        self.skipFirstSelfPlay = False  # can be overridden in loadTrainExamples()

    def executeEpisode(self, x):
        """
        This function executes one episode of self-play, starting with player 1.
        As the game is played, each turn is added as a training example to
        trainExamples. The game is played till the game ends. After the game
        ends, the outcome of the game is used to assign values to each example
        in trainExamples.

        It uses a temp=1 if episodeStep < tempThreshold, and thereafter
        uses temp=0.

        Returns:
            trainExamples: a list of examples of the form (state,pi,v)
                           pi is the MCTS informed policy vector, v is +1 if
                           the player eventually won the game, else -1.
        """
        logger = logging.getLogger("fireplace")
        logger.setLevel(logging.WARNING)
        trainExamples = []
        #print(id(self.game))
        #game = copy.deepcopy(self.game)
        current_game = self.game.getInitGame()
        #self.mcts = MCTS(self.game, self.nnet, self.args)   # reset search tree -> not needed since we get a copy of self.mcts in this child process, and MCTS itself uses a deep clone of this copy
        #curPlayer = 1 if f'{current_game.current_player}' == 'Player1' else -1
        #self.curPlayer = 1 if f'{current_game.current_player}' == 'Player1' else -1
        self.curPlayer = 1 if current_game.current_player.name == 'Player1' else -1
        #print(id(self.curPlayer))
        episodeStep = 0
        # timing
        # start = time.time()

        while True:
            episodeStep += 1
            # print('---Episode step ' + str(episodeStep) + '--- ' + current_process().name) #os.getpid())
            # print('TIME TAKEN : {0:03f}'.format(time.time()-start))
            # start = time.time()
            state = self.game.getState(
                current_game)  # state is from the current player's perspective
            temp = int(episodeStep < self.args.tempThreshold)

            # TODO: No MCTS reset? Should work because player switch leads to new mirror state and info in tree could possibly be reused. Otherwise starting a new tree could speed things up here (faster lookups through smaller mcts lists)!
            pi = self.mcts.getActionProb(
                state,
                temp=temp)  # pi is from the current player's perspective
            # print(self.mcts)
            pi_reshape = np.reshape(pi, (21, 18))
            # sym = self.game.getSymmetries(state, pi)
            trainExamples.append([state, self.curPlayer, pi,
                                  None])  # TODO: Is None still needed?
            # for b,p in sym:
            #     trainExamples.append([b, self.curPlayer, p, None])
            action = np.random.choice(len(pi), p=pi)
            a, b = np.unravel_index(np.ravel(action,
                                             np.asarray(pi).shape),
                                    pi_reshape.shape)
            next_state, self.curPlayer = self.game.getNextState(
                self.curPlayer, (a[0], b[0]), current_game)

            r = self.game.getGameEnded(
                current_game, self.curPlayer
            )  # returns 0 if game has not ended, 1 if curPlayer won, -1 if curPlayer lost

            if r != 0:
                return [(x[0], x[2], r * ((-1)**(x[1] != self.curPlayer)))
                        for x in trainExamples]

    def learn(self):
        """
        Performs numIters iterations with numEps episodes of self-play in each
        iteration. After every iteration, it retrains neural network with
        examples in trainExamples (which has a maximium length of maxlenofQueue).
        It then pits the new neural network against the old one and accepts it
        only if it wins >= updateThreshold fraction of games.
        """

        for i in range(1, self.args.numIters + 1):
            # bookkeeping
            print('------ITER ' + str(i) + '------')

            # Find and load newest model (name scheme 'x.pth.tar' with highest x)
            modelfile = self.get_newest_model()
            print("Loading newest model:", modelfile)
            nnet.load_checkpoint(args.modelspath, modelfile)

            # examples of the iteration
            if not self.skipFirstSelfPlay or i > 1:
                iterationTrainExamples = deque([],
                                               maxlen=self.args.maxlenOfQueue)

                # with ProcessPoolExecutor(self.args.numThreads) as executor:
                #     results = list(tqdm(executor.map(self.executeEpisode, range(self.args.numEps)), total=self.args.numEps, desc='Self-play matches'))
                # iterationTrainExamples = [r for r in results]

                # with Pool(self.args.numThreads) as pool:
                #     for result in list(tqdm(pool.imap(self.executeEpisode, range(self.args.numEps)), total=self.args.numEps, desc='Self-play matches')):
                #         iterationTrainExamples += result

                for result in parallel_process(self.executeEpisode,
                                               range(self.args.numEps),
                                               workers=self.args.numThreads,
                                               desc='Self-play matches'):
                    iterationTrainExamples += result

                # save the iteration examples to the history
                self.trainExamplesHistory.append(iterationTrainExamples)

            # if len(self.trainExamplesHistory) > self.args.numItersForTrainExamplesHistory:
            #     print("len(trainExamplesHistory) =", len(self.trainExamplesHistory), " => remove the oldest trainExamples")
            #     self.trainExamplesHistory.pop(0)
            # backup history to a file
            # NB! the examples were collected using the model from the previous iteration, so (i-1)
            self.saveTrainExamples(modelfile, i - 1)

    def getCheckpointFile(self, modelfile, iteration):
        return modelfile + '_' + str(iteration)

    def saveTrainExamples(self, modelfile, iteration):
        folder = self.args.examplespath
        if not os.path.exists(folder):
            os.makedirs(folder)
        filename = os.path.join(
            folder,
            self.getCheckpointFile(modelfile, iteration) + ".examples")
        with open(filename, "wb+") as f:
            Pickler(f, protocol=pickle.HIGHEST_PROTOCOL).dump(
                self.trainExamplesHistory)
        f.closed

    # def loadTrainExamples(self):
    #     modelFile = os.path.join(self.args.load_folder_file[0], self.args.load_folder_file[1])
    #     examplesFile = modelFile+".examples"
    #     if not os.path.isfile(examplesFile):
    #         print(examplesFile)
    #         r = input("File with trainExamples not found. Continue? [y|n]")
    #         if r != "y":
    #             sys.exit()
    #     else:
    #         print("File with trainExamples found. Read it.")
    #         with open(examplesFile, "rb") as f:
    #             self.trainExamplesHistory = Unpickler(f).load()
    #         f.closed
    #         # examples based on the model were already collected (loaded)
    #         self.skipFirstSelfPlay = True

    def list_files(self, directory, extension):
        return (f for f in os.listdir(directory)
                if f.endswith('.' + extension))

    def get_modelnumber(self, filename):
        file_name = os.path.basename(filename)
        index_of_dot = file_name.index('.')
        file_name_without_extension = file_name[:index_of_dot]
        return int(file_name_without_extension)

    def get_newest_model(self):
        files = self.list_files(args.modelspath, "pth.tar")
        modelnumbers = list(map(self.get_modelnumber, files))
        maximum = max(modelnumbers)
        return str(maximum) + ".pth.tar"
Ejemplo n.º 23
0
from datetime import datetime
"""
use this script to play any two agents against each other, or play manually with
any agent.
"""
g = AzulGame(shouldRandomize=False)

# all players
n1 = NNet(g)
n2 = NNet(g)

n1.load_checkpoint('./temp/', 'best.pth.tar')
n2.load_checkpoint('./temp/', 'best.pth.tar')

args = dotdict({'numMCTSSims': 50, 'cpuct': 1.0})
mcts = MCTS(g, n1, args)
mcts2 = MCTS(g, n2, args)
n1p = lambda x: np.argmax(mcts.getActionProb(x, temp=0))
n2p = lambda x: np.argmax(mcts2.getActionProb(x, temp=0))

player1 = n1p
player2 = n2p

numberGames = 1
verbose = False

conn = sqlite3.connect("AzulGamesSite/AzulGameViewer/db.sqlite3")

for _ in range(numberGames):
    curTime = datetime.now().strftime("%m/%d/%y - %H:%M:%S")
    gameID = conn.cursor().execute(
Ejemplo n.º 24
0
res = {'random': {}, 'abp1': {}, 'abp2': {}, 'abp3': {}}

num = 10
cps = [1, 2, 5, 9, 17, 24, 36, 50, 63, 74, 85, 95, 99]
full_cps = [1, 2, 3, 5, 7, 8, 9, 11, 13, 17, 21, 24, 28, 29, 30, 31, 33, 36, 38,\
   39, 40, 41, 42, 44, 48, 50 ,57, 59, 60, 61, 63, 67, 68, 69, 71, 72, 73,\
   74, 78, 79, 85, 89, 91, 95, 99]
cur_cps = [36, 38, 39, 40, 41, 42, 44, 48, 50 ,57, 59, 60, 61, 63, 67, 68, 69, 71,\
     72, 73, 74, 78, 79, 85, 89, 91, 95, 99]

for cp in cur_cps:
    n1 = NNet(g)
    n1.load_checkpoint('./pretrained_models/hex/pytorch/temp/',
                       'Copy of checkpoint_{}.pth.tar'.format(cp))
    args1 = dotdict({'numMCTSSims': 50, 'cpuct': 1.0})
    mcts = MCTS(g, n1, args1)
    azp = lambda x, player: np.argmax(mcts.getActionProb(x, player, temp=0))

    arena = Arena.Arena(azp, rp.play, g, display=display)
    print('=========== playing check point {} vs {} ==========='.format(
        cp, 'random'))
    az_won, rp_won, draws = arena.playGames(num, verbose=True)
    print((az_won, rp_won, draws))
    total_turn = arena.total_turn
    print('sim count MCTS all', mcts.sim_count, 'avg game',
          mcts.sim_count / num, 'avg turn', mcts.sim_count / total_turn)
    res['random'][cp] = (az_won, num)

    for depth in [1, 2]:
        player = abps[depth]
        player.sim_count = 0
Ejemplo n.º 25
0
    def learn(self):
        """
        Performs numIters iterations with numEps episodes of self-play in each
        iteration. After every iteration, it retrains neural network with
        examples in trainExamples (which has a maximium length of maxlenofQueue).
        It then pits the new neural network against the old one and accepts it
        only if it wins >= updateThreshold fraction of games.
        """

        self.game.prune_prob = self.args.prune_starting_prob
        train_black = self.args.train_black_first

        for i in range(1, self.args.numIters+1):
            # bookkeeping
            print('------ITER ' + str(i) + '------')
            # examples of the iteration
            if not self.args.skip_first_self_play or i>1:
                iterationTrainExamples_white = deque([], maxlen=self.args.maxlenOfQueue)
                iterationTrainExamples_black = deque([], maxlen=self.args.maxlenOfQueue)
    
                eps_time = AverageMeter()
                bar = Bar('Self Play', max=self.args.numEps)
                end = time.time()

                if self.args.profile_coach:
                    prof = cProfile.Profile()
                    prof.enable()

                for eps in range(self.args.numEps):
                    self.mcts = MCTS(self.game, self.white_nnet, self.black_nnet, self.args)   # reset search tree

                    white_examples, black_examples = self.executeEpisode()

                    iterationTrainExamples_white += white_examples
                    iterationTrainExamples_black += black_examples

                    # bookkeeping + plot progress
                    eps_time.update(time.time() - end)
                    end = time.time()
                    bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format(eps=eps + 1, maxeps=self.args.numEps, et=eps_time.avg,
                                                                                                               total=bar.elapsed_td, eta=bar.eta_td)
                    bar.next()
                bar.finish()
                if self.args.profile_coach:
                    prof.disable()
                    prof.print_stats(sort=2)

                # save the iteration examples to the history 
                self.trainExamplesHistory_white.append(iterationTrainExamples_white)
                self.trainExamplesHistory_black.append(iterationTrainExamples_black)
                
            while len(self.trainExamplesHistory_white) > self.args.numItersForTrainExamplesHistory:
                print("len(trainExamplesHistory) =", len(self.trainExamplesHistory_white), " => remove the oldest trainExamples")
                self.trainExamplesHistory_white.pop(0)
                self.trainExamplesHistory_black.pop(0)
            # backup history to a file
            # NB! the examples were collected using the model from the previous iteration, so (i-1)  
            self.saveTrainExamples(i-1)

            # training new network, keeping a copy of the old one
            self.white_nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp_white.pth.tar')
            self.black_nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp_black.pth.tar')
            self.white_pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_white.pth.tar')
            self.black_pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_black.pth.tar')

            pmcts = MCTS(self.game, self.white_pnet, self.black_pnet, self.args)

            if not self.args.train_both:
                if train_black:
                    # shuffle examples before training
                    trainExamples = []
                    for e in self.trainExamplesHistory_black:
                        trainExamples.extend(e)
                    shuffle(trainExamples)
                    self.black_nnet.train(trainExamples)
                else:
                    # shuffle examples before training
                    trainExamples = []
                    for e in self.trainExamplesHistory_white:
                        trainExamples.extend(e)
                    shuffle(trainExamples)
                    self.white_nnet.train(trainExamples)
            else:
                # shuffle examples before training
                trainExamples = []
                for e in self.trainExamplesHistory_black:
                    trainExamples.extend(e)
                shuffle(trainExamples)
                self.black_nnet.train(trainExamples)

                # shuffle examples before training
                trainExamples = []
                for e in self.trainExamplesHistory_white:
                    trainExamples.extend(e)
                shuffle(trainExamples)
                self.white_nnet.train(trainExamples)

            nmcts = MCTS(self.game, self.white_nnet, self.black_nnet, self.args)

            print('PITTING AGAINST PREVIOUS VERSION')
            arena = Arena(lambda board, turn_player: np.argmax(pmcts.getActionProb(board, turn_player, temp=0)),
                          lambda board, turn_player: np.argmax(nmcts.getActionProb(board, turn_player, temp=0)),
                          self.game)
            pwins, nwins, draws, pwins_white, pwins_black, nwins_white, nwins_black \
                = arena.playGames(self.args.arenaCompare, self.args.profile_arena)

            print('NEW/PREV WINS (white, black) : (%d,%d) / (%d,%d) ; DRAWS : %d' % (nwins_white, nwins_black, pwins_white, pwins_black, draws))

            if pwins+nwins == 0 or float(nwins)/(pwins+nwins) < self.args.updateThreshold \
                    or nwins_black < pwins_black or nwins_white < pwins_white:
                print('REJECTING NEW MODEL')
                if not self.args.train_both:
                    if train_black:
                        self.black_nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_black.pth.tar')
                    else:
                        self.white_nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_white.pth.tar')
                else:
                    self.black_nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_black.pth.tar')
                    self.white_nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp_white.pth.tar')
            else:
                print('ACCEPTING NEW MODEL')
                if not self.args.train_both:
                    if train_black:
                        # self.black_nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i, Player.black))
                        self.black_nnet.save_checkpoint(folder=self.args.checkpoint, filename='best_black.pth.tar')
                        # if nwins_white == 0 or nwins_black / nwins_white >= self.args.train_other_network_threshold:
                        #     train_black = False
                        print("training white neural net next")
                        train_black = False
                    else:
                        # self.white_nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i, Player.white))
                        self.white_nnet.save_checkpoint(folder=self.args.checkpoint, filename='best_white.pth.tar')
                        # if nwins_black == 0 or nwins_white / nwins_black > self.args.train_other_network_threshold:
                        #     train_black = True
                        print("training black neural net next")
                        train_black = True
                else:
                    self.black_nnet.save_checkpoint(folder=self.args.checkpoint, filename='best_black.pth.tar')
                    self.white_nnet.save_checkpoint(folder=self.args.checkpoint, filename='best_white.pth.tar')
                self.game.prune_prob += self.args.prune_prob_gain_per_iteration
                self.args.arenaCompare = math.floor(self.args.arenaCompare * 1.05)
            # self.args.numEps = math.floor(self.args.numEps * 1.1)
            self.args.numMCTSSims = math.floor(self.args.numMCTSSims * 1.1)
            print("prune probability: " + str(self.game.prune_prob) + ", episodes: " + str(self.args.numEps) +
                  ", sims: " + str(self.args.numMCTSSims) + ", arena compare: " + str(self.args.arenaCompare))
Ejemplo n.º 26
0
    'numItersForTrainExamplesHistory': 20,
})

g = Game(6)
nnet = nn(g)

if args.load_model:
    nnet.load_checkpoint(args.load_folder_file[0], args.load_folder_file[1])

c = Coach(g, nnet, args)
board = g.getInitBoard()

player = 1

while g.getGameEnded(board, player) == 0:
    mcts = MCTS(g, nnet, args)
    action = mcts.get_action_prob(g.getCanonicalForm(board, 1))
    print(action)
    board, player = g.getNextState(board, 1, np.argmax(action))
    g.display(board)
    print([(g.action_dict[x], x)
           for x in np.where(g.getValidMoves(board, player) == 1)[0]])
    mode = -1
    while mode not in np.where(g.getValidMoves(board, player) == 1)[0]:
        try:
            mode = int(raw_input('Input:'))
        except ValueError:
            print "Not a number"
    board, player = g.getNextState(board, player, mode)
    g.display(board)
Ejemplo n.º 27
0
class Coach():
    """
    This class executes the self-play + learning. It uses the functions defined
    in Game and NeuralNet. args are specified in main_.py.
    """

    def __init__(self, game, nnet, args):
        self.game = game
        self.nnet = nnet
        self.pnet = self.nnet.__class__(self.game)  # the competitor network
        self.args = args
        self.mcts = MCTS(self.game, self.nnet, self.args)
        self.trainExamplesHistory = []  # history of examples from args.numItersForTrainExamplesHistory latest iterations
        self.skipFirstSelfPlay = False  # can be overriden in loadTrainExamples()

    def executeEpisode(self):
        """
        This function executes one episode of self-play, starting with player 1.
        As the game is played, each turn is added as a training example to
        trainExamples. The game is played till the game ends. After the game
        ends, the outcome of the game is used to assign values to each example
        in trainExamples.

        It uses a temp=1 if episodeStep < tempThreshold, and thereafter
        uses temp=0.

        Returns:
            trainExamples: a list of examples of the form (canonicalBoard, currPlayer, pi,v)
                           pi is the MCTS informed policy vector, v is +1 if
                           the player eventually won the game, else -1.
        """
        trainExamples = []
        board = self.game.getInitBoard()
        self.curPlayer = 1
        episodeStep = 0

        while True:
            episodeStep += 1
            canonicalBoard = self.game.getCanonicalForm(board, self.curPlayer) # This is good, return depending on the payer
            # print('---------coach-------------')
            # print(board)
            temp = int(episodeStep < self.args.tempThreshold)

            pi = self.mcts.getActionProb(canonicalBoard, obj_board=board, player=self.curPlayer, temp=temp) # To check it returns what
            sym = self.game.getSymmetries(canonicalBoard, pi)
            for b, p in sym:
                trainExamples.append([b, self.curPlayer, p, None])

            to_validate = self.game.getValidMoves(board, self.curPlayer)
            action = np.random.choice(len(pi), p=pi)
            if not to_validate[action] == 1:
                action = np.argmax(self.game.getValidMoves(board, self.curPlayer))
            board, self.curPlayer = self.game.getNextState(board, self.curPlayer, action)

            # board.apply_mirror()

            r = self.game.getGameEnded(board, self.curPlayer)

            if r != 0:
                return [(x[0], x[2], r * ((-1) ** (x[1] != self.curPlayer))) for x in trainExamples]

    def learn(self):
        """
        Performs numIters iterations with numEps episodes of self-play in each
        iteration. After every iteration, it retrains neural network with
        examples in trainExamples (which has a maximum length of maxlenofQueue).
        It then pits the new neural network against the old one and accepts it
        only if it wins >= updateThreshold fraction of games.
        """

        for i in range(1, self.args.numIters + 1):
            # bookkeeping
            log.info(f'Starting Iter #{i} ...')
            # examples of the iteration
            if not self.skipFirstSelfPlay or i > 1:
                iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue)

                for _ in tqdm(range(self.args.numEps), desc="Self Play"):
                    self.mcts = MCTS(self.game, self.nnet, self.args)  # reset search tree
                    iterationTrainExamples += self.executeEpisode()

                # save the iteration examples to the history
                self.trainExamplesHistory.append(iterationTrainExamples)

            if len(self.trainExamplesHistory) > self.args.numItersForTrainExamplesHistory:
                log.warning(
                    f"Removing the oldest entry in trainExamples. len(trainExamplesHistory) = {len(self.trainExamplesHistory)}")
                self.trainExamplesHistory.pop(0)
            # backup history to a file
            # NB! the examples were collected using the model from the previous iteration, so (i-1)
            self.saveTrainExamples(i - 1)

            # shuffle examples before training
            trainExamples = []
            for e in self.trainExamplesHistory:
                trainExamples.extend(e)
            shuffle(trainExamples)

            # training new network, keeping a copy of the old one
            self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar')
            self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar')
            # self.loadTrainExamples()
            # trainExamples = self.trainExamplesHistory
            pmcts = MCTS(self.game, self.pnet, self.args)

            self.nnet.train(trainExamples)
            nmcts = MCTS(self.game, self.nnet, self.args)

            log.info('PITTING AGAINST PREVIOUS VERSION')
            arena = Arena(lambda x, y: np.argmax(pmcts.getActionProb(x, y, temp=0, player=1)),
                          lambda x, y: np.argmax(nmcts.getActionProb(x, y, temp=0, player=-1)), self.game)
            pwins, nwins, draws = arena.playGames(self.args.arenaCompare)

            print('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws))
            if pwins + nwins == 0 or float(nwins) / (pwins + nwins) < self.args.updateThreshold:
                log.info('REJECTING NEW MODEL')
                self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar')
            else:
                log.info('ACCEPTING NEW MODEL')
                self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i))
                self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar')

    def getCheckpointFile(self, iteration):
        return 'checkpoint_' + str(iteration) + '.pth.tar'

    def saveTrainExamples(self, iteration):
        folder = self.args.checkpoint
        if not os.path.exists(folder):
            os.makedirs(folder)
        filename = os.path.join(folder, self.getCheckpointFile(iteration) + ".examples")
        with open(filename, "wb+") as f:
            Pickler(f).dump(self.trainExamplesHistory)
        f.closed

    def loadTrainExamples(self):
        modelFile = os.path.join(self.args.load_folder_file[0], self.args.load_folder_file[1])
        examplesFile = modelFile + ".examples"
        if not os.path.isfile(examplesFile):
            log.warning(f'File "{examplesFile}" with trainExamples not found!')
            r = input("Continue? [y|n]")
            if r != "y":
                sys.exit()
        else:
            log.info("File with trainExamples found. Loading it...")
            with open(examplesFile, "rb") as f:
                self.trainExamplesHistory = Unpickler(f).load()
            log.info('Loading done!')

            # examples based on the model were already collected (loaded)
            self.skipFirstSelfPlay = True
Ejemplo n.º 28
0
 def __init__(self, game, n_mcts_per_step):
     self.mcts = MCTS(game)
     self.n_mcts_per_step = n_mcts_per_step
Ejemplo n.º 29
0
    def learn(self):
        """
        Performs numIters iterations with numEps episodes of self-play in each
        iteration. After every iteration, it retrains neural network with
        examples in trainExamples (which has a maximium length of maxlenofQueue).
        It then pits the new neural network against the old one and accepts it
        only if it wins >= updateThreshold fraction of games.
        """

        for i in range(1, self.args.numIters + 1):
            # bookkeeping
            print('------ITER ' + str(i) + '------')
            # examples of the iteration
            if not self.skipFirstSelfPlay or i > 1:
                iterationTrainExamples = deque([],
                                               maxlen=self.args.maxlenOfQueue)

                eps_time = AverageMeter()
                bar = Bar('Self Play', max=self.args.numEps)
                end = time.time()

                for eps in range(self.args.numEps):
                    self.mcts = MCTS(self.game, self.nnet,
                                     self.args)  # reset search tree
                    iterationTrainExamples += self.executeEpisode()

                    # bookkeeping + plot progress
                    eps_time.update(time.time() - end)
                    end = time.time()
                    bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format(
                        eps=eps + 1,
                        maxeps=self.args.numEps,
                        et=eps_time.avg,
                        total=bar.elapsed_td,
                        eta=bar.eta_td)
                    bar.next()
                bar.finish()

                # save the iteration examples to the history
                self.trainExamplesHistory.append(iterationTrainExamples)

            if len(self.trainExamplesHistory
                   ) > self.args.numItersForTrainExamplesHistory:
                print("len(trainExamplesHistory) =",
                      len(self.trainExamplesHistory),
                      " => remove the oldest trainExamples")
                self.trainExamplesHistory.pop(0)
            # backup history to a file
            # NB! the examples were collected using the model from the previous iteration, so (i-1)
            self.saveTrainExamples(i - 1)

            # shuffle examlpes before training
            trainExamples = []
            for e in self.trainExamplesHistory:
                trainExamples.extend(e)
            shuffle(trainExamples)

            # training new network, keeping a copy of the old one
            self.nnet.save_checkpoint(folder=self.args.checkpoint,
                                      filename='temp.pth.tar')
            self.pnet.load_checkpoint(folder=self.args.checkpoint,
                                      filename='temp.pth.tar')
            pmcts = MCTS(self.game, self.pnet, self.args)

            self.nnet.train(trainExamples)
            nmcts = MCTS(self.game, self.nnet, self.args)

            print('PITTING AGAINST PREVIOUS VERSION')
            arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)),
                          lambda x: np.argmax(nmcts.getActionProb(x, temp=0)),
                          self.game)
            pwins, nwins, draws = arena.playGames(self.args.arenaCompare)

            print('NEW/PREV WINS : %d / %d ; DRAWS : %d' %
                  (nwins, pwins, draws))
            if pwins + nwins > 0 and float(nwins) / (
                    pwins + nwins) < self.args.updateThreshold:
                print('REJECTING NEW MODEL')
                self.nnet.load_checkpoint(folder=self.args.checkpoint,
                                          filename='temp.pth.tar')
            else:
                print('ACCEPTING NEW MODEL')
                self.nnet.save_checkpoint(folder=self.args.checkpoint,
                                          filename=self.getCheckpointFile(i))
                self.nnet.save_checkpoint(folder=self.args.checkpoint,
                                          filename='best.pth.tar')
Ejemplo n.º 30
0
class Coach():
    """
    This class executes the self-play + learning. It uses the functions defined
    in Game and NeuralNet. args are specified in main.py.
    """
    def __init__(self, game, nnet, args):
        self.game = game
        self.nnet = nnet
        self.pnet = self.nnet.__class__(self.game)  # the competitor network
        self.args = args
        self.mcts = MCTS(self.game, self.nnet, self.args)
        self.trainExamplesHistory = [
        ]  # history of examples from args.numItersForTrainExamplesHistory latest iterations
        self.skipFirstSelfPlay = False  # can be overriden in loadTrainExamples()

    def executeEpisode(self):
        """
        This function executes one episode of self-play, starting with player 1.
        As the game is played, each turn is added as a training example to
        trainExamples. The game is played till the game ends. After the game
        ends, the outcome of the game is used to assign values to each example
        in trainExamples.

        It uses a temp=1 if episodeStep < tempThreshold, and thereafter
        uses temp=0.

        Returns:
            trainExamples: a list of examples of the form (canonicalBoard,pi,v)
                           pi is the MCTS informed policy vector, v is +1 if
                           the player eventually won the game, else -1.
        """
        trainExamples = []  #move history of this single episode
        board = self.game.getInitBoard()  #load the gam setup
        self.curPlayer = WHITE  #WHITE goes first
        episodeStep = 0  #record the truns of self play

        #star playing the game
        while True:
            # turn objective board into self.curlPlayer POV's board. Ie: black, white -> friend, enemy
            #       two kinds of board:
            #           1: Objective board: Black and White
            #           2: CanonicalBoard:  Friendly and Enemy
            canonicalBoard = self.game.getCanonicalForm(board, self.curPlayer)
            # print("Received CanonicalBoard:\n%s"%canonicalBoard.reshape(8,8))
            # print("Current player:%s"%self.curPlayer)
            # a = input()

            # if episodes > tempThreshold, MCTS will stop updating probs, and just return best move
            #NOTE: mainly for spped up I guess? currently disable, as episodeStep = 24, args.tempThreshold = 25
            temp = int(episodeStep < self.args.tempThreshold)

            # create probability of winning for each action on board for self.currPlayer's POV

            pi = self.mcts.getActionProb(canonicalBoard,
                                         episodeStep,
                                         temp=temp)

            # one board situation can generate two tranining example # as symmetric does not matter
            sym = self.game.getSymmetries(canonicalBoard, pi)

            #adding tranning example
            #BUG: not showing correctly
            for b, policyVector in sym:
                # (canonicalBoard,player, polivy vector)
                trainExamples.append(
                    [b, self.curPlayer, policyVector, episodeStep])

            # #DEBUG:
            # probs_display = [round(x,2) for x in pi]
            # print("curr_player:%s turn:%s, probs:\n%s"%(self.curPlayer, episodeStep, np.array(probs_display).reshape(8,8)))

            #choose action with highest winning probability
            action = np.random.choice(len(pi), p=pi)

            if self.curPlayer == BLACK:
                action = self.game.blackActionConverter(action)

            #DEBUG
            # print("in player point of view \n player %s going to take action %s in turn %s board:\n%s"%(self.curPlayer, action, episodeStep, canonicalBoard.reshape(8,8)))

            #self.curPlayer turn to next player, objective board update, turn update
            board, self.curPlayer = self.game.getNextState(
                board, self.curPlayer,
                action)  #regardless of friendly or enemy, show objective
            episodeStep += 1

            #DEBUG
            # print("after action, objective board \n ")
            # print( board.reshape(8,8))
            # print("next player %s next turn %s"%(self.curPlayer, episodeStep))

            #check the new board status
            #return 0 if game continue, 1 if WHITE win, -1 if BLACK win.
            #thgouth the last turn was BLACK's Move, we updated self.curPlayer after Black action
            #so we judge result in WHITE's POV
            # the last turn after black does not added to the trainExample, as we already know who won
            # we add winning result in next if Statement
            r = self.game.getGameEnded(board,
                                       self.curPlayer,
                                       episodeStep,
                                       end_Evaluate=False)  #in WHITE's POV

            if r != 0 and self.curPlayer == WHITE:
                #DEBUG
                # print("Objective board")
                # print("game has ended, player %s result %s board:\n%s"%(self.curPlayer, r, board.reshape(8,8)))

                #return board winning result, who won it
                #(canonicalBoard,policyVector,v)
                # x[0] cannonical board, x[2] policy vector x[1] self.curlPlayer of cannonical board
                # x[1] is BLACK, return -result as -result is in BLACK's POV
                # x[1] is WHITE, return result, as result is in WHITE's POV
                # x[2] policyVector from mcts
                # x[3] turn
                #TODO do I need to add turn for poliicy vector as well?
                r = self.game.getGameEnded(board,
                                           self.curPlayer,
                                           episodeStep,
                                           end_Evaluate=True)
                generatedTraining = [
                    (x[0], x[2], r * ((-1)**(x[1] != WHITE)), x[3])
                    for x in trainExamples
                ]  #add turn as a input, no need to make change for pi
                # generatedTraining = [(x[0],x[2],r*((-1)**(x[1]!=WHITE))) for x in trainExamples]
                # generatedTraining = [(x[0],x[2],r*((-1)**(x[1]!=self.curPlayer))) for x in trainExamples]

                #DEBUG
                # lastResult = generatedTraining[-1]
                # print("Input to trainExample")
                # print("result:%s, cannonicalboard:\n%s "%(lastResult[2], lastResult[0].reshape(8,8)))

                # a = input()
                return generatedTraining

    def learn(self):
        """
        Performs numIters iterations with numEps episodes of self-play in each
        iteration. After every iteration, it retrains neural network with
        examples in trainExamples (which has a maximium length of maxlenofQueue).
        It then pits the new neural network against the old one and accepts it
        only if it wins >= updateThreshold fraction of games.
        """

        for i in range(1, self.args.numIters + 1):  #for number of rounds
            # bookkeeping
            print('------ITER ' + str(i) + '------')
            # examples of the iteration
            if not self.skipFirstSelfPlay or i > 1:
                iterationTrainExamples = deque(
                    [], maxlen=self.args.maxlenOfQueue
                )  #remove the previous training example

                eps_time = AverageMeter()
                bar = Bar('Self Play', max=self.args.numEps)
                end = time.time()

                for eps in range(
                        self.args.numEps):  #for each self-play of this rounds
                    self.mcts = MCTS(self.game, self.nnet,
                                     self.args)  # reset search tree

                    #reutrn [(canonicalBoard,pi,v), (canonicalBoard,pi,v)]
                    # v is the result
                    selfPlayResult = self.executeEpisode()
                    #play one game, adding the gaming history
                    iterationTrainExamples += selfPlayResult

                    # bookkeeping + plot progress
                    eps_time.update(time.time() - end)
                    end = time.time()
                    bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format(
                        eps=eps + 1,
                        maxeps=self.args.numEps,
                        et=eps_time.avg,
                        total=bar.elapsed_td,
                        eta=bar.eta_td)
                    bar.next()
                bar.finish()

                # save the iteration examples to the history
                self.trainExamplesHistory.append(iterationTrainExamples)

            #self-play finished, updating the move history
            if len(self.trainExamplesHistory
                   ) > self.args.numItersForTrainExamplesHistory:
                print("len(trainExamplesHistory) =",
                      len(self.trainExamplesHistory),
                      " => remove the oldest trainExamples")
                self.trainExamplesHistory.pop(
                    0)  #remove the oldest gaming history
            # backup history to a file
            # NB! the examples were collected using the model from the previous iteration, so (i-1)
            self.saveTrainExamples(i - 1)

            # shuffle examlpes before training
            trainExamples = []
            for e in self.trainExamplesHistory:
                trainExamples.extend(e)  #adding new move record
            shuffle(trainExamples)

            # training new network, keeping a copy of the old one
            self.nnet.save_checkpoint(
                folder=self.args.checkpoint,
                filename='temp.pth.tar')  #save the previous net
            self.pnet.load_checkpoint(
                folder=self.args.checkpoint,
                filename='temp.pth.tar')  #read the previous net
            pmcts = MCTS(self.game, self.pnet,
                         self.args)  #reset previous models' mcts

            #using new data to train the new model
            self.nnet.train(
                trainExamples)  #trin the network with new move record
            nmcts = MCTS(self.game, self.nnet,
                         self.args)  #rest new models' mcts

            #OLD VS NEW
            print('PITTING AGAINST PREVIOUS VERSION')
            arena = Arena(
                lambda board, turn: np.argmax(
                    pmcts.getActionProb(board, turn, temp=0)),
                lambda board, turn: np.argmax(
                    nmcts.getActionProb(board, turn, temp=0)), self.game)
            pwins, nwins, draws = arena.playGames(
                self.args.arenaCompare)  #playing new mode against old models

            print('NEW/PREV WINS : %d / %d ; DRAWS : %d' %
                  (nwins, pwins, draws))
            if pwins + nwins > 0 and float(nwins) / (
                    pwins + nwins) < self.args.updateThreshold:
                #OLD WIN!
                print('REJECTING NEW MODEL')
                self.nnet.load_checkpoint(
                    folder=self.args.checkpoint, filename='temp.pth.tar'
                )  #using previous mode, as it beat new model
            else:
                #NEW WIN!
                print('ACCEPTING NEW MODEL')
                self.nnet.save_checkpoint(folder=self.args.checkpoint,
                                          filename=self.getCheckpointFile(i))
                self.nnet.save_checkpoint(
                    folder=self.args.checkpoint, filename='best.pth.tar'
                )  #save the new model, as this is the best

    def getCheckpointFile(self, iteration):  #reading the tranined network
        return 'checkpoint_' + str(iteration) + '.pth.tar'

    def saveTrainExamples(self, iteration):
        folder = self.args.checkpoint
        if not os.path.exists(folder):
            os.makedirs(folder)
        filename = os.path.join(
            folder,
            self.getCheckpointFile(iteration) + ".examples")
        with open(filename, "wb+") as f:
            Pickler(f).dump(self.trainExamplesHistory)
        f.closed

    def loadTrainExamples(self):
        modelFile = os.path.join(self.args.load_folder_file[0],
                                 self.args.load_folder_file[1])
        examplesFile = modelFile + ".examples"
        if not os.path.isfile(examplesFile):
            print(examplesFile)
            r = input("File with trainExamples not found. Continue? [y|n]")
            if r != "y":
                sys.exit()
        else:
            print("File with trainExamples found. Read it.")
            with open(examplesFile, "rb") as f:
                self.trainExamplesHistory = Unpickler(f).load()
            f.closed
            # examples based on the model were already collected (loaded)
            self.skipFirstSelfPlay = True
Ejemplo n.º 31
0
human_vs_cpu = True

g = HalfchessGame()

# all players
rp = RandomPlayer(g).play
# gp = GreedyOthelloPlayer(g).play
hp = HumanPlayer(g).play

# nnet player
nn = NNet(g)
#nn.load_checkpoint('./temp/','best.pth.tar')
nn.load_checkpoint('./pretrained_models/halfchess', '43it.pth.tar')
args = dotdict({'numMCTSSims': 50, 'cpuct': 1.0})
mcts = MCTS(g, nn, args)
nnp = lambda x: np.argmax(mcts.getActionProb(x, temp=1))


if human_vs_cpu:
    player1 = hp
else:
    n2 = NNet(g)
    n2.load_checkpoint('./pretrained_models/halfchess/', '26it_fixed_logic.pth.tar')
    args2 = dotdict({'numMCTSSims': 50, 'cpuct': 1.0})
    mcts2 = MCTS(g, n2, args2)
    n2p = lambda x: np.argmax(mcts2.getActionProb(x, temp=0))

#     player2 = n2p  # Player 2 is neural network if it's cpu vs cpu.

arena = Arena.Arena(hp, nnp, g, display=str)
Ejemplo n.º 32
0
def onevsonegame(budget1, random1, counter1, usecounter_in_rollout_1, budget2,
                 random2, counter2, usecounter_in_rollout_2, whostarts, index):

    import random
    random.seed()
    np.random.seed()

    if whostarts == 'budget1':
        modulo = 1
    elif whostarts == 'budget2':
        modulo = 0

    # init tree, root, game
    tree = MCTS()
    c_uct = 1
    game = Game()
    turn = 0
    gameover = 0
    rootnode = tree.createNode(game.state)
    currentnode = rootnode

    # main loop
    while gameover == 0:

        turn = turn + 1

        if turn % 2 == modulo:
            #player = 'player1'
            sim_number = budget1
            usecounterinrollout = usecounter_in_rollout_1
            counter = counter1
            rd = random1

        else:
            #player = 'player2'
            sim_number = budget2
            usecounterinrollout = usecounter_in_rollout_2
            counter = counter2
            rd = random2

        if rd:  #completely random play / or random + counter
            if counter:
                currentnode, existscounter = getcountermove(currentnode, tree)
                if existscounter == False:
                    if len(currentnode.children) == 0:
                        tree.expand_all(currentnode)
                    randindex = int(random.random() *
                                    (len(currentnode.children)))
                    currentnode = currentnode.children[randindex]

            else:
                if len(currentnode.children) == 0:
                    tree.expand_all(currentnode)
                randindex = int(random.random() * (len(currentnode.children)))
                currentnode = currentnode.children[randindex]

        else:
            if counter:
                currentnode, existscounter = getcountermove(currentnode, tree)
                if existscounter == False:
                    for sims in range(0, sim_number):
                        tree.simulate(currentnode, UCT_simu, c_uct,
                                      usecounterinrollout)

                    visits = np.array(
                        [child.N for child in currentnode.children])
                    max_visits = np.where(visits == np.max(visits))[0]
                    imax = max_visits[int(random.random() * len(max_visits))]
                    currentnode = currentnode.children[imax]

            else:

                for sims in range(0, sim_number):
                    tree.simulate(currentnode, UCT_simu, c_uct,
                                  usecounterinrollout)

                visits = np.array([child.N for child in currentnode.children])
                max_visits = np.where(visits == np.max(visits))[0]
                imax = max_visits[int(random.random() * len(max_visits))]
                currentnode = currentnode.children[imax]

        # then reinit tree
        game = Game(currentnode.state)
        tree = MCTS()
        rootnode = tree.createNode(game.state)
        currentnode = rootnode
        gameover, winner = game.gameover()

    #print('end of game')
    if winner == 0:
        toreturn = 'draw'

    elif winner == 1:
        if whostarts == 'budget1':
            toreturn = 'budget1'
        else:
            toreturn = 'budget2'

    elif winner == -1:
        if whostarts == 'budget1':
            toreturn = 'budget2'
        else:
            toreturn = 'budget1'

    monresult = {'result': toreturn}
    filename = './data/game' + str(index) + '.txt'
    with open(filename, 'wb') as file:
        pickle.dump(monresult, file)
    file.close()
Ejemplo n.º 33
0
from utils import *

"""
use this script to play any two agents against each other, or play manually with
any agent.
"""

g = OthelloGame(6)

# all players
rp = RandomPlayer(g).play
gp = GreedyOthelloPlayer(g).play
hp = HumanOthelloPlayer(g).play

# nnet players
n1 = NNet(g)
n1.load_checkpoint('./pretrained_models/othello/pytorch/','6x100x25_best.pth.tar')
args1 = dotdict({'numMCTSSims': 50, 'cpuct':1.0})
mcts1 = MCTS(g, n1, args1)
n1p = lambda x: np.argmax(mcts1.getActionProb(x, temp=0))


#n2 = NNet(g)
#n2.load_checkpoint('/dev/8x50x25/','best.pth.tar')
#args2 = dotdict({'numMCTSSims': 25, 'cpuct':1.0})
#mcts2 = MCTS(g, n2, args2)
#n2p = lambda x: np.argmax(mcts2.getActionProb(x, temp=0))

arena = Arena.Arena(n1p, hp, g, display=display)
print(arena.playGames(2, verbose=True))
Ejemplo n.º 34
0
end = datetime.datetime.now()
print("Elapsed:", end - start)

for sims in np.arange(100, 2000, 100):
    good = 0
    for i in range(n_games):
        if i % 10 == 0: print("- Game", i, datetime.datetime.now())
        args = dotdict({
            'numMCTSSims': sims,
            'n_nodes': N_NODES,
            'max_dist': 100,
            'cpuct': 1,
            'max_dist': 100
        })

        mcts = MCTS(games[i], None, args)

        state = [0]
        R = 0
        Actions = []

        while not games[i].getGameEnded(state):
            action = np.argmax(mcts.getActionProb(state))
            state, reward = games[i].getNextState(state, action)
            Actions.append(action)
            R += reward

        score_of_mcts_path = games[i].path_pay(Actions)

        if score_of_mcts_path / optimal[i] < 1.1:
            good += 1
Ejemplo n.º 35
0
 def __init__(self, policy_net_path, value_net_path):
     self.mcts = MCTS(policy_net_path, value_net_path, time_limit=20)
Ejemplo n.º 36
0
    'cpuct': 1,

    'checkpoint': './temp/',
    'load_model': True,
    'load_folder_file': ('/media/nmo/3E26969E2696572D/Martin/Programmieren/Machine-Learning/reinforcement_learning/alpha_go_zero/temp','best.pth.tar'),
    'numItersForTrainExamplesHistory': 20,

})

if __name__ == "__main__":
    game = Connect4Proxy(6, 7)
    net = NNetWrapper(game)
    net.load_checkpoint(folder='./temp/', filename='best.pth.tar')
    board = game.getInitBoard()

    nmcts = MCTS(game, net, args)
    curPlayer = 1
    it = 0

    def computer_move(canonical_board, valids):
        action = np.argmax(nmcts.getActionProb(canonical_board, temp=0))
        if valids[action] == 0:
            assert valids[action] > 0
        return action

    def human_move(canonical_board, valids):
        while True:
            # break if we make a legal move
            move = input("Enter column: ")
            if valids[int(move)] == 1:
                break