Ejemplo n.º 1
0
    def arena(self, agent1, agent2, mcts_args, games_to_play=10):
        mcts1 = MCTS(agent1, mcts_args)
        mcts2 = MCTS(agent2, mcts_args)
        results = []

        for i in range(games_to_play): #tqdm()
            if i % 2 == 0:
                player1 = mcts1
                player2 = mcts2
            else:
                player2 = mcts1
                player1 = mcts2

            env = TicTacToeEnv()

            done = False
            while not done:
                first_player_move = env.fpt
                if first_player_move:
                    probs = player1.getProbs(env, temp=0)
                else:
                    probs = player2.getProbs(env, temp=0)
                
                action = np.random.choice(probs.shape[0], p=probs.reshape(-1,))
                _, reward, done, _ = env.step( (action//env.size, action % env.size) )
                if reward == -1:
                    print('Repeated move!')
                if done:
                    results.append( reward if first_player_move else -1*reward )
        return results
Ejemplo n.º 2
0
    def execute_update(self, agent, mcts_args):
        train_samples = []
        for _ in range(self.num_eps): #tqdm()
            self.mcts = MCTS(agent, mcts_args)
            train_samples += self.play_game(agent)

        agent.updateNet(np.array(train_samples))
        return train_samples
Ejemplo n.º 3
0
    def __init__(self, game, nnet, args):
        self.game = game
        self.nnet = nnet
        # 对抗者的网络
        self.pnet = self.nnet.__class__(self.game)  # the competitor network
        self.args = args

        # 蒙特卡洛树搜索
        self.mcts = MCTS(self.game, self.nnet, self.args)

        self.trainExamplesHistory = []  # history of examples from args.numItersForTrainExamplesHistory latest iterations

        # 是否跳过第一次自学习
        self.skipFirstSelfPlay = False  # can be overriden in loadTrainExamples()
Ejemplo n.º 4
0
class AlphaBot():
    def __init__(self, path, args):
        self.type = 'computer'
        self.net = AlphaNet(11, device='cuda:0')
        self.net.load_net(path)
        self.mcts = MCTS(self.net, args)

    def get_action(self, env, info):
        probs = self.mcts.getProbs(env, temp=0)
        action = np.random.choice(probs.shape[0], p=probs.reshape(-1, ))
        return action // env.size, action % env.size
Ejemplo n.º 5
0
 def selfPlay(self, mcts: MCTS, queue):
     dataOneGame = []
     mcts.reset()
     while not mcts.game.isTerminated():
         mcts.expandMaxNodes()
         Pi = mcts.Pi()
         if not len(Pi) > 0:
             break
         stepData = TrainData()
         stepData.inputPlanes = mcts.game.getInputPlanes()
         stepData.inputPolicyMask = mcts.game.getInputPolicyMask()
         stepData.predictionProbability = Pi
         actionIndex = self.selectActionIndex(Pi, stepData.inputPolicyMask)
         action = mcts.play(actionIndex)
         assert action
         dataOneGame.append(stepData)
     print('_', end='')
     sys.stdout.flush()
     resultValue = mcts.game.getTerminateValue()
     dataOneGame.reverse()
     for data in dataOneGame:
         data.predictionValue = resultValue
         resultValue = -resultValue
     for data in dataOneGame:
         queue.put(data)
Ejemplo n.º 6
0
    def learn(self):
        """
        Performs numIters iterations with numEps episodes of self-play in each
        iteration. After every iteration, it retrains neural network with
        examples in trainExamples (which has a maximum length of maxlenofQueue).
        It then pits the new neural network against the old one and accepts it
        only if it wins >= updateThreshold fraction of games.
        """

        for i in range(1, self.args.numIters + 1):
            # bookkeeping
            log.info(f'Starting Iter #{i} ...')
            # examples of the iteration

            # 获取自学习数据
            if not self.skipFirstSelfPlay or i > 1:
                iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue)

                for _ in tqdm(range(self.args.numEps), desc="Self Play"):
                    self.mcts = MCTS(self.game, self.nnet, self.args)  # reset search tree

                    iterationTrainExamples += self.executeEpisode()

                # save the iteration examples to the history
                self.trainExamplesHistory.append(iterationTrainExamples)

            if len(self.trainExamplesHistory) > self.args.numItersForTrainExamplesHistory:
                log.warning(
                    f"Removing the oldest entry in trainExamples. len(trainExamplesHistory) = {len(self.trainExamplesHistory)}")
                self.trainExamplesHistory.pop(0)


            # backup history to a file
            # NB! the examples were collected using the model from the previous iteration, so (i-1)
            # 备份训练数据
            self.saveTrainExamples(i - 1)

            # shuffle examples before training
            # 所有历史样本累加,再洗牌
            trainExamples = []
            for e in self.trainExamplesHistory:
                trainExamples.extend(e)
            shuffle(trainExamples)

            # training new network, keeping a copy of the old one
            # 培训新网络,保留旧网络的副本
            self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar')
            self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar')
            # 对抗的mcts,要也上一次训练的网络
            pmcts = MCTS(self.game, self.pnet, self.args)

            # 训练
            self.nnet.train(trainExamples)
            # 新的mcts
            nmcts = MCTS(self.game, self.nnet, self.args)

            log.info('PITTING AGAINST PREVIOUS VERSION')
            # 新旧模型对决
            arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)),
                          lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game)
            pwins, nwins, draws = arena.playGames(self.args.arenaCompare)

            log.info('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws))
            if pwins + nwins == 0 or float(nwins) / (pwins + nwins) < self.args.updateThreshold:
                log.info('REJECTING NEW MODEL')
                # 训练模型还退步了,不保留新的模型
                self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar')
            else:
                log.info('ACCEPTING NEW MODEL')
                # 训练有进步,保留新的模型
                self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i))
                self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar')
Ejemplo n.º 7
0
class Coach():
    """
    This class executes the self-play + learning. It uses the functions defined
    in Game and NeuralNet. args are specified in train.py.
    """

    def __init__(self, game, nnet, args):
        self.game = game
        self.nnet = nnet
        # 对抗者的网络
        self.pnet = self.nnet.__class__(self.game)  # the competitor network
        self.args = args

        # 蒙特卡洛树搜索
        self.mcts = MCTS(self.game, self.nnet, self.args)

        self.trainExamplesHistory = []  # history of examples from args.numItersForTrainExamplesHistory latest iterations

        # 是否跳过第一次自学习
        self.skipFirstSelfPlay = False  # can be overriden in loadTrainExamples()

    def executeEpisode(self):
        """
        This function executes one episode of self-play, starting with player 1.
        As the game is played, each turn is added as a training example to
        trainExamples. The game is played till the game ends. After the game
        ends, the outcome of the game is used to assign values to each example
        in trainExamples.

        It uses a temp=1 if episodeStep < tempThreshold, and thereafter
        uses temp=0.

        Returns:
            trainExamples: a list of examples of the form (canonicalBoard, currPlayer, pi,v)
                           pi is the MCTS informed policy vector, v is +1 if
                           the player eventually won the game, else -1.
        """
        trainExamples = []
        board = self.game.getInitBoard()
        self.curPlayer = 1
        episodeStep = 0

        while True:
            episodeStep += 1
            canonicalBoard = self.game.getCanonicalForm(board, self.curPlayer)
            temp = int(episodeStep < self.args.tempThreshold)

            # 获取实际概率
            pi = self.mcts.getActionProb(canonicalBoard, temp=temp)

            # 获取两个棋盘和先验证概率(原始棋盘 与 对称的棋盘)
            # 列表,两个元素 [(b1,p1),(b2,p2)]
            sym = self.game.getSymmetries(canonicalBoard, pi)
            for b, p in sym:
                # b为棋盘 p为概率
                trainExamples.append([b, self.curPlayer, p, None])

            # p = pi按这个概率的形式进行随机选择
            action = np.random.choice(len(pi), p=pi)

            # 下一步的棋盘,并交换玩家
            board, self.curPlayer = self.game.getNextState(board, self.curPlayer, action)

            # 获取回报
            r = self.game.getGameEnded(board, self.curPlayer)

            if r != 0:
                # 棋盘,选择概率,奖励值
                return [(x[0], x[2], r * ((-1) ** (x[1] != self.curPlayer))) for x in trainExamples]

    def learn(self):
        """
        Performs numIters iterations with numEps episodes of self-play in each
        iteration. After every iteration, it retrains neural network with
        examples in trainExamples (which has a maximum length of maxlenofQueue).
        It then pits the new neural network against the old one and accepts it
        only if it wins >= updateThreshold fraction of games.
        """

        for i in range(1, self.args.numIters + 1):
            # bookkeeping
            log.info(f'Starting Iter #{i} ...')
            # examples of the iteration

            # 获取自学习数据
            if not self.skipFirstSelfPlay or i > 1:
                iterationTrainExamples = deque([], maxlen=self.args.maxlenOfQueue)

                for _ in tqdm(range(self.args.numEps), desc="Self Play"):
                    self.mcts = MCTS(self.game, self.nnet, self.args)  # reset search tree

                    iterationTrainExamples += self.executeEpisode()

                # save the iteration examples to the history
                self.trainExamplesHistory.append(iterationTrainExamples)

            if len(self.trainExamplesHistory) > self.args.numItersForTrainExamplesHistory:
                log.warning(
                    f"Removing the oldest entry in trainExamples. len(trainExamplesHistory) = {len(self.trainExamplesHistory)}")
                self.trainExamplesHistory.pop(0)


            # backup history to a file
            # NB! the examples were collected using the model from the previous iteration, so (i-1)
            # 备份训练数据
            self.saveTrainExamples(i - 1)

            # shuffle examples before training
            # 所有历史样本累加,再洗牌
            trainExamples = []
            for e in self.trainExamplesHistory:
                trainExamples.extend(e)
            shuffle(trainExamples)

            # training new network, keeping a copy of the old one
            # 培训新网络,保留旧网络的副本
            self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar')
            self.pnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar')
            # 对抗的mcts,要也上一次训练的网络
            pmcts = MCTS(self.game, self.pnet, self.args)

            # 训练
            self.nnet.train(trainExamples)
            # 新的mcts
            nmcts = MCTS(self.game, self.nnet, self.args)

            log.info('PITTING AGAINST PREVIOUS VERSION')
            # 新旧模型对决
            arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)),
                          lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game)
            pwins, nwins, draws = arena.playGames(self.args.arenaCompare)

            log.info('NEW/PREV WINS : %d / %d ; DRAWS : %d' % (nwins, pwins, draws))
            if pwins + nwins == 0 or float(nwins) / (pwins + nwins) < self.args.updateThreshold:
                log.info('REJECTING NEW MODEL')
                # 训练模型还退步了,不保留新的模型
                self.nnet.load_checkpoint(folder=self.args.checkpoint, filename='temp.pth.tar')
            else:
                log.info('ACCEPTING NEW MODEL')
                # 训练有进步,保留新的模型
                self.nnet.save_checkpoint(folder=self.args.checkpoint, filename=self.getCheckpointFile(i))
                self.nnet.save_checkpoint(folder=self.args.checkpoint, filename='best.pth.tar')

    def getCheckpointFile(self, iteration):
        return 'checkpoint_' + str(iteration) + '.pth.tar'

    def saveTrainExamples(self, iteration):
        folder = self.args.checkpoint
        if not os.path.exists(folder):
            os.makedirs(folder)
        filename = os.path.join(folder, self.getCheckpointFile(iteration) + ".examples")
        with open(filename, "wb+") as f:
            Pickler(f).dump(self.trainExamplesHistory)
        f.closed

    def loadTrainExamples(self):
        modelFile = os.path.join(self.args.load_folder_file[0], self.args.load_folder_file[1])
        examplesFile = modelFile + ".examples"
        if not os.path.isfile(examplesFile):
            log.warning(f'File "{examplesFile}" with trainExamples not found!')
            r = input("Continue? [y|n]")
            if r != "y":
                sys.exit()
        else:
            log.info("File with trainExamples found. Loading it...")
            with open(examplesFile, "rb") as f:
                self.trainExamplesHistory = Unpickler(f).load()
            log.info('Loading done!')

            # examples based on the model were already collected (loaded)
            self.skipFirstSelfPlay = True
Ejemplo n.º 8
0
 def createMCTS(self, network) -> MCTS:
     config = self.createMCTSConfig()
     game = self.createGame(network)
     mcts = MCTS(game, config)
     return mcts
Ejemplo n.º 9
0
 def __init__(self, agent, mcts_args,temp_thres = 16, num_eps = 20):
     self.temp_thres = temp_thres
     self.mcts = MCTS(agent, mcts_args)
     self.num_eps = num_eps 
Ejemplo n.º 10
0
class Trainer():

    def __init__(self, agent, mcts_args,temp_thres = 16, num_eps = 20):
        self.temp_thres = temp_thres
        self.mcts = MCTS(agent, mcts_args)
        self.num_eps = num_eps 

    def play_game(self, agent):
        train_samples = []
        episode_step = 0
        env = TicTacToeEnv()

        while True:
            episode_step += 1
            temp = int(episode_step < self.temp_thres)
            probs = self.mcts.getProbs(env, temp=temp)

            action = np.random.choice(probs.shape[0], p=probs.reshape(-1,))
            #probs_orig, val = self.mcts.net.predict(env.getPBoard())
            for board_s, probs_s in getBoardSims(env.getPBoard(), probs):
                train_samples.append([board_s, probs_s, env.fpt]) #, probs_orig, action, val
            _, reward, done, _ = env.step( (action//env.size, action % env.size) )
            if done:
                final_player = env.fpt
                return [ (x[0], x[1], reward if final_player != x[2] else -reward) for x in train_samples] #, x[3], x[4], x[5]

    def execute_update(self, agent, mcts_args):
        train_samples = []
        for _ in range(self.num_eps): #tqdm()
            self.mcts = MCTS(agent, mcts_args)
            train_samples += self.play_game(agent)

        agent.updateNet(np.array(train_samples))
        return train_samples

    def arena(self, agent1, agent2, mcts_args, games_to_play=10):
        mcts1 = MCTS(agent1, mcts_args)
        mcts2 = MCTS(agent2, mcts_args)
        results = []

        for i in range(games_to_play): #tqdm()
            if i % 2 == 0:
                player1 = mcts1
                player2 = mcts2
            else:
                player2 = mcts1
                player1 = mcts2

            env = TicTacToeEnv()

            done = False
            while not done:
                first_player_move = env.fpt
                if first_player_move:
                    probs = player1.getProbs(env, temp=0)
                else:
                    probs = player2.getProbs(env, temp=0)
                
                action = np.random.choice(probs.shape[0], p=probs.reshape(-1,))
                _, reward, done, _ = env.step( (action//env.size, action % env.size) )
                if reward == -1:
                    print('Repeated move!')
                if done:
                    results.append( reward if first_player_move else -1*reward )
        return results
Ejemplo n.º 11
0
                if display:
                    env.render()

                if reward == -1:
                    print('You lose!')
                    lose_count += 1
                elif reward == 1:
                    print('You won!')
                    won_count += 1
                elif reward == -10:
                    exit()
                break
    print('%d-%d-%d' % (won_count, round -
                        (won_count + lose_count), lose_count))
    pass


g = ConnectX_AlphaZero()
# 神经网络
n = NNet(g)
n.load_checkpoint('./temp', 'best.pth.tar')
mcts = MCTS(g, n, dotdict({'numMCTSSims': 50, 'cpuct': 1.0}))
# 执行选择
alpha_zero_action = lambda x: np.argmax(mcts.getActionProb(x, temp=0))

# play_with_coumpter(alpha_zero_action, player=1, agent_level=2, round=100)
# play_with_coumpter(alpha_zero_action, player=2, agent_level=2, round=100)
# play_with_coumpter(alpha_zero_action, player=1, agent_level=1, round=100)
# play_with_coumpter(alpha_zero_action, player=2, agent_level=1, round=100)

play_with_human(alpha_zero_action, 1)
Ejemplo n.º 12
0
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

from env import TicTacToeEnv
from utils import *
from AlphaZero.AlphaNet import AlphaNet
from AlphaZero.MCTS import MCTS
from AlphaZero.trainAlphaZero import Trainer
import torch

if __name__ == '__main__':
    net = AlphaNet(11, device='cuda:0')
    net.load_net('AlphaZero/models/net_updates_2000.pth')
    args = {'c': 1., 'num_sims': 25, 'sleep_time': 0}
    mcts = MCTS(net, args)
    coach = Trainer(net, args, num_eps=100)

    for num_update in range(2001, 3001):
        print('Starting update {}'.format(num_update))
        coach.execute_update(net, args)

        net_old = AlphaNet(11, device='cuda:0')
        net_old.load_net('AlphaZero/models/net_updates_' +
                         str(num_update - 1) + '.pth')

        results = coach.arena(net, net_old, args, games_to_play=40)

        num_of_wins = np.sum(
            [1 if results[i] == (-1)**(i) else 0 for i in range(len(results))])
        num_of_losses = np.sum([
Ejemplo n.º 13
0
 def __init__(self, path, args):
     self.type = 'computer'
     self.net = AlphaNet(11, device='cuda:0')
     self.net.load_net(path)
     self.mcts = MCTS(self.net, args)