Example #1
0
 def maxValue(self, alpha, beta, currentDepth, simulator):
     self.iter += 1
     if currentDepth == self.depth:
         return simulator.approxScore()
     value = -1e9
     #        timer.startTime("fetch actions")
     availableActions = self.reducedAvailableActions(simulator)
     #        timer.endTime("fetch actions")
     for act in availableActions:
         timer.startTime("take action")
         simulator.takeAction(act)
         timer.endTime("take action")
         if simulator.isFinish():
             value = -simulator.approxScore()
             simulator.rollbackLastAction()
             break
         value = max(
             value,
             -self.maxValue(-beta, -alpha, currentDepth + 1, simulator))
         timer.startTime("rollback")
         simulator.rollbackLastAction()
         timer.endTime("rollback")
         if value >= beta:
             break
         alpha = max(alpha, value)
     return value
Example #2
0
    def updateFeatureAndScore(self, currentPlayer, action, mode):
        dx = [-1, -1, -1, 0]
        dy = [-1, 0, 1, -1]
        x, y = action[0], action[1]
        lengths = []
        board = self.boardList[currentPlayer]
        timer.startTime("part 2:get length")
        importance = 0
        for i in range(4):
            l, r = 0, 0
            xx, yy = x - dx[i], y - dy[i]
            while l + 1 + 1 <= self.numberForWin and not self.outOfRange(
                    xx, yy) and board[xx][yy]:
                l += 1
                xx -= dx[i]
                yy -= dy[i]

            xx, yy = x + dx[i], y + dy[i]
            while r + l + 1 + 1 <= self.numberForWin and not self.outOfRange(
                    xx, yy) and board[xx][yy]:
                r += 1
                xx += dx[i]
                yy += dy[i]
            lengths.append(l + r + 1)
            importance += self.lenToScore(l + r + 1)
        timer.endTime("part 2:get length")

        # timer.startTime("test")
        # tmp = 1
        # for i in range(4):
        #     for j in range(self.numberForWin):
        #         for k in range(self.numberForWin):
        #         tmp += i*j
        # timer.endTime("test")
        if mode == 0:
            return importance

        timer.startTime("part 2:update")
        for length in lengths:
            self.feature[currentPlayer][length] += 1 * mode
            if length == self.numberForWin:
                if self.feature[currentPlayer][length] == 0 or self.feature[
                        currentPlayer][length] == 1:
                    self.playerScore[currentPlayer] += self.lenToScore(
                        length) * mode
            else:
                self.playerScore[currentPlayer] += self.lenToScore(
                    length) * mode
        timer.endTime("part 2:update")
        return importance
Example #3
0
 def getAction(self, simulator):
     TimeId = timer.startTime("GetAction")
     self.policy, _ = self.net.getPolicy_Value(
         torch.tensor(simulator.getCurrentState(), dtype=torch.float))
     self.policy = self.policy.tolist()
     bestAction = (-1, -1)
     bestProb = -1e9
     self.actProPair = {}
     for act in simulator.getAvailableActions():
         prob = self.policy[simulator.encodeAction(act)]
         self.actProPair[act] = prob
         if prob > bestProb:
             bestAction = act
             bestProb = prob
         elif prob == bestProb and np.random.random() > 0.5:
             bestAction = act
     timer.endTime(TimeId)
     return bestAction
Example #4
0
 def getAction(self, simulator):
     TimeID = timer.startTime("Get action")
     self.mcts.run(self.numOfiterations,
                   simulator,
                   self.network,
                   rolloutFn=self.rollout,
                   balance=self.balance)
     act_pro_pair = self.mcts.getPolicy()
     keys = []
     values = []
     for key, value in act_pro_pair.items():
         keys.append(key)
         values.append(value)
     action = keys[np.random.choice(len(values), 1, p=values)[0]]
     self.mcts.takeAction(action)
     policy = [0 for i in range(simulator.getSize()**2)]
     for act in act_pro_pair.keys():
         policy[simulator.encodeAction(act)] = act_pro_pair[act]
     self.datalist.append((action, policy, simulator.getCurrentPlayer()))
     timer.endTime(TimeID)
     return action
Example #5
0
 def expand(self, simulator, expandingFn, rolloutFn=None, balance = 1):
     """
     reach and expand a leaf.
     :return:
     """
     simulator = copy.deepcopy(simulator) #could I use copy?
     node = self.currentRootNode
     while not node.isLeaf():
         action = node.bestActionByPUCT()
         node = node.children[action]
         simulator.takeAction(action)
     if simulator.isFinish():
         z = 1.0 if simulator.getWinner() == simulator.getCurrentPlayer() else -1.0
     else:
         Id = timer.startTime('Expanding')
         actions = simulator.getAvailableActions()
         actionProbability, z = expandingFn.getPolicy_Value(simulator.getCurrentState())
         timer.endTime(Id)
         e = 0
         Id = timer.startTime('Rollout')
         if balance>0:
             e = (1.0 if simulator.approxScore()>0 else -1.0) if (rolloutFn is None) else rolloutFn(simulator)
         timer.endTime(Id)
         z = balance*e+ (1-balance)*z
         for action in actions:
             Id = timer.startTime('Child')
             node.children[action] = TreeNode(node, action, actionProbability[simulator.encodeAction(action)],self.C)
             timer.endTime(Id)
     while node != self.currentRootNode:
         node.N += 1
         node.W += -z  #in logic, 一个点的Q存的是他父亲走这一步的价值
         node.V = node.W/node.N
         z=-z
         node = node.fatherNode
Example #6
0
def Training():
    args = argument.get_args()
    logger = get_logger()
    currentModel = -1 if args.overwrite else dataProcessor.getLatestNetworkID()
    trainWorker = NetworkTraining()
    replayBuffer = []
    Loss = []
    WinRate = []

    rollout0 = None
    balance0 = 0
    if args.rolloutMode == 'network':
        rollout0 = None
        balance0 = 0
    elif args.rolloutMode == 'minmax':
        rollout0 = minMaxRolloutFn(1)
        balance0 = 1
    elif args.rolloutMode == 'random':
        rollout0 = randomRolloutFn(20)
        balance0 = 1
    elif args.rolloutMode == 'mix_minmax':
        rollout0 = minMaxRolloutFn(1)
        balance0 = args.balance
    elif args.rolloutMode == 'mix_random':
        rollout0 = randomRolloutFn(30)
        balance0 = args.balance
    else:
        rollout0 = None
        balance0 = 1

    for rd in range(1, args.trainround + 1):
        logger.info("round:%d" % rd)
        if currentModel != -1:
            model = dataProcessor.loadNetwork(args, currentModel)
        else:
            model = PolicyValueFn(args).to(device=args.device)
        eta = math.log(args.trainround / rd) + 1
        file = os.path.join(args.data_folder, f"selfplay-{currentModel+1}.txt")
        #rollout =randomRolloutFn(cnt=7)
        agent1 = Agent.SelfplayAgent(args.numOfIterations,
                                     model,
                                     file,
                                     eta,
                                     rollout=rollout0,
                                     balance=balance0)

        b = Board.Board(args.size, args.numberForWin)
        g = Game.Game(agent0=agent1, agent1=agent1, simulator=b)

        for i in range(1, args.epochs + 1):
            logger.info("epoch %d" % i)
            TimeID = timer.startTime("play time")
            g.run()
            timer.endTime(TimeID)
            timer.showTime(TimeID)
            if i % args.n_save_step == 0:
                agent1.saveData()
            if args.openReplayBuffer and len(replayBuffer) > args.buffersize:
                buffer = []
                for i in range(args.buffersize):
                    buffer.append(random.choice(replayBuffer))
                trainWorker.train(args.miniTrainingEpochs,
                                  currentModel,
                                  buffer,
                                  update=False)
            #if args.openReplayBuffer and len(replayBuffer):
            #    trainWorker.train(args.miniTrainingEpochs, currentModel, replayBuffer, update=False)
        agent1.saveData()
        dataList = dataProcessor.retrieveData(file)
        replayBuffer = replayBuffer + dataList
        if len(replayBuffer) > args.maxBufferSize:
            replayBuffer = replayBuffer[-args.maxBufferSize:]
        currentModel += 1
        TimeID = timer.startTime("network training")
        Loss.append(trainWorker.train(args.trainepochs, currentModel,
                                      dataList))
        timer.endTime(TimeID)
        timer.showTime(TimeID)

        #if args.openReplayBuffer:
        #    TimeID = timer.startTime("update replay buffer")
        #    replayBuffer = trainWorker.getReplayData(currentModel, dataList)
        #    timer.endTime(TimeID)
        #    timer.showTime(TimeID)
        agentTest = Agent.IntelligentAgent(args.numOfIterations,
                                           dataProcessor.loadNetwork(args),
                                           rolloutFn=rollout0,
                                           balance=balance0)

        exp = Experiment()
        WinRate.append(exp.evaluationWithBaseLine(agentTest))
        logger.info("WinRate: %.3f" % WinRate[-1])
    return Loss, WinRate