def maxValue(self, alpha, beta, currentDepth, simulator): self.iter += 1 if currentDepth == self.depth: return simulator.approxScore() value = -1e9 # timer.startTime("fetch actions") availableActions = self.reducedAvailableActions(simulator) # timer.endTime("fetch actions") for act in availableActions: timer.startTime("take action") simulator.takeAction(act) timer.endTime("take action") if simulator.isFinish(): value = -simulator.approxScore() simulator.rollbackLastAction() break value = max( value, -self.maxValue(-beta, -alpha, currentDepth + 1, simulator)) timer.startTime("rollback") simulator.rollbackLastAction() timer.endTime("rollback") if value >= beta: break alpha = max(alpha, value) return value
def updateFeatureAndScore(self, currentPlayer, action, mode): dx = [-1, -1, -1, 0] dy = [-1, 0, 1, -1] x, y = action[0], action[1] lengths = [] board = self.boardList[currentPlayer] timer.startTime("part 2:get length") importance = 0 for i in range(4): l, r = 0, 0 xx, yy = x - dx[i], y - dy[i] while l + 1 + 1 <= self.numberForWin and not self.outOfRange( xx, yy) and board[xx][yy]: l += 1 xx -= dx[i] yy -= dy[i] xx, yy = x + dx[i], y + dy[i] while r + l + 1 + 1 <= self.numberForWin and not self.outOfRange( xx, yy) and board[xx][yy]: r += 1 xx += dx[i] yy += dy[i] lengths.append(l + r + 1) importance += self.lenToScore(l + r + 1) timer.endTime("part 2:get length") # timer.startTime("test") # tmp = 1 # for i in range(4): # for j in range(self.numberForWin): # for k in range(self.numberForWin): # tmp += i*j # timer.endTime("test") if mode == 0: return importance timer.startTime("part 2:update") for length in lengths: self.feature[currentPlayer][length] += 1 * mode if length == self.numberForWin: if self.feature[currentPlayer][length] == 0 or self.feature[ currentPlayer][length] == 1: self.playerScore[currentPlayer] += self.lenToScore( length) * mode else: self.playerScore[currentPlayer] += self.lenToScore( length) * mode timer.endTime("part 2:update") return importance
def getAction(self, simulator): TimeId = timer.startTime("GetAction") self.policy, _ = self.net.getPolicy_Value( torch.tensor(simulator.getCurrentState(), dtype=torch.float)) self.policy = self.policy.tolist() bestAction = (-1, -1) bestProb = -1e9 self.actProPair = {} for act in simulator.getAvailableActions(): prob = self.policy[simulator.encodeAction(act)] self.actProPair[act] = prob if prob > bestProb: bestAction = act bestProb = prob elif prob == bestProb and np.random.random() > 0.5: bestAction = act timer.endTime(TimeId) return bestAction
def getAction(self, simulator): TimeID = timer.startTime("Get action") self.mcts.run(self.numOfiterations, simulator, self.network, rolloutFn=self.rollout, balance=self.balance) act_pro_pair = self.mcts.getPolicy() keys = [] values = [] for key, value in act_pro_pair.items(): keys.append(key) values.append(value) action = keys[np.random.choice(len(values), 1, p=values)[0]] self.mcts.takeAction(action) policy = [0 for i in range(simulator.getSize()**2)] for act in act_pro_pair.keys(): policy[simulator.encodeAction(act)] = act_pro_pair[act] self.datalist.append((action, policy, simulator.getCurrentPlayer())) timer.endTime(TimeID) return action
def expand(self, simulator, expandingFn, rolloutFn=None, balance = 1): """ reach and expand a leaf. :return: """ simulator = copy.deepcopy(simulator) #could I use copy? node = self.currentRootNode while not node.isLeaf(): action = node.bestActionByPUCT() node = node.children[action] simulator.takeAction(action) if simulator.isFinish(): z = 1.0 if simulator.getWinner() == simulator.getCurrentPlayer() else -1.0 else: Id = timer.startTime('Expanding') actions = simulator.getAvailableActions() actionProbability, z = expandingFn.getPolicy_Value(simulator.getCurrentState()) timer.endTime(Id) e = 0 Id = timer.startTime('Rollout') if balance>0: e = (1.0 if simulator.approxScore()>0 else -1.0) if (rolloutFn is None) else rolloutFn(simulator) timer.endTime(Id) z = balance*e+ (1-balance)*z for action in actions: Id = timer.startTime('Child') node.children[action] = TreeNode(node, action, actionProbability[simulator.encodeAction(action)],self.C) timer.endTime(Id) while node != self.currentRootNode: node.N += 1 node.W += -z #in logic, 一个点的Q存的是他父亲走这一步的价值 node.V = node.W/node.N z=-z node = node.fatherNode
def Training(): args = argument.get_args() logger = get_logger() currentModel = -1 if args.overwrite else dataProcessor.getLatestNetworkID() trainWorker = NetworkTraining() replayBuffer = [] Loss = [] WinRate = [] rollout0 = None balance0 = 0 if args.rolloutMode == 'network': rollout0 = None balance0 = 0 elif args.rolloutMode == 'minmax': rollout0 = minMaxRolloutFn(1) balance0 = 1 elif args.rolloutMode == 'random': rollout0 = randomRolloutFn(20) balance0 = 1 elif args.rolloutMode == 'mix_minmax': rollout0 = minMaxRolloutFn(1) balance0 = args.balance elif args.rolloutMode == 'mix_random': rollout0 = randomRolloutFn(30) balance0 = args.balance else: rollout0 = None balance0 = 1 for rd in range(1, args.trainround + 1): logger.info("round:%d" % rd) if currentModel != -1: model = dataProcessor.loadNetwork(args, currentModel) else: model = PolicyValueFn(args).to(device=args.device) eta = math.log(args.trainround / rd) + 1 file = os.path.join(args.data_folder, f"selfplay-{currentModel+1}.txt") #rollout =randomRolloutFn(cnt=7) agent1 = Agent.SelfplayAgent(args.numOfIterations, model, file, eta, rollout=rollout0, balance=balance0) b = Board.Board(args.size, args.numberForWin) g = Game.Game(agent0=agent1, agent1=agent1, simulator=b) for i in range(1, args.epochs + 1): logger.info("epoch %d" % i) TimeID = timer.startTime("play time") g.run() timer.endTime(TimeID) timer.showTime(TimeID) if i % args.n_save_step == 0: agent1.saveData() if args.openReplayBuffer and len(replayBuffer) > args.buffersize: buffer = [] for i in range(args.buffersize): buffer.append(random.choice(replayBuffer)) trainWorker.train(args.miniTrainingEpochs, currentModel, buffer, update=False) #if args.openReplayBuffer and len(replayBuffer): # trainWorker.train(args.miniTrainingEpochs, currentModel, replayBuffer, update=False) agent1.saveData() dataList = dataProcessor.retrieveData(file) replayBuffer = replayBuffer + dataList if len(replayBuffer) > args.maxBufferSize: replayBuffer = replayBuffer[-args.maxBufferSize:] currentModel += 1 TimeID = timer.startTime("network training") Loss.append(trainWorker.train(args.trainepochs, currentModel, dataList)) timer.endTime(TimeID) timer.showTime(TimeID) #if args.openReplayBuffer: # TimeID = timer.startTime("update replay buffer") # replayBuffer = trainWorker.getReplayData(currentModel, dataList) # timer.endTime(TimeID) # timer.showTime(TimeID) agentTest = Agent.IntelligentAgent(args.numOfIterations, dataProcessor.loadNetwork(args), rolloutFn=rollout0, balance=balance0) exp = Experiment() WinRate.append(exp.evaluationWithBaseLine(agentTest)) logger.info("WinRate: %.3f" % WinRate[-1]) return Loss, WinRate