async def playTurn(queue, myMcData, actionList, cmdHeader, initMoves): request = await queue.get() if len(initMoves) > 0: action = initMoves[0] del initMoves[0] print('|c|' + cmdHeader + '|Turn ' + str(i) + ' pre-set action:', action, file=file) else: #figure out what kind of action we need state = request[1]['stateHash'] actions = moves.getMoves(format, request[1]) #the mcdatasets are all combined, so we can just look at the first data = myMcData[0] #probs = mc.getProbsExp3(data, state, actions) probs = mc.getProbsRM(data, state, actions) #remove low probability moves, likely just noise #this can remove every action, but if that's the case then it's doesn't really matter #as all the probabilites are low normProbs = np.array( [p if p > probCutoff else 0 for p in probs]) normProbs = normProbs / np.sum(normProbs) action = np.random.choice(actions, p=normProbs) actionList.append(action) await game.cmdQueue.put(cmdHeader + action)
async def getAction(self, request): #get info to replicate the current game state seed = request['state']['startingSeed'] initActions = request['state']['actions'] try: searchPs = [await getPSProcess() for i in range(self.numProcesses)] searches = [] for j in range(self.numProcesses): search = mc.mcSearchRM(searchPs[j], self.format, self.teams, limit=self.limit, seed=seed, p1InitActions=initActions[0], p2InitActions=initActions[1], mcData=self.mcDataset, pid=j, initExpVal=0, probScaling=2, regScaling=1.5) searches.append(search) print('searching', file=sys.stderr) await asyncio.gather(*searches) print('combining', file=sys.stderr) self.mcDataset = mc.combineRMData([self.mcDataset], self.valueModel)[0] #figure out what kind of action we need state = request['stateHash'] actions = moves.getMoves(format, request) data = self.mcDataset[1] probs = mc.getProbsRM(data, state, actions) #remove low probability moves, likely just noise #this can remove every action, but if that's the case then it's doesn't really matter #as all the probabilites are low normProbs = np.array( [p if p > self.probCutoff else 0 for p in probs]) normProbs = normProbs / np.sum(normProbs) action = np.random.choice(actions, p=normProbs) return action finally: for ps in searchPs: ps.terminate()
async def randomAgent(queue, cmdHeader, initMoveList): while True: req = await queue.get() if req[0] == Game.END: break #print('getting actions') actions = moves.getMoves(format, req[1]) state = req[1]['state'] #print(cmdHeader, 'actions', actions) if len(initMoveList) > 0: action = initMoveList[0] del initMoveList[0] else: action = random.choice(actions) print(cmdHeader, 'picked', action) await game.cmdQueue.put(cmdHeader + action)
async def playTurn(num): request = await queues[num].get() if len(initMoves[num]) > 0: #do the given action action = initMoves[num][0] del initMoves[num][0] print('|c|' + cmdHeaders[num] + '|Turn ' + str(i) + ' pre-set action:', action, file=file) else: #let the agent pick the action #figure out what kind of action we need state = request[1]['stateHash'] actions = moves.getMoves(format, request[1]) probs = agent.getProbs(num, state, actions) #remove low probability moves, likely just noise normProbs = np.array( [p if p > probCutoff else 0 for p in probs]) normSum = np.sum(normProbs) if normSum > 0: normProbs = normProbs / np.sum(normProbs) else: normProbs = [1 / len(actions) for a in actions] for j in range(len(actions)): actionString = moves.prettyPrintMove( actions[j], request[1]) if normProbs[j] > 0: print('|c|' + cmdHeaders[num] + '|Turn ' + str(i) + ' action:', actionString, 'prob:', '%.1f%%' % (normProbs[j] * 100), file=file) action = np.random.choice(actions, p=normProbs) actionLists[num].append(action) await game.cmdQueue.put(cmdHeaders[num] + action)
async def cfrRecur(self, ps, game, startSeed, history, iter, depth=0, rollout=False): async def endGame(): side = 'bot1' if iter % 2 == 0 else 'bot2' winner = await game.winner #have to clear the results out of the queues while not game.p1Queue.empty(): await game.p1Queue.get() while not game.p2Queue.empty(): await game.p2Queue.get() #the deep cfr paper uses [-1,1] rather than [0,1] for u #but I like [0,1] if winner == side: return 1 else: return -1 if depth >= self.depthLimit: rollout = True cmdHeaders = ['>p1', '>p2'] queues = [game.p1Queue, game.p2Queue] offPlayer = (iter + 1) % 2 onPlayer = iter % 2 #off player request = (await queues[offPlayer].get()) if request[0] == Game.END: return await endGame() req = request[1] state = req['state'] actions = moves.getMoves(self.format, req) #just sample a move probs = self.regretMatch(offPlayer, state, actions, -1) if depth == 0 and self.pid == 0: print('player ' + str(offPlayer) + ' probs', list(zip(actions, probs)), file=sys.stderr) offAction = np.random.choice(actions, p=probs) #and update average stategy #we should be okay adding this for rollouts #but I'm testing skipping rollouts if not rollout: self.updateProbs(offPlayer, state, actions, probs, iter // 2 + 1) #on player request = (await queues[onPlayer].get()) if request[0] == Game.END: return await endGame() req = request[1] state = req['state'] actions = moves.getMoves(self.format, req) probs = self.regretMatch(onPlayer, state, actions, depth) if depth == 0 and self.pid == 0: print('player ' + str(onPlayer) + ' probs', list(zip(actions, probs)), file=sys.stderr) if rollout: #we pick one action according to the current strategy actions = [np.random.choice(actions, p=probs)] actionIndices = [0] elif self.branchingLimit: #select a set of actions to pick #chance to play randomly instead of picking the best actions exploreProbs = probs # * (0.9) + 0.1 / len(probs) #there might be some duplicates but it shouldn't matter actionIndices = np.random.choice(len(actions), self.branchingLimit, p=exploreProbs) else: #we're picking every action actionIndices = list(range(len(actions))) #get expected reward for each action rewards = [] gameUsed = False for i in range(len(actions)): action = actions[i] #use rollout for non-sampled actions if not i in actionIndices: curRollout = True else: curRollout = rollout #don't have to re-init game for the first action if gameUsed: game = Game(ps, self.teams, format=self.format, seed=startSeed, verbose=self.verbose) await game.startGame() await game.applyHistory(history) #need to consume two requests, as we consumed two above await game.p1Queue.get() await game.p2Queue.get() else: gameUsed = True seed = Game.getSeed() if onPlayer == 0: onHeader = '>p1' offHeader = '>p2' historyEntry = (seed, action, offAction) else: onHeader = '>p2' offHeader = '>p1' historyEntry = (seed, offAction, action) await game.cmdQueue.put('>resetPRNG ' + str(seed)) await game.cmdQueue.put(onHeader + action) await game.cmdQueue.put(offHeader + offAction) r = await self.cfrRecur(ps, game, startSeed, history + [historyEntry], iter, depth=depth + 1, rollout=curRollout) rewards.append(r) if not rollout: #save sample of advantages stateExpValue = 0 for p, r in zip(probs, rewards): stateExpValue += p * r advantages = [r - stateExpValue for r in rewards] am = self.advModels[onPlayer] am.addSample(state, zip(actions, advantages), iter // 2 + 1) if depth == 0 and self.pid == 0: print('player', str(onPlayer), file=sys.stderr) print('stateExpValue', stateExpValue, 'from', list(zip(probs, rewards)), file=sys.stderr) print('advantages', list(zip(actions, advantages)), file=sys.stderr) return stateExpValue else: #we can't calculate advantage, so we can't update anything #we only have one reward, so just return it return rewards[0]
async def rmRecur(self, ps, game, startSeed, iter, depth=0): cmdHeaders = ['>p1', '>p2'] queues = [game.p1Queue, game.p2Queue] #all the actions both players can pick playerActions = [[], []] #the probabilitiy of picking each action playerProbs = [[], []] #the indices of the actions actually picked pickedActions = [0, 0] #both players make their move for i in range(2): request = (await queues[i].get()) if request[0] == Game.END: winner = await game.winner #have to clear the results out of the queues while not game.p1Queue.empty(): await game.p1Queue.get() while not game.p2Queue.empty(): await game.p2Queue.get() if winner == 'bot1': return 1 else: return 0 req = request[1] state = req['stateHash'] #get rm probs for the actions actions = moves.getMoves(self.format, req) playerActions[i] = actions probs = self.regretMatch(i, state, actions) #add exploration, which adds a chance to play randomly exploreProbs = probs * ( 1 - self.exploration) + self.exploration / len(probs) playerProbs[i] = probs #and sample one action pickedActions[i] = np.random.choice(len(actions), p=exploreProbs) #apply the picked actions to the game seed = Game.getSeed() await game.cmdQueue.put('>resetPRNG ' + str(seed)) for i in range(2): pickedAction = pickedActions[i] action = playerActions[i][pickedAction] await game.cmdQueue.put(cmdHeaders[i] + action) #get the reward so we can update our regrets reward = await self.rmRecur(ps, game, startSeed, iter, depth=depth + 1) #save the reward a1 = playerActions[0][pickedActions[0]] a2 = playerActions[1][pickedActions[1]] self.addReward(state, a1, a2, reward) #need to update both players' regret and strategy for i in range(2): #update each action's regret and probability in average strategy rt = self.regretTables[i] pt = self.probTables[i] actions = playerActions[i] for j in range(len(actions)): #update stategy with this iteration's strategy #which just means adding the current probability of each action probScale = ((iter + 1) / (iter + 2))**self.probScaling prob = dictGet(pt, (state, actions[j])) pt[hash((state, actions[j]))] = probScale * prob + playerProbs[i][j] #immediate regret of picked actions is 0, so just skip if j == pickedActions[i]: continue #get existing regret so we can add to it regret = dictGet(rt, (state, actions[j])) if self.regScaling != 0: regret *= ((iter + 1)**self.regScaling) / ( (iter + 1)**self.regScaling + 1) #get i's possible action and -i's actual action #in player order if i == 0: a1 = actions[j] a2 = playerActions[1][pickedActions[1]] myReward = reward else: a1 = playerActions[0][pickedActions[0]] a2 = actions[j] myReward = 1 - reward #get expected value for the potential turn expValue = self.getExpValue(i, state, a1, a2) #add immediate regret if self.posReg: rt[hash((state, actions[j]))] = max(regret + expValue - myReward, 0) else: rt[hash( (state, actions[j]))] = regret + expValue - myReward #pass the actual reward up return reward
async def cfrRecur(self, ps, game, startSeed, history, q, iter, depth=0, rollout=False): #I'm not sure about this q parameter #I'm getting better results setting it to 1 in all games q = 1 async def endGame(): side = 'bot1' if iter % 2 == 0 else 'bot2' winner = await game.winner #have to clear the results out of the queues while not game.p1Queue.empty(): await game.p1Queue.get() while not game.p2Queue.empty(): await game.p2Queue.get() if winner == side: return 1 / q else: return 0 cmdHeaders = ['>p1', '>p2'] queues = [game.p1Queue, game.p2Queue] offPlayer = (iter + 1) % 2 onPlayer = iter % 2 #off player request = (await queues[offPlayer].get()) if request[0] == Game.END: return await endGame() req = request[1] state = req['stateHash'] actions = moves.getMoves(self.format, req) #just sample a move probs = self.regretMatch(offPlayer, state, actions) #apply exploration chance to off-player as well exploreProbs = probs * ( 1 - self.exploration) + self.exploration / len(actions) #or don't #exploreProbs = probs offAction = np.random.choice(actions, p=exploreProbs) #and update average stategy self.updateProbs(offPlayer, state, actions, probs / q, iter) #on player request = (await queues[onPlayer].get()) if request[0] == Game.END: return await endGame() req = request[1] #now that we've checked if the game is over, #let's check depth before continuing if self.depthLimit != None and depth >= self.depthLimit: if self.evaluation == HEURISTIC: #immediately return a heuristic-based expected value await game.cmdQueue.put('>forcewin p1') #clean up the end game messages await queues[onPlayer].get() await queues[offPlayer].get() return expValueHeuristic(onPlayer, req['state']) / q elif self.evaluation == ROLLOUT: #instead of branching out, find the actual value of a single #play-through and use that as the expected value rollout = True #rest of rollout is implemented with the normal code path elif self.evaluation == MODEL: #TODO pass state = req['stateHash'] actions = moves.getMoves(self.format, req) #we sometimes bias towards the first or last actions #this fixes that bias random.shuffle(actions) #probs is the set of sample probabilities, used for traversing #iterProbs is the set of probabilities for this iteration's strategy, used for regret if rollout: #I'm not sure if using regret matching or going uniform random #would be better #my gut says regret matching probs = self.regretMatch(onPlayer, state, actions) action = np.random.choice(actions, p=probs) actions = [action] probs = [1] # would it be better to use the actual probability? iterProbs = probs elif self.samplingType == EXTERNAL: probs = self.regretMatch(onPlayer, state, actions) iterProbs = probs elif self.samplingType == AVERAGE: #we're just using the current iteration's strategy #it's simple and it seems to work iterProbs = self.regretMatch(onPlayer, state, actions) probs = iterProbs + self.exploration #this is the average-sampling procedure from some paper #it's designed for a large number of samples, so it doesn't really #work. It expects it to be feasible to try every action for the #on player on some turns, which usually isn't the case """ stratSum = 0 strats = [] pt = self.probTables[onPlayer] for a in actions: s = dictGet(pt, (state, a)) stratSum += s strats.append(s) probs = [] for a,s in zip(actions, strats): if self.bonus + stratSum == 0: p = 0 else: p = (self.bonus + self.threshold * s) / (self.bonus + stratSum) p = max(self.exploration, p) probs.append(p) """ #keep track of how many actions we take from this state numTaken = 0 #get expected reward for each action rewards = [] gameUsed = False self.numActionsSeen += len(actions) #whether a specific action is a rollout curRollout = rollout for action, prob in zip(actions, probs): #for ES we just check every action #for AS use a roll to determine if we search if self.samplingType == AVERAGE and not curRollout: #instead of skipping, try making the skipped entries a rollout #like in https://www.aaai.org/ocs/index.php/AAAI/AAAI12/paper/viewFile/4937/5469 #if we're at the last action and we haven't done anything, do something regardless of roll if (self.bound != 0 and numTaken > self.bound ) or random.random() >= prob and (action != actions[-1] or gameUsed): curRollout = True #rewards.append(0) #continue else: curRollout = rollout numTaken += 1 self.numActionsTaken += 1 #don't have to re-init game for the first action if gameUsed: game = Game(ps, self.teams, format=self.format, seed=startSeed, verbose=self.verbose) await game.startGame() await game.applyHistory(history) #need to consume two requests, as we consumed two above await game.p1Queue.get() await game.p2Queue.get() else: gameUsed = True seed = Game.getSeed() if onPlayer == 0: onHeader = '>p1' offHeader = '>p2' historyEntry = (seed, action, offAction) else: onHeader = '>p2' offHeader = '>p1' historyEntry = (seed, offAction, action) await game.cmdQueue.put('>resetPRNG ' + str(seed)) await game.cmdQueue.put(onHeader + action) await game.cmdQueue.put(offHeader + offAction) r = await self.cfrRecur(ps, game, startSeed, history + [historyEntry], q * min(1, max(0.01, prob)), iter, depth=depth + 1, rollout=curRollout) rewards.append(r) #update regrets stateExpValue = 0 for p, r in zip(iterProbs, rewards): stateExpValue += p * r rt = self.regretTables[onPlayer] for a, r in zip(actions, rewards): regret = dictGet(rt, (state, a)) if self.regScaling != 0: regret *= ((iter // 2 + 1)**self.regScaling) / ( (iter // 2 + 1)**self.regScaling + 1) if self.posReg: rt[hash((state, a))] = max(0, regret + r - stateExpValue) else: rt[hash((state, a))] = regret + r - stateExpValue return stateExpValue
async def mcExp3Impl(requestQueue, cmdQueue, cmdHeader, mcData, format, iter=0, initActions=[], verbose=False): countTable = mcData['countTable'] expValueTable = mcData['expValueTable'] gamma = mcData['gamma'] seenStates = mcData['seenStates'] #history so we can update probTable history = [] #we're going to be popping off this initActions = copy.deepcopy(initActions) running = True inInitActions = True while running: request = await requestQueue.get() if verbose: print(cmdHeader, 'got request', request) if request[0] == Game.REQUEST or request[0] == Game.ERROR: req = request[1] state = req['stateHash'] seenStates[state] = True actions = moves.getMoves(format, req) #check if we ran out of initActions on the previous turn #if so, we need to change the PRNG if inInitActions and len(initActions) == 0: inInitActions = False #no problem if both players reset the PRNG await cmdQueue.put('>resetPRNG') #calculate a probability for each action #need the probs from the initActions so we can update, #so we always calculate this eta = gamma / len(actions) expValues = [expValueTable[(state, action)] for action in actions] maxExpValues = max(expValues) ws = [ expValueTable[(state, action)] - maxExpValues for action in actions ] xs = [math.exp(eta * w) for w in ws] xSum = np.sum(xs) probs = np.array([(1 - gamma) * x / xSum + gamma / len(actions) for x in xs]) #illegal moves might have a negative probability, which should just be 0 probs = [p if p > 0 else 0 for p in probs] probs = probs / np.sum(probs) if len(initActions) > 0: #blindly pick init action bestAction = initActions[0] bestActionIndex = actions.index(bestAction) bestActionProb = probs[bestActionIndex] initActions = initActions[1:] else: #pick action based on probs bestActionIndex = np.random.choice(len(actions), p=probs) bestAction = actions[bestActionIndex] bestActionProb = probs[bestActionIndex] #save our action history.append((state, bestAction, bestActionProb)) if verbose: print('picked', cmdHeader + bestAction) await cmdQueue.put(cmdHeader + bestAction) elif request[0] == Game.END: #update probTable with our history + result reward = request[1] #rescale reward from [-1,1] to [0,1] reward = (reward + 1) / 2 for state, action, prob in history: countTable[(state, action)] += 1 expValueTable[(state, action)] += reward / prob running = False
async def userTurn(queue, actionList, cmdHeader, initMoves): request = await queue.get() if len(initMoves) > 0: action = initMoves[0] del initMoves[0] print('|c|' + cmdHeader + '|Turn ' + str(i) + ' pre-set action:', action, file=file) else: #figure out what kind of action we need state = request[1]['stateHash'] actions = moves.getMoves(format, request[1]) actionTexts = [] for j in range(len(actions)): action = actions[j].split(',') actionText = [] for k in range(len(action)): a = action[k] a = a.strip() if 'pass' in a: actionText.append('pass') elif 'move' in a: parts = a.split(' ') moveNum = int(parts[1]) if len(parts) < 3: targetNum = 0 else: targetNum = int(parts[2]) move = request[1]['active'][k]['moves'][ moveNum - 1]['move'] if targetNum != 0: actionText.append(move + ' into slot ' + str(targetNum)) else: actionText.append(move) elif 'team' in a: actionText.append(a) elif 'switch' in a: actionText.append(a) else: actionText.append('unknown action: ' + a) actionString = ','.join(actionText) actionTexts.append(actionString) #ask the user which action to take print('Legal actions:') for j in range(len(actions)): print(j, actionTexts[j], '(' + actions[j] + ')') #humans are dumb and make mistakes while True: try: actionIndex = int(input('Your action:')) if actionIndex >= 0 and actionIndex < len( actions): action = actions[actionIndex] break except ValueException: pass print('try again') actionList.append(action) await game.cmdQueue.put(cmdHeader + action)
async def mcOOSImpl(requestQueue, cmdQueue, cmdHeader, mcData, format, playerNum, iter, initActions, pid=0, posReg=False, probScaling=0, regScaling=0, verbose=False): regretTable = mcData['regretTable'] seenStates = mcData['seenStates'] gamma = mcData['gamma'] probTable = mcData['probTable'] #stack where our actions/strategies are stored history = [] running = True inInitActions = True while running: request = await requestQueue.get() if verbose: print('got request', cmdHeader, request) if request[0] == Game.REQUEST: req = request[1] state = req['stateHash'] seenStates[state] = True actions = moves.getMoves(format, req) #after doing init actions, we're in the target state #need to reset the PRNG so the bot doesn't cheat if inInitActions and len(initActions) == 0: inInitActions = False await cmdQueue.put('>resetPRNG') #generate a stategy rSum = 0 regrets = [] for action in actions: regret = regretTable[(state, action)] regrets.append(regret) rSum += max(0, regret) if rSum > 0: #prob according to regret probs = np.array([max(0, r) / rSum for r in regrets]) probs = probs / np.sum(probs) #use probs to update strategy #use exploreProbs to sample moves if iter % 2 == playerNum: exploreProbs = probs * (1 - gamma) + gamma / len(actions) else: #we're the off player, don't explore exploreProbs = probs else: #everything is new/bad, play randomly probs = np.array([1 / len(actions) for a in actions]) exploreProbs = probs if len(initActions) > 0: #blindly pick init action preAction = initActions[0].strip() #find close enough action in list #PS client will generate team preview actions that #are longer than what we expect, but we can just #assume that the equivalent action is a prefix bestActionIndex = 0 while bestActionIndex < len(actions): if preAction.startswith(actions[bestActionIndex].strip()): break bestActionIndex += 1 bestAction = actions[bestActionIndex] initActions = initActions[1:] else: #pick action based on probs bestActionIndex = np.random.choice(len(actions), p=exploreProbs) bestAction = actions[bestActionIndex] #save our action history.append( (state, bestActionIndex, actions, probs, exploreProbs)) if verbose: print('picked', cmdHeader + bestAction) await cmdQueue.put(cmdHeader + bestAction) elif request[0] == Game.END: running = False #map from [-1,1] to [0,1] reward = (request[1] + 1) / 2 #on player's contribution to tail probability x = 1 #on player's contribution to sample probability q = 1 while len(history) > 0: state, actionIndex, actions, probs, exploreProbs = history.pop( ) if iter % 2 == playerNum: action = actions[actionIndex] w = reward * x / q p = probs[actionIndex] ep = exploreProbs[actionIndex] #update picked action's regret regret = regretTable[(state, action)] if regScaling != 0: regret *= ((iter + 1)**regScaling) / ( (iter + 1)**regScaling + 1) regretTable[(state, action)] = regret + (1 - p) / ep * w #update other actions' regrets for i in range(len(actions)): if i == actionIndex: continue regret = regretTable[(state, actions[i])] if regScaling != 0: regret *= ((iter + 1)**regScaling) / ( (iter + 1)**regScaling + 1) if posReg: regretTable[(state, actions[i])] = max( 0, regret - p / ep * w) else: regretTable[(state, actions[i])] = regret - p / ep * w x *= p q *= ep else: #update off player's average stategy probScale = ((iter + 1) / (iter + 2))**probScaling for i in range(len(actions)): oldProb = probTable[(state, actions[i])] probTable[( state, actions[i])] = probScale * oldProb + probs[i]