Beispiel #1
0
                async def playTurn(queue, myMcData, actionList, cmdHeader,
                                   initMoves):

                    request = await queue.get()

                    if len(initMoves) > 0:
                        action = initMoves[0]
                        del initMoves[0]
                        print('|c|' + cmdHeader + '|Turn ' + str(i) +
                              ' pre-set action:',
                              action,
                              file=file)
                    else:
                        #figure out what kind of action we need
                        state = request[1]['stateHash']
                        actions = moves.getMoves(format, request[1])

                        #the mcdatasets are all combined, so we can just look at the first
                        data = myMcData[0]
                        #probs = mc.getProbsExp3(data, state, actions)
                        probs = mc.getProbsRM(data, state, actions)
                        #remove low probability moves, likely just noise
                        #this can remove every action, but if that's the case then it's doesn't really matter
                        #as all the probabilites are low
                        normProbs = np.array(
                            [p if p > probCutoff else 0 for p in probs])
                        normProbs = normProbs / np.sum(normProbs)

                        action = np.random.choice(actions, p=normProbs)

                    actionList.append(action)
                    await game.cmdQueue.put(cmdHeader + action)
Beispiel #2
0
    async def getAction(self, request):
        #get info to replicate the current game state
        seed = request['state']['startingSeed']
        initActions = request['state']['actions']

        try:
            searchPs = [await getPSProcess() for i in range(self.numProcesses)]
            searches = []
            for j in range(self.numProcesses):
                search = mc.mcSearchRM(searchPs[j],
                                       self.format,
                                       self.teams,
                                       limit=self.limit,
                                       seed=seed,
                                       p1InitActions=initActions[0],
                                       p2InitActions=initActions[1],
                                       mcData=self.mcDataset,
                                       pid=j,
                                       initExpVal=0,
                                       probScaling=2,
                                       regScaling=1.5)
                searches.append(search)

            print('searching', file=sys.stderr)
            await asyncio.gather(*searches)

            print('combining', file=sys.stderr)
            self.mcDataset = mc.combineRMData([self.mcDataset],
                                              self.valueModel)[0]

            #figure out what kind of action we need
            state = request['stateHash']
            actions = moves.getMoves(format, request)

            data = self.mcDataset[1]
            probs = mc.getProbsRM(data, state, actions)
            #remove low probability moves, likely just noise
            #this can remove every action, but if that's the case then it's doesn't really matter
            #as all the probabilites are low
            normProbs = np.array(
                [p if p > self.probCutoff else 0 for p in probs])
            normProbs = normProbs / np.sum(normProbs)

            action = np.random.choice(actions, p=normProbs)
            return action
        finally:
            for ps in searchPs:
                ps.terminate()
Beispiel #3
0
    async def randomAgent(queue, cmdHeader, initMoveList):
        while True:
            req = await queue.get()
            if req[0] == Game.END:
                break

            #print('getting actions')
            actions = moves.getMoves(format, req[1])
            state = req[1]['state']
            #print(cmdHeader, 'actions', actions)

            if len(initMoveList) > 0:
                action = initMoveList[0]
                del initMoveList[0]
            else:
                action = random.choice(actions)
            print(cmdHeader, 'picked', action)
            await game.cmdQueue.put(cmdHeader + action)
Beispiel #4
0
                async def playTurn(num):

                    request = await queues[num].get()

                    if len(initMoves[num]) > 0:
                        #do the given action
                        action = initMoves[num][0]
                        del initMoves[num][0]
                        print('|c|' + cmdHeaders[num] + '|Turn ' + str(i) +
                              ' pre-set action:',
                              action,
                              file=file)
                    else:
                        #let the agent pick the action
                        #figure out what kind of action we need
                        state = request[1]['stateHash']
                        actions = moves.getMoves(format, request[1])

                        probs = agent.getProbs(num, state, actions)
                        #remove low probability moves, likely just noise
                        normProbs = np.array(
                            [p if p > probCutoff else 0 for p in probs])
                        normSum = np.sum(normProbs)
                        if normSum > 0:
                            normProbs = normProbs / np.sum(normProbs)
                        else:
                            normProbs = [1 / len(actions) for a in actions]

                        for j in range(len(actions)):
                            actionString = moves.prettyPrintMove(
                                actions[j], request[1])
                            if normProbs[j] > 0:
                                print('|c|' + cmdHeaders[num] + '|Turn ' +
                                      str(i) + ' action:',
                                      actionString,
                                      'prob:',
                                      '%.1f%%' % (normProbs[j] * 100),
                                      file=file)

                        action = np.random.choice(actions, p=normProbs)

                    actionLists[num].append(action)
                    await game.cmdQueue.put(cmdHeaders[num] + action)
Beispiel #5
0
    async def cfrRecur(self,
                       ps,
                       game,
                       startSeed,
                       history,
                       iter,
                       depth=0,
                       rollout=False):
        async def endGame():
            side = 'bot1' if iter % 2 == 0 else 'bot2'
            winner = await game.winner
            #have to clear the results out of the queues
            while not game.p1Queue.empty():
                await game.p1Queue.get()
            while not game.p2Queue.empty():
                await game.p2Queue.get()
            #the deep cfr paper uses [-1,1] rather than [0,1] for u
            #but I like [0,1]
            if winner == side:
                return 1
            else:
                return -1

        if depth >= self.depthLimit:
            rollout = True

        cmdHeaders = ['>p1', '>p2']
        queues = [game.p1Queue, game.p2Queue]
        offPlayer = (iter + 1) % 2
        onPlayer = iter % 2

        #off player
        request = (await queues[offPlayer].get())
        if request[0] == Game.END:
            return await endGame()
        req = request[1]
        state = req['state']
        actions = moves.getMoves(self.format, req)
        #just sample a move
        probs = self.regretMatch(offPlayer, state, actions, -1)
        if depth == 0 and self.pid == 0:
            print('player ' + str(offPlayer) + ' probs',
                  list(zip(actions, probs)),
                  file=sys.stderr)
        offAction = np.random.choice(actions, p=probs)
        #and update average stategy
        #we should be okay adding this for rollouts
        #but I'm testing skipping rollouts
        if not rollout:
            self.updateProbs(offPlayer, state, actions, probs, iter // 2 + 1)

        #on player
        request = (await queues[onPlayer].get())
        if request[0] == Game.END:
            return await endGame()
        req = request[1]

        state = req['state']
        actions = moves.getMoves(self.format, req)
        probs = self.regretMatch(onPlayer, state, actions, depth)
        if depth == 0 and self.pid == 0:
            print('player ' + str(onPlayer) + ' probs',
                  list(zip(actions, probs)),
                  file=sys.stderr)
        if rollout:
            #we pick one action according to the current strategy
            actions = [np.random.choice(actions, p=probs)]
            actionIndices = [0]
        elif self.branchingLimit:
            #select a set of actions to pick
            #chance to play randomly instead of picking the best actions
            exploreProbs = probs  # * (0.9) + 0.1 / len(probs)
            #there might be some duplicates but it shouldn't matter
            actionIndices = np.random.choice(len(actions),
                                             self.branchingLimit,
                                             p=exploreProbs)
        else:
            #we're picking every action
            actionIndices = list(range(len(actions)))

        #get expected reward for each action
        rewards = []
        gameUsed = False

        for i in range(len(actions)):
            action = actions[i]

            #use rollout for non-sampled actions
            if not i in actionIndices:
                curRollout = True
            else:
                curRollout = rollout

            #don't have to re-init game for the first action
            if gameUsed:
                game = Game(ps,
                            self.teams,
                            format=self.format,
                            seed=startSeed,
                            verbose=self.verbose)
                await game.startGame()
                await game.applyHistory(history)
                #need to consume two requests, as we consumed two above
                await game.p1Queue.get()
                await game.p2Queue.get()
            else:
                gameUsed = True

            seed = Game.getSeed()
            if onPlayer == 0:
                onHeader = '>p1'
                offHeader = '>p2'
                historyEntry = (seed, action, offAction)
            else:
                onHeader = '>p2'
                offHeader = '>p1'
                historyEntry = (seed, offAction, action)

            await game.cmdQueue.put('>resetPRNG ' + str(seed))
            await game.cmdQueue.put(onHeader + action)
            await game.cmdQueue.put(offHeader + offAction)

            r = await self.cfrRecur(ps,
                                    game,
                                    startSeed,
                                    history + [historyEntry],
                                    iter,
                                    depth=depth + 1,
                                    rollout=curRollout)
            rewards.append(r)

        if not rollout:
            #save sample of advantages
            stateExpValue = 0
            for p, r in zip(probs, rewards):
                stateExpValue += p * r
            advantages = [r - stateExpValue for r in rewards]

            am = self.advModels[onPlayer]
            am.addSample(state, zip(actions, advantages), iter // 2 + 1)

            if depth == 0 and self.pid == 0:
                print('player', str(onPlayer), file=sys.stderr)
                print('stateExpValue',
                      stateExpValue,
                      'from',
                      list(zip(probs, rewards)),
                      file=sys.stderr)
                print('advantages',
                      list(zip(actions, advantages)),
                      file=sys.stderr)

            return stateExpValue
        else:
            #we can't calculate advantage, so we can't update anything
            #we only have one reward, so just return it
            return rewards[0]
Beispiel #6
0
    async def rmRecur(self, ps, game, startSeed, iter, depth=0):

        cmdHeaders = ['>p1', '>p2']
        queues = [game.p1Queue, game.p2Queue]
        #all the actions both players can pick
        playerActions = [[], []]
        #the probabilitiy of picking each action
        playerProbs = [[], []]
        #the indices of the actions actually picked
        pickedActions = [0, 0]
        #both players make their move
        for i in range(2):
            request = (await queues[i].get())
            if request[0] == Game.END:
                winner = await game.winner
                #have to clear the results out of the queues
                while not game.p1Queue.empty():
                    await game.p1Queue.get()
                while not game.p2Queue.empty():
                    await game.p2Queue.get()
                if winner == 'bot1':
                    return 1
                else:
                    return 0

            req = request[1]
            state = req['stateHash']
            #get rm probs for the actions
            actions = moves.getMoves(self.format, req)
            playerActions[i] = actions
            probs = self.regretMatch(i, state, actions)
            #add exploration, which adds a chance to play randomly
            exploreProbs = probs * (
                1 - self.exploration) + self.exploration / len(probs)
            playerProbs[i] = probs
            #and sample one action
            pickedActions[i] = np.random.choice(len(actions), p=exploreProbs)

        #apply the picked actions to the game
        seed = Game.getSeed()
        await game.cmdQueue.put('>resetPRNG ' + str(seed))
        for i in range(2):
            pickedAction = pickedActions[i]
            action = playerActions[i][pickedAction]
            await game.cmdQueue.put(cmdHeaders[i] + action)

        #get the reward so we can update our regrets
        reward = await self.rmRecur(ps, game, startSeed, iter, depth=depth + 1)

        #save the reward
        a1 = playerActions[0][pickedActions[0]]
        a2 = playerActions[1][pickedActions[1]]
        self.addReward(state, a1, a2, reward)

        #need to update both players' regret and strategy
        for i in range(2):
            #update each action's regret and probability in average strategy
            rt = self.regretTables[i]
            pt = self.probTables[i]
            actions = playerActions[i]
            for j in range(len(actions)):
                #update stategy with this iteration's strategy
                #which just means adding the current probability of each action
                probScale = ((iter + 1) / (iter + 2))**self.probScaling
                prob = dictGet(pt, (state, actions[j]))
                pt[hash((state,
                         actions[j]))] = probScale * prob + playerProbs[i][j]

                #immediate regret of picked actions is 0, so just skip
                if j == pickedActions[i]:
                    continue

                #get existing regret so we can add to it
                regret = dictGet(rt, (state, actions[j]))
                if self.regScaling != 0:
                    regret *= ((iter + 1)**self.regScaling) / (
                        (iter + 1)**self.regScaling + 1)
                #get i's possible action and -i's actual action
                #in player order
                if i == 0:
                    a1 = actions[j]
                    a2 = playerActions[1][pickedActions[1]]
                    myReward = reward
                else:
                    a1 = playerActions[0][pickedActions[0]]
                    a2 = actions[j]
                    myReward = 1 - reward

                #get expected value for the potential turn
                expValue = self.getExpValue(i, state, a1, a2)

                #add immediate regret
                if self.posReg:
                    rt[hash((state,
                             actions[j]))] = max(regret + expValue - myReward,
                                                 0)
                else:
                    rt[hash(
                        (state, actions[j]))] = regret + expValue - myReward

        #pass the actual reward up
        return reward
Beispiel #7
0
    async def cfrRecur(self,
                       ps,
                       game,
                       startSeed,
                       history,
                       q,
                       iter,
                       depth=0,
                       rollout=False):
        #I'm not sure about this q parameter
        #I'm getting better results setting it to 1 in all games
        q = 1

        async def endGame():
            side = 'bot1' if iter % 2 == 0 else 'bot2'
            winner = await game.winner
            #have to clear the results out of the queues
            while not game.p1Queue.empty():
                await game.p1Queue.get()
            while not game.p2Queue.empty():
                await game.p2Queue.get()
            if winner == side:
                return 1 / q
            else:
                return 0

        cmdHeaders = ['>p1', '>p2']
        queues = [game.p1Queue, game.p2Queue]
        offPlayer = (iter + 1) % 2
        onPlayer = iter % 2

        #off player
        request = (await queues[offPlayer].get())
        if request[0] == Game.END:
            return await endGame()
        req = request[1]
        state = req['stateHash']
        actions = moves.getMoves(self.format, req)
        #just sample a move
        probs = self.regretMatch(offPlayer, state, actions)
        #apply exploration chance to off-player as well
        exploreProbs = probs * (
            1 - self.exploration) + self.exploration / len(actions)
        #or don't
        #exploreProbs = probs
        offAction = np.random.choice(actions, p=exploreProbs)
        #and update average stategy
        self.updateProbs(offPlayer, state, actions, probs / q, iter)

        #on player
        request = (await queues[onPlayer].get())
        if request[0] == Game.END:
            return await endGame()
        req = request[1]

        #now that we've checked if the game is over,
        #let's check depth before continuing
        if self.depthLimit != None and depth >= self.depthLimit:
            if self.evaluation == HEURISTIC:
                #immediately return a heuristic-based expected value
                await game.cmdQueue.put('>forcewin p1')
                #clean up the end game messages
                await queues[onPlayer].get()
                await queues[offPlayer].get()
                return expValueHeuristic(onPlayer, req['state']) / q
            elif self.evaluation == ROLLOUT:
                #instead of branching out, find the actual value of a single
                #play-through and use that as the expected value
                rollout = True
                #rest of rollout is implemented with the normal code path
            elif self.evaluation == MODEL:
                #TODO
                pass

        state = req['stateHash']
        actions = moves.getMoves(self.format, req)
        #we sometimes bias towards the first or last actions
        #this fixes that bias
        random.shuffle(actions)
        #probs is the set of sample probabilities, used for traversing
        #iterProbs is the set of probabilities for this iteration's strategy, used for regret
        if rollout:
            #I'm not sure if using regret matching or going uniform random
            #would be better
            #my gut says regret matching
            probs = self.regretMatch(onPlayer, state, actions)
            action = np.random.choice(actions, p=probs)
            actions = [action]
            probs = [1]  # would it be better to use the actual probability?
            iterProbs = probs
        elif self.samplingType == EXTERNAL:
            probs = self.regretMatch(onPlayer, state, actions)
            iterProbs = probs
        elif self.samplingType == AVERAGE:
            #we're just using the current iteration's strategy
            #it's simple and it seems to work
            iterProbs = self.regretMatch(onPlayer, state, actions)
            probs = iterProbs + self.exploration

            #this is the average-sampling procedure from some paper
            #it's designed for a large number of samples, so it doesn't really
            #work. It expects it to be feasible to try every action for the
            #on player on some turns, which usually isn't the case
            """
            stratSum = 0
            strats = []
            pt = self.probTables[onPlayer]
            for a in actions:
                s = dictGet(pt, (state, a))
                stratSum += s
                strats.append(s)
            probs = []
            for a,s in zip(actions, strats):
                if self.bonus + stratSum == 0:
                    p = 0
                else:
                    p = (self.bonus + self.threshold * s) / (self.bonus + stratSum)
                p = max(self.exploration, p)
                probs.append(p)
            """
            #keep track of how many actions we take from this state
            numTaken = 0

        #get expected reward for each action
        rewards = []
        gameUsed = False
        self.numActionsSeen += len(actions)
        #whether a specific action is a rollout
        curRollout = rollout
        for action, prob in zip(actions, probs):
            #for ES we just check every action
            #for AS use a roll to determine if we search
            if self.samplingType == AVERAGE and not curRollout:
                #instead of skipping, try making the skipped entries a rollout
                #like in https://www.aaai.org/ocs/index.php/AAAI/AAAI12/paper/viewFile/4937/5469
                #if we're at the last action and we haven't done anything, do something regardless of roll
                if (self.bound != 0 and numTaken > self.bound
                    ) or random.random() >= prob and (action != actions[-1]
                                                      or gameUsed):
                    curRollout = True
                    #rewards.append(0)
                    #continue
                else:
                    curRollout = rollout
                    numTaken += 1
            self.numActionsTaken += 1
            #don't have to re-init game for the first action
            if gameUsed:
                game = Game(ps,
                            self.teams,
                            format=self.format,
                            seed=startSeed,
                            verbose=self.verbose)
                await game.startGame()
                await game.applyHistory(history)
                #need to consume two requests, as we consumed two above
                await game.p1Queue.get()
                await game.p2Queue.get()
            else:
                gameUsed = True

            seed = Game.getSeed()
            if onPlayer == 0:
                onHeader = '>p1'
                offHeader = '>p2'
                historyEntry = (seed, action, offAction)
            else:
                onHeader = '>p2'
                offHeader = '>p1'
                historyEntry = (seed, offAction, action)

            await game.cmdQueue.put('>resetPRNG ' + str(seed))
            await game.cmdQueue.put(onHeader + action)
            await game.cmdQueue.put(offHeader + offAction)

            r = await self.cfrRecur(ps,
                                    game,
                                    startSeed,
                                    history + [historyEntry],
                                    q * min(1, max(0.01, prob)),
                                    iter,
                                    depth=depth + 1,
                                    rollout=curRollout)
            rewards.append(r)

        #update regrets
        stateExpValue = 0
        for p, r in zip(iterProbs, rewards):
            stateExpValue += p * r
        rt = self.regretTables[onPlayer]
        for a, r in zip(actions, rewards):
            regret = dictGet(rt, (state, a))
            if self.regScaling != 0:
                regret *= ((iter // 2 + 1)**self.regScaling) / (
                    (iter // 2 + 1)**self.regScaling + 1)
            if self.posReg:
                rt[hash((state, a))] = max(0, regret + r - stateExpValue)
            else:
                rt[hash((state, a))] = regret + r - stateExpValue

        return stateExpValue
Beispiel #8
0
async def mcExp3Impl(requestQueue,
                     cmdQueue,
                     cmdHeader,
                     mcData,
                     format,
                     iter=0,
                     initActions=[],
                     verbose=False):

    countTable = mcData['countTable']
    expValueTable = mcData['expValueTable']
    gamma = mcData['gamma']
    seenStates = mcData['seenStates']

    #history so we can update probTable
    history = []

    #we're going to be popping off this
    initActions = copy.deepcopy(initActions)

    running = True
    inInitActions = True
    while running:
        request = await requestQueue.get()

        if verbose:
            print(cmdHeader, 'got request', request)

        if request[0] == Game.REQUEST or request[0] == Game.ERROR:
            req = request[1]
            state = req['stateHash']

            seenStates[state] = True
            actions = moves.getMoves(format, req)

            #check if we ran out of initActions on the previous turn
            #if so, we need to change the PRNG
            if inInitActions and len(initActions) == 0:
                inInitActions = False
                #no problem if both players reset the PRNG
                await cmdQueue.put('>resetPRNG')

            #calculate a probability for each action
            #need the probs from the initActions so we can update,
            #so we always calculate this
            eta = gamma / len(actions)
            expValues = [expValueTable[(state, action)] for action in actions]
            maxExpValues = max(expValues)
            ws = [
                expValueTable[(state, action)] - maxExpValues
                for action in actions
            ]
            xs = [math.exp(eta * w) for w in ws]
            xSum = np.sum(xs)
            probs = np.array([(1 - gamma) * x / xSum + gamma / len(actions)
                              for x in xs])
            #illegal moves might have a negative probability, which should just be 0
            probs = [p if p > 0 else 0 for p in probs]
            probs = probs / np.sum(probs)

            if len(initActions) > 0:
                #blindly pick init action
                bestAction = initActions[0]
                bestActionIndex = actions.index(bestAction)
                bestActionProb = probs[bestActionIndex]
                initActions = initActions[1:]
            else:
                #pick action based on probs
                bestActionIndex = np.random.choice(len(actions), p=probs)
                bestAction = actions[bestActionIndex]
                bestActionProb = probs[bestActionIndex]

            #save our action
            history.append((state, bestAction, bestActionProb))

            if verbose:
                print('picked', cmdHeader + bestAction)

            await cmdQueue.put(cmdHeader + bestAction)

        elif request[0] == Game.END:
            #update probTable with our history + result
            reward = request[1]
            #rescale reward from [-1,1] to [0,1]
            reward = (reward + 1) / 2
            for state, action, prob in history:
                countTable[(state, action)] += 1
                expValueTable[(state, action)] += reward / prob

            running = False
Beispiel #9
0
                async def userTurn(queue, actionList, cmdHeader, initMoves):

                    request = await queue.get()

                    if len(initMoves) > 0:
                        action = initMoves[0]
                        del initMoves[0]
                        print('|c|' + cmdHeader + '|Turn ' + str(i) +
                              ' pre-set action:',
                              action,
                              file=file)
                    else:
                        #figure out what kind of action we need
                        state = request[1]['stateHash']
                        actions = moves.getMoves(format, request[1])

                        actionTexts = []
                        for j in range(len(actions)):
                            action = actions[j].split(',')
                            actionText = []
                            for k in range(len(action)):
                                a = action[k]
                                a = a.strip()
                                if 'pass' in a:
                                    actionText.append('pass')
                                elif 'move' in a:
                                    parts = a.split(' ')
                                    moveNum = int(parts[1])
                                    if len(parts) < 3:
                                        targetNum = 0
                                    else:
                                        targetNum = int(parts[2])
                                    move = request[1]['active'][k]['moves'][
                                        moveNum - 1]['move']
                                    if targetNum != 0:
                                        actionText.append(move +
                                                          ' into slot ' +
                                                          str(targetNum))
                                    else:
                                        actionText.append(move)
                                elif 'team' in a:
                                    actionText.append(a)
                                elif 'switch' in a:
                                    actionText.append(a)
                                else:
                                    actionText.append('unknown action: ' + a)
                            actionString = ','.join(actionText)
                            actionTexts.append(actionString)

                        #ask the user which action to take
                        print('Legal actions:')
                        for j in range(len(actions)):
                            print(j, actionTexts[j], '(' + actions[j] + ')')
                        #humans are dumb and make mistakes
                        while True:
                            try:
                                actionIndex = int(input('Your action:'))
                                if actionIndex >= 0 and actionIndex < len(
                                        actions):
                                    action = actions[actionIndex]
                                    break
                            except ValueException:
                                pass
                            print('try again')

                        actionList.append(action)

                        await game.cmdQueue.put(cmdHeader + action)
Beispiel #10
0
async def mcOOSImpl(requestQueue,
                    cmdQueue,
                    cmdHeader,
                    mcData,
                    format,
                    playerNum,
                    iter,
                    initActions,
                    pid=0,
                    posReg=False,
                    probScaling=0,
                    regScaling=0,
                    verbose=False):

    regretTable = mcData['regretTable']
    seenStates = mcData['seenStates']
    gamma = mcData['gamma']
    probTable = mcData['probTable']

    #stack where our actions/strategies are stored
    history = []

    running = True
    inInitActions = True
    while running:
        request = await requestQueue.get()
        if verbose:
            print('got request', cmdHeader, request)

        if request[0] == Game.REQUEST:
            req = request[1]
            state = req['stateHash']

            seenStates[state] = True
            actions = moves.getMoves(format, req)

            #after doing init actions, we're in the target state
            #need to reset the PRNG so the bot doesn't cheat
            if inInitActions and len(initActions) == 0:
                inInitActions = False
                await cmdQueue.put('>resetPRNG')

            #generate a stategy
            rSum = 0
            regrets = []
            for action in actions:
                regret = regretTable[(state, action)]
                regrets.append(regret)
                rSum += max(0, regret)
            if rSum > 0:
                #prob according to regret
                probs = np.array([max(0, r) / rSum for r in regrets])
                probs = probs / np.sum(probs)
                #use probs to update strategy
                #use exploreProbs to sample moves
                if iter % 2 == playerNum:
                    exploreProbs = probs * (1 - gamma) + gamma / len(actions)
                else:
                    #we're the off player, don't explore
                    exploreProbs = probs
            else:
                #everything is new/bad, play randomly
                probs = np.array([1 / len(actions) for a in actions])
                exploreProbs = probs

            if len(initActions) > 0:
                #blindly pick init action
                preAction = initActions[0].strip()
                #find close enough action in list
                #PS client will generate team preview actions that
                #are longer than what we expect, but we can just
                #assume that the equivalent action is a prefix
                bestActionIndex = 0
                while bestActionIndex < len(actions):
                    if preAction.startswith(actions[bestActionIndex].strip()):
                        break
                    bestActionIndex += 1
                bestAction = actions[bestActionIndex]
                initActions = initActions[1:]
            else:
                #pick action based on probs
                bestActionIndex = np.random.choice(len(actions),
                                                   p=exploreProbs)
                bestAction = actions[bestActionIndex]

            #save our action
            history.append(
                (state, bestActionIndex, actions, probs, exploreProbs))

            if verbose:
                print('picked', cmdHeader + bestAction)

            await cmdQueue.put(cmdHeader + bestAction)

        elif request[0] == Game.END:
            running = False
            #map from [-1,1] to [0,1]
            reward = (request[1] + 1) / 2

            #on player's contribution to tail probability
            x = 1
            #on player's contribution to sample probability
            q = 1
            while len(history) > 0:
                state, actionIndex, actions, probs, exploreProbs = history.pop(
                )
                if iter % 2 == playerNum:
                    action = actions[actionIndex]
                    w = reward * x / q
                    p = probs[actionIndex]
                    ep = exploreProbs[actionIndex]
                    #update picked action's regret
                    regret = regretTable[(state, action)]
                    if regScaling != 0:
                        regret *= ((iter + 1)**regScaling) / (
                            (iter + 1)**regScaling + 1)
                    regretTable[(state, action)] = regret + (1 - p) / ep * w
                    #update other actions' regrets
                    for i in range(len(actions)):
                        if i == actionIndex:
                            continue
                        regret = regretTable[(state, actions[i])]
                        if regScaling != 0:
                            regret *= ((iter + 1)**regScaling) / (
                                (iter + 1)**regScaling + 1)
                        if posReg:
                            regretTable[(state, actions[i])] = max(
                                0, regret - p / ep * w)
                        else:
                            regretTable[(state,
                                         actions[i])] = regret - p / ep * w
                    x *= p
                    q *= ep
                else:
                    #update off player's average stategy
                    probScale = ((iter + 1) / (iter + 2))**probScaling
                    for i in range(len(actions)):
                        oldProb = probTable[(state, actions[i])]
                        probTable[(
                            state,
                            actions[i])] = probScale * oldProb + probs[i]