Python getMoves Examples

Programming Language: Python

Namespace/Package Name: moves

Method/Function: getMoves

Examples at hotexamples.com: 10

Python getMoves - 10 examples found. These are the top rated real world Python examples of moves.getMoves extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

                async def playTurn(queue, myMcData, actionList, cmdHeader,
                                   initMoves):

                    request = await queue.get()

                    if len(initMoves) > 0:
                        action = initMoves[0]
                        del initMoves[0]
                        print('|c|' + cmdHeader + '|Turn ' + str(i) +
                              ' pre-set action:',
                              action,
                              file=file)
                    else:
                        #figure out what kind of action we need
                        state = request[1]['stateHash']
                        actions = moves.getMoves(format, request[1])

                        #the mcdatasets are all combined, so we can just look at the first
                        data = myMcData[0]
                        #probs = mc.getProbsExp3(data, state, actions)
                        probs = mc.getProbsRM(data, state, actions)
                        #remove low probability moves, likely just noise
                        #this can remove every action, but if that's the case then it's doesn't really matter
                        #as all the probabilites are low
                        normProbs = np.array(
                            [p if p > probCutoff else 0 for p in probs])
                        normProbs = normProbs / np.sum(normProbs)

                        action = np.random.choice(actions, p=normProbs)

                    actionList.append(action)
                    await game.cmdQueue.put(cmdHeader + action)

Example #2

Show file

    async def getAction(self, request):
        #get info to replicate the current game state
        seed = request['state']['startingSeed']
        initActions = request['state']['actions']

        try:
            searchPs = [await getPSProcess() for i in range(self.numProcesses)]
            searches = []
            for j in range(self.numProcesses):
                search = mc.mcSearchRM(searchPs[j],
                                       self.format,
                                       self.teams,
                                       limit=self.limit,
                                       seed=seed,
                                       p1InitActions=initActions[0],
                                       p2InitActions=initActions[1],
                                       mcData=self.mcDataset,
                                       pid=j,
                                       initExpVal=0,
                                       probScaling=2,
                                       regScaling=1.5)
                searches.append(search)

            print('searching', file=sys.stderr)
            await asyncio.gather(*searches)

            print('combining', file=sys.stderr)
            self.mcDataset = mc.combineRMData([self.mcDataset],
                                              self.valueModel)[0]

            #figure out what kind of action we need
            state = request['stateHash']
            actions = moves.getMoves(format, request)

            data = self.mcDataset[1]
            probs = mc.getProbsRM(data, state, actions)
            #remove low probability moves, likely just noise
            #this can remove every action, but if that's the case then it's doesn't really matter
            #as all the probabilites are low
            normProbs = np.array(
                [p if p > self.probCutoff else 0 for p in probs])
            normProbs = normProbs / np.sum(normProbs)

            action = np.random.choice(actions, p=normProbs)
            return action
        finally:
            for ps in searchPs:
                ps.terminate()

Example #3

Show file

File: runner.py Project: samhippie/shallow-red

    async def randomAgent(queue, cmdHeader, initMoveList):
        while True:
            req = await queue.get()
            if req[0] == Game.END:
                break

            #print('getting actions')
            actions = moves.getMoves(format, req[1])
            state = req[1]['state']
            #print(cmdHeader, 'actions', actions)

            if len(initMoveList) > 0:
                action = initMoveList[0]
                del initMoveList[0]
            else:
                action = random.choice(actions)
            print(cmdHeader, 'picked', action)
            await game.cmdQueue.put(cmdHeader + action)

Example #4

Show file

                async def playTurn(num):

                    request = await queues[num].get()

                    if len(initMoves[num]) > 0:
                        #do the given action
                        action = initMoves[num][0]
                        del initMoves[num][0]
                        print('|c|' + cmdHeaders[num] + '|Turn ' + str(i) +
                              ' pre-set action:',
                              action,
                              file=file)
                    else:
                        #let the agent pick the action
                        #figure out what kind of action we need
                        state = request[1]['stateHash']
                        actions = moves.getMoves(format, request[1])

                        probs = agent.getProbs(num, state, actions)
                        #remove low probability moves, likely just noise
                        normProbs = np.array(
                            [p if p > probCutoff else 0 for p in probs])
                        normSum = np.sum(normProbs)
                        if normSum > 0:
                            normProbs = normProbs / np.sum(normProbs)
                        else:
                            normProbs = [1 / len(actions) for a in actions]

                        for j in range(len(actions)):
                            actionString = moves.prettyPrintMove(
                                actions[j], request[1])
                            if normProbs[j] > 0:
                                print('|c|' + cmdHeaders[num] + '|Turn ' +
                                      str(i) + ' action:',
                                      actionString,
                                      'prob:',
                                      '%.1f%%' % (normProbs[j] * 100),
                                      file=file)

                        action = np.random.choice(actions, p=normProbs)

                    actionLists[num].append(action)
                    await game.cmdQueue.put(cmdHeaders[num] + action)

Example #5

Show file

File: deepcfr.py Project: samhippie/shallow-red

    async def cfrRecur(self,
                       ps,
                       game,
                       startSeed,
                       history,
                       iter,
                       depth=0,
                       rollout=False):
        async def endGame():
            side = 'bot1' if iter % 2 == 0 else 'bot2'
            winner = await game.winner
            #have to clear the results out of the queues
            while not game.p1Queue.empty():
                await game.p1Queue.get()
            while not game.p2Queue.empty():
                await game.p2Queue.get()
            #the deep cfr paper uses [-1,1] rather than [0,1] for u
            #but I like [0,1]
            if winner == side:
                return 1
            else:
                return -1

        if depth >= self.depthLimit:
            rollout = True

        cmdHeaders = ['>p1', '>p2']
        queues = [game.p1Queue, game.p2Queue]
        offPlayer = (iter + 1) % 2
        onPlayer = iter % 2

        #off player
        request = (await queues[offPlayer].get())
        if request[0] == Game.END:
            return await endGame()
        req = request[1]
        state = req['state']
        actions = moves.getMoves(self.format, req)
        #just sample a move
        probs = self.regretMatch(offPlayer, state, actions, -1)
        if depth == 0 and self.pid == 0:
            print('player ' + str(offPlayer) + ' probs',
                  list(zip(actions, probs)),
                  file=sys.stderr)
        offAction = np.random.choice(actions, p=probs)
        #and update average stategy
        #we should be okay adding this for rollouts
        #but I'm testing skipping rollouts
        if not rollout:
            self.updateProbs(offPlayer, state, actions, probs, iter // 2 + 1)

        #on player
        request = (await queues[onPlayer].get())
        if request[0] == Game.END:
            return await endGame()
        req = request[1]

        state = req['state']
        actions = moves.getMoves(self.format, req)
        probs = self.regretMatch(onPlayer, state, actions, depth)
        if depth == 0 and self.pid == 0:
            print('player ' + str(onPlayer) + ' probs',
                  list(zip(actions, probs)),
                  file=sys.stderr)
        if rollout:
            #we pick one action according to the current strategy
            actions = [np.random.choice(actions, p=probs)]
            actionIndices = [0]
        elif self.branchingLimit:
            #select a set of actions to pick
            #chance to play randomly instead of picking the best actions
            exploreProbs = probs  # * (0.9) + 0.1 / len(probs)
            #there might be some duplicates but it shouldn't matter
            actionIndices = np.random.choice(len(actions),
                                             self.branchingLimit,
                                             p=exploreProbs)
        else:
            #we're picking every action
            actionIndices = list(range(len(actions)))

        #get expected reward for each action
        rewards = []
        gameUsed = False

        for i in range(len(actions)):
            action = actions[i]

            #use rollout for non-sampled actions
            if not i in actionIndices:
                curRollout = True
            else:
                curRollout = rollout

            #don't have to re-init game for the first action
            if gameUsed:
                game = Game(ps,
                            self.teams,
                            format=self.format,
                            seed=startSeed,
                            verbose=self.verbose)
                await game.startGame()
                await game.applyHistory(history)
                #need to consume two requests, as we consumed two above
                await game.p1Queue.get()
                await game.p2Queue.get()
            else:
                gameUsed = True

            seed = Game.getSeed()
            if onPlayer == 0:
                onHeader = '>p1'
                offHeader = '>p2'
                historyEntry = (seed, action, offAction)
            else:
                onHeader = '>p2'
                offHeader = '>p1'
                historyEntry = (seed, offAction, action)

            await game.cmdQueue.put('>resetPRNG ' + str(seed))
            await game.cmdQueue.put(onHeader + action)
            await game.cmdQueue.put(offHeader + offAction)

            r = await self.cfrRecur(ps,
                                    game,
                                    startSeed,
                                    history + [historyEntry],
                                    iter,
                                    depth=depth + 1,
                                    rollout=curRollout)
            rewards.append(r)

        if not rollout:
            #save sample of advantages
            stateExpValue = 0
            for p, r in zip(probs, rewards):
                stateExpValue += p * r
            advantages = [r - stateExpValue for r in rewards]

            am = self.advModels[onPlayer]
            am.addSample(state, zip(actions, advantages), iter // 2 + 1)

            if depth == 0 and self.pid == 0:
                print('player', str(onPlayer), file=sys.stderr)
                print('stateExpValue',
                      stateExpValue,
                      'from',
                      list(zip(probs, rewards)),
                      file=sys.stderr)
                print('advantages',
                      list(zip(actions, advantages)),
                      file=sys.stderr)

            return stateExpValue
        else:
            #we can't calculate advantage, so we can't update anything
            #we only have one reward, so just return it
            return rewards[0]

Example #6

Show file

    async def rmRecur(self, ps, game, startSeed, iter, depth=0):

        cmdHeaders = ['>p1', '>p2']
        queues = [game.p1Queue, game.p2Queue]
        #all the actions both players can pick
        playerActions = [[], []]
        #the probabilitiy of picking each action
        playerProbs = [[], []]
        #the indices of the actions actually picked
        pickedActions = [0, 0]
        #both players make their move
        for i in range(2):
            request = (await queues[i].get())
            if request[0] == Game.END:
                winner = await game.winner
                #have to clear the results out of the queues
                while not game.p1Queue.empty():
                    await game.p1Queue.get()
                while not game.p2Queue.empty():
                    await game.p2Queue.get()
                if winner == 'bot1':
                    return 1
                else:
                    return 0

            req = request[1]
            state = req['stateHash']
            #get rm probs for the actions
            actions = moves.getMoves(self.format, req)
            playerActions[i] = actions
            probs = self.regretMatch(i, state, actions)
            #add exploration, which adds a chance to play randomly
            exploreProbs = probs * (
                1 - self.exploration) + self.exploration / len(probs)
            playerProbs[i] = probs
            #and sample one action
            pickedActions[i] = np.random.choice(len(actions), p=exploreProbs)

        #apply the picked actions to the game
        seed = Game.getSeed()
        await game.cmdQueue.put('>resetPRNG ' + str(seed))
        for i in range(2):
            pickedAction = pickedActions[i]
            action = playerActions[i][pickedAction]
            await game.cmdQueue.put(cmdHeaders[i] + action)

        #get the reward so we can update our regrets
        reward = await self.rmRecur(ps, game, startSeed, iter, depth=depth + 1)

        #save the reward
        a1 = playerActions[0][pickedActions[0]]
        a2 = playerActions[1][pickedActions[1]]
        self.addReward(state, a1, a2, reward)

        #need to update both players' regret and strategy
        for i in range(2):
            #update each action's regret and probability in average strategy
            rt = self.regretTables[i]
            pt = self.probTables[i]
            actions = playerActions[i]
            for j in range(len(actions)):
                #update stategy with this iteration's strategy
                #which just means adding the current probability of each action
                probScale = ((iter + 1) / (iter + 2))**self.probScaling
                prob = dictGet(pt, (state, actions[j]))
                pt[hash((state,
                         actions[j]))] = probScale * prob + playerProbs[i][j]

                #immediate regret of picked actions is 0, so just skip
                if j == pickedActions[i]:
                    continue

                #get existing regret so we can add to it
                regret = dictGet(rt, (state, actions[j]))
                if self.regScaling != 0:
                    regret *= ((iter + 1)**self.regScaling) / (
                        (iter + 1)**self.regScaling + 1)
                #get i's possible action and -i's actual action
                #in player order
                if i == 0:
                    a1 = actions[j]
                    a2 = playerActions[1][pickedActions[1]]
                    myReward = reward
                else:
                    a1 = playerActions[0][pickedActions[0]]
                    a2 = actions[j]
                    myReward = 1 - reward

                #get expected value for the potential turn
                expValue = self.getExpValue(i, state, a1, a2)

                #add immediate regret
                if self.posReg:
                    rt[hash((state,
                             actions[j]))] = max(regret + expValue - myReward,
                                                 0)
                else:
                    rt[hash(
                        (state, actions[j]))] = regret + expValue - myReward

        #pass the actual reward up
        return reward

Example #7

Show file

File: cfr.py Project: samhippie/shallow-red

    async def cfrRecur(self,
                       ps,
                       game,
                       startSeed,
                       history,
                       q,
                       iter,
                       depth=0,
                       rollout=False):
        #I'm not sure about this q parameter
        #I'm getting better results setting it to 1 in all games
        q = 1

        async def endGame():
            side = 'bot1' if iter % 2 == 0 else 'bot2'
            winner = await game.winner
            #have to clear the results out of the queues
            while not game.p1Queue.empty():
                await game.p1Queue.get()
            while not game.p2Queue.empty():
                await game.p2Queue.get()
            if winner == side:
                return 1 / q
            else:
                return 0

        cmdHeaders = ['>p1', '>p2']
        queues = [game.p1Queue, game.p2Queue]
        offPlayer = (iter + 1) % 2
        onPlayer = iter % 2

        #off player
        request = (await queues[offPlayer].get())
        if request[0] == Game.END:
            return await endGame()
        req = request[1]
        state = req['stateHash']
        actions = moves.getMoves(self.format, req)
        #just sample a move
        probs = self.regretMatch(offPlayer, state, actions)
        #apply exploration chance to off-player as well
        exploreProbs = probs * (
            1 - self.exploration) + self.exploration / len(actions)
        #or don't
        #exploreProbs = probs
        offAction = np.random.choice(actions, p=exploreProbs)
        #and update average stategy
        self.updateProbs(offPlayer, state, actions, probs / q, iter)

        #on player
        request = (await queues[onPlayer].get())
        if request[0] == Game.END:
            return await endGame()
        req = request[1]

        #now that we've checked if the game is over,
        #let's check depth before continuing
        if self.depthLimit != None and depth >= self.depthLimit:
            if self.evaluation == HEURISTIC:
                #immediately return a heuristic-based expected value
                await game.cmdQueue.put('>forcewin p1')
                #clean up the end game messages
                await queues[onPlayer].get()
                await queues[offPlayer].get()
                return expValueHeuristic(onPlayer, req['state']) / q
            elif self.evaluation == ROLLOUT:
                #instead of branching out, find the actual value of a single
                #play-through and use that as the expected value
                rollout = True
                #rest of rollout is implemented with the normal code path
            elif self.evaluation == MODEL:
                #TODO
                pass

        state = req['stateHash']
        actions = moves.getMoves(self.format, req)
        #we sometimes bias towards the first or last actions
        #this fixes that bias
        random.shuffle(actions)
        #probs is the set of sample probabilities, used for traversing
        #iterProbs is the set of probabilities for this iteration's strategy, used for regret
        if rollout:
            #I'm not sure if using regret matching or going uniform random
            #would be better
            #my gut says regret matching
            probs = self.regretMatch(onPlayer, state, actions)
            action = np.random.choice(actions, p=probs)
            actions = [action]
            probs = [1]  # would it be better to use the actual probability?
            iterProbs = probs
        elif self.samplingType == EXTERNAL:
            probs = self.regretMatch(onPlayer, state, actions)
            iterProbs = probs
        elif self.samplingType == AVERAGE:
            #we're just using the current iteration's strategy
            #it's simple and it seems to work
            iterProbs = self.regretMatch(onPlayer, state, actions)
            probs = iterProbs + self.exploration

            #this is the average-sampling procedure from some paper
            #it's designed for a large number of samples, so it doesn't really
            #work. It expects it to be feasible to try every action for the
            #on player on some turns, which usually isn't the case
            """
            stratSum = 0
            strats = []
            pt = self.probTables[onPlayer]
            for a in actions:
                s = dictGet(pt, (state, a))
                stratSum += s
                strats.append(s)
            probs = []
            for a,s in zip(actions, strats):
                if self.bonus + stratSum == 0:
                    p = 0
                else:
                    p = (self.bonus + self.threshold * s) / (self.bonus + stratSum)
                p = max(self.exploration, p)
                probs.append(p)
            """
            #keep track of how many actions we take from this state
            numTaken = 0

        #get expected reward for each action
        rewards = []
        gameUsed = False
        self.numActionsSeen += len(actions)
        #whether a specific action is a rollout
        curRollout = rollout
        for action, prob in zip(actions, probs):
            #for ES we just check every action
            #for AS use a roll to determine if we search
            if self.samplingType == AVERAGE and not curRollout:
                #instead of skipping, try making the skipped entries a rollout
                #like in https://www.aaai.org/ocs/index.php/AAAI/AAAI12/paper/viewFile/4937/5469
                #if we're at the last action and we haven't done anything, do something regardless of roll
                if (self.bound != 0 and numTaken > self.bound
                    ) or random.random() >= prob and (action != actions[-1]
                                                      or gameUsed):
                    curRollout = True
                    #rewards.append(0)
                    #continue
                else:
                    curRollout = rollout
                    numTaken += 1
            self.numActionsTaken += 1
            #don't have to re-init game for the first action
            if gameUsed:
                game = Game(ps,
                            self.teams,
                            format=self.format,
                            seed=startSeed,
                            verbose=self.verbose)
                await game.startGame()
                await game.applyHistory(history)
                #need to consume two requests, as we consumed two above
                await game.p1Queue.get()
                await game.p2Queue.get()
            else:
                gameUsed = True

            seed = Game.getSeed()
            if onPlayer == 0:
                onHeader = '>p1'
                offHeader = '>p2'
                historyEntry = (seed, action, offAction)
            else:
                onHeader = '>p2'
                offHeader = '>p1'
                historyEntry = (seed, offAction, action)

            await game.cmdQueue.put('>resetPRNG ' + str(seed))
            await game.cmdQueue.put(onHeader + action)
            await game.cmdQueue.put(offHeader + offAction)

            r = await self.cfrRecur(ps,
                                    game,
                                    startSeed,
                                    history + [historyEntry],
                                    q * min(1, max(0.01, prob)),
                                    iter,
                                    depth=depth + 1,
                                    rollout=curRollout)
            rewards.append(r)

        #update regrets
        stateExpValue = 0
        for p, r in zip(iterProbs, rewards):
            stateExpValue += p * r
        rt = self.regretTables[onPlayer]
        for a, r in zip(actions, rewards):
            regret = dictGet(rt, (state, a))
            if self.regScaling != 0:
                regret *= ((iter // 2 + 1)**self.regScaling) / (
                    (iter // 2 + 1)**self.regScaling + 1)
            if self.posReg:
                rt[hash((state, a))] = max(0, regret + r - stateExpValue)
            else:
                rt[hash((state, a))] = regret + r - stateExpValue

        return stateExpValue

Example #8

Show file

async def mcExp3Impl(requestQueue,
                     cmdQueue,
                     cmdHeader,
                     mcData,
                     format,
                     iter=0,
                     initActions=[],
                     verbose=False):

    countTable = mcData['countTable']
    expValueTable = mcData['expValueTable']
    gamma = mcData['gamma']
    seenStates = mcData['seenStates']

    #history so we can update probTable
    history = []

    #we're going to be popping off this
    initActions = copy.deepcopy(initActions)

    running = True
    inInitActions = True
    while running:
        request = await requestQueue.get()

        if verbose:
            print(cmdHeader, 'got request', request)

        if request[0] == Game.REQUEST or request[0] == Game.ERROR:
            req = request[1]
            state = req['stateHash']

            seenStates[state] = True
            actions = moves.getMoves(format, req)

            #check if we ran out of initActions on the previous turn
            #if so, we need to change the PRNG
            if inInitActions and len(initActions) == 0:
                inInitActions = False
                #no problem if both players reset the PRNG
                await cmdQueue.put('>resetPRNG')

            #calculate a probability for each action
            #need the probs from the initActions so we can update,
            #so we always calculate this
            eta = gamma / len(actions)
            expValues = [expValueTable[(state, action)] for action in actions]
            maxExpValues = max(expValues)
            ws = [
                expValueTable[(state, action)] - maxExpValues
                for action in actions
            ]
            xs = [math.exp(eta * w) for w in ws]
            xSum = np.sum(xs)
            probs = np.array([(1 - gamma) * x / xSum + gamma / len(actions)
                              for x in xs])
            #illegal moves might have a negative probability, which should just be 0
            probs = [p if p > 0 else 0 for p in probs]
            probs = probs / np.sum(probs)

            if len(initActions) > 0:
                #blindly pick init action
                bestAction = initActions[0]
                bestActionIndex = actions.index(bestAction)
                bestActionProb = probs[bestActionIndex]
                initActions = initActions[1:]
            else:
                #pick action based on probs
                bestActionIndex = np.random.choice(len(actions), p=probs)
                bestAction = actions[bestActionIndex]
                bestActionProb = probs[bestActionIndex]

            #save our action
            history.append((state, bestAction, bestActionProb))

            if verbose:
                print('picked', cmdHeader + bestAction)

            await cmdQueue.put(cmdHeader + bestAction)

        elif request[0] == Game.END:
            #update probTable with our history + result
            reward = request[1]
            #rescale reward from [-1,1] to [0,1]
            reward = (reward + 1) / 2
            for state, action, prob in history:
                countTable[(state, action)] += 1
                expValueTable[(state, action)] += reward / prob

            running = False

Example #9

Show file

                async def userTurn(queue, actionList, cmdHeader, initMoves):

                    request = await queue.get()

                    if len(initMoves) > 0:
                        action = initMoves[0]
                        del initMoves[0]
                        print('|c|' + cmdHeader + '|Turn ' + str(i) +
                              ' pre-set action:',
                              action,
                              file=file)
                    else:
                        #figure out what kind of action we need
                        state = request[1]['stateHash']
                        actions = moves.getMoves(format, request[1])

                        actionTexts = []
                        for j in range(len(actions)):
                            action = actions[j].split(',')
                            actionText = []
                            for k in range(len(action)):
                                a = action[k]
                                a = a.strip()
                                if 'pass' in a:
                                    actionText.append('pass')
                                elif 'move' in a:
                                    parts = a.split(' ')
                                    moveNum = int(parts[1])
                                    if len(parts) < 3:
                                        targetNum = 0
                                    else:
                                        targetNum = int(parts[2])
                                    move = request[1]['active'][k]['moves'][
                                        moveNum - 1]['move']
                                    if targetNum != 0:
                                        actionText.append(move +
                                                          ' into slot ' +
                                                          str(targetNum))
                                    else:
                                        actionText.append(move)
                                elif 'team' in a:
                                    actionText.append(a)
                                elif 'switch' in a:
                                    actionText.append(a)
                                else:
                                    actionText.append('unknown action: ' + a)
                            actionString = ','.join(actionText)
                            actionTexts.append(actionString)

                        #ask the user which action to take
                        print('Legal actions:')
                        for j in range(len(actions)):
                            print(j, actionTexts[j], '(' + actions[j] + ')')
                        #humans are dumb and make mistakes
                        while True:
                            try:
                                actionIndex = int(input('Your action:'))
                                if actionIndex >= 0 and actionIndex < len(
                                        actions):
                                    action = actions[actionIndex]
                                    break
                            except ValueException:
                                pass
                            print('try again')

                        actionList.append(action)

                        await game.cmdQueue.put(cmdHeader + action)

Example #10

Show file

async def mcOOSImpl(requestQueue,
                    cmdQueue,
                    cmdHeader,
                    mcData,
                    format,
                    playerNum,
                    iter,
                    initActions,
                    pid=0,
                    posReg=False,
                    probScaling=0,
                    regScaling=0,
                    verbose=False):

    regretTable = mcData['regretTable']
    seenStates = mcData['seenStates']
    gamma = mcData['gamma']
    probTable = mcData['probTable']

    #stack where our actions/strategies are stored
    history = []

    running = True
    inInitActions = True
    while running:
        request = await requestQueue.get()
        if verbose:
            print('got request', cmdHeader, request)

        if request[0] == Game.REQUEST:
            req = request[1]
            state = req['stateHash']

            seenStates[state] = True
            actions = moves.getMoves(format, req)

            #after doing init actions, we're in the target state
            #need to reset the PRNG so the bot doesn't cheat
            if inInitActions and len(initActions) == 0:
                inInitActions = False
                await cmdQueue.put('>resetPRNG')

            #generate a stategy
            rSum = 0
            regrets = []
            for action in actions:
                regret = regretTable[(state, action)]
                regrets.append(regret)
                rSum += max(0, regret)
            if rSum > 0:
                #prob according to regret
                probs = np.array([max(0, r) / rSum for r in regrets])
                probs = probs / np.sum(probs)
                #use probs to update strategy
                #use exploreProbs to sample moves
                if iter % 2 == playerNum:
                    exploreProbs = probs * (1 - gamma) + gamma / len(actions)
                else:
                    #we're the off player, don't explore
                    exploreProbs = probs
            else:
                #everything is new/bad, play randomly
                probs = np.array([1 / len(actions) for a in actions])
                exploreProbs = probs

            if len(initActions) > 0:
                #blindly pick init action
                preAction = initActions[0].strip()
                #find close enough action in list
                #PS client will generate team preview actions that
                #are longer than what we expect, but we can just
                #assume that the equivalent action is a prefix
                bestActionIndex = 0
                while bestActionIndex < len(actions):
                    if preAction.startswith(actions[bestActionIndex].strip()):
                        break
                    bestActionIndex += 1
                bestAction = actions[bestActionIndex]
                initActions = initActions[1:]
            else:
                #pick action based on probs
                bestActionIndex = np.random.choice(len(actions),
                                                   p=exploreProbs)
                bestAction = actions[bestActionIndex]

            #save our action
            history.append(
                (state, bestActionIndex, actions, probs, exploreProbs))

            if verbose:
                print('picked', cmdHeader + bestAction)

            await cmdQueue.put(cmdHeader + bestAction)

        elif request[0] == Game.END:
            running = False
            #map from [-1,1] to [0,1]
            reward = (request[1] + 1) / 2

            #on player's contribution to tail probability
            x = 1
            #on player's contribution to sample probability
            q = 1
            while len(history) > 0:
                state, actionIndex, actions, probs, exploreProbs = history.pop(
                )
                if iter % 2 == playerNum:
                    action = actions[actionIndex]
                    w = reward * x / q
                    p = probs[actionIndex]
                    ep = exploreProbs[actionIndex]
                    #update picked action's regret
                    regret = regretTable[(state, action)]
                    if regScaling != 0:
                        regret *= ((iter + 1)**regScaling) / (
                            (iter + 1)**regScaling + 1)
                    regretTable[(state, action)] = regret + (1 - p) / ep * w
                    #update other actions' regrets
                    for i in range(len(actions)):
                        if i == actionIndex:
                            continue
                        regret = regretTable[(state, actions[i])]
                        if regScaling != 0:
                            regret *= ((iter + 1)**regScaling) / (
                                (iter + 1)**regScaling + 1)
                        if posReg:
                            regretTable[(state, actions[i])] = max(
                                0, regret - p / ep * w)
                        else:
                            regretTable[(state,
                                         actions[i])] = regret - p / ep * w
                    x *= p
                    q *= ep
                else:
                    #update off player's average stategy
                    probScale = ((iter + 1) / (iter + 2))**probScaling
                    for i in range(len(actions)):
                        oldProb = probTable[(state, actions[i])]
                        probTable[(
                            state,
                            actions[i])] = probScale * oldProb + probs[i]