def getDefenderBaseline(aIds, aMap, aMix, game, dPool): expectedUtility = 0 dOb, aOb = game.getEmptyObservations() undoClone = ssg.cloneGame(game) clone = ssg.cloneGame(game) editableAMix = aMix.copy() savedAction = None # Calculate the expected utility of attacker myopic play for agentIndex in range(len(aMap)): if aMix[agentIndex] > 0: aAgent = aMap[agentIndex] for timestep in range(game.timesteps): if timestep == 0: # Calculate the best response against the entire mixed strategy and save it savedAction = getBestResponseAction(ssg.DEFENDER, game, aMap, editableAMix, dPool, dOb, aOb) # play the defender action and best response attacker action to obtain # a set of observations attackerAction = aAgent.getAction(game, aOb) dOb, aOb, _, _ = game.performActions(savedAction, attackerAction, dOb, aOb) else: # For each agent: for i in range(len(editableAMix)): if editableAMix[i] > 0: # Play the agent's action on clone with the saved action attackerAction = aMap[i].getAction(clone, game.previousAttackerObservation) dTestOb, _, _, _ = clone.performActions(savedAction, attackerAction, game.previousDefenderObservation, game.previousAttackerObservation) # Compare the resulting attacker observation. If they don't match set its odds to 0 in the editable mix if not np.array_equal(dTestOb,dOb): editableAMix[i] = 0 # set clone to undoClone ssg.cloneGameState(clone, undoClone) # Using the filtered mix, compute the best response and save it editableAMix = [float(p)/sum(editableAMix) for p in editableAMix] savedAction = getBestResponseAction(ssg.DEFENDER, game, aMap, editableAMix, dPool, dOb, aOb) # Set clone and undo clone to the current game ssg.cloneGameState(clone, game) ssg.cloneGameState(undoClone, game) # perform the best response and agent action on the normal game attackerAction = aAgent.getAction(game, aOb) dOb, aOb, _, _ = game.performActions(savedAction, attackerAction, dOb, aOb) print(game.defenderUtility) expectedUtility += game.defenderUtility * aMix[agentIndex] editableAMix = aMix.copy() game.restartGame() ssg.cloneGameState(clone, game) ssg.cloneGameState(undoClone, game) return expectedUtility
def updatePayoutMatrix(newDefenderId, newAttackerId, payoutMatrix, dIds, aIds, dMap, aMap, game, newDOracle, newAOracle): for aId in aIds: value = ssg.expectedPureVPure(newDOracle, aMap[aId], ssg.cloneGame(game)) payoutMatrix[newDefenderId, aId] = value for dId in dIds: value = ssg.expectedPureVPure(dMap[dId], newAOracle, ssg.cloneGame(game)) payoutMatrix[dId, newAttackerId] = value value = ssg.expectedPureVPure(newDOracle, newAOracle, ssg.cloneGame(game)) payoutMatrix[newDefenderId, newAttackerId] = value aIds.append(newAttackerId) dIds.append(newDefenderId) aMap[newAttackerId] = newAOracle dMap[newDefenderId] = newDOracle newDefenderId += 1 newAttackerId += 1 return newDefenderId, newAttackerId, payoutMatrix
def calculatePayoutMatrix(dIds, aIds, dMap, aMap, game): payoutMatrix = {} for attackerId in aIds: pureAttacker = aMap[attackerId] for defenderId in dIds: pureDefender = dMap[defenderId] value = ssg.expectedPureVPure(pureDefender, pureAttacker, ssg.cloneGame(game)) payoutMatrix[defenderId, attackerId] = value game.restartGame() return payoutMatrix
def attackerTrain(oracleToTrain, dIds, dMap, dMix, game, aPool, N=300, batchSize=30, C=50, epochs=100, optimizer=None, lossFunction=nn.MSELoss(), showOutput=False, trainingTest=False, writer=None): if optimizer is None: optimizer = optim.Adam(oracleToTrain.parameters(), lr=0.00001) optim.lr_scheduler.ReduceLROnPlateau(optimizer) gameClone = ssg.cloneGame(game) if trainingTest: history = [] lossHistory = [] # Initialize the replay memory with limited capacity N replayMemory = ReplayMemory(N) # Initialize target network with weights equal to the oracle to train targetNetwork = AttackerOracle(oracleToTrain.targetNum) targetNetwork.setState(oracleToTrain.getState()) # An epoch is one iteration over all training data. In our case, that's the one # Game we're learning on. step = 0 for epoch in range(0, epochs): print(f"epoch {epoch} of {epochs}") # initialize the starting values for the game dOb, aOb = game.getEmptyObservations() defenderAgent = dMap[np.random.choice(dIds, 1, p=dMix)[0]] for timestep in range(game.timesteps): # Play a full game # Choose an action based off of Q network (oracle to train) dAction = defenderAgent.getAction(game, dOb) aAction = oracleToTrain.getAction(game, aOb) if trainingTest: writer.writerow([ f"{(timestep+1)+(game.timesteps*(epoch))}", f"{game.getValidActions(ssg.ATTACKER)}", f"{[oracleToTrain.forward(game.previousAttackerObservation, aOb, game.previousAttackerAction, x).item() for x in game.getValidActions(ssg.ATTACKER)]}", f"{aAction}", f"{dAction}" ]) # Execute that action and store the result in replay memory ob0 = game.previousAttackerObservation action0 = game.previousAttackerAction ob1 = aOb action1 = aAction dOb, aOb, dScore, aScore = game.performActions( dAction, aAction, dOb, aOb) replayMemory.push(ob0, action0, ob1, action1, aScore, dOb, game.getValidActions(ssg.ATTACKER)) # Sample a random minibatch of transitions from replay memory avgLoss = sampleMinibatch(replayMemory, game, targetNetwork, oracleToTrain, lossFunction, optimizer, timestep, batchSize=batchSize) if trainingTest: oracleScore = ssg.expectedPureVMix(ssg.ATTACKER, oracleToTrain, dMap, dMix, gameClone) history.append(oracleScore) lossHistory.append(avgLoss / batchSize) # Every C steps, set Q^ = Q step += 1 if step == C: targetNetwork.setState(oracleToTrain.getState()) step = 0 game.restartGame() if trainingTest: return history, lossHistory return ssg.expectedPureVMix(ssg.ATTACKER, oracleToTrain, dMap, dMix, gameClone)