def __call__(self, parameters): print(parameters) miniBatchSize = parameters['miniBatchSize'] learningRate = parameters['learningRate'] model = self.NNModel train = self.getTrain(miniBatchSize, learningRate) parameters.update({'trainSteps': 0}) modelSavePath = self.getModelSavePath(parameters) saveVariables(model, modelSavePath) for trainIntervelIndex in self.trainIntervelIndexes: parameters.update( {'trainSteps': trainIntervelIndex * self.trainStepsIntervel}) modelSavePath = self.getModelSavePath(parameters) if not os.path.isfile(modelSavePath + '.index'): trainedModel = train(model, self.trainData) saveVariables(trainedModel, modelSavePath) else: trainedModel = restoreVariables(model, modelSavePath) model = trainedModel
def __call__(self, parameters): print(parameters) visualizeTraj = False numWolves = parameters['numWolves'] numSheep = parameters['numSheep'] wolfType = parameters['wolfType'] sheepConcern = parameters['sheepConcern'] ## MDP Env # state is all multi agent state # action is all multi agent action wolvesID = list(range(numWolves)) sheepsID = list(range(numWolves, numWolves + numSheep)) possibleWolvesIds = wolvesID possibleSheepIds = sheepsID numAgents = numWolves + numSheep numBlocks = 5 - numWolves blocksID = list(range(numAgents, numAgents + numBlocks)) numEntities = numAgents + numBlocks sheepSize = 0.05 wolfSize = 0.075 blockSize = 0.2 sheepMaxSpeed = 1.3 * 1 wolfMaxSpeed = 1.0 * 1 blockMaxSpeed = None entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheep + [blockSize] * numBlocks entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [sheepMaxSpeed] * numSheep + [blockMaxSpeed] * numBlocks entitiesMovableList = [True]* numAgents + [False] * numBlocks massList = [1.0] * numEntities reshapeActionInTransit = lambda action: action getCollisionForce = GetCollisionForce() applyActionForce = ApplyActionForce(wolvesID, sheepsID, entitiesMovableList) applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList, entitiesSizeList, getCollisionForce, getPosFromAgentState) integrateState = IntegrateState(numEntities, entitiesMovableList, massList, entityMaxSpeedList, getVelFromAgentState, getPosFromAgentState) transit = TransitMultiAgentChasing(numEntities, reshapeActionInTransit, applyActionForce, applyEnvironForce, integrateState) isCollision = IsCollision(getPosFromAgentState) collisonRewardWolf = 1 punishForOutOfBoundForWolf = lambda stata: 0 rewardWolf = RewardCentralControlPunishBond(wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBoundForWolf, collisonRewardWolf) collisonRewardSheep = -1 punishForOutOfBoundForSheep = PunishForOutOfBound() rewardSheep = RewardCentralControlPunishBond(sheepsID, wolvesID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBoundForSheep, collisonRewardSheep) forwardOneStep = ForwardOneStep(transit, rewardWolf) reset = ResetMultiAgentChasing(numAgents, numBlocks) isTerminal = lambda state: False maxRunningSteps = 101 sampleTrajectory = SampleTrajectory(maxRunningSteps, isTerminal, reset, forwardOneStep) ## MDP Policy worldDim = 2 actionDim = worldDim * 2 + 1 layerWidth = [64 * (numWolves - 1), 64 * (numWolves - 1)] # Sheep Part # ------------ model ------------------------ if sheepConcern == 'selfSheep': sheepConcernSelfOnly = 1 if sheepConcern == 'allSheep': sheepConcernSelfOnly = 0 numSheepToObserveWhenSheepSameOrDiff = [numSheep, 1] numSheepToObserve = numSheepToObserveWhenSheepSameOrDiff[sheepConcernSelfOnly] print(numSheepToObserve) sheepModelListOfDiffWolfReward = [] sheepType = 'mixed' if sheepType == 'mixed': sheepPrefixList = ['maddpgIndividWolf', 'maddpg'] else: sheepPrefixList = [sheepType] for sheepPrefix in sheepPrefixList: wolvesIDForSheepObserve = list(range(numWolves)) sheepsIDForSheepObserve = list(range(numWolves, numSheepToObserve + numWolves)) blocksIDForSheepObserve = list(range(numSheepToObserve + numWolves, numSheepToObserve + numWolves + numBlocks)) observeOneAgentForSheep = lambda agentID: Observe(agentID, wolvesIDForSheepObserve, sheepsIDForSheepObserve, blocksIDForSheepObserve, getPosFromAgentState, getVelFromAgentState) observeSheep = lambda state: [observeOneAgentForSheep(agentID)(state) for agentID in range(numWolves + numSheepToObserve)] obsIDsForSheep = wolvesIDForSheepObserve + sheepsIDForSheepObserve + blocksIDForSheepObserve initObsForSheepParams = observeSheep(reset()[obsIDsForSheep]) obsShapeSheep = [initObsForSheepParams[obsID].shape[0] for obsID in range(len(initObsForSheepParams))] buildSheepModels = BuildMADDPGModels(actionDim, numWolves + numSheepToObserve, obsShapeSheep) sheepModelsList = [buildSheepModels(layerWidth, agentID) for agentID in range(numWolves, numWolves + numSheepToObserve)] dirName = os.path.dirname(__file__) maxEpisode = 60000 print(sheepPrefix) sheepFileName = "{}wolves{}sheep{}blocks{}eps_agent".format(numWolves, numSheepToObserve, numBlocks, maxEpisode) sheepModelPaths = [os.path.join(dirName, '..', '..', 'data', 'preTrainModel', sheepPrefix + sheepFileName + str(i) + '60000eps') for i in range(numWolves, numWolves + numSheepToObserve)] [restoreVariables(model, path) for model, path in zip(sheepModelsList, sheepModelPaths)] sheepModelListOfDiffWolfReward = sheepModelListOfDiffWolfReward + sheepModelsList # Sheep Policy Function reshapeAction = ReshapeAction() actOneStepOneModelSheep = ActOneStep(actByPolicyTrainNoisy) # Sheep Generate Action numAllSheepModels = len(sheepModelListOfDiffWolfReward) # Wolves Part # ------------ model ------------------------ wolvesIDForWolfObserve = list(range(numWolves)) sheepsIDForWolfObserve = list(range(numWolves, numSheep + numWolves)) blocksIDForWolfObserve = list(range(numSheep + numWolves, numSheep + numWolves + numBlocks)) observeOneAgentForWolf = lambda agentID: Observe(agentID, wolvesIDForWolfObserve, sheepsIDForWolfObserve, blocksIDForWolfObserve, getPosFromAgentState, getVelFromAgentState) observeWolf = lambda state: [observeOneAgentForWolf(agentID)(state) for agentID in range(numWolves + numSheep)] obsIDsForWolf = wolvesIDForWolfObserve + sheepsIDForWolfObserve + blocksIDForWolfObserve initObsForWolfParams = observeWolf(reset()[obsIDsForWolf]) obsShapeWolf = [initObsForWolfParams[obsID].shape[0] for obsID in range(len(initObsForWolfParams))] buildWolfModels = BuildMADDPGModels(actionDim, numWolves + numSheep, obsShapeWolf) layerWidthForWolf = [64 * (numWolves - 1), 64 * (numWolves - 1)] wolfModelsList = [buildWolfModels(layerWidthForWolf, agentID) for agentID in range(numWolves)] if wolfType == 'sharedReward': prefix = 'maddpg' if wolfType == 'individualReward': prefix = 'maddpgIndividWolf' wolfFileName = "{}wolves{}sheep{}blocks{}eps_agent".format(numWolves, numSheep, numBlocks, maxEpisode) wolfModelPaths = [os.path.join(dirName, '..', '..', 'data', 'preTrainModel', prefix + wolfFileName + str(i) + '60000eps') for i in range(numWolves)] print(numWolves, obsShapeWolf, wolfModelPaths) [restoreVariables(model, path) for model, path in zip(wolfModelsList, wolfModelPaths)] actionDimReshaped = 2 cov = [0.03 ** 2 for _ in range(actionDimReshaped)] buildGaussian = BuildGaussianFixCov(cov) actOneStepOneModelWolf = ActOneStep(actByPolicyTrainNoNoisy) composeWolfPolicy = lambda wolfModel: lambda state: sampleFromContinuousSpace(buildGaussian( tuple(reshapeAction(actOneStepOneModelWolf(wolfModel, observeWolf(state)))))) #actOneStepOneModelWolf = ActOneStep(actByPolicyTrainNoisy) #composeWolfPolicy = lambda wolfModel: lambda state: tuple(reshapeAction(actOneStepOneModelSheep(wolfModel, observeWolf(state)))) wolvesSampleActions = [composeWolfPolicy(wolfModel) for wolfModel in wolfModelsList] trajectories = [] for trajectoryId in range(self.numTrajectories): sheepModelsForPolicy = [sheepModelListOfDiffWolfReward[np.random.choice(numAllSheepModels)] for sheepId in possibleSheepIds] if sheepConcernSelfOnly: composeSheepPolicy = lambda sheepModel : lambda state: {tuple(reshapeAction(actOneStepOneModelSheep(sheepModel, observeSheep(state)))): 1} sheepChooseActionMethod = sampleFromDistribution sheepSampleActions = [SampleActionOnFixedIntention(selfId, possibleWolvesIds, composeSheepPolicy(sheepModel), sheepChooseActionMethod, blocksID) for selfId, sheepModel in zip(possibleSheepIds, sheepModelsForPolicy)] else: composeSheepPolicy = lambda sheepModel: lambda state: tuple(reshapeAction(actOneStepOneModelSheep(sheepModel, observeSheep(state)))) sheepSampleActions = [composeSheepPolicy(sheepModel) for sheepModel in sheepModelsForPolicy] allIndividualSampleActions = wolvesSampleActions + sheepSampleActions sampleAction = lambda state: [sampleIndividualAction(state) for sampleIndividualAction in allIndividualSampleActions] trajectory = sampleTrajectory(sampleAction) trajectories.append(trajectory) trajectoryFixedParameters = {'maxRunningSteps': maxRunningSteps} self.saveTrajectoryByParameters(trajectories, trajectoryFixedParameters, parameters) print(np.mean([len(tra) for tra in trajectories])) # visualize if visualizeTraj: wolfColor = np.array([0.85, 0.35, 0.35]) sheepColor = np.array([0.35, 0.85, 0.35]) blockColor = np.array([0.25, 0.25, 0.25]) entitiesColorList = [wolfColor] * numWolves + [sheepColor] * numSheep + [blockColor] * numBlocks render = Render(entitiesSizeList, entitiesColorList, numAgents, getPosFromAgentState) trajToRender = np.concatenate(trajectories) render(trajToRender)
def __call__(self, parameters): print(parameters) valuePriorEndTime = -100 deviationFor2DAction = 1.0 rationalityBetaInInference = 1.0 numWolves = parameters['numWolves'] numSheep = parameters['numSheep'] wolfType = parameters['wolfType'] wolfSelfish = 0.0 if wolfType == 'sharedAgencyBySharedRewardWolf' else 1.0 perturbedWolfID = parameters['perturbedWolfID'] perturbedWolfGoalID = parameters['perturbedWolfGoalID'] ## MDP Env numBlocks = 2 numAgents = numWolves + numSheep numEntities = numAgents + numBlocks wolvesID = list(range(numWolves)) sheepsID = list(range(numWolves, numWolves + numSheep)) blocksID = list(range(numAgents, numEntities)) sheepSize = 0.05 wolfSize = 0.075 blockSize = 0.2 entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheep + [blockSize] * numBlocks costActionRatio = 0.0 sheepSpeedMultiplier = 1.0 sheepMaxSpeed = 1.3 * sheepSpeedMultiplier wolfMaxSpeed = 1.0 blockMaxSpeed = None entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [sheepMaxSpeed] * numSheep + [blockMaxSpeed] * numBlocks entitiesMovableList = [True] * numAgents + [False] * numBlocks massList = [1.0] * numEntities collisionReward = 1 # for evaluation, count # of bites isCollision = IsCollision(getPosFromAgentState) rewardAllWolves = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision, collisionReward, wolfSelfish) rewardWolf = lambda state, action, nextState: np.sum(rewardAllWolves(state, action, nextState)) reshapeActionInTransit = lambda action: action getCollisionForce = GetCollisionForce() applyActionForce = ApplyActionForce(wolvesID, sheepsID, entitiesMovableList) applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList, entitiesSizeList, getCollisionForce, getPosFromAgentState) integrateState = IntegrateState(numEntities, entitiesMovableList, massList, entityMaxSpeedList, getVelFromAgentState, getPosFromAgentState) transit = TransitMultiAgentChasing(numEntities, reshapeActionInTransit, applyActionForce, applyEnvironForce, integrateState) forwardOneStep = ForwardOneStep(transit, rewardWolf) reset = ResetMultiAgentChasingWithSeed(numAgents, numBlocks) isTerminal = lambda state: False maxRunningStepsToSample = 101 sampleTrajectory = SampleTrajectory(maxRunningStepsToSample, isTerminal, reset, forwardOneStep) ## MDP Policy worldDim = 2 actionDim = worldDim * 2 + 1 layerWidth = [128, 128] maxTimeStep = 75 maxEpisode = 60000 dirName = os.path.dirname(__file__) # ------------ sheep recover variables ------------------------ numSheepToObserve = 1 sheepModelListOfDiffWolfReward = [] sheepTypeList = [0.0, 1.0] for sheepType in sheepTypeList: wolvesIDForSheepObserve = list(range(numWolves)) sheepsIDForSheepObserve = list(range(numWolves, numSheepToObserve + numWolves)) blocksIDForSheepObserve = list( range(numSheepToObserve + numWolves, numSheepToObserve + numWolves + numBlocks)) observeOneAgentForSheep = lambda agentID: Observe(agentID, wolvesIDForSheepObserve, sheepsIDForSheepObserve, blocksIDForSheepObserve, getPosFromAgentState, getVelFromAgentState) observeSheep = lambda state: [observeOneAgentForSheep(agentID)(state) for agentID in range(numWolves + numSheepToObserve)] obsIDsForSheep = wolvesIDForSheepObserve + sheepsIDForSheepObserve + blocksIDForSheepObserve initObsForSheepParams = observeSheep(reset()[obsIDsForSheep]) obsShapeSheep = [initObsForSheepParams[obsID].shape[0] for obsID in range(len(initObsForSheepParams))] buildSheepModels = BuildMADDPGModels(actionDim, numWolves + numSheepToObserve, obsShapeSheep) sheepModelsList = [buildSheepModels(layerWidth, agentID) for agentID in range(numWolves, numWolves + numSheepToObserve)] sheepFileName = "maddpg{}wolves{}sheep{}blocks{}episodes{}stepSheepSpeed{}WolfActCost{}individ{}_agent".format( numWolves, numSheepToObserve, numBlocks, maxEpisode, maxTimeStep, sheepSpeedMultiplier, costActionRatio, sheepType) sheepModelPaths = [os.path.join(dirName, '..', '..', 'data', 'preTrainModel', sheepFileName + str(i)) for i in range(numWolves, numWolves + numSheepToObserve)] [restoreVariables(model, path) for model, path in zip(sheepModelsList, sheepModelPaths)] sheepModelListOfDiffWolfReward = sheepModelListOfDiffWolfReward + sheepModelsList actOneStep = ActOneStep(actByPolicyTrainNoNoisy) numAllSheepModels = len(sheepModelListOfDiffWolfReward) # ------------ recover variables for "we" ------------------------ numAgentsInWe = numWolves numSheepInWe = 1 numBlocksForWe = numBlocks wolvesIDForWolfObserve = list(range(numAgentsInWe)) sheepsIDForWolfObserve = list(range(numAgentsInWe, numSheepInWe + numAgentsInWe)) blocksIDForWolfObserve = list( range(numSheepInWe + numAgentsInWe, numSheepInWe + numAgentsInWe + numBlocksForWe)) observeOneAgentForWolf = lambda agentID: Observe(agentID, wolvesIDForWolfObserve, sheepsIDForWolfObserve, blocksIDForWolfObserve, getPosFromAgentState, getVelFromAgentState) observeWolf = lambda state: [observeOneAgentForWolf(agentID)(state) for agentID in range(numAgentsInWe + numSheepInWe)] obsIDsForWolf = wolvesIDForWolfObserve + sheepsIDForWolfObserve + blocksIDForWolfObserve initObsForWolfParams = observeWolf(reset()[obsIDsForWolf]) obsShapeWolf = [initObsForWolfParams[obsID].shape[0] for obsID in range(len(initObsForWolfParams))] buildWolfModels = BuildMADDPGModels(actionDim, numAgentsInWe + numSheepInWe, obsShapeWolf) layerWidthForWolf = [128, 128] wolfModelsList = [buildWolfModels(layerWidthForWolf, agentID) for agentID in range(numAgentsInWe)] wolfFileName = "maddpg{}wolves{}sheep{}blocks{}episodes{}stepSheepSpeed{}WolfActCost{}individ{}_agent".format( numWolves, numSheepInWe, numBlocks, maxEpisode, maxTimeStep, sheepSpeedMultiplier, costActionRatio, wolfSelfish) wolfModelPaths = [os.path.join(dirName, '..', '..', 'data', 'preTrainModel', wolfFileName + str(i)) for i in range(numAgentsInWe)] [restoreVariables(model, path) for model, path in zip(wolfModelsList, wolfModelPaths)] # ------------ compose wolves policy no perturbation ------------------------ actionDimReshaped = 2 cov = [deviationFor2DAction ** 2 for _ in range(actionDimReshaped)] # 1 buildGaussian = BuildGaussianFixCov(cov) actOneStep = ActOneStep(actByPolicyTrainNoNoisy) reshapeAction = ReshapeAction() composeCentralControlPolicy = lambda observe: ComposeCentralControlPolicyByGaussianOnDeterministicAction( reshapeAction, observe, actOneStep, buildGaussian) wolvesCentralControlPolicy = [composeCentralControlPolicy(observeWolf)(wolfModelsList, numAgentsInWe)] # input state, return a list of gaussian distributions with cov 1 softPolicyInInference = lambda distribution: distribution getStateThirdPersonPerspective = lambda state, goalId, weIds: getStateOrActionThirdPersonPerspective(state, goalId, weIds, blocksID) # nochange policyForCommittedAgentsInInference = PolicyForCommittedAgent(wolvesCentralControlPolicy, softPolicyInInference, getStateThirdPersonPerspective) # same as wolvesCentralControlPolicy(state) concernedAgentsIds = wolvesID calCommittedAgentsPolicyLikelihood = CalCommittedAgentsContinuousPolicyLikelihood(concernedAgentsIds, policyForCommittedAgentsInInference, rationalityBetaInInference) randomActionSpace = [(5, 0), (3.5, 3.5), (0, 5), (-3.5, 3.5), (-5, 0), (-3.5, -3.5), (0, -5), (3.5, -3.5), (0, 0)] randomPolicy = RandomPolicy(randomActionSpace) getStateFirstPersonPerspective = lambda state, goalId, weIds, selfId: getStateOrActionFirstPersonPerspective( state, goalId, weIds, selfId, blocksID) policyForUncommittedAgentsInInference = PolicyForUncommittedAgent(wolvesID, randomPolicy, softPolicyInInference, getStateFirstPersonPerspective) # random policy, returns action distribution calUncommittedAgentsPolicyLikelihood = CalUncommittedAgentsPolicyLikelihood(wolvesID, concernedAgentsIds, policyForUncommittedAgentsInInference) # returns 1 # Joint Likelihood calJointLikelihood = lambda intention, state, perceivedAction: calCommittedAgentsPolicyLikelihood(intention, state, perceivedAction) * \ calUncommittedAgentsPolicyLikelihood(intention, state, perceivedAction) # __* 1 # ------------ wolves intention ------------------------ intentionSpacesForAllWolves = [tuple(it.product(sheepsID, [tuple(wolvesID)])) for wolfId in wolvesID] # <class 'tuple'>: ((3, (0, 1, 2)), (4, (0, 1, 2)), (5, (0, 1, 2)), (6, (0, 1, 2))) print('intentionSpacesForAllWolves', intentionSpacesForAllWolves) wolvesIntentionPriors = [ {tuple(intention): 1 / len(allPossibleIntentionsOneWolf) for intention in allPossibleIntentionsOneWolf} for allPossibleIntentionsOneWolf in intentionSpacesForAllWolves] perceptSelfAction = SampleNoisyAction(deviationFor2DAction) perceptOtherAction = SampleNoisyAction(deviationFor2DAction) perceptAction = PerceptImaginedWeAction(wolvesID, perceptSelfAction, perceptOtherAction) # input self, others action # Infer and update Intention variablesForAllWolves = [[intentionSpace] for intentionSpace in intentionSpacesForAllWolves] jointHypothesisSpaces = [pd.MultiIndex.from_product(variables, names=['intention']) for variables in variablesForAllWolves] concernedHypothesisVariable = ['intention'] priorDecayRate = 1 softPrior = SoftDistribution(priorDecayRate) # no change inferIntentionOneStepList = [InferOneStep(jointHypothesisSpace, concernedHypothesisVariable, calJointLikelihood, softPrior) for jointHypothesisSpace in jointHypothesisSpaces] if numSheep == 1: inferIntentionOneStepList = [lambda prior, state, action: prior] * 3 adjustIntentionPriorGivenValueOfState = lambda state: 1 chooseIntention = sampleFromDistribution updateIntentions = [UpdateIntention(intentionPrior, valuePriorEndTime, adjustIntentionPriorGivenValueOfState, perceptAction, inferIntentionOneStep, chooseIntention) for intentionPrior, inferIntentionOneStep in zip(wolvesIntentionPriors, inferIntentionOneStepList)] # reset intention and adjust intention prior attributes tools for multiple trajectory intentionResetAttributes = ['timeStep', 'lastState', 'lastAction', 'intentionPrior', 'formerIntentionPriors'] intentionResetAttributeValues = [ dict(zip(intentionResetAttributes, [0, None, None, intentionPrior, [intentionPrior]])) for intentionPrior in wolvesIntentionPriors] resetIntentions = ResetObjects(intentionResetAttributeValues, updateIntentions) returnAttributes = ['formerIntentionPriors'] getIntentionDistributions = GetObjectsValuesOfAttributes(returnAttributes, updateIntentions[1:]) attributesToRecord = ['lastAction'] recordActionForUpdateIntention = RecordValuesForObjects(attributesToRecord, updateIntentions) # Wovels Generate Action #TODO covForPlanning = [0.00000001 for _ in range(actionDimReshaped)] # covForPlanning = [0.03 ** 2 for _ in range(actionDimReshaped)] buildGaussianForPlanning = BuildGaussianFixCov(covForPlanning) composeCentralControlPolicyForPlanning = lambda \ observe: ComposeCentralControlPolicyByGaussianOnDeterministicAction(reshapeAction, observe, actOneStep, buildGaussianForPlanning) wolvesCentralControlPoliciesForPlanning = [ composeCentralControlPolicyForPlanning(observeWolf)(wolfModelsList, numAgentsInWe)] centralControlPolicyListBasedOnNumAgentsInWeForPlanning = wolvesCentralControlPoliciesForPlanning # 0 for two agents in We, 1 for three agents... softPolicyInPlanning = lambda distribution: distribution policyForCommittedAgentInPlanning = PolicyForCommittedAgent( centralControlPolicyListBasedOnNumAgentsInWeForPlanning, softPolicyInPlanning, getStateThirdPersonPerspective) policyForUncommittedAgentInPlanning = PolicyForUncommittedAgent(wolvesID, randomPolicy, softPolicyInPlanning, getStateFirstPersonPerspective) def wolfChooseActionMethod(individualContinuousDistributions): centralControlAction = tuple( [tuple(sampleFromContinuousSpace(distribution)) for distribution in individualContinuousDistributions]) return centralControlAction getSelfActionIDInThirdPersonPerspective = lambda weIds, selfId: list(weIds).index(selfId) chooseCommittedAction = GetActionFromJointActionDistribution(wolfChooseActionMethod, getSelfActionIDInThirdPersonPerspective) chooseUncommittedAction = sampleFromDistribution wolvesSampleIndividualActionGivenIntentionList = [ SampleIndividualActionGivenIntention(selfId, policyForCommittedAgentInPlanning, policyForUncommittedAgentInPlanning, chooseCommittedAction, chooseUncommittedAction) for selfId in wolvesID] # ------------------- recover one wolf model that only concerns sheep 0 ------------------- numSheepForPerturbedWolf = 1 wolvesIDForPerturbedWolf = wolvesID sheepsIDForPerturbedWolf = [sheepsID[perturbedWolfGoalID]] blocksIDForPerturbedWolf = list(range(numWolves + numSheep, numEntities)) # skip the unattended sheep id observeOneAgentForPerturbedWolf = lambda agentID: Observe(agentID, wolvesIDForPerturbedWolf, sheepsIDForPerturbedWolf, blocksIDForPerturbedWolf, getPosFromAgentState, getVelFromAgentState) observePerturbedWolf = lambda state: [observeOneAgentForPerturbedWolf(agentID)(state) for agentID in wolvesIDForPerturbedWolf + sheepsIDForPerturbedWolf] initObsForPerturbedWolfParams = observePerturbedWolf(reset()) obsShapePerturbedWolf = [initObsForPerturbedWolfParams[obsID].shape[0] for obsID in range(len(initObsForPerturbedWolfParams))] buildPerturbedWolfModels = BuildMADDPGModels(actionDim, numWolves + numSheepForPerturbedWolf, obsShapePerturbedWolf) layerWidthForWolf = [128, 128] perturbedWolfModel = buildPerturbedWolfModels(layerWidthForWolf, perturbedWolfID) perturbedWolfFileName = "maddpg{}wolves{}sheep{}blocks{}episodes{}stepSheepSpeed{}WolfActCost{}individ{}_agent".format( numWolves, numSheepForPerturbedWolf, numBlocks, maxEpisode, maxTimeStep, sheepSpeedMultiplier, costActionRatio, wolfSelfish) perturbedWolfModelPath = os.path.join(dirName, '..', '..', 'data', 'preTrainModel', perturbedWolfFileName + str(perturbedWolfID)) restoreVariables(perturbedWolfModel, perturbedWolfModelPath) # ------------------- Sample and Save Trajectory ------------------- wolvesSampleActions = [ SampleActionOnChangableIntention(updateIntention, wolvesSampleIndividualActionGivenIntention) for updateIntention, wolvesSampleIndividualActionGivenIntention in zip(updateIntentions, wolvesSampleIndividualActionGivenIntentionList)] perturbedWolfSampleActions = lambda state: tuple(reshapeAction(actOneStep(perturbedWolfModel, observePerturbedWolf(state)))) wolvesSampleActionsPerturbed = wolvesSampleActions#.copy() wolvesSampleActionsPerturbed[perturbedWolfID] = perturbedWolfSampleActions trajectoriesWithIntentionDists = [] for trajectoryId in range(self.numTrajectories): sheepModelsForPolicy = [sheepModelListOfDiffWolfReward[np.random.choice(numAllSheepModels)] for sheepId in sheepsID] composeSheepPolicy = lambda sheepModel: lambda state: { tuple(reshapeAction(actOneStep(sheepModel, observeSheep(state)))): 1} sheepChooseActionMethod = sampleFromDistribution sheepSampleActions = [SampleActionOnFixedIntention(selfId, wolvesID, composeSheepPolicy(sheepModel), sheepChooseActionMethod, blocksID) for selfId, sheepModel in zip(sheepsID, sheepModelsForPolicy)] allIndividualSampleActions = wolvesSampleActions + sheepSampleActions sampleActionMultiAgent = SampleActionMultiagent(allIndividualSampleActions, recordActionForUpdateIntention) allIndividualSampleActionsPerturbed = wolvesSampleActionsPerturbed + sheepSampleActions sampleActionMultiAgentPerturbed = SampleActionMultiagent(allIndividualSampleActionsPerturbed, recordActionForUpdateIntention) # trajectory = sampleTrajectory(sampleActionMultiAgentPerturbed) trajectory = sampleTrajectory(sampleActionMultiAgentPerturbed) intentionDistributions = getIntentionDistributions() trajectoryWithIntentionDists = [tuple(list(SASRPair) + list(intentionDist)) for SASRPair, intentionDist in zip(trajectory, intentionDistributions)] trajectoriesWithIntentionDists.append(tuple(trajectoryWithIntentionDists)) # trajectoriesWithIntentionDists.append(trajectory) resetIntentions() trajectoryFixedParameters = {'maxRunningStepsToSample': maxRunningStepsToSample} self.saveTrajectoryByParameters(trajectoriesWithIntentionDists, trajectoryFixedParameters, parameters)
def __call__(self, parameters): print(parameters) numWolves = parameters['numWolves'] numSheep = 1 ## MDP Env # state is all multi agent state # action is all multi agent action xBoundary = [0,600] yBoundary = [0,600] numOfAgent = numWolves + numSheep reset = Reset(xBoundary, yBoundary, numOfAgent) possibleSheepIds = list(range(numSheep)) possibleWolvesIds = list(range(numSheep, numSheep + numWolves)) getSheepStatesFromAll = lambda state: np.array(state)[possibleSheepIds] getWolvesStatesFromAll = lambda state: np.array(state)[possibleWolvesIds] killzoneRadius = 50 isTerminal = IsTerminal(killzoneRadius, getSheepStatesFromAll, getWolvesStatesFromAll) stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity(xBoundary, yBoundary) interpolateOneFrame = InterpolateOneFrame(stayInBoundaryByReflectVelocity) numFramesToInterpolate = 3 transit = TransitWithTerminalCheckOfInterpolation(numFramesToInterpolate, interpolateOneFrame, isTerminal) maxRunningSteps = 52 timeCost = 1/maxRunningSteps terminalBonus = 1 rewardFunction = RewardFunctionByTerminal(timeCost, terminalBonus, isTerminal) forwardOneStep = ForwardOneStep(transit, rewardFunction) sampleTrajectory = SampleTrajectory(maxRunningSteps, isTerminal, reset, forwardOneStep) ## MDP Policy # Sheep Part # Sheep Policy Function numSheepPolicyStateSpace = 2 * (numWolves + 1) sheepActionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)] preyPowerRatio = 12 sheepIndividualActionSpace = list(map(tuple, np.array(sheepActionSpace) * preyPowerRatio)) numSheepActionSpace = len(sheepIndividualActionSpace) regularizationFactor = 1e-4 generateSheepModel = GenerateModel(numSheepPolicyStateSpace, numSheepActionSpace, regularizationFactor) sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] sheepNNDepth = 9 resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' initSheepModel = generateSheepModel(sharedWidths * sheepNNDepth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) sheepModelPath = os.path.join('..', '..', 'data', 'preTrainModel', 'agentId=0.'+str(numWolves)+'_depth=9_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations=110_trainSteps=50000') sheepNNModel = restoreVariables(initSheepModel, sheepModelPath) sheepPolicy = ApproximatePolicy(sheepNNModel, sheepIndividualActionSpace) # Sheep Generate Action softParameterInPlanningForSheep = 2.5 softPolicyInPlanningForSheep = SoftDistribution(softParameterInPlanningForSheep) softenSheepPolicy = lambda relativeAgentsStatesForSheepPolicy: softPolicyInPlanningForSheep(sheepPolicy(relativeAgentsStatesForSheepPolicy)) sheepChooseActionMethod = sampleFromDistribution sheepSampleActions = [SampleActionOnFixedIntention(selfId, possibleWolvesIds, sheepPolicy, sheepChooseActionMethod) for selfId in possibleSheepIds] # Wolves Part # Policy Likelihood function: Wolf Centrol Control NN Policy Given Intention numWolvesStateSpaces = [2 * (numInWe + 1) for numInWe in range(2, numWolves + 1)] actionSpace = [(10, 0), (0, 10), (-10, 0), (0, -10)] predatorPowerRatio = 8 wolfIndividualActionSpace = list(map(tuple, np.array(actionSpace) * predatorPowerRatio)) wolvesCentralControlActionSpaces = [list(it.product(wolfIndividualActionSpace, repeat = numInWe)) for numInWe in range(2, numWolves + 1)] numWolvesCentralControlActionSpaces = [len(wolvesCentralControlActionSpace) for wolvesCentralControlActionSpace in wolvesCentralControlActionSpaces] regularizationFactor = 1e-4 generateWolvesCentralControlModels = [GenerateModel(numStateSpace, numActionSpace, regularizationFactor) for numStateSpace, numActionSpace in zip(numWolvesStateSpaces, numWolvesCentralControlActionSpaces)] sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] wolfNNDepth = 9 resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' initWolvesCentralControlModels = [generateWolvesCentralControlModel(sharedWidths * wolfNNDepth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) for generateWolvesCentralControlModel in generateWolvesCentralControlModels] NNNumSimulations = 250 wolvesModelPaths = [os.path.join('..', '..', 'data', 'preTrainModel', 'agentId='+str(len(actionSpace) * np.sum([10**_ for _ in range(numInWe)]))+'_depth=9_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations='+str(NNNumSimulations)+'_trainSteps=50000') for numInWe in range(2, numWolves + 1)] print(wolvesModelPaths) wolvesCentralControlNNModels = [restoreVariables(initWolvesCentralControlModel, wolvesModelPath) for initWolvesCentralControlModel, wolvesModelPath in zip(initWolvesCentralControlModels, wolvesModelPaths)] wolvesCentralControlPolicies = [ApproximatePolicy(NNModel, actionSpace) for NNModel, actionSpace in zip(wolvesCentralControlNNModels, wolvesCentralControlActionSpaces)] centralControlPolicyListBasedOnNumAgentsInWe = wolvesCentralControlPolicies # 0 for two agents in We, 1 for three agents... softParameterInInference = 1 softPolicyInInference = SoftDistribution(softParameterInInference) policyForCommittedAgentsInInference = PolicyForCommittedAgent(centralControlPolicyListBasedOnNumAgentsInWe, softPolicyInInference, getStateThirdPersonPerspective) calCommittedAgentsPolicyLikelihood = CalCommittedAgentsPolicyLikelihood(policyForCommittedAgentsInInference) wolfLevel2ActionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7)] wolfLevel2IndividualActionSpace = list(map(tuple, np.array(wolfLevel2ActionSpace) * predatorPowerRatio)) wolfLevel2CentralControlActionSpace = list(it.product(wolfLevel2IndividualActionSpace)) numWolfLevel2ActionSpace = len(wolfLevel2CentralControlActionSpace) regularizationFactor = 1e-4 generatewolfLevel2Models = [GenerateModel(numStateSpace, numWolfLevel2ActionSpace, regularizationFactor) for numStateSpace in numWolvesStateSpaces] sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] wolfLevel2NNDepth = 9 resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' initwolfLevel2Models = [generatewolfLevel2Model(sharedWidths * wolfLevel2NNDepth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) for generatewolfLevel2Model in generatewolfLevel2Models] wolfLevel2ModelPaths = [os.path.join('..', '..', 'data', 'preTrainModel', 'agentId=1.'+str(numInWe)+'_depth=9_hierarchy=2_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations='+str(NNNumSimulations)+'_trainSteps=50000') for numInWe in range(2, numWolves + 1)] wolfLevel2NNModels = [restoreVariables(initwolfLevel2Model, wolfLevel2ModelPath) for initwolfLevel2Model, wolfLevel2ModelPath in zip(initwolfLevel2Models, wolfLevel2ModelPaths)] wolfLevel2Policies = [ApproximatePolicy(wolfLevel2NNModel, wolfLevel2CentralControlActionSpace) for wolfLevel2NNModel in wolfLevel2NNModels] level2PolicyListBasedOnNumAgentsInWe = wolfLevel2Policies # 0 for two agents in We, 1 for three agents... softPolicy = SoftDistribution(2.5) totalInSmallRangeFlags = [] for trial in range(self.numTrajectories): state = reset() while isTerminal(state): state = reset() jointActions = sampleFromDistribution(softPolicy(wolvesCentralControlPolicies[numWolves - 2](state))) hierarchyActions = [] weIds = [list(range(numSheep, numWolves + numSheep)) for _ in range(numWolves)] for index in range(numWolves): weId = weIds[index].copy() weId.insert(0, weId.pop(index)) relativeId = [0] + weId action = sampleFromDistribution(softPolicy(wolfLevel2Policies[numWolves - 2](state[relativeId]))) hierarchyActions.append(action) reasonableActionRange = [int(np.linalg.norm(np.array(jointAction) - np.array(hierarchyAction)) <= 8 * predatorPowerRatio) for jointAction, hierarchyAction in zip(jointActions, hierarchyActions) if jointAction != (0, 0) and hierarchyAction != (0, 0)] totalInSmallRangeFlags = totalInSmallRangeFlags + reasonableActionRange inSmallRangeRateMean = np.mean(totalInSmallRangeFlags) return inSmallRangeRateMean
def __call__(self, parameters): print(parameters) numWolves = parameters['numWolves'] numSheep = parameters['numSheep'] softParameterInInference = parameters['inferenceSoft'] softParameterInPlanning = parameters['wolfPolicySoft'] otherCompeteRate = parameters['otherCompeteRate'] competeDetectionRate = parameters['competeDetectionRate'] ## MDP Env # state is all multi agent state # action is all multi agent action xBoundary = [0, 600] yBoundary = [0, 600] numOfAgent = numWolves + numSheep reset = Reset(xBoundary, yBoundary, numOfAgent) possibleSheepIds = list(range(numSheep)) possibleWolvesIds = list(range(numSheep, numSheep + numWolves)) getSheepStatesFromAll = lambda state: np.array(state)[possibleSheepIds] getWolvesStatesFromAll = lambda state: np.array(state)[ possibleWolvesIds] killzoneRadius = 50 isTerminal = IsTerminal(killzoneRadius, getSheepStatesFromAll, getWolvesStatesFromAll) stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity( xBoundary, yBoundary) interpolateOneFrame = InterpolateOneFrame( stayInBoundaryByReflectVelocity) numFramesToInterpolate = 3 transit = TransitWithTerminalCheckOfInterpolation( numFramesToInterpolate, interpolateOneFrame, isTerminal) maxRunningSteps = 61 timeCost = 1 / maxRunningSteps terminalBonus = 1 rewardFunction = RewardFunctionByTerminal(timeCost, terminalBonus, isTerminal) forwardOneStep = ForwardOneStep(transit, rewardFunction) sampleTrajectory = SampleTrajectory(maxRunningSteps, isTerminal, reset, forwardOneStep) ## MDP Policy # Sheep Part # Sheep Policy Function numSheepPolicyStateSpace = 2 * (numWolves + 1) sheepActionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)] preyPowerRatio = 12 sheepIndividualActionSpace = list( map(tuple, np.array(sheepActionSpace) * preyPowerRatio)) numSheepActionSpace = len(sheepIndividualActionSpace) regularizationFactor = 1e-4 generateSheepModel = GenerateModel(numSheepPolicyStateSpace, numSheepActionSpace, regularizationFactor) sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] sheepNNDepth = 9 resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' initSheepModel = generateSheepModel(sharedWidths * sheepNNDepth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) sheepModelPath = os.path.join( '..', '..', 'data', 'preTrainModel', 'agentId=0.' + str(numWolves) + '_depth=9_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations=110_trainSteps=50000' ) sheepNNModel = restoreVariables(initSheepModel, sheepModelPath) sheepPolicy = ApproximatePolicy(sheepNNModel, sheepIndividualActionSpace) # Sheep Generate Action softParameterInPlanningForSheep = 2.0 softPolicyInPlanningForSheep = SoftDistribution( softParameterInPlanningForSheep) softenSheepPolicy = lambda relativeAgentsStatesForSheepPolicy: softPolicyInPlanningForSheep( sheepPolicy(relativeAgentsStatesForSheepPolicy)) sheepChooseActionMethod = sampleFromDistribution sheepSampleActions = [ SampleActionOnFixedIntention(selfId, possibleWolvesIds, softenSheepPolicy, sheepChooseActionMethod) for selfId in possibleSheepIds ] # Wolves Part # Percept Action For Inference perceptAction = lambda action: action # Policy Likelihood function: Wolf Centrol Control NN Policy Given Intention numWolvesStateSpaces = [ 2 * (numInWe + 1) for numInWe in range(2, numWolves + 1) ] actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7)] predatorPowerRatio = 8 wolfIndividualActionSpace = list( map(tuple, np.array(actionSpace) * predatorPowerRatio)) wolvesCentralControlActionSpaces = [ list(it.product(wolfIndividualActionSpace, repeat=numInWe)) for numInWe in range(2, numWolves + 1) ] numWolvesCentralControlActionSpaces = [ len(wolvesCentralControlActionSpace) for wolvesCentralControlActionSpace in wolvesCentralControlActionSpaces ] regularizationFactor = 1e-4 generateWolvesCentralControlModels = [ GenerateModel(numStateSpace, numActionSpace, regularizationFactor) for numStateSpace, numActionSpace in zip( numWolvesStateSpaces, numWolvesCentralControlActionSpaces) ] sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] wolfNNDepth = 9 resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' initWolvesCentralControlModels = [ generateWolvesCentralControlModel(sharedWidths * wolfNNDepth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) for generateWolvesCentralControlModel in generateWolvesCentralControlModels ] NNNumSimulations = 250 wolvesModelPaths = [ os.path.join( '..', '..', 'data', 'preTrainModel', 'agentId=' + str(8 * np.sum([10**_ for _ in range(numInWe)])) + '_depth=9_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations=' + str(NNNumSimulations) + '_trainSteps=50000') for numInWe in range(2, numWolves + 1) ] print(wolvesModelPaths) wolvesCentralControlNNModels = [ restoreVariables(initWolvesCentralControlModel, wolvesModelPath) for initWolvesCentralControlModel, wolvesModelPath in zip( initWolvesCentralControlModels, wolvesModelPaths) ] wolvesCentralControlPolicies = [ ApproximatePolicy(NNModel, actionSpace) for NNModel, actionSpace in zip(wolvesCentralControlNNModels, wolvesCentralControlActionSpaces) ] centralControlPolicyListBasedOnNumAgentsInWe = wolvesCentralControlPolicies # 0 for two agents in We, 1 for three agents... softPolicyInInference = SoftDistribution(softParameterInInference) policyForCommittedAgentsInInference = PolicyForCommittedAgent( centralControlPolicyListBasedOnNumAgentsInWe, softPolicyInInference, getStateOrActionThirdPersonPerspective) concernedAgentsIds = [2] calCommittedAgentsPolicyLikelihood = CalCommittedAgentsPolicyLikelihood( concernedAgentsIds, policyForCommittedAgentsInInference) getGoalStateForIndividualHeatseeking = lambda statesRelative: np.array( statesRelative)[0] getSelfStateForIndividualHeatseeking = lambda statesRelative: np.array( statesRelative)[1] heatseekingPrecesion = 1.83 heatSeekingDiscreteStochasticPolicy = HeatSeekingDiscreteStochasticPolicy( heatseekingPrecesion, wolfIndividualActionSpace, getSelfStateForIndividualHeatseeking, getGoalStateForIndividualHeatseeking) policyForUncommittedAgentsInInference = PolicyForUncommittedAgent( possibleWolvesIds, heatSeekingDiscreteStochasticPolicy, softPolicyInInference, getStateOrActionFirstPersonPerspective) calUncommittedAgentsPolicyLikelihood = CalUncommittedAgentsPolicyLikelihood( possibleWolvesIds, concernedAgentsIds, policyForUncommittedAgentsInInference) # Joint Likelihood calJointLikelihood = lambda intention, state, perceivedAction: calCommittedAgentsPolicyLikelihood(intention, state, perceivedAction) * \ calUncommittedAgentsPolicyLikelihood(intention, state, perceivedAction) wolvesValueListBasedOnNumAgentsInWe = [ ApproximateValue(NNModel) for NNModel in wolvesCentralControlNNModels ] calIntentionValueGivenState = CalIntentionValueGivenState( wolvesValueListBasedOnNumAgentsInWe) softParamterForValue = 0.01 softValueToBuildDistribution = SoftMax(softParamterForValue) adjustIntentionPriorGivenValueOfState = AdjustIntentionPriorGivenValueOfState( calIntentionValueGivenState, softValueToBuildDistribution) # Sample and Save Trajectory trajectoriesWithIntentionDists = [] for trajectoryId in range(self.numTrajectories): # Intention Prior For inference otherWolfPossibleIntentionSpaces = {0: [(0, (1, 2))], 1: [(0, ())]} otherIntentionType = np.random.choice( [1, 0], p=[otherCompeteRate, 1 - otherCompeteRate]) otherWolfIntentionSpace = otherWolfPossibleIntentionSpaces[ otherIntentionType] selfPossibleIntentionSpaces = { 0: [(0, (1, 2))], 0.5: [(0, (1, 2)), (0, ())], 1: [(0, ())] } selfWolfIntentionSpace = selfPossibleIntentionSpaces[ competeDetectionRate] intentionSpacesForAllWolves = [ selfWolfIntentionSpace, otherWolfIntentionSpace ] wolvesIntentionPriors = [{ tuple(intention): 1 / len(allPossibleIntentionsOneWolf) for intention in allPossibleIntentionsOneWolf } for allPossibleIntentionsOneWolf in intentionSpacesForAllWolves] # Infer and update Intention variablesForAllWolves = [[ intentionSpace ] for intentionSpace in intentionSpacesForAllWolves] jointHypothesisSpaces = [ pd.MultiIndex.from_product(variables, names=['intention']) for variables in variablesForAllWolves ] concernedHypothesisVariable = ['intention'] priorDecayRate = 1 softPrior = SoftDistribution(priorDecayRate) inferIntentionOneStepList = [ InferOneStep(jointHypothesisSpace, concernedHypothesisVariable, calJointLikelihood, softPrior) for jointHypothesisSpace in jointHypothesisSpaces ] chooseIntention = sampleFromDistribution valuePriorEndTime = -100 updateIntentions = [ UpdateIntention(intentionPrior, valuePriorEndTime, adjustIntentionPriorGivenValueOfState, perceptAction, inferIntentionOneStep, chooseIntention) for intentionPrior, inferIntentionOneStep in zip( wolvesIntentionPriors, inferIntentionOneStepList) ] # reset intention and adjuste intention prior attributes tools for multiple trajectory intentionResetAttributes = [ 'timeStep', 'lastState', 'lastAction', 'intentionPrior', 'formerIntentionPriors' ] intentionResetAttributeValues = [ dict( zip(intentionResetAttributes, [0, None, None, intentionPrior, [intentionPrior]])) for intentionPrior in wolvesIntentionPriors ] resetIntentions = ResetObjects(intentionResetAttributeValues, updateIntentions) returnAttributes = ['formerIntentionPriors'] getIntentionDistributions = GetObjectsValuesOfAttributes( returnAttributes, updateIntentions) attributesToRecord = ['lastAction'] recordActionForUpdateIntention = RecordValuesForObjects( attributesToRecord, updateIntentions) # Wovels Generate Action softPolicyInPlanning = SoftDistribution(softParameterInPlanning) policyForCommittedAgentInPlanning = PolicyForCommittedAgent( centralControlPolicyListBasedOnNumAgentsInWe, softPolicyInPlanning, getStateOrActionThirdPersonPerspective) policyForUncommittedAgentInPlanning = PolicyForUncommittedAgent( possibleWolvesIds, heatSeekingDiscreteStochasticPolicy, softPolicyInPlanning, getStateOrActionFirstPersonPerspective) wolfChooseActionMethod = sampleFromDistribution getSelfActionThirdPersonPerspective = lambda weIds, selfId: list( weIds).index(selfId) chooseCommittedAction = GetActionFromJointActionDistribution( wolfChooseActionMethod, getSelfActionThirdPersonPerspective) chooseUncommittedAction = sampleFromDistribution wolvesSampleIndividualActionGivenIntentionList = [ SampleIndividualActionGivenIntention( selfId, policyForCommittedAgentInPlanning, policyForUncommittedAgentInPlanning, chooseCommittedAction, chooseUncommittedAction) for selfId in possibleWolvesIds ] wolvesSampleActions = [ SampleActionOnChangableIntention( updateIntention, wolvesSampleIndividualActionGivenIntention) for updateIntention, wolvesSampleIndividualActionGivenIntention in zip(updateIntentions, wolvesSampleIndividualActionGivenIntentionList) ] allIndividualSampleActions = sheepSampleActions + wolvesSampleActions sampleActionMultiAgent = SampleActionMultiagent( allIndividualSampleActions, recordActionForUpdateIntention) trajectory = sampleTrajectory(sampleActionMultiAgent) intentionDistributions = getIntentionDistributions() trajectoryWithIntentionDists = [ tuple(list(SASRPair) + list(intentionDist)) for SASRPair, intentionDist in zip(trajectory, intentionDistributions) ] trajectoriesWithIntentionDists.append( tuple(trajectoryWithIntentionDists)) resetIntentions() #print(intentionDistributions[-1], otherCompeteRate) trajectoryFixedParameters = { 'sheepPolicySoft': softParameterInPlanningForSheep, 'wolfPolicySoft': softParameterInPlanning, 'maxRunningSteps': maxRunningSteps, 'competePolicy': 'heatseeking', 'NNNumSimulations': NNNumSimulations, 'heatseekingPrecesion': heatseekingPrecesion } self.saveTrajectoryByParameters(trajectoriesWithIntentionDists, trajectoryFixedParameters, parameters) print(np.mean([len(tra) for tra in trajectoriesWithIntentionDists]))
def iterateTrainOneCondition(parameterOneCondition): numTrainStepEachIteration = int( parameterOneCondition['numTrainStepEachIteration']) numTrajectoriesPerIteration = int( parameterOneCondition['numTrajectoriesPerIteration']) dirName = os.path.dirname(__file__) numOfAgent = 2 agentIds = list(range(numOfAgent)) maxRunningSteps = 50 numSimulations = 250 killzoneRadius = 50 fixedParameters = { 'maxRunningSteps': maxRunningSteps, 'numSimulations': numSimulations, 'killzoneRadius': killzoneRadius } # env MDP sheepsID = [0] wolvesID = [1, 2] blocksID = [] numSheeps = len(sheepsID) numWolves = len(wolvesID) numBlocks = len(blocksID) numAgents = numWolves + numSheeps numEntities = numAgents + numBlocks sheepSize = 0.05 wolfSize = 0.075 blockSize = 0.2 sheepMaxSpeed = 1.3 * 1 wolfMaxSpeed = 1.0 * 1 blockMaxSpeed = None entitiesSizeList = [sheepSize] * numSheeps + [wolfSize] * numWolves + [ blockSize ] * numBlocks entityMaxSpeedList = [sheepMaxSpeed] * numSheeps + [ wolfMaxSpeed ] * numWolves + [blockMaxSpeed] * numBlocks entitiesMovableList = [True] * numAgents + [False] * numBlocks massList = [1.0] * numEntities centralControlId = 1 centerControlIndexList = [centralControlId] reshapeAction = UnpackCenterControlAction(centerControlIndexList) getCollisionForce = GetCollisionForce() applyActionForce = ApplyActionForce(wolvesID, sheepsID, entitiesMovableList) applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList, entitiesSizeList, getCollisionForce, getPosFromAgentState) integrateState = IntegrateState(numEntities, entitiesMovableList, massList, entityMaxSpeedList, getVelFromAgentState, getPosFromAgentState) interpolateState = TransitMultiAgentChasing(numEntities, reshapeAction, applyActionForce, applyEnvironForce, integrateState) numFramesToInterpolate = 1 def transit(state, action): for frameIndex in range(numFramesToInterpolate): nextState = interpolateState(state, action) action = np.array([(0, 0)] * numAgents) state = nextState return nextState isTerminal = lambda state: False isCollision = IsCollision(getPosFromAgentState) collisonRewardWolf = 1 punishForOutOfBound = PunishForOutOfBound() rewardWolf = RewardCentralControlPunishBond( wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBound, collisonRewardWolf) collisonRewardSheep = -1 rewardSheep = RewardCentralControlPunishBond( sheepsID, wolvesID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBound, collisonRewardSheep) resetState = ResetMultiAgentChasing(numAgents, numBlocks) observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID, blocksID, getPosFromAgentState, getVelFromAgentState) observe = lambda state: [ observeOneAgent(agentID)(state) for agentID in range(numAgents) ] # policy actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)] wolfActionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)] preyPowerRatio = 0.5 sheepActionSpace = list(map(tuple, np.array(actionSpace) * preyPowerRatio)) predatorPowerRatio = 0.5 wolfActionOneSpace = list( map(tuple, np.array(wolfActionSpace) * predatorPowerRatio)) wolfActionTwoSpace = list( map(tuple, np.array(wolfActionSpace) * predatorPowerRatio)) wolvesActionSpace = list(it.product(wolfActionOneSpace, wolfActionTwoSpace)) actionSpaceList = [sheepActionSpace, wolvesActionSpace] # neural network init numStateSpace = 4 * numEntities numSheepActionSpace = len(sheepActionSpace) numWolvesActionSpace = len(wolvesActionSpace) regularizationFactor = 1e-4 sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] generateSheepModel = GenerateModel(numStateSpace, numSheepActionSpace, regularizationFactor) generateWolvesModel = GenerateModel(numStateSpace, numWolvesActionSpace, regularizationFactor) generateModelList = [generateSheepModel, generateWolvesModel] sheepDepth = 9 wolfDepth = 9 depthList = [sheepDepth, wolfDepth] resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' multiAgentNNmodel = [ generateModel(sharedWidths * depth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) for depth, generateModel in zip(depthList, generateModelList) ] # replay buffer bufferSize = 20000 saveToBuffer = SaveToBuffer(bufferSize) def getUniformSamplingProbabilities(buffer): return [(1 / len(buffer)) for _ in buffer] miniBatchSize = 512 sampleBatchFromBuffer = SampleBatchFromBuffer( miniBatchSize, getUniformSamplingProbabilities) # pre-process the trajectory for replayBuffer rewardMultiAgents = [rewardSheep, rewardWolf] decay = 1 accumulateMultiAgentRewards = AccumulateMultiAgentRewards(decay) addMultiAgentValuesToTrajectory = AddValuesToTrajectory( accumulateMultiAgentRewards) actionIndex = 1 def getTerminalActionFromTrajectory(trajectory): return trajectory[-1][actionIndex] removeTerminalTupleFromTrajectory = RemoveTerminalTupleFromTrajectory( getTerminalActionFromTrajectory) # pre-process the trajectory for NNTraining sheepActionToOneHot = ActionToOneHot(sheepActionSpace) wolvesActionToOneHot = ActionToOneHot(wolvesActionSpace) actionToOneHotList = [sheepActionToOneHot, wolvesActionToOneHot] processTrajectoryForPolicyValueNets = [ ProcessTrajectoryForPolicyValueNetMultiAgentReward( actionToOneHotList[agentId], agentId) for agentId in agentIds ] # function to train NN model terminalThreshold = 1e-6 lossHistorySize = 10 initActionCoeff = 1 initValueCoeff = 1 initCoeff = (initActionCoeff, initValueCoeff) afterActionCoeff = 1 afterValueCoeff = 1 afterCoeff = (afterActionCoeff, afterValueCoeff) terminalController = TrainTerminalController(lossHistorySize, terminalThreshold) coefficientController = CoefficientCotroller(initCoeff, afterCoeff) reportInterval = 10000 trainStepsIntervel = 1 # 10000 trainReporter = TrainReporter(numTrainStepEachIteration, reportInterval) learningRateDecay = 1 learningRateDecayStep = 1 learningRate = 0.0001 learningRateModifier = LearningRateModifier(learningRate, learningRateDecay, learningRateDecayStep) trainNN = Train(numTrainStepEachIteration, miniBatchSize, sampleData, learningRateModifier, terminalController, coefficientController, trainReporter) # load save dir trajectorySaveExtension = '.pickle' NNModelSaveExtension = '' trajectoriesSaveDirectory = os.path.join( dirName, '..', '..', 'data', 'iterTrain2wolves1sheepMADDPGEnv', 'trajectories') if not os.path.exists(trajectoriesSaveDirectory): os.makedirs(trajectoriesSaveDirectory) NNModelSaveDirectory = os.path.join(dirName, '..', '..', 'data', 'iterTrain2wolves1sheepMADDPGEnv', 'NNModelRes') if not os.path.exists(NNModelSaveDirectory): os.makedirs(NNModelSaveDirectory) generateTrajectorySavePath = GetSavePath(trajectoriesSaveDirectory, trajectorySaveExtension, fixedParameters) generateNNModelSavePath = GetSavePath(NNModelSaveDirectory, NNModelSaveExtension, fixedParameters) startTime = time.time() sheepDepth = 9 wolfDepth = 9 depthList = [sheepDepth, wolfDepth] resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' multiAgentNNmodel = [ generateModel(sharedWidths * depth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) for depth, generateModel in zip(depthList, generateModelList) ] preprocessMultiAgentTrajectories = PreprocessTrajectoriesForBuffer( addMultiAgentValuesToTrajectory, removeTerminalTupleFromTrajectory) numTrajectoriesToStartTrain = 1024 trainOneAgent = TrainOneAgent(numTrainStepEachIteration, numTrajectoriesToStartTrain, processTrajectoryForPolicyValueNets, sampleBatchFromBuffer, trainNN) # restorePretrainModel sheepPreTrainModelPath = os.path.join( dirName, '..', '..', 'data', 'MADDPG2wolves1sheep', 'trainSheepWithPretrrainWolves', 'trainedResNNModels', 'agentId=0_depth=9_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations=250_trainSteps=50000' ) wolvesPreTrainModelPath = os.path.join( dirName, '..', '..', 'data', 'MADDPG2wolves1sheep', 'trainWolvesTwoCenterControlAction', 'trainedResNNModels', 'agentId=1_depth=9_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations=250_trainSteps=50000' ) pretrainModelPathList = [sheepPreTrainModelPath, wolvesPreTrainModelPath] sheepId, wolvesId = [0, 1] trainableAgentIds = [sheepId, wolvesId] for agentId in trainableAgentIds: restoredNNModel = restoreVariables(multiAgentNNmodel[agentId], pretrainModelPathList[agentId]) multiAgentNNmodel[agentId] = restoredNNModel NNModelPathParameters = { 'iterationIndex': 0, 'agentId': agentId, 'numTrajectoriesPerIteration': numTrajectoriesPerIteration, 'numTrainStepEachIteration': numTrainStepEachIteration } NNModelSavePath = generateNNModelSavePath(NNModelPathParameters) saveVariables(multiAgentNNmodel[agentId], NNModelSavePath) fuzzySearchParameterNames = ['sampleIndex'] loadTrajectoriesForParallel = LoadTrajectories(generateTrajectorySavePath, loadFromPickle, fuzzySearchParameterNames) loadTrajectoriesForTrainBreak = LoadTrajectories( generateTrajectorySavePath, loadFromPickle) # initRreplayBuffer replayBuffer = [] trajectoryBeforeTrainIndex = 0 trajectoryBeforeTrainPathParamters = { 'iterationIndex': trajectoryBeforeTrainIndex } trajectoriesBeforeTrain = loadTrajectoriesForParallel( trajectoryBeforeTrainPathParamters) preProcessedTrajectoriesBeforeTrain = preprocessMultiAgentTrajectories( trajectoriesBeforeTrain) replayBuffer = saveToBuffer(replayBuffer, preProcessedTrajectoriesBeforeTrain) # delete used model for disk space fixedParametersForDelete = { 'maxRunningSteps': maxRunningSteps, 'numSimulations': numSimulations, 'killzoneRadius': killzoneRadius, 'numTrajectoriesPerIteration': numTrajectoriesPerIteration, 'numTrainStepEachIteration': numTrainStepEachIteration } toDeleteNNModelExtensionList = ['.meta', '.index', '.data-00000-of-00001'] generatetoDeleteNNModelPathList = [ GetSavePath(NNModelSaveDirectory, toDeleteNNModelExtension, fixedParametersForDelete) for toDeleteNNModelExtension in toDeleteNNModelExtensionList ] # restore model restoredIteration = 0 for agentId in trainableAgentIds: modelPathForRestore = generateNNModelSavePath({ 'iterationIndex': restoredIteration, 'agentId': agentId, 'numTrajectoriesPerIteration': numTrajectoriesPerIteration, 'numTrainStepEachIteration': numTrainStepEachIteration }) restoredNNModel = restoreVariables(multiAgentNNmodel[agentId], modelPathForRestore) multiAgentNNmodel[agentId] = restoredNNModel # restore buffer bufferTrajectoryPathParameters = { 'numTrajectoriesPerIteration': numTrajectoriesPerIteration, 'numTrainStepEachIteration': numTrainStepEachIteration } restoredIterationIndexRange = range(restoredIteration) restoredTrajectories = loadTrajectoriesForTrainBreak( parameters=bufferTrajectoryPathParameters, parametersWithSpecificValues={ 'iterationIndex': list(restoredIterationIndexRange) }) preProcessedRestoredTrajectories = preprocessMultiAgentTrajectories( restoredTrajectories) print(len(preProcessedRestoredTrajectories)) replayBuffer = saveToBuffer(replayBuffer, preProcessedRestoredTrajectories) modelMemorySize = 5 modelSaveFrequency = 50 deleteUsedModel = DeleteUsedModel(modelMemorySize, modelSaveFrequency, generatetoDeleteNNModelPathList) numIterations = 10000 for iterationIndex in range(restoredIteration + 1, numIterations): print('iterationIndex: ', iterationIndex) numCpuToUseWhileTrain = int(16) numCmdList = min(numTrajectoriesPerIteration, numCpuToUseWhileTrain) sampleTrajectoryFileName = 'sampleMultiMCTSAgentCenterControlResNetTrajCondtion.py' generateTrajectoriesParallelWhileTrain = GenerateTrajectoriesParallel( sampleTrajectoryFileName, numTrajectoriesPerIteration, numCmdList) trajectoryPathParameters = { 'iterationIndex': iterationIndex, 'numTrajectoriesPerIteration': numTrajectoriesPerIteration, 'numTrainStepEachIteration': numTrainStepEachIteration } cmdList = generateTrajectoriesParallelWhileTrain( trajectoryPathParameters) trajectories = loadTrajectoriesForParallel(trajectoryPathParameters) trajectorySavePath = generateTrajectorySavePath( trajectoryPathParameters) saveToPickle(trajectories, trajectorySavePath) preProcessedTrajectories = preprocessMultiAgentTrajectories( trajectories) updatedReplayBuffer = saveToBuffer(replayBuffer, preProcessedTrajectories) for agentId in trainableAgentIds: updatedAgentNNModel = trainOneAgent(agentId, multiAgentNNmodel, updatedReplayBuffer) NNModelPathParameters = { 'iterationIndex': iterationIndex, 'agentId': agentId, 'numTrajectoriesPerIteration': numTrajectoriesPerIteration, 'numTrainStepEachIteration': numTrainStepEachIteration } NNModelSavePath = generateNNModelSavePath(NNModelPathParameters) saveVariables(updatedAgentNNModel, NNModelSavePath) multiAgentNNmodel[agentId] = updatedAgentNNModel replayBuffer = updatedReplayBuffer deleteUsedModel(iterationIndex, agentId) endTime = time.time() print("Time taken for {} iterations: {} seconds".format( numIterations, (endTime - startTime)))
def main(): parametersForTrajectoryPath = json.loads(sys.argv[1]) startSampleIndex = int(sys.argv[2]) endSampleIndex = int(sys.argv[3]) # parametersForTrajectoryPath['sampleOneStepPerTraj']=1 #0 # parametersForTrajectoryPath['sampleIndex'] = (startSampleIndex, endSampleIndex) trainSteps = int(parametersForTrajectoryPath['trainSteps']) depth = int(parametersForTrajectoryPath['depth']) dataSize = int(parametersForTrajectoryPath['dataSize']) # parametersForTrajectoryPath = {} # depth = 5 # dataSize = 5000 # trainSteps = 50000 # startSampleIndex = 0 # endSampleIndex = 100 killzoneRadius = 25 numSimulations = 200 maxRunningSteps = 100 fixedParameters = { 'maxRunningSteps': maxRunningSteps, 'numSimulations': numSimulations, 'killzoneRadius': killzoneRadius } trajectorySaveExtension = '.pickle' dirName = os.path.dirname(__file__) trajectoriesSaveDirectory = os.path.join( dirName, '..', '..', '..', 'data', 'evaluateSupervisedLearning', 'multiMCTSAgentResNetNoPhysicsCenterControl', 'evaluateCenterControlTrajByCondition') if not os.path.exists(trajectoriesSaveDirectory): os.makedirs(trajectoriesSaveDirectory) generateTrajectorySavePath = GetSavePath(trajectoriesSaveDirectory, trajectorySaveExtension, fixedParameters) trajectorySavePath = generateTrajectorySavePath( parametersForTrajectoryPath) if not os.path.isfile(trajectorySavePath): numOfAgent = 3 sheepId = 0 wolvesId = 1 wolfOneId = 1 wolfTwoId = 2 xPosIndex = [0, 1] xBoundary = [0, 600] yBoundary = [0, 600] reset = Reset(xBoundary, yBoundary, numOfAgent) getSheepXPos = GetAgentPosFromState(sheepId, xPosIndex) getWolfOneXPos = GetAgentPosFromState(wolfOneId, xPosIndex) getWolfTwoXPos = GetAgentPosFromState(wolfTwoId, xPosIndex) isTerminalOne = IsTerminal(getWolfOneXPos, getSheepXPos, killzoneRadius) isTerminalTwo = IsTerminal(getWolfTwoXPos, getSheepXPos, killzoneRadius) isTerminal = lambda state: isTerminalOne(state) or isTerminalTwo(state) stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity( xBoundary, yBoundary) transit = TransiteForNoPhysics(stayInBoundaryByReflectVelocity) actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)] preyPowerRatio = 3 sheepActionSpace = list( map(tuple, np.array(actionSpace) * preyPowerRatio)) predatorPowerRatio = 2 wolfActionOneSpace = list( map(tuple, np.array(actionSpace) * predatorPowerRatio)) wolfActionTwoSpace = list( map(tuple, np.array(actionSpace) * predatorPowerRatio)) wolvesActionSpace = list( it.product(wolfActionOneSpace, wolfActionTwoSpace)) # neural network init numStateSpace = 6 numSheepActionSpace = len(sheepActionSpace) numWolvesActionSpace = len(wolvesActionSpace) regularizationFactor = 1e-4 sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] generateSheepModel = GenerateModel(numStateSpace, numSheepActionSpace, regularizationFactor) # load save dir NNModelSaveExtension = '' NNModelSaveDirectory = os.path.join( dirName, '..', '..', '..', 'data', 'evaluateEscapeMultiChasingNoPhysics', 'trainedResNNModelsMultiStillAction') NNModelFixedParameters = { 'agentId': 0, 'maxRunningSteps': 150, 'numSimulations': 200, 'miniBatchSize': 256, 'learningRate': 0.0001 } getNNModelSavePath = GetSavePath(NNModelSaveDirectory, NNModelSaveExtension, NNModelFixedParameters) if not os.path.exists(NNModelSaveDirectory): os.makedirs(NNModelSaveDirectory) resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' initSheepNNModel = generateSheepModel(sharedWidths * 5, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) sheepTrainedModelPath = getNNModelSavePath({ 'trainSteps': 50000, 'depth': 5 }) sheepTrainedModel = restoreVariables(initSheepNNModel, sheepTrainedModelPath) sheepPolicy = ApproximatePolicy(sheepTrainedModel, sheepActionSpace) generateWolvesModel = GenerateModel(numStateSpace, numWolvesActionSpace, regularizationFactor) initWolvesNNModel = generateWolvesModel(sharedWidths * depth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) NNModelSaveDirectory = os.path.join( dirName, '..', '..', '..', 'data', 'evaluateSupervisedLearning', 'multiMCTSAgentResNetNoPhysicsCenterControl', 'trainedResNNModels') wolfId = 1 NNModelFixedParametersWolves = { 'agentId': wolfId, 'maxRunningSteps': maxRunningSteps, 'numSimulations': numSimulations, 'miniBatchSize': 256, 'learningRate': 0.0001, } getNNModelSavePath = GetSavePath(NNModelSaveDirectory, NNModelSaveExtension, NNModelFixedParametersWolves) wolvesTrainedModelPath = getNNModelSavePath({ 'trainSteps': trainSteps, 'depth': depth, 'dataSize': dataSize }) wolvesTrainedModel = restoreVariables(initWolvesNNModel, wolvesTrainedModelPath) wolfPolicy = ApproximatePolicy(wolvesTrainedModel, wolvesActionSpace) from exec.evaluateNoPhysicsEnvWithRender import Render import pygame as pg from pygame.color import THECOLORS screenColor = THECOLORS['black'] circleColorList = [ THECOLORS['green'], THECOLORS['red'], THECOLORS['orange'] ] circleSize = 10 saveImage = False saveImageDir = os.path.join(dirName, '..', '..', '..', 'data', 'demoImg') if not os.path.exists(saveImageDir): os.makedirs(saveImageDir) renderOn = False render = None if renderOn: screen = pg.display.set_mode([xBoundary[1], yBoundary[1]]) render = Render(numOfAgent, xPosIndex, screen, screenColor, circleColorList, circleSize, saveImage, saveImageDir) chooseActionList = [chooseGreedyAction, chooseGreedyAction] sampleTrajectory = SampleTrajectoryWithRender(maxRunningSteps, transit, isTerminal, reset, chooseActionList, render, renderOn) # All agents' policies policy = lambda state: [sheepPolicy(state), wolfPolicy(state)] trajectories = [ sampleTrajectory(policy) for sampleIndex in range(startSampleIndex, endSampleIndex) ] saveToPickle(trajectories, trajectorySavePath)
def main(): DEBUG = 0 renderOn = 0 if DEBUG: parametersForTrajectoryPath = {} startSampleIndex = 5 endSampleIndex = 7 agentId = 1 parametersForTrajectoryPath['sampleIndex'] = (startSampleIndex, endSampleIndex) else: parametersForTrajectoryPath = json.loads(sys.argv[1]) startSampleIndex = int(sys.argv[2]) endSampleIndex = int(sys.argv[3]) agentId = int(parametersForTrajectoryPath['agentId']) parametersForTrajectoryPath['sampleIndex'] = (startSampleIndex, endSampleIndex) # check file exists or not dirName = os.path.dirname(__file__) trajectoriesSaveDirectory = os.path.join( dirName, '..', '..', '..', '..', 'data', 'NoPhysics2wolves1sheep', 'trainWolvesTwoCenterControlAction88', 'trajectories') if not os.path.exists(trajectoriesSaveDirectory): os.makedirs(trajectoriesSaveDirectory) trajectorySaveExtension = '.pickle' maxRunningSteps = 50 numSimulations = 250 killzoneRadius = 150 fixedParameters = { 'agentId': agentId, 'maxRunningSteps': maxRunningSteps, 'numSimulations': numSimulations, 'killzoneRadius': killzoneRadius } generateTrajectorySavePath = GetSavePath(trajectoriesSaveDirectory, trajectorySaveExtension, fixedParameters) trajectorySavePath = generateTrajectorySavePath( parametersForTrajectoryPath) if not os.path.isfile(trajectorySavePath): numOfAgent = 3 xBoundary = [0, 600] yBoundary = [0, 600] resetState = Reset(xBoundary, yBoundary, numOfAgent) stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity( xBoundary, yBoundary) interpolateOneFrame = InterpolateOneFrame( stayInBoundaryByReflectVelocity) chooseInterpolatedNextState = lambda interpolatedStates: interpolatedStates[ -1] sheepId = 0 wolvesId = 1 centerControlIndexList = [wolvesId] unpackCenterControlAction = UnpackCenterControlAction( centerControlIndexList) numFramesToInterpolate = 0 transit = TransitWithInterpolation(numFramesToInterpolate, interpolateOneFrame, chooseInterpolatedNextState, unpackCenterControlAction) # NNGuidedMCTS init cInit = 1 cBase = 100 calculateScore = ScoreChild(cInit, cBase) selectChild = SelectChild(calculateScore) actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)] wolfActionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7)] preyPowerRatio = 10 sheepActionSpace = list( map(tuple, np.array(actionSpace) * preyPowerRatio)) predatorPowerRatio = 8 wolfActionOneSpace = list( map(tuple, np.array(wolfActionSpace) * predatorPowerRatio)) wolfActionTwoSpace = list( map(tuple, np.array(wolfActionSpace) * predatorPowerRatio)) wolvesActionSpace = list( product(wolfActionOneSpace, wolfActionTwoSpace)) actionSpaceList = [sheepActionSpace, wolvesActionSpace] # neural network init numStateSpace = 2 * numOfAgent numSheepActionSpace = len(sheepActionSpace) numWolvesActionSpace = len(wolvesActionSpace) regularizationFactor = 1e-4 sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] generateSheepModel = GenerateModel(numStateSpace, numSheepActionSpace, regularizationFactor) # load save dir NNModelSaveExtension = '' sheepNNModelSaveDirectory = os.path.join( dirName, '..', '..', '..', '..', 'data', 'NoPhysics2wolves1sheep', 'trainSheepWithTwoHeatSeekingWolves', 'trainedResNNModels') sheepNNModelFixedParameters = { 'agentId': 0, 'maxRunningSteps': 50, 'numSimulations': 110, 'miniBatchSize': 256, 'learningRate': 0.0001, } getSheepNNModelSavePath = GetSavePath(sheepNNModelSaveDirectory, NNModelSaveExtension, sheepNNModelFixedParameters) depth = 9 resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' initSheepNNModel = generateSheepModel(sharedWidths * depth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) sheepTrainedModelPath = getSheepNNModelSavePath({ 'trainSteps': 50000, 'depth': depth }) sheepTrainedModel = restoreVariables(initSheepNNModel, sheepTrainedModelPath) sheepPolicy = ApproximatePolicy(sheepTrainedModel, sheepActionSpace) wolfOneId = 1 wolfTwoId = 2 xPosIndex = [0, 1] getSheepXPos = GetAgentPosFromState(sheepId, xPosIndex) getWolfOneXPos = GetAgentPosFromState(wolfOneId, xPosIndex) speed = 120 #sheepPolicy = HeatSeekingContinuesDeterministicPolicy(getWolfOneXPos, getSheepXPos, speed) # MCTS cInit = 1 cBase = 100 calculateScore = ScoreChild(cInit, cBase) selectChild = SelectChild(calculateScore) # prior getActionPrior = lambda state: { action: 1 / len(wolvesActionSpace) for action in wolvesActionSpace } # load chase nn policy chooseActionInMCTS = sampleFromDistribution def wolvesTransit(state, action): return transit(state, [chooseActionInMCTS(sheepPolicy(state)), action]) # reward function wolfOneId = 1 wolfTwoId = 2 xPosIndex = [0, 1] getSheepXPos = GetAgentPosFromState(sheepId, xPosIndex) getWolfOneXPos = GetAgentPosFromState(wolfOneId, xPosIndex) getWolfTwoXPos = GetAgentPosFromState(wolfTwoId, xPosIndex) isCollidedOne = IsTerminal(getWolfOneXPos, getSheepXPos, killzoneRadius) isCollidedTwo = IsTerminal(getWolfTwoXPos, getSheepXPos, killzoneRadius) calCollisionTimes = lambda state: np.sum([ isCollidedOne(state), isCollidedTwo(state) ]) # collisionTimeByAddingCollisionInAllWolves #calCollisionTimes = lambda state: np.max([isCollidedOne(state), isCollidedTwo(state)]) # collisionTimeByBooleanCollisionForAnyWolf calTerminationSignals = calCollisionTimes chooseInterpolatedStateByEarlyTermination = ChooseInterpolatedStateByEarlyTermination( calTerminationSignals) numFramesToInterpolateInReward = 3 interpolateStateInReward = TransitWithInterpolation( numFramesToInterpolateInReward, interpolateOneFrame, chooseInterpolatedStateByEarlyTermination, unpackCenterControlAction) aliveBonus = -1 / maxRunningSteps * 10 deathPenalty = 1 rewardFunction = RewardFunctionCompeteWithStateInterpolation( aliveBonus, deathPenalty, calCollisionTimes, interpolateStateInReward) # initialize children; expand initializeChildren = InitializeChildren(wolvesActionSpace, wolvesTransit, getActionPrior) isTerminal = lambda state: False expand = Expand(isTerminal, initializeChildren) # random rollout policy def rolloutPolicy(state): return [ sampleFromDistribution(sheepPolicy(state)), wolvesActionSpace[np.random.choice( range(numWolvesActionSpace))] ] # rollout #rolloutHeuristicWeight = 0 #minDistance = 400 #rolloutHeuristic1 = HeuristicDistanceToTarget( # rolloutHeuristicWeight, getWolfOneXPos, getSheepXPos, minDistance) #rolloutHeuristic2 = HeuristicDistanceToTarget( # rolloutHeuristicWeight, getWolfTwoXPos, getSheepXPos, minDistance) #rolloutHeuristic = lambda state: (rolloutHeuristic1(state) + rolloutHeuristic2(state)) / 2 rolloutHeuristic = lambda state: 0 maxRolloutSteps = 15 rollout = RollOut(rolloutPolicy, maxRolloutSteps, transit, rewardFunction, isTerminal, rolloutHeuristic) wolfPolicy = MCTS(numSimulations, selectChild, expand, rollout, backup, establishSoftmaxActionDist) # All agents' policies policy = lambda state: [sheepPolicy(state), wolfPolicy(state)] chooseActionList = [maxFromDistribution, maxFromDistribution] def sampleAction(state): actionDists = [sheepPolicy(state), wolfPolicy(state)] action = [ chooseAction(actionDist) for actionDist, chooseAction in zip( actionDists, chooseActionList) ] return action render = None if renderOn: import pygame as pg from pygame.color import THECOLORS screenColor = THECOLORS['black'] circleColorList = [ THECOLORS['green'], THECOLORS['yellow'], THECOLORS['red'] ] circleSize = 10 saveImage = False saveImageDir = os.path.join(dirName, '..', '..', '..', '..', 'data', 'demoImg') if not os.path.exists(saveImageDir): os.makedirs(saveImageDir) screen = pg.display.set_mode([max(xBoundary), max(yBoundary)]) render = Render(numOfAgent, xPosIndex, screen, screenColor, circleColorList, circleSize, saveImage, saveImageDir) forwardOneStep = ForwardOneStep(transit, rewardFunction) sampleTrajectory = SampleTrajectoryWithRender(maxRunningSteps, isTerminal, resetState, forwardOneStep, render, renderOn) trajectories = [ sampleTrajectory(sampleAction) for sampleIndex in range(startSampleIndex, endSampleIndex) ] print([len(traj) for traj in trajectories]) saveToPickle(trajectories, trajectorySavePath)
def __call__(self, parameters): print(parameters) numWolves = parameters['numWolves'] numSheep = parameters['numSheep'] numBlocks = 2 wolfSelfish = 1.0 if parameters[ 'wolfType'] == 'individualReward' else 0.0 perturbedWolfID = parameters['perturbedWolfID'] perturbedWolfGoalID = parameters['perturbedWolfGoalID'] ## MDP Env numAgents = numWolves + numSheep numEntities = numAgents + numBlocks wolvesID = list(range(numWolves)) sheepsID = list(range(numWolves, numWolves + numSheep)) blocksID = list(range(numAgents, numEntities)) sheepSize = 0.05 wolfSize = 0.075 blockSize = 0.2 entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheep + [ blockSize ] * numBlocks costActionRatio = 0.0 sheepSpeedMultiplier = 1.0 sheepMaxSpeed = 1.3 * sheepSpeedMultiplier wolfMaxSpeed = 1.0 blockMaxSpeed = None entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [ sheepMaxSpeed ] * numSheep + [blockMaxSpeed] * numBlocks entitiesMovableList = [True] * numAgents + [False] * numBlocks massList = [1.0] * numEntities collisionReward = 1 # for evaluation, count # of bites isCollision = IsCollision(getPosFromAgentState) rewardAllWolves = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision, collisionReward, wolfSelfish) rewardWolf = lambda state, action, nextState: np.sum( rewardAllWolves(state, action, nextState)) reshapeActionInTransit = lambda action: action getCollisionForce = GetCollisionForce() applyActionForce = ApplyActionForce(wolvesID, sheepsID, entitiesMovableList) applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList, entitiesSizeList, getCollisionForce, getPosFromAgentState) integrateState = IntegrateState(numEntities, entitiesMovableList, massList, entityMaxSpeedList, getVelFromAgentState, getPosFromAgentState) transit = TransitMultiAgentChasing(numEntities, reshapeActionInTransit, applyActionForce, applyEnvironForce, integrateState) forwardOneStep = ForwardOneStep(transit, rewardWolf) reset = ResetMultiAgentChasingWithSeed(numAgents, numBlocks) isTerminal = lambda state: False maxRunningStepsToSample = 101 sampleTrajectory = SampleTrajectory(maxRunningStepsToSample, isTerminal, reset, forwardOneStep) ## MDP Policy worldDim = 2 actionDim = worldDim * 2 + 1 layerWidth = [128, 128] maxTimeStep = 75 maxEpisode = 60000 dirName = os.path.dirname(__file__) # ------------ sheep recover variables ------------------------ numSheepToObserve = 1 sheepModelListOfDiffWolfReward = [] sheepTypeList = [0.0, 1.0] for sheepType in sheepTypeList: wolvesIDForSheepObserve = list(range(numWolves)) sheepsIDForSheepObserve = list( range(numWolves, numSheepToObserve + numWolves)) blocksIDForSheepObserve = list( range(numSheepToObserve + numWolves, numSheepToObserve + numWolves + numBlocks)) observeOneAgentForSheep = lambda agentID: Observe( agentID, wolvesIDForSheepObserve, sheepsIDForSheepObserve, blocksIDForSheepObserve, getPosFromAgentState, getVelFromAgentState) observeSheep = lambda state: [ observeOneAgentForSheep(agentID)(state) for agentID in range(numWolves + numSheepToObserve) ] obsIDsForSheep = wolvesIDForSheepObserve + sheepsIDForSheepObserve + blocksIDForSheepObserve initObsForSheepParams = observeSheep(reset()[obsIDsForSheep]) obsShapeSheep = [ initObsForSheepParams[obsID].shape[0] for obsID in range(len(initObsForSheepParams)) ] buildSheepModels = BuildMADDPGModels(actionDim, numWolves + numSheepToObserve, obsShapeSheep) sheepModelsList = [ buildSheepModels(layerWidth, agentID) for agentID in range(numWolves, numWolves + numSheepToObserve) ] sheepFileName = "maddpg{}wolves{}sheep{}blocks{}episodes{}stepSheepSpeed{}WolfActCost{}individ{}_agent".format( numWolves, numSheepToObserve, numBlocks, maxEpisode, maxTimeStep, sheepSpeedMultiplier, costActionRatio, sheepType) sheepModelPaths = [ os.path.join(dirName, '..', '..', 'data', 'preTrainModel', sheepFileName + str(i)) for i in range(numWolves, numWolves + numSheepToObserve) ] [ restoreVariables(model, path) for model, path in zip(sheepModelsList, sheepModelPaths) ] sheepModelListOfDiffWolfReward = sheepModelListOfDiffWolfReward + sheepModelsList # # actOneStep = ActOneStep(actByPolicyTrainNoisy) #TODO actOneStep = ActOneStep(actByPolicyTrainNoNoisy) numAllSheepModels = len(sheepModelListOfDiffWolfReward) # ------------ wolves recover variables ------------------------ # ------------ Recover one perturbed wolf for comparison ------- numSheepForPerturbedWolf = 1 wolvesIDForPerturbedWolf = wolvesID sheepsIDForPerturbedWolf = [sheepsID[perturbedWolfGoalID]] blocksIDForPerturbedWolf = list( range(numWolves + numSheep, numEntities)) # skip the unattended sheep id observeOneAgentForPerturbedWolf = lambda agentID: Observe( agentID, wolvesIDForPerturbedWolf, sheepsIDForPerturbedWolf, blocksIDForPerturbedWolf, getPosFromAgentState, getVelFromAgentState) observePerturbedWolf = lambda state: [ observeOneAgentForPerturbedWolf(agentID)(state) for agentID in wolvesIDForPerturbedWolf + sheepsIDForPerturbedWolf ] initObsForPerturbedWolfParams = observePerturbedWolf(reset()) obsShapePerturbedWolf = [ initObsForPerturbedWolfParams[obsID].shape[0] for obsID in range(len(initObsForPerturbedWolfParams)) ] buildPerturbedWolfModels = BuildMADDPGModels( actionDim, numWolves + numSheepForPerturbedWolf, obsShapePerturbedWolf) layerWidthForWolf = [128, 128] perturbedWolfModel = buildPerturbedWolfModels(layerWidthForWolf, perturbedWolfID) perturbedWolfFileName = "maddpg{}wolves{}sheep{}blocks{}episodes{}stepSheepSpeed{}WolfActCost{}individ{}_agent".format( numWolves, numSheepForPerturbedWolf, numBlocks, maxEpisode, maxTimeStep, sheepSpeedMultiplier, costActionRatio, wolfSelfish) perturbedWolfModelPath = os.path.join( dirName, '..', '..', 'data', 'preTrainModel', perturbedWolfFileName + str(perturbedWolfID)) restoreVariables(perturbedWolfModel, perturbedWolfModelPath) # ------------ Recover other wolves trained with multiple goals ------- wolvesIDForWolfObserve = wolvesID sheepsIDForWolfObserve = sheepsID blocksIDForWolfObserve = blocksID observeOneAgentForWolf = lambda agentID: Observe( agentID, wolvesIDForWolfObserve, sheepsIDForWolfObserve, blocksIDForWolfObserve, getPosFromAgentState, getVelFromAgentState) observeWolf = lambda state: [ observeOneAgentForWolf(agentID)(state) for agentID in range(numWolves + numSheep) ] obsIDsForWolf = wolvesIDForWolfObserve + sheepsIDForWolfObserve + blocksIDForWolfObserve initObsForWolfParams = observeWolf(reset()[obsIDsForWolf]) obsShapeWolf = [ initObsForWolfParams[obsID].shape[0] for obsID in range(len(initObsForWolfParams)) ] buildWolfModels = BuildMADDPGModels(actionDim, numWolves + numSheep, obsShapeWolf) layerWidthForWolf = [128, 128] wolfModelsList = [ buildWolfModels(layerWidthForWolf, agentID) for agentID in range(numWolves) ] wolfFileName = "maddpg{}wolves{}sheep{}blocks{}episodes{}stepSheepSpeed{}WolfActCost{}individ{}_agent".format( numWolves, numSheep, numBlocks, maxEpisode, maxTimeStep, sheepSpeedMultiplier, costActionRatio, wolfSelfish) wolfModelPaths = [ os.path.join(dirName, '..', '..', 'data', 'preTrainModel', wolfFileName + str(i)) for i in range(numWolves) ] [ restoreVariables(model, path) for model, path in zip(wolfModelsList, wolfModelPaths) ] # ------------ compose policy --------------------- actionDimReshaped = 2 cov = [0.00000000001**2 for _ in range(actionDimReshaped)] buildGaussian = BuildGaussianFixCov(cov) reshapeAction = ReshapeAction() # unperturbed policy composeWolfPolicy = lambda wolfModel: lambda state: sampleFromContinuousSpace( buildGaussian( tuple(reshapeAction(actOneStep(wolfModel, observeWolf(state)))) )) wolvesSampleActions = [ composeWolfPolicy(wolfModel) for wolfModel in wolfModelsList ] # perturbed policy composePerturbedWolfPolicy = lambda perturbedModel: lambda state: sampleFromContinuousSpace( buildGaussian( tuple( reshapeAction( actOneStep(perturbedModel, observePerturbedWolf(state)) )))) wolvesSampleActionsPerturbed = wolvesSampleActions.copy() wolvesSampleActionsPerturbed[ perturbedWolfID] = composePerturbedWolfPolicy(perturbedWolfModel) trajectories = [] for trajectoryId in range(self.numTrajectories): sheepModelsForPolicy = [ sheepModelListOfDiffWolfReward[np.random.choice( numAllSheepModels)] for sheepId in sheepsID ] composeSheepPolicy = lambda sheepModel: lambda state: { tuple( reshapeAction(actOneStep(sheepModel, observeSheep(state)))): 1 } sheepChooseActionMethod = sampleFromDistribution sheepSampleActions = [ SampleActionOnFixedIntention(selfId, wolvesID, composeSheepPolicy(sheepModel), sheepChooseActionMethod, blocksID) for selfId, sheepModel in zip(sheepsID, sheepModelsForPolicy) ] allIndividualSampleActionsPerturbed = wolvesSampleActionsPerturbed + sheepSampleActions sampleActionPerturbed = lambda state: [ sampleIndividualAction(state) for sampleIndividualAction in allIndividualSampleActionsPerturbed ] trajectory = sampleTrajectory(sampleActionPerturbed) trajectories.append(trajectory) trajectoryFixedParameters = { 'maxRunningStepsToSample': maxRunningStepsToSample } self.saveTrajectoryByParameters(trajectories, trajectoryFixedParameters, parameters)
def main(): DEBUG = 1 renderOn = 1 if DEBUG: parametersForTrajectoryPath = {} startSampleIndex = 1 endSampleIndex = 2 parametersForTrajectoryPath['sampleIndex'] = (startSampleIndex, endSampleIndex) iterationIndex = 2 numTrainStepEachIteration = 1 numTrajectoriesPerIteration = 1 else: parametersForTrajectoryPath = json.loads(sys.argv[1]) startSampleIndex = int(sys.argv[2]) endSampleIndex = int(sys.argv[3]) parametersForTrajectoryPath['sampleIndex'] = (startSampleIndex, endSampleIndex) iterationIndex = int(parametersForTrajectoryPath['iterationIndex']) numTrainStepEachIteration = int(parametersForTrajectoryPath['numTrainStepEachIteration']) numTrajectoriesPerIteration = int(parametersForTrajectoryPath['numTrajectoriesPerIteration']) # check file exists or not dirName = os.path.dirname(__file__) trajectoriesSaveDirectory = os.path.join(dirName, '..', '..', 'data', 'iterTrain2wolves1sheepMADDPGEnv', 'trajectories') if not os.path.exists(trajectoriesSaveDirectory): os.makedirs(trajectoriesSaveDirectory) trajectorySaveExtension = '.pickle' maxRunningSteps = 50 numSimulations = 250 killzoneRadius = 50 numTree = 2 fixedParameters = {'maxRunningSteps': maxRunningSteps, 'numSimulations': numSimulations, 'killzoneRadius': killzoneRadius} generateTrajectorySavePath = GetSavePath(trajectoriesSaveDirectory, trajectorySaveExtension, fixedParameters) trajectorySavePath = generateTrajectorySavePath(parametersForTrajectoryPath) if not os.path.isfile(trajectorySavePath): # env MDP sheepsID = [0] wolvesID = [1, 2] blocksID = [] numSheeps = len(sheepsID) numWolves = len(wolvesID) numBlocks = len(blocksID) numAgents = numWolves + numSheeps numEntities = numAgents + numBlocks sheepSize = 0.05 wolfSize = 0.075 blockSize = 0.2 sheepMaxSpeed = 1.3 * 1 wolfMaxSpeed = 1.0 * 1 blockMaxSpeed = None entitiesSizeList = [sheepSize] * numSheeps + [wolfSize] * numWolves + [blockSize] * numBlocks entityMaxSpeedList = [sheepMaxSpeed] * numSheeps + [wolfMaxSpeed] * numWolves + [blockMaxSpeed] * numBlocks entitiesMovableList = [True] * numAgents + [False] * numBlocks massList = [1.0] * numEntities centralControlId = 1 centerControlIndexList = [centralControlId] reshapeAction = UnpackCenterControlAction(centerControlIndexList) getCollisionForce = GetCollisionForce() applyActionForce = ApplyActionForce(wolvesID, sheepsID, entitiesMovableList) applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList, entitiesSizeList, getCollisionForce, getPosFromAgentState) integrateState = IntegrateState(numEntities, entitiesMovableList, massList, entityMaxSpeedList, getVelFromAgentState, getPosFromAgentState) interpolateState = TransitMultiAgentChasing(numEntities, reshapeAction, applyActionForce, applyEnvironForce, integrateState) numFramesToInterpolate = 1 def transit(state, action): for frameIndex in range(numFramesToInterpolate): nextState = interpolateState(state, action) action = np.array([(0, 0)] * numAgents) state = nextState return nextState isTerminal = lambda state: False isCollision = IsCollision(getPosFromAgentState) collisonRewardWolf = 1 punishForOutOfBound = PunishForOutOfBound() rewardWolf = RewardCentralControlPunishBond(wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBound, collisonRewardWolf) collisonRewardSheep = -1 rewardSheep = RewardCentralControlPunishBond(sheepsID, wolvesID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBound, collisonRewardSheep) terminalRewardList = [collisonRewardSheep,collisonRewardWolf] rewardMultiAgents = [rewardSheep, rewardWolf] resetState = ResetMultiAgentChasing(numAgents, numBlocks) observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID, blocksID, getPosFromAgentState, getVelFromAgentState) observe = lambda state: [observeOneAgent(agentID)(state) for agentID in range(numAgents)] # policy actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)] wolfActionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)] preyPowerRatio = 0.5 sheepActionSpace = list(map(tuple, np.array(actionSpace) * preyPowerRatio)) predatorPowerRatio = 0.5 wolfActionOneSpace = list(map(tuple, np.array(wolfActionSpace) * predatorPowerRatio)) wolfActionTwoSpace = list(map(tuple, np.array(wolfActionSpace) * predatorPowerRatio)) wolvesActionSpace = list(product(wolfActionOneSpace, wolfActionTwoSpace)) actionSpaceList = [sheepActionSpace, wolvesActionSpace] # neural network init numStateSpace = 4 * numEntities numSheepActionSpace = len(sheepActionSpace) numWolvesActionSpace = len(wolvesActionSpace) regularizationFactor = 1e-4 sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] generateSheepModel = GenerateModel(numStateSpace, numSheepActionSpace, regularizationFactor) generateWolvesModel = GenerateModel(numStateSpace, numWolvesActionSpace, regularizationFactor) generateModelList = [generateSheepModel, generateWolvesModel] sheepDepth = 9 wolfDepth = 9 depthList = [sheepDepth, wolfDepth] resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' sheepId,wolvesId = [0,1] trainableAgentIds = [sheepId, wolvesId] multiAgentNNmodel = [generateModel(sharedWidths * depth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) for depth, generateModel in zip(depthList, generateModelList)] otherAgentApproximatePolicy = [lambda NNmodel, : ApproximatePolicy(NNmodel, sheepActionSpace), lambda NNmodel, : ApproximatePolicy(NNmodel, wolvesActionSpace)] # NNGuidedMCTS init cInit = 1 cBase = 100 calculateScore = ScoreChild(cInit, cBase) selectChild = SelectChild(calculateScore) getApproximatePolicy = [lambda NNmodel, : ApproximatePolicy(NNmodel, sheepActionSpace), lambda NNmodel, : ApproximatePolicy(NNmodel, wolvesActionSpace)] getApproximateValue = [lambda NNmodel: ApproximateValue(NNmodel), lambda NNmodel: ApproximateValue(NNmodel)] def getStateFromNode(node): return list(node.id.values())[0] chooseActionInMCTS = sampleFromDistribution composeMultiAgentTransitInSingleAgentMCTS = ComposeMultiAgentTransitInSingleAgentMCTS(chooseActionInMCTS) composeSingleAgentGuidedMCTS = ComposeSingleAgentGuidedMCTS(numTree, numSimulations, actionSpaceList, terminalRewardList, selectChild, isTerminal, transit, getStateFromNode, getApproximatePolicy, getApproximateValue, composeMultiAgentTransitInSingleAgentMCTS) prepareMultiAgentPolicy = PrepareMultiAgentPolicy(composeSingleAgentGuidedMCTS, otherAgentApproximatePolicy, trainableAgentIds) # load model NNModelSaveExtension = '' NNModelSaveDirectory = os.path.join(dirName, '..', '..', 'data', 'iterTrain2wolves1sheepMADDPGEnv', 'NNModelRes') if not os.path.exists(NNModelSaveDirectory): os.makedirs(NNModelSaveDirectory) generateNNModelSavePath = GetSavePath(NNModelSaveDirectory, NNModelSaveExtension, fixedParameters) for agentId in trainableAgentIds: modelPath = generateNNModelSavePath({'iterationIndex': iterationIndex - 1, 'agentId': agentId, 'numTrajectoriesPerIteration': numTrajectoriesPerIteration, 'numTrainStepEachIteration': numTrainStepEachIteration}) restoredNNModel = restoreVariables(multiAgentNNmodel[agentId], modelPath) multiAgentNNmodel[agentId] = restoredNNModel multiAgentPolicy = prepareMultiAgentPolicy(multiAgentNNmodel) chooseActionList = [maxFromDistribution, maxFromDistribution] def sampleAction(state): actionDists = multiAgentPolicy(state) action = [chooseAction(actionDist) for actionDist, chooseAction in zip(actionDists, chooseActionList)] return action render = lambda state: None forwardOneStep = ForwardMultiAgentsOneStep(transit, rewardMultiAgents) sampleTrajectory = SampleTrajectoryWithRender(maxRunningSteps, isTerminal, resetState, forwardOneStep, render, renderOn) trajectories = [sampleTrajectory(sampleAction) for sampleIndex in range(startSampleIndex, endSampleIndex)] print([len(traj) for traj in trajectories]) saveToPickle(trajectories, trajectorySavePath)
def main(): numWolves = 2 numSheep = 1 numWolvesStateSpaces = [ 2 * (numInWe + 1) for numInWe in range(2, numWolves + 1) ] actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7)] #actionSpace = [(10, 0), (0, 10), (-10, 0), (0, -10)] predatorPowerRatio = 8 wolfIndividualActionSpace = list( map(tuple, np.array(actionSpace) * predatorPowerRatio)) wolvesCentralControlActionSpaces = [ list(it.product(wolfIndividualActionSpace, repeat=numInWe)) for numInWe in range(2, numWolves + 1) ] numWolvesCentralControlActionSpaces = [ len(wolvesCentralControlActionSpace) for wolvesCentralControlActionSpace in wolvesCentralControlActionSpaces ] regularizationFactor = 1e-4 generateWolvesCentralControlModels = [ GenerateModel(numStateSpace, numActionSpace, regularizationFactor) for numStateSpace, numActionSpace in zip( numWolvesStateSpaces, numWolvesCentralControlActionSpaces) ] sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] wolfNNDepth = 9 resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' initWolvesCentralControlModels = [ generateWolvesCentralControlModel(sharedWidths * wolfNNDepth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) for generateWolvesCentralControlModel in generateWolvesCentralControlModels ] NNNumSimulations = 250 wolvesModelPaths = [ os.path.join( '..', '..', 'data', 'preTrainModel', 'agentId=' + str(len(actionSpace) * np.sum([10**_ for _ in range(numInWe)])) + '_depth=9_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations=' + str(NNNumSimulations) + '_trainSteps=50000') for numInWe in range(2, numWolves + 1) ] print(wolvesModelPaths) wolvesCentralControlNNModels = [ restoreVariables(initWolvesCentralControlModel, wolvesModelPath) for initWolvesCentralControlModel, wolvesModelPath in zip( initWolvesCentralControlModels, wolvesModelPaths) ] wolvesValueFunctionListBasedOnNumAgentsInWe = [ ApproximateValue(NNModel) for NNModel in wolvesCentralControlNNModels ] valueFunction = wolvesValueFunctionListBasedOnNumAgentsInWe[numWolves - 2] xBoundary = [0, 600] yBoundary = [0, 600] reset = Reset(xBoundary, yBoundary, numWolves) numGridX = 120 numGridY = 120 xInterval = (xBoundary[1] - xBoundary[0]) / numGridX yInterval = (yBoundary[1] - yBoundary[0]) / numGridY sheepXPosition = [(gridIndex + 0.5) * xInterval for gridIndex in range(numGridX)] sheepYPosition = [(gridIndex + 0.5) * yInterval for gridIndex in range(numGridY)] wolvesState = reset() wolvesState = np.array([[300, 350], [550, 400]]) print(wolvesState) levelValues = [sheepXPosition, sheepYPosition] levelNames = ["sheepXPosition", "sheepYPosition"] modelIndex = pd.MultiIndex.from_product(levelValues, names=levelNames) toSplitFrame = pd.DataFrame(index=modelIndex) evaluate = lambda df: evaluateValue(df, valueFunction, wolvesState) valueResultDf = toSplitFrame.groupby(levelNames).apply(evaluate) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) drawHeatmapPlot(valueResultDf, ax) fig.savefig('valueMap2', dpi=300) plt.show()
def __call__(self, parameters): print(parameters) numWolves = parameters['numWolves'] numSheep = parameters['numSheep'] softParamterForValue = parameters['valuePriorSoftMaxBeta'] valuePriorEndTime = parameters['valuePriorEndTime'] ## MDP Env # state is all multi agent state # action is all multi agent action xBoundary = [0,600] yBoundary = [0,600] numOfAgent = numWolves + numSheep reset = Reset(xBoundary, yBoundary, numOfAgent) possibleSheepIds = list(range(numSheep)) possibleWolvesIds = list(range(numSheep, numSheep + numWolves)) getSheepStatesFromAll = lambda state: np.array(state)[possibleSheepIds] getWolvesStatesFromAll = lambda state: np.array(state)[possibleWolvesIds] killzoneRadius = 25 isTerminal = IsTerminal(killzoneRadius, getSheepStatesFromAll, getWolvesStatesFromAll) stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity(xBoundary, yBoundary) interpolateOneFrame = InterpolateOneFrame(stayInBoundaryByReflectVelocity) numFramesToInterpolate = 5 transit = TransitWithTerminalCheckOfInterpolation(numFramesToInterpolate, interpolateOneFrame, isTerminal) maxRunningSteps = 52 timeCost = 1/maxRunningSteps terminalBonus = 1 rewardFunction = RewardFunctionByTerminal(timeCost, terminalBonus, isTerminal) forwardOneStep = ForwardOneStep(transit, rewardFunction) sampleTrajectory = SampleTrajectory(maxRunningSteps, isTerminal, reset, forwardOneStep) ## MDP Policy # Sheep Part # Sheep Policy Function numSheepPolicyStateSpace = 2 * (numWolves + 1) sheepActionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)] preyPowerRatio = 12 sheepIndividualActionSpace = list(map(tuple, np.array(sheepActionSpace) * preyPowerRatio)) numSheepActionSpace = len(sheepIndividualActionSpace) regularizationFactor = 1e-4 generateSheepModel = GenerateModel(numSheepPolicyStateSpace, numSheepActionSpace, regularizationFactor) sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] sheepNNDepth = 9 resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' initSheepModel = generateSheepModel(sharedWidths * sheepNNDepth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) sheepModelPath = os.path.join('..', '..', 'data', 'preTrainModel', 'agentId=0.'+str(numWolves)+'_depth=9_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations=110_trainSteps=50000') sheepNNModel = restoreVariables(initSheepModel, sheepModelPath) sheepPolicy = ApproximatePolicy(sheepNNModel, sheepIndividualActionSpace) # Sheep Generate Action softParameterInPlanningForSheep = 2.5 softPolicyInPlanningForSheep = SoftDistribution(softParameterInPlanningForSheep) softenSheepPolicy = lambda relativeAgentsStatesForSheepPolicy: softPolicyInPlanningForSheep(sheepPolicy(relativeAgentsStatesForSheepPolicy)) sheepChooseActionMethod = sampleFromDistribution sheepSampleActions = [SampleActionOnFixedIntention(selfId, possibleWolvesIds, softenSheepPolicy, sheepChooseActionMethod) for selfId in possibleSheepIds] # Wolves Part # Policy Likelihood function: Wolf Centrol Control NN Policy Given Intention numWolvesStateSpaces = [2 * (numInWe + numSheep) for numInWe in range(2, numWolves + 1)] actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)] predatorPowerRatio = 8 wolfIndividualActionSpace = list(map(tuple, np.array(actionSpace) * predatorPowerRatio)) wolvesCentralControlActionSpaces = [list(it.product(wolfIndividualActionSpace, repeat = numInWe)) for numInWe in range(2, numWolves + 1)] numWolvesCentralControlActionSpaces = [len(wolvesCentralControlActionSpace) for wolvesCentralControlActionSpace in wolvesCentralControlActionSpaces] regularizationFactor = 1e-4 generateWolvesCentralControlModels = [GenerateModel(numStateSpace, numActionSpace, regularizationFactor) for numStateSpace, numActionSpace in zip(numWolvesStateSpaces, numWolvesCentralControlActionSpaces)] sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] wolfNNDepth = 9 resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' initWolvesCentralControlModels = [generateWolvesCentralControlModel(sharedWidths * wolfNNDepth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) for generateWolvesCentralControlModel in generateWolvesCentralControlModels] NNNumSimulations = 250 wolvesModelPaths = [os.path.join('..', '..', 'data', 'preTrainModel', 'agentId=.'+str(len(actionSpace) * np.sum([10**_ for _ in range(numInWe)]))+'_depth=9_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations='+str(NNNumSimulations)+'_trainSteps=50000') for numInWe in range(2, numWolves + 1)] print(wolvesModelPaths) wolvesCentralControlNNModels = [restoreVariables(initWolvesCentralControlModel, wolvesModelPath) for initWolvesCentralControlModel, wolvesModelPath in zip(initWolvesCentralControlModels, wolvesModelPaths)] wolvesCentralControlPolicies = [ApproximatePolicy(NNModel, actionSpace) for NNModel, actionSpace in zip(wolvesCentralControlNNModels, wolvesCentralControlActionSpaces)] # Wovels Generate Action softParameterInPlanning = 2.5 softPolicyInPlanning = SoftDistribution(softParameterInPlanning) wolvesPolicy = lambda state: wolvesCentralControlPolicies[numWolves - 2](state) wolfChooseActionMethod = sampleFromDistribution wolvesSampleAction = lambda state: wolfChooseActionMethod(softPolicyInPlanning(wolvesPolicy(state))) def sampleAction(state): action = list(wolvesSampleAction(state)) + [sheepSampleAction(state) for sheepSampleAction in sheepSampleActions] return action # Sample and Save Trajectory trajectories = [sampleTrajectory(sampleAction) for _ in range(self.numTrajectories)] wolfType = 'sharedReward' trajectoryFixedParameters = {'sheepPolicySoft': softParameterInPlanningForSheep, 'wolfPolicySoft': softParameterInPlanning, 'maxRunningSteps': maxRunningSteps, 'hierarchy': 0, 'NNNumSimulations':NNNumSimulations, 'wolfType': wolfType} self.saveTrajectoryByParameters(trajectories, trajectoryFixedParameters, parameters) print(np.mean([len(tra) for tra in trajectories]))
def __call__(self, parameters): print(parameters) visualizeTraj = False numWolves = parameters['numWolves'] numSheep = parameters['numSheep'] softParamterForValue = parameters['valuePriorSoftMaxBeta'] valuePriorEndTime = parameters['valuePriorEndTime'] deviationFor2DAction = parameters['deviationFor2DAction'] rationalityBetaInInference = parameters['rationalityBetaInInference'] wolfType = parameters['wolfType'] sheepConcern = parameters['sheepConcern'] print(rationalityBetaInInference) ## MDP Env # state is all multi agent state # action is all multi agent action wolvesID = list(range(numWolves)) sheepsID = list(range(numWolves, numWolves + numSheep)) possibleWolvesIds = wolvesID possibleSheepIds = sheepsID numAgents = numWolves + numSheep numBlocks = 5 - numWolves blocksID = list(range(numAgents, numAgents + numBlocks)) numEntities = numAgents + numBlocks sheepSize = 0.05 wolfSize = 0.075 blockSize = 0.2 sheepMaxSpeed = 1.3 * 1 wolfMaxSpeed = 1.0 * 1 blockMaxSpeed = None entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheep + [ blockSize ] * numBlocks entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [ sheepMaxSpeed ] * numSheep + [blockMaxSpeed] * numBlocks entitiesMovableList = [True] * numAgents + [False] * numBlocks massList = [1.0] * numEntities reshapeActionInTransit = lambda action: action getCollisionForce = GetCollisionForce() applyActionForce = ApplyActionForce(wolvesID, sheepsID, entitiesMovableList) applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList, entitiesSizeList, getCollisionForce, getPosFromAgentState) integrateState = IntegrateState(numEntities, entitiesMovableList, massList, entityMaxSpeedList, getVelFromAgentState, getPosFromAgentState) transit = TransitMultiAgentChasing(numEntities, reshapeActionInTransit, applyActionForce, applyEnvironForce, integrateState) isCollision = IsCollision(getPosFromAgentState) collisonRewardWolf = 1 punishForOutOfBoundForWolf = lambda stata: 0 rewardWolf = RewardCentralControlPunishBond( wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBoundForWolf, collisonRewardWolf) collisonRewardSheep = -1 punishForOutOfBoundForSheep = PunishForOutOfBound() rewardSheep = RewardCentralControlPunishBond( sheepsID, wolvesID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBoundForSheep, collisonRewardSheep) forwardOneStep = ForwardOneStep(transit, rewardWolf) reset = ResetMultiAgentChasing(numAgents, numBlocks) isTerminal = lambda state: False maxRunningSteps = 101 sampleTrajectory = SampleTrajectory(maxRunningSteps, isTerminal, reset, forwardOneStep) ## MDP Policy worldDim = 2 actionDim = worldDim * 2 + 1 layerWidth = [64 * (numWolves - 1), 64 * (numWolves - 1)] # Sheep Part # ------------ model ------------------------ if sheepConcern == 'selfSheep': sheepConcernSelfOnly = 1 if sheepConcern == 'allSheep': sheepConcernSelfOnly = 0 numSheepToObserveWhenSheepSameOrDiff = [numSheep, 1] numSheepToObserve = numSheepToObserveWhenSheepSameOrDiff[ sheepConcernSelfOnly] print(numSheepToObserve) sheepModelListOfDiffWolfReward = [] sheepType = 'mixed' if sheepType == 'mixed': sheepPrefixList = ['maddpgIndividWolf', 'maddpg'] else: sheepPrefixList = [sheepType] for sheepPrefix in sheepPrefixList: wolvesIDForSheepObserve = list(range(numWolves)) sheepsIDForSheepObserve = list( range(numWolves, numSheepToObserve + numWolves)) blocksIDForSheepObserve = list( range(numSheepToObserve + numWolves, numSheepToObserve + numWolves + numBlocks)) observeOneAgentForSheep = lambda agentID: Observe( agentID, wolvesIDForSheepObserve, sheepsIDForSheepObserve, blocksIDForSheepObserve, getPosFromAgentState, getVelFromAgentState) observeSheep = lambda state: [ observeOneAgentForSheep(agentID)(state) for agentID in range(numWolves + numSheepToObserve) ] obsIDsForSheep = wolvesIDForSheepObserve + sheepsIDForSheepObserve + blocksIDForSheepObserve initObsForSheepParams = observeSheep(reset()[obsIDsForSheep]) obsShapeSheep = [ initObsForSheepParams[obsID].shape[0] for obsID in range(len(initObsForSheepParams)) ] buildSheepModels = BuildMADDPGModels(actionDim, numWolves + numSheepToObserve, obsShapeSheep) sheepModelsList = [ buildSheepModels(layerWidth, agentID) for agentID in range(numWolves, numWolves + numSheepToObserve) ] dirName = os.path.dirname(__file__) maxEpisode = 60000 print(sheepPrefix) sheepFileName = "{}wolves{}sheep{}blocks{}eps_agent".format( numWolves, numSheepToObserve, numBlocks, maxEpisode) sheepModelPaths = [ os.path.join(dirName, '..', '..', 'data', 'preTrainModel', sheepPrefix + sheepFileName + str(i) + '60000eps') for i in range(numWolves, numWolves + numSheepToObserve) ] [ restoreVariables(model, path) for model, path in zip(sheepModelsList, sheepModelPaths) ] sheepModelListOfDiffWolfReward = sheepModelListOfDiffWolfReward + sheepModelsList # Sheep Policy Function reshapeAction = ReshapeAction() actOneStepOneModelSheep = ActOneStep(actByPolicyTrainNoisy) # Sheep Generate Action numAllSheepModels = len(sheepModelListOfDiffWolfReward) # Wolves Part # Intention Prior For inference #createIntentionSpaceGivenSelfId = CreateIntentionSpaceGivenSelfId(possibleSheepIds, possibleWolvesIds) #intentionSpacesForAllWolves = [createAllPossibleIntentionsGivenSelfId(wolfId) # for wolfId in possibleWolvesIds] intentionSpacesForAllWolves = [ tuple(it.product(possibleSheepIds, [tuple(possibleWolvesIds)])) for wolfId in possibleWolvesIds ] print(intentionSpacesForAllWolves) wolvesIntentionPriors = [{ tuple(intention): 1 / len(allPossibleIntentionsOneWolf) for intention in allPossibleIntentionsOneWolf } for allPossibleIntentionsOneWolf in intentionSpacesForAllWolves] # Percept Action For Inference #perceptAction = lambda action: action perceptSelfAction = SampleNoisyAction(deviationFor2DAction) perceptOtherAction = SampleNoisyAction(deviationFor2DAction) perceptAction = PerceptImaginedWeAction(possibleWolvesIds, perceptSelfAction, perceptOtherAction) #perceptAction = lambda action: action # Policy Likelihood function: Wolf Centrol Control NN Policy Given Intention # ------------ model ------------------------ weModelsListBaseOnNumInWe = [] observeListBaseOnNumInWe = [] for numAgentInWe in range(2, numWolves + 1): numBlocksForWe = 5 - numAgentInWe wolvesIDForWolfObserve = list(range(numAgentInWe)) sheepsIDForWolfObserve = list(range(numAgentInWe, 1 + numAgentInWe)) blocksIDForWolfObserve = list( range(1 + numAgentInWe, 1 + numAgentInWe + numBlocksForWe)) observeOneAgentForWolf = lambda agentID: Observe( agentID, wolvesIDForWolfObserve, sheepsIDForWolfObserve, blocksIDForWolfObserve, getPosFromAgentState, getVelFromAgentState) observeWolf = lambda state: [ observeOneAgentForWolf(agentID)(state) for agentID in range(numAgentInWe + 1) ] observeListBaseOnNumInWe.append(observeWolf) obsIDsForWolf = wolvesIDForWolfObserve + sheepsIDForWolfObserve + blocksIDForWolfObserve initObsForWolfParams = observeWolf(reset()[obsIDsForWolf]) obsShapeWolf = [ initObsForWolfParams[obsID].shape[0] for obsID in range(len(initObsForWolfParams)) ] buildWolfModels = BuildMADDPGModels(actionDim, numAgentInWe + 1, obsShapeWolf) layerWidthForWolf = [ 64 * (numAgentInWe - 1), 64 * (numAgentInWe - 1) ] wolfModelsList = [ buildWolfModels(layerWidthForWolf, agentID) for agentID in range(numAgentInWe) ] if wolfType == 'sharedAgencyByIndividualRewardWolf': wolfPrefix = 'maddpgIndividWolf' if wolfType == 'sharedAgencyBySharedRewardWolf': wolfPrefix = 'maddpg' wolfFileName = "{}wolves{}sheep{}blocks{}eps_agent".format( numAgentInWe, 1, numBlocksForWe, maxEpisode) wolfModelPaths = [ os.path.join(dirName, '..', '..', 'data', 'preTrainModel', wolfPrefix + wolfFileName + str(i) + '60000eps') for i in range(numAgentInWe) ] print(numAgentInWe, obsShapeWolf, wolfModelPaths) [ restoreVariables(model, path) for model, path in zip(wolfModelsList, wolfModelPaths) ] weModelsListBaseOnNumInWe.append(wolfModelsList) actionDimReshaped = 2 cov = [deviationFor2DAction**2 for _ in range(actionDimReshaped)] buildGaussian = BuildGaussianFixCov(cov) actOneStepOneModelWolf = ActOneStep(actByPolicyTrainNoNoisy) #actOneStepOneModelWolf = ActOneStep(actByPolicyTrainNoisy) composeCentralControlPolicy = lambda observe: ComposeCentralControlPolicyByGaussianOnDeterministicAction( reshapeAction, observe, actOneStepOneModelWolf, buildGaussian) wolvesCentralControlPolicies = [ composeCentralControlPolicy( observeListBaseOnNumInWe[numAgentsInWe - 2])( weModelsListBaseOnNumInWe[numAgentsInWe - 2], numAgentsInWe) for numAgentsInWe in range(2, numWolves + 1) ] centralControlPolicyListBasedOnNumAgentsInWe = wolvesCentralControlPolicies # 0 for two agents in We, 1 for three agents... softPolicyInInference = lambda distribution: distribution getStateThirdPersonPerspective = lambda state, goalId, weIds: getStateOrActionThirdPersonPerspective( state, goalId, weIds, blocksID) policyForCommittedAgentsInInference = PolicyForCommittedAgent( centralControlPolicyListBasedOnNumAgentsInWe, softPolicyInInference, getStateThirdPersonPerspective) concernedAgentsIds = possibleWolvesIds calCommittedAgentsPolicyLikelihood = CalCommittedAgentsContinuousPolicyLikelihood( concernedAgentsIds, policyForCommittedAgentsInInference, rationalityBetaInInference) randomActionSpace = [(5, 0), (3.5, 3.5), (0, 5), (-3.5, 3.5), (-5, 0), (-3.5, -3.5), (0, -5), (3.5, -3.5), (0, 0)] randomPolicy = RandomPolicy(randomActionSpace) getStateFirstPersonPerspective = lambda state, goalId, weIds, selfId: getStateOrActionFirstPersonPerspective( state, goalId, weIds, selfId, blocksID) policyForUncommittedAgentsInInference = PolicyForUncommittedAgent( possibleWolvesIds, randomPolicy, softPolicyInInference, getStateFirstPersonPerspective) calUncommittedAgentsPolicyLikelihood = CalUncommittedAgentsPolicyLikelihood( possibleWolvesIds, concernedAgentsIds, policyForUncommittedAgentsInInference) # Joint Likelihood calJointLikelihood = lambda intention, state, perceivedAction: calCommittedAgentsPolicyLikelihood(intention, state, perceivedAction) * \ calUncommittedAgentsPolicyLikelihood(intention, state, perceivedAction) # Infer and update Intention variablesForAllWolves = [ [intentionSpace] for intentionSpace in intentionSpacesForAllWolves ] jointHypothesisSpaces = [ pd.MultiIndex.from_product(variables, names=['intention']) for variables in variablesForAllWolves ] concernedHypothesisVariable = ['intention'] priorDecayRate = 1 softPrior = SoftDistribution(priorDecayRate) inferIntentionOneStepList = [ InferOneStep(jointHypothesisSpace, concernedHypothesisVariable, calJointLikelihood, softPrior) for jointHypothesisSpace in jointHypothesisSpaces ] if numSheep == 1: inferIntentionOneStepList = [lambda prior, state, action: prior ] * 3 adjustIntentionPriorGivenValueOfState = lambda state: 1 chooseIntention = sampleFromDistribution updateIntentions = [ UpdateIntention(intentionPrior, valuePriorEndTime, adjustIntentionPriorGivenValueOfState, perceptAction, inferIntentionOneStep, chooseIntention) for intentionPrior, inferIntentionOneStep in zip( wolvesIntentionPriors, inferIntentionOneStepList) ] # reset intention and adjuste intention prior attributes tools for multiple trajectory intentionResetAttributes = [ 'timeStep', 'lastState', 'lastAction', 'intentionPrior', 'formerIntentionPriors' ] intentionResetAttributeValues = [ dict( zip(intentionResetAttributes, [0, None, None, intentionPrior, [intentionPrior]])) for intentionPrior in wolvesIntentionPriors ] resetIntentions = ResetObjects(intentionResetAttributeValues, updateIntentions) returnAttributes = ['formerIntentionPriors'] getIntentionDistributions = GetObjectsValuesOfAttributes( returnAttributes, updateIntentions) attributesToRecord = ['lastAction'] recordActionForUpdateIntention = RecordValuesForObjects( attributesToRecord, updateIntentions) # Wovels Generate Action covForPlanning = [0.03**2 for _ in range(actionDimReshaped)] buildGaussianForPlanning = BuildGaussianFixCov(covForPlanning) composeCentralControlPolicyForPlanning = lambda observe: ComposeCentralControlPolicyByGaussianOnDeterministicAction( reshapeAction, observe, actOneStepOneModelWolf, buildGaussianForPlanning) wolvesCentralControlPoliciesForPlanning = [ composeCentralControlPolicyForPlanning( observeListBaseOnNumInWe[numAgentsInWe - 2])( weModelsListBaseOnNumInWe[numAgentsInWe - 2], numAgentsInWe) for numAgentsInWe in range(2, numWolves + 1) ] centralControlPolicyListBasedOnNumAgentsInWeForPlanning = wolvesCentralControlPoliciesForPlanning # 0 for two agents in We, 1 for three agents... softPolicyInPlanning = lambda distribution: distribution policyForCommittedAgentInPlanning = PolicyForCommittedAgent( centralControlPolicyListBasedOnNumAgentsInWeForPlanning, softPolicyInPlanning, getStateThirdPersonPerspective) policyForUncommittedAgentInPlanning = PolicyForUncommittedAgent( possibleWolvesIds, randomPolicy, softPolicyInPlanning, getStateFirstPersonPerspective) def wolfChooseActionMethod(individualContinuousDistributions): centralControlAction = tuple([ tuple(sampleFromContinuousSpace(distribution)) for distribution in individualContinuousDistributions ]) return centralControlAction getSelfActionThirdPersonPerspective = lambda weIds, selfId: list( weIds).index(selfId) chooseCommittedAction = GetActionFromJointActionDistribution( wolfChooseActionMethod, getSelfActionThirdPersonPerspective) chooseUncommittedAction = sampleFromDistribution wolvesSampleIndividualActionGivenIntentionList = [ SampleIndividualActionGivenIntention( selfId, policyForCommittedAgentInPlanning, policyForUncommittedAgentInPlanning, chooseCommittedAction, chooseUncommittedAction) for selfId in possibleWolvesIds ] # Sample and Save Trajectory trajectoriesWithIntentionDists = [] for trajectoryId in range(self.numTrajectories): sheepModelsForPolicy = [ sheepModelListOfDiffWolfReward[np.random.choice( numAllSheepModels)] for sheepId in possibleSheepIds ] if sheepConcernSelfOnly: composeSheepPolicy = lambda sheepModel: lambda state: { tuple( reshapeAction( actOneStepOneModelSheep(sheepModel, observeSheep(state)))): 1 } sheepChooseActionMethod = sampleFromDistribution sheepSampleActions = [ SampleActionOnFixedIntention( selfId, possibleWolvesIds, composeSheepPolicy(sheepModel), sheepChooseActionMethod, blocksID) for selfId, sheepModel in zip(possibleSheepIds, sheepModelsForPolicy) ] else: composeSheepPolicy = lambda sheepModel: lambda state: tuple( reshapeAction( actOneStepOneModelSheep(sheepModel, observeSheep(state) ))) sheepSampleActions = [ composeSheepPolicy(sheepModel) for sheepModel in sheepModelsForPolicy ] wolvesSampleActions = [ SampleActionOnChangableIntention( updateIntention, wolvesSampleIndividualActionGivenIntention) for updateIntention, wolvesSampleIndividualActionGivenIntention in zip(updateIntentions, wolvesSampleIndividualActionGivenIntentionList) ] allIndividualSampleActions = wolvesSampleActions + sheepSampleActions sampleActionMultiAgent = SampleActionMultiagent( allIndividualSampleActions, recordActionForUpdateIntention) trajectory = sampleTrajectory(sampleActionMultiAgent) intentionDistributions = getIntentionDistributions() trajectoryWithIntentionDists = [ tuple(list(SASRPair) + list(intentionDist)) for SASRPair, intentionDist in zip(trajectory, intentionDistributions) ] trajectoriesWithIntentionDists.append( tuple(trajectoryWithIntentionDists)) resetIntentions() #print(intentionDistributions) trajectoryFixedParameters = {'maxRunningSteps': maxRunningSteps} self.saveTrajectoryByParameters(trajectoriesWithIntentionDists, trajectoryFixedParameters, parameters) print(np.mean([len(tra) for tra in trajectoriesWithIntentionDists])) # visualize if visualizeTraj: wolfColor = np.array([0.85, 0.35, 0.35]) sheepColor = np.array([0.35, 0.85, 0.35]) blockColor = np.array([0.25, 0.25, 0.25]) entitiesColorList = [wolfColor] * numWolves + [ sheepColor ] * numSheep + [blockColor] * numBlocks render = Render(entitiesSizeList, entitiesColorList, numAgents, getPosFromAgentState) trajToRender = np.concatenate(trajectoriesWithIntentionDists) render(trajToRender)
def main(): DEBUG = 0 renderOn = 0 if DEBUG: parametersForTrajectoryPath = {} startSampleIndex = 0 endSampleIndex = 10 agentId = 1 parametersForTrajectoryPath['sampleIndex'] = (startSampleIndex, endSampleIndex) else: parametersForTrajectoryPath = json.loads(sys.argv[1]) startSampleIndex = int(sys.argv[2]) endSampleIndex = int(sys.argv[3]) agentId = int(parametersForTrajectoryPath['agentId']) parametersForTrajectoryPath['sampleIndex'] = (startSampleIndex, endSampleIndex) # check file exists or not dirName = os.path.dirname(__file__) trajectoriesSaveDirectory = os.path.join( dirName, '..', '..', '..', '..', 'data', '2wolves1sheep', 'trainWolvesTwoCenterControlMultiTrees', 'trajectories') if not os.path.exists(trajectoriesSaveDirectory): os.makedirs(trajectoriesSaveDirectory) trajectorySaveExtension = '.pickle' maxRunningSteps = 50 numSimulations = 500 killzoneRadius = 50 fixedParameters = { 'agentId': agentId, 'maxRunningSteps': maxRunningSteps, 'numSimulations': numSimulations, 'killzoneRadius': killzoneRadius } generateTrajectorySavePath = GetSavePath(trajectoriesSaveDirectory, trajectorySaveExtension, fixedParameters) trajectorySavePath = generateTrajectorySavePath( parametersForTrajectoryPath) if not os.path.isfile(trajectorySavePath): numOfAgent = 3 sheepId = 0 wolvesId = 1 wolfOneId = 1 wolfTwoId = 2 xPosIndex = [0, 1] xBoundary = [0, 600] yBoundary = [0, 600] getSheepXPos = GetAgentPosFromState(sheepId, xPosIndex) getWolfOneXPos = GetAgentPosFromState(wolfOneId, xPosIndex) getWolfTwoXPos = GetAgentPosFromState(wolfTwoId, xPosIndex) reset = Reset(xBoundary, yBoundary, numOfAgent) isTerminalOne = IsTerminal(getWolfOneXPos, getSheepXPos, killzoneRadius) isTerminalTwo = IsTerminal(getWolfTwoXPos, getSheepXPos, killzoneRadius) isTerminal = lambda state: isTerminalOne(state) or isTerminalTwo(state) stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity( xBoundary, yBoundary) centerControlIndexList = [wolvesId] unpackCenterControlAction = UnpackCenterControlAction( centerControlIndexList) transitionFunction = TransiteForNoPhysicsWithCenterControlAction( stayInBoundaryByReflectVelocity) numFramesToInterpolate = 3 transit = TransitWithInterpolateStateWithCenterControlAction( numFramesToInterpolate, transitionFunction, isTerminal, unpackCenterControlAction) # NNGuidedMCTS init cInit = 1 cBase = 100 calculateScore = ScoreChild(cInit, cBase) selectChild = SelectChild(calculateScore) actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)] wolfActionSpace = actionSpace # wolfActionSpace = [(10, 0), (0, 10), (-10, 0), (0, -10), (0, 0)] preyPowerRatio = 12 sheepActionSpace = list( map(tuple, np.array(actionSpace) * preyPowerRatio)) predatorPowerRatio = 8 wolfActionOneSpace = list( map(tuple, np.array(wolfActionSpace) * predatorPowerRatio)) wolfActionTwoSpace = list( map(tuple, np.array(wolfActionSpace) * predatorPowerRatio)) wolvesActionSpace = list( product(wolfActionOneSpace, wolfActionTwoSpace)) actionSpaceList = [sheepActionSpace, wolvesActionSpace] # neural network init numStateSpace = 2 * numOfAgent numSheepActionSpace = len(sheepActionSpace) numWolvesActionSpace = len(wolvesActionSpace) regularizationFactor = 1e-4 sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] generateSheepModel = GenerateModel(numStateSpace, numSheepActionSpace, regularizationFactor) # load save dir NNModelSaveExtension = '' sheepNNModelSaveDirectory = os.path.join( dirName, '..', '..', '..', '..', 'data', '2wolves1sheep', 'trainSheepWithTwoHeatSeekingWolves', 'trainedResNNModels') sheepNNModelFixedParameters = { 'agentId': 0, 'maxRunningSteps': 50, 'numSimulations': 110, 'miniBatchSize': 256, 'learningRate': 0.0001, } getSheepNNModelSavePath = GetSavePath(sheepNNModelSaveDirectory, NNModelSaveExtension, sheepNNModelFixedParameters) depth = 9 resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' initSheepNNModel = generateSheepModel(sharedWidths * depth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) sheepTrainedModelPath = getSheepNNModelSavePath({ 'trainSteps': 50000, 'depth': depth }) sheepTrainedModel = restoreVariables(initSheepNNModel, sheepTrainedModelPath) sheepPolicy = ApproximatePolicy(sheepTrainedModel, sheepActionSpace) # MCTS cInit = 1 cBase = 100 calculateScore = ScoreChild(cInit, cBase) selectChild = SelectChild(calculateScore) # prior getActionPrior = lambda state: { action: 1 / len(wolvesActionSpace) for action in wolvesActionSpace } # load chase nn policy temperatureInMCTS = 1 chooseActionInMCTS = SampleAction(temperatureInMCTS) def wolvesTransit(state, action): return transit(state, [chooseActionInMCTS(sheepPolicy(state)), action]) # reward function aliveBonus = -1 / maxRunningSteps deathPenalty = 1 rewardFunction = reward.RewardFunctionCompete(aliveBonus, deathPenalty, isTerminal) # initialize children; expand initializeChildren = InitializeChildren(wolvesActionSpace, wolvesTransit, getActionPrior) expand = Expand(isTerminal, initializeChildren) # random rollout policy def rolloutPolicy(state): return wolvesActionSpace[np.random.choice( range(numWolvesActionSpace))] # rollout rolloutHeuristicWeight = 0 minDistance = 400 rolloutHeuristic1 = reward.HeuristicDistanceToTarget( rolloutHeuristicWeight, getWolfOneXPos, getSheepXPos, minDistance) rolloutHeuristic2 = reward.HeuristicDistanceToTarget( rolloutHeuristicWeight, getWolfTwoXPos, getSheepXPos, minDistance) rolloutHeuristic = lambda state: (rolloutHeuristic1(state) + rolloutHeuristic2(state)) / 2 maxRolloutSteps = 15 rollout = RollOut(rolloutPolicy, maxRolloutSteps, wolvesTransit, rewardFunction, isTerminal, rolloutHeuristic) numTree = 4 numSimulationsPerTree = int(numSimulations / numTree) wolfPolicy = StochasticMCTS( numTree, numSimulationsPerTree, selectChild, expand, rollout, backup, establishSoftmaxActionDistFromMultipleTrees) # All agents' policies policy = lambda state: [sheepPolicy(state), wolfPolicy(state)] chooseActionList = [chooseGreedyAction, chooseGreedyAction] render = None if renderOn: import pygame as pg from pygame.color import THECOLORS screenColor = THECOLORS['black'] circleColorList = [ THECOLORS['green'], THECOLORS['red'], THECOLORS['red'] ] circleSize = 10 saveImage = False saveImageDir = os.path.join(dirName, '..', '..', '..', '..', 'data', 'demoImg') if not os.path.exists(saveImageDir): os.makedirs(saveImageDir) screen = pg.display.set_mode([xBoundary[1], yBoundary[1]]) render = Render(numOfAgent, xPosIndex, screen, screenColor, circleColorList, circleSize, saveImage, saveImageDir) sampleTrajectory = SampleTrajectoryWithRender(maxRunningSteps, transit, isTerminal, reset, chooseActionList, render, renderOn) trajectories = [ sampleTrajectory(policy) for sampleIndex in range(startSampleIndex, endSampleIndex) ] print([len(traj) for traj in trajectories]) saveToPickle(trajectories, trajectorySavePath)
def main(): startTime = time.time() DEBUG = 0 renderOn = 0 if DEBUG: parametersForTrajectoryPath = {} startSampleIndex = 5 endSampleIndex = 8 agentId = 0 parametersForTrajectoryPath['sampleIndex'] = (startSampleIndex, endSampleIndex) else: parametersForTrajectoryPath = json.loads(sys.argv[1]) startSampleIndex = int(sys.argv[2]) endSampleIndex = int(sys.argv[3]) agentId = int(parametersForTrajectoryPath['agentId']) parametersForTrajectoryPath['sampleIndex'] = (startSampleIndex, endSampleIndex) # check file exists or not dirName = os.path.dirname(__file__) trajectoriesSaveDirectory = os.path.join(dirName, '..', '..', '..', '..', 'data', 'MADDPG2wolves1sheep', 'trainSheepWithPretrrainWolves', 'trajectories') if not os.path.exists(trajectoriesSaveDirectory): os.makedirs(trajectoriesSaveDirectory) trajectorySaveExtension = '.pickle' maxRunningSteps = 50 numSimulations = 250 fixedParameters = {'agentId': agentId, 'maxRunningSteps': maxRunningSteps, 'numSimulations': numSimulations} generateTrajectorySavePath = GetSavePath(trajectoriesSaveDirectory, trajectorySaveExtension, fixedParameters) trajectorySavePath = generateTrajectorySavePath(parametersForTrajectoryPath) if not os.path.isfile(trajectorySavePath): # env MDP sheepsID = [0] wolvesID = [1, 2] blocksID = [] numSheeps = len(sheepsID) numWolves = len(wolvesID) numBlocks = len(blocksID) numAgents = numWolves + numSheeps numEntities = numAgents + numBlocks sheepSize = 0.05 wolfSize = 0.075 blockSize = 0.2 sheepMaxSpeed = 1.3 * 1 wolfMaxSpeed = 1.0 * 1 blockMaxSpeed = None entitiesSizeList = [sheepSize] * numSheeps + [wolfSize] * numWolves + [blockSize] * numBlocks entityMaxSpeedList = [sheepMaxSpeed] * numSheeps + [wolfMaxSpeed] * numWolves + [blockMaxSpeed] * numBlocks entitiesMovableList = [True] * numAgents + [False] * numBlocks massList = [1.0] * numEntities centralControlId = 1 centerControlIndexList = [centralControlId] reshapeAction = UnpackCenterControlAction(centerControlIndexList) getCollisionForce = GetCollisionForce() applyActionForce = ApplyActionForce(wolvesID, sheepsID, entitiesMovableList) applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList, entitiesSizeList, getCollisionForce, getPosFromAgentState) integrateState = IntegrateState(numEntities, entitiesMovableList, massList, entityMaxSpeedList, getVelFromAgentState, getPosFromAgentState) interpolateState = TransitMultiAgentChasing(numEntities, reshapeAction, applyActionForce, applyEnvironForce, integrateState) numFramesToInterpolate = 1 def transit(state, action): for frameIndex in range(numFramesToInterpolate): nextState = interpolateState(state, action) action = np.array([(0, 0)] * numAgents) state = nextState return nextState isTerminal = lambda state: False isCollision = IsCollision(getPosFromAgentState) collisonRewardWolf = 1 punishForOutOfBound = PunishForOutOfBound() rewardWolf = RewardCentralControlPunishBond(wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBound, collisonRewardWolf) collisonRewardSheep = -1 rewardSheep = RewardCentralControlPunishBond(sheepsID, wolvesID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBound, collisonRewardSheep) resetState = ResetMultiAgentChasing(numAgents, numBlocks) observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID, blocksID, getPosFromAgentState, getVelFromAgentState) observe = lambda state: [observeOneAgent(agentID)(state) for agentID in range(numAgents)] # policy actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)] wolfActionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)] preyPowerRatio = 0.5 sheepActionSpace = list(map(tuple, np.array(actionSpace) * preyPowerRatio)) predatorPowerRatio = 0.5 wolfActionOneSpace = list(map(tuple, np.array(wolfActionSpace) * predatorPowerRatio)) wolfActionTwoSpace = list(map(tuple, np.array(wolfActionSpace) * predatorPowerRatio)) wolvesActionSpace = list(product(wolfActionOneSpace, wolfActionTwoSpace)) actionSpaceList = [sheepActionSpace, wolvesActionSpace] # neural network init numStateSpace = 4 * numEntities numSheepActionSpace = len(sheepActionSpace) numWolvesActionSpace = len(wolvesActionSpace) regularizationFactor = 1e-4 sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] generateWolvesModel = GenerateModel(numStateSpace, numWolvesActionSpace, regularizationFactor) # wolf NN Policy NNModelSaveExtension = '' wolfTrainedModelPath = os.path.join(dirName, '..', '..', '..', '..', 'data', 'MADDPG2wolves1sheep', 'trainWolvesTwoCenterControlAction', 'trainedResNNModels', 'agentId=1_depth=9_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations=250_trainSteps=50000') depth = 9 resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' initWolfNNModel = generateWolvesModel(sharedWidths * depth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) wolfTrainedModel = restoreVariables(initWolfNNModel, wolfTrainedModelPath) wolfPolicy = ApproximatePolicy(wolfTrainedModel, wolvesActionSpace) # MCTS cInit = 1 cBase = 100 calculateScore = ScoreChild(cInit, cBase) selectChild = SelectChild(calculateScore) # prior getActionPrior = lambda state: {action: 1 / len(sheepActionSpace) for action in sheepActionSpace} # load chase nn policy chooseActionInMCTS = sampleFromDistribution def sheepTransit(state, action): return transit( state, [action, chooseActionInMCTS(wolfPolicy(state))]) # initialize children; expand initializeChildren = InitializeChildren( sheepActionSpace, sheepTransit, getActionPrior) isTerminal = lambda state: False expand = Expand(isTerminal, initializeChildren) # random rollout policy def rolloutPolicy( state): return [sheepActionSpace[np.random.choice(range(numSheepActionSpace))],sampleFromDistribution(wolfPolicy(state))] rolloutHeuristic = lambda state: 0 maxRolloutSteps = 15 rollout = RollOut(rolloutPolicy, maxRolloutSteps, transit, rewardSheep, isTerminal, rolloutHeuristic) sheepPolicy = MCTS(numSimulations, selectChild, expand, rollout, backup, establishSoftmaxActionDist) # All agents' policies policy = lambda state: [sheepPolicy(state), wolfPolicy(state)] chooseActionList = [maxFromDistribution, maxFromDistribution] def sampleAction(state): actionDists = [sheepPolicy(state), wolfPolicy(state)] action = [chooseAction(actionDist) for actionDist, chooseAction in zip(actionDists, chooseActionList)] return action render = lambda state: None forwardOneStep = ForwardOneStep(transit, rewardSheep) sampleTrajectory = SampleTrajectoryWithRender(maxRunningSteps, isTerminal, resetState, forwardOneStep, render, renderOn) trajectories = [sampleTrajectory(sampleAction) for sampleIndex in range(startSampleIndex, endSampleIndex)] print([len(traj) for traj in trajectories]) saveToPickle(trajectories, trajectorySavePath) endTime = time.time()