def setUp(self): self.sheepId = 0 self.actionIndex = 1 self.decay = 1 self.actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7)] self.actionToOneHot = lambda action: np.asarray([ 1 if (np.array(action) == np.array(self.actionSpace[index])).all() else 0 for index in range(len(self.actionSpace)) ]) self.rewardFunction = lambda s, a: 1 self.anotherRewardFunction = lambda s, a: -1 self.accumulateRewards = AccumulateRewards(self.decay, self.rewardFunction) self.accumulateMultipleAgentRewards = AccumulateMultiAgentRewards( self.decay, [self.rewardFunction, self.anotherRewardFunction]) self.addValuesToTrajectory = AddValuesToTrajectory( self.accumulateRewards) self.getTerminalActionFromTrajectory = lambda trajectory: trajectory[ -1][1] self.removeTerminalTupleFromTrajectory = RemoveTerminalTupleFromTrajectory( self.getTerminalActionFromTrajectory) self.processTrajectoryForPolicyValueNet = ProcessTrajectoryForPolicyValueNet( self.actionToOneHot, self.sheepId) self.compareTuples = lambda tuple1, tuple2: all( np.array_equal(element1, element2) for element1, element2 in zip( tuple1, tuple2)) and len(tuple1) == len(tuple2) self.compareTrajectories = lambda traj1, traj2: all( self.compareTuples(tuple1, tuple2) for tuple1, tuple2 in zip( traj1, traj2)) and len(traj1) == len(traj2)
def testAddValuesToTraj(self, traj, decay, groundTruthTrajWithValues): self.accumulateRewards = AccumulateRewards(decay, self.rewardFunction) self.addValuesToTrajectory = AddValuesToTrajectory( self.accumulateRewards) trajWithValues = self.addValuesToTrajectory(traj) for transition, groundTruthTransition in zip( trajWithValues, groundTruthTrajWithValues): self.assertEqual(transition[0:4], groundTruthTransition[0:4])
def testAddMultiAgentValuesToTraj(self, traj, decay, groundTruthTrajWithValues): accRewards = AccumulateMultiAgentRewards( decay, [self.rewardFunction, self.anotherRewardFunction]) self.addValuesToTrajectory = AddValuesToTrajectory(accRewards) trajWithValues = self.addValuesToTrajectory(traj) for transition, groundTruthTransition in zip( trajWithValues, groundTruthTrajWithValues): self.assertEqual(transition[0:3], groundTruthTransition[0:3]) self.assertTrue(np.all(transition[3] == groundTruthTransition[3]))
def main(): # manipulated variables manipulatedVariables = OrderedDict() manipulatedVariables['dataSize'] = [1000, 2000, 3000] manipulatedVariables['depth'] = [5, 9] manipulatedVariables['trainSteps'] = list(range(0, 50001, 10000)) levelNames = list(manipulatedVariables.keys()) levelValues = list(manipulatedVariables.values()) modelIndex = pd.MultiIndex.from_product(levelValues, names=levelNames) toSplitFrame = pd.DataFrame(index=modelIndex) killzoneRadius = 25 maxRunningSteps = 100 numSimulations = 200 # accumulate rewards for trajectories numOfAgent = 3 sheepId = 0 wolvesId = 1 wolfOneId = 1 wolfTwoId = 2 xPosIndex = [0, 1] xBoundary = [0, 600] yBoundary = [0, 600] getSheepXPos = GetAgentPosFromState(sheepId, xPosIndex) getWolfOneXPos = GetAgentPosFromState(wolfOneId, xPosIndex) getWolfTwoXPos = GetAgentPosFromState(wolfTwoId, xPosIndex) isTerminalOne = IsTerminal(getWolfOneXPos, getSheepXPos, killzoneRadius) isTerminalTwo = IsTerminal(getWolfTwoXPos, getSheepXPos, killzoneRadius) playIsTerminal = lambda state: isTerminalOne(state) or isTerminalTwo(state) stayInBoundaryByReflectVelocity = StayInBoundaryByReflectVelocity( xBoundary, yBoundary) transit = TransiteForNoPhysics(stayInBoundaryByReflectVelocity) playAliveBonus = -1 / maxRunningSteps playDeathPenalty = 1 playKillzoneRadius = killzoneRadius playReward = RewardFunctionCompete(playAliveBonus, playDeathPenalty, playIsTerminal) decay = 1 accumulateRewards = AccumulateRewards(decay, playReward) addValuesToTrajectory = AddValuesToTrajectory(accumulateRewards) # generate trajectory parallel generateTrajectoriesCodeName = 'generateCenterControlTrajectoryByCondition.py' evalNumTrials = 500 numCpuCores = os.cpu_count() numCpuToUse = int(0.75 * numCpuCores) numCmdList = min(evalNumTrials, numCpuToUse) generateTrajectoriesParallel = GenerateTrajectoriesParallel( generateTrajectoriesCodeName, evalNumTrials, numCmdList) # run all trials and save trajectories generateTrajectoriesParallelFromDf = lambda df: generateTrajectoriesParallel( readParametersFromDf(df)) # toSplitFrame.groupby(levelNames).apply(generateTrajectoriesParallelFromDf) # save evaluation trajectories dirName = os.path.dirname(__file__) trajectoriesSaveDirectory = os.path.join( dirName, '..', '..', '..', 'data', 'evaluateSupervisedLearning', 'multiMCTSAgentResNetNoPhysicsCenterControl', 'evaluateCenterControlTrajByCondition') if not os.path.exists(trajectoriesSaveDirectory): os.makedirs(trajectoriesSaveDirectory) trajectoryExtension = '.pickle' trajectoryFixedParameters = { 'maxRunningSteps': maxRunningSteps, 'numSimulations': numSimulations, 'killzoneRadius': killzoneRadius } getTrajectorySavePath = GetSavePath(trajectoriesSaveDirectory, trajectoryExtension, trajectoryFixedParameters) getTrajectorySavePathFromDf = lambda df: getTrajectorySavePath( readParametersFromDf(df)) # compute statistics on the trajectories fuzzySearchParameterNames = [] loadTrajectories = LoadTrajectories(getTrajectorySavePath, loadFromPickle, fuzzySearchParameterNames) loadTrajectoriesFromDf = lambda df: loadTrajectories( readParametersFromDf(df)) measurementFunction = lambda trajectory: accumulateRewards(trajectory)[0] computeStatistics = ComputeStatistics(loadTrajectoriesFromDf, measurementFunction) statisticsDf = toSplitFrame.groupby(levelNames).apply(computeStatistics) def calculateSuriveRatio(trajectory): lenght = np.array(len(trajectory)) count = np.array( [lenght < 50, lenght >= 50 and lenght < 100, lenght >= 100]) return count computeNumbers = ComputeStatistics(loadTrajectoriesFromDf, calculateSuriveRatio) df = toSplitFrame.groupby(levelNames).apply(computeNumbers) print(df) fig = plt.figure() numRows = 1 numColumns = 1 plotCounter = 1 axForDraw = fig.add_subplot(numRows, numColumns, plotCounter) xlabel = ['0-50', '50-100', '100-150'] x = np.arange(len(xlabel)) numTrials = 500 yMean = df['mean'].tolist() yRrr = np.array(df['std'].tolist()) / (np.sqrt(numTrials) - 1) totalWidth, n = 0.6, 3 width = totalWidth / n x = x - (totalWidth - width) / 2 plt.bar(x, yMean[0], yerr=yRrr[0], width=width, label='trainStep0') plt.bar(x + width, yMean[1], yerr=yRrr[1], width=width, label='trainStep10000') plt.bar(x + width * 2, yMean[2], yerr=yRrr[2], width=width, label='trainStep30000') plt.bar(x + width * 3, yMean[3], yerr=yRrr[3], width=width, label='trainStep50000') plt.suptitle('dataSize 3000') plt.xticks(x, xlabel) plt.ylim(0, 1) plt.xlabel('living steps') plt.legend(loc='best') # plt.show() # plot the results fig = plt.figure() numRows = len(manipulatedVariables['depth']) numColumns = len(manipulatedVariables['dataSize']) plotCounter = 1 print(statisticsDf) for depth, grp in statisticsDf.groupby('depth'): grp.index = grp.index.droplevel('depth') for dataSize, group in grp.groupby('dataSize'): group.index = group.index.droplevel('dataSize') axForDraw = fig.add_subplot(numRows, numColumns, plotCounter) if plotCounter % numColumns == 1: axForDraw.set_ylabel('depth: {}'.format(depth)) if plotCounter <= numColumns: axForDraw.set_title('dataSize: {}'.format(dataSize)) axForDraw.set_ylim(-1, 1) # plt.ylabel('Accumulated rewards') maxTrainSteps = manipulatedVariables['trainSteps'][-1] plt.plot([0, maxTrainSteps], [0.354] * 2, '--m', color="#1C2833", label='pure MCTS') group.plot(ax=axForDraw, y='mean', yerr='std', marker='o', logx=False) plotCounter += 1 plt.suptitle('center control wolves') plt.legend(loc='best') plt.show()
def trainOneCondition(manipulatedVariables): depth = int(manipulatedVariables['depth']) # Get dataset for training DIRNAME = os.path.dirname(__file__) dataSetDirectory = os.path.join(dirName, '..', '..', '..', '..', 'data', 'NoPhysics2wolves1sheep', 'trainWolvesTwoCenterControlAction88', 'trajectories') if not os.path.exists(dataSetDirectory): os.makedirs(dataSetDirectory) dataSetExtension = '.pickle' dataSetMaxRunningSteps = 50 dataSetNumSimulations = 250 killzoneRadius = 150 agentId = 1 wolvesId = 1 dataSetFixedParameters = { 'agentId': agentId, 'maxRunningSteps': dataSetMaxRunningSteps, 'numSimulations': dataSetNumSimulations, 'killzoneRadius': killzoneRadius } getDataSetSavePath = GetSavePath(dataSetDirectory, dataSetExtension, dataSetFixedParameters) print("DATASET LOADED!") numOfAgent = 3 # accumulate rewards for trajectories decay = 1 accumulateRewards = AccumulateRewards(decay) addValuesToTrajectory = AddValuesToTrajectory(accumulateRewards) # pre-process the trajectories actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)] preyPowerRatio = 10 sheepActionSpace = list(map(tuple, np.array(actionSpace) * preyPowerRatio)) predatorPowerRatio = 8 wolfActionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7)] wolfActionOneSpace = list( map(tuple, np.array(wolfActionSpace) * predatorPowerRatio)) wolfActionTwoSpace = list( map(tuple, np.array(wolfActionSpace) * predatorPowerRatio)) wolvesActionSpace = list(it.product(wolfActionOneSpace, wolfActionTwoSpace)) numActionSpace = len(wolvesActionSpace) actionIndex = 1 actionToOneHot = ActionToOneHot(wolvesActionSpace) getTerminalActionFromTrajectory = lambda trajectory: trajectory[-1][ actionIndex] removeTerminalTupleFromTrajectory = RemoveTerminalTupleFromTrajectory( getTerminalActionFromTrajectory) processTrajectoryForNN = ProcessTrajectoryForPolicyValueNet( actionToOneHot, wolvesId) preProcessTrajectories = PreProcessTrajectories( addValuesToTrajectory, removeTerminalTupleFromTrajectory, processTrajectoryForNN) fuzzySearchParameterNames = ['sampleIndex'] loadTrajectories = LoadTrajectories(getDataSetSavePath, loadFromPickle, fuzzySearchParameterNames) loadedTrajectories = loadTrajectories(parameters={}) print(loadedTrajectories[0]) filterState = lambda timeStep: (timeStep[0][:numOfAgent], timeStep[1], timeStep[2], timeStep[3]) trajectories = [[filterState(timeStep) for timeStep in trajectory] for trajectory in loadedTrajectories] print(len(trajectories)) preProcessedTrajectories = np.concatenate( preProcessTrajectories(trajectories)) trainData = [list(varBatch) for varBatch in zip(*preProcessedTrajectories)] valuedTrajectories = [addValuesToTrajectory(tra) for tra in trajectories] # neural network init and save path numStateSpace = 6 regularizationFactor = 1e-4 sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] generateModel = GenerateModel(numStateSpace, numActionSpace, regularizationFactor) resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' sheepNNModel = generateModel(sharedWidths * depth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) initTimeStep = 0 valueIndex = 3 trainDataMeanAccumulatedReward = np.mean( [tra[initTimeStep][valueIndex] for tra in valuedTrajectories]) print(trainDataMeanAccumulatedReward) # function to train NN model terminalThreshold = 1e-10 lossHistorySize = 10 initActionCoeff = 1 initValueCoeff = 1 initCoeff = (initActionCoeff, initValueCoeff) afterActionCoeff = 1 afterValueCoeff = 1 afterCoeff = (afterActionCoeff, afterValueCoeff) terminalController = lambda evalDict, numSteps: False coefficientController = CoefficientCotroller(initCoeff, afterCoeff) reportInterval = 10000 trainStepsIntervel = 10000 trainReporter = TrainReporter(trainStepsIntervel, reportInterval) learningRateDecay = 1 learningRateDecayStep = 1 learningRateModifier = lambda learningRate: LearningRateModifier( learningRate, learningRateDecay, learningRateDecayStep) getTrainNN = lambda batchSize, learningRate: Train( trainStepsIntervel, batchSize, sampleData, learningRateModifier(learningRate), terminalController, coefficientController, trainReporter) # get path to save trained models NNModelFixedParameters = { 'agentId': agentId, 'maxRunningSteps': dataSetMaxRunningSteps, 'numSimulations': dataSetNumSimulations } NNModelSaveDirectory = os.path.join(dirName, '..', '..', '..', '..', 'data', 'NoPhysics2wolves1sheep', 'trainWolvesTwoCenterControlAction88', 'trainedResNNModels') if not os.path.exists(NNModelSaveDirectory): os.makedirs(NNModelSaveDirectory) NNModelSaveExtension = '' getNNModelSavePath = GetSavePath(NNModelSaveDirectory, NNModelSaveExtension, NNModelFixedParameters) # function to train models numOfTrainStepsIntervel = 6 trainIntervelIndexes = list(range(numOfTrainStepsIntervel)) trainModelForConditions = TrainModelForConditions(trainIntervelIndexes, trainStepsIntervel, trainData, sheepNNModel, getTrainNN, getNNModelSavePath) trainModelForConditions(manipulatedVariables)
def main(): # important parameters # manipulated variables manipulatedVariables = OrderedDict() manipulatedVariables['miniBatchSize'] = [64, 256] manipulatedVariables['learningRate'] = [1e-3, 1e-4, 1e-5] manipulatedVariables['depth'] = [5, 9, 17] #[4,8,16]# manipulatedVariables['trainSteps'] = [0, 5000, 10000, 20000, 50000] levelNames = list(manipulatedVariables.keys()) levelValues = list(manipulatedVariables.values()) modelIndex = pd.MultiIndex.from_product(levelValues, names=levelNames) toSplitFrame = pd.DataFrame(index=modelIndex) # accumulate rewards for trajectories sheepId = 0 wolfId = 1 xPosIndex = [0, 1] getSheepPos = GetAgentPosFromState(sheepId, xPosIndex) getWolfPos = GetAgentPosFromState(wolfId, xPosIndex) killzoneRadius = 2 numSimulations = 150 maxRunningSteps = 30 agentId = 1 playAliveBonus = -1 / maxRunningSteps playDeathPenalty = 1 playKillzoneRadius = killzoneRadius playIsTerminal = IsTerminal(playKillzoneRadius, getSheepPos, getWolfPos) playReward = RewardFunctionCompete(playAliveBonus, playDeathPenalty, playIsTerminal) decay = 1 accumulateRewards = AccumulateRewards(decay, playReward) addValuesToTrajectory = AddValuesToTrajectory(accumulateRewards) # generate trajectory parallel # generateTrajectoriesCodeName = 'generateWolfResNNEvaluationTrajectoryFixObstacle.py' # generateTrajectoriesCodeName = 'generateWolfNNEvaluationTrajectoryFixObstacle.py' # generateTrajectoriesCodeName = 'generateWolfResNNEvaluationTrajectoryMovedObstacle.py' generateTrajectoriesCodeName = 'generateWolfResNNEvaluationTrajectoryRandomObstacle.py' # generateTrajectoriesCodeName = 'generateWolfNNEvaluationTrajectoryRandomObstacle.py' evalNumTrials = 100 numCpuCores = os.cpu_count() numCpuToUse = int(0.75 * numCpuCores) numCmdList = min(evalNumTrials, numCpuToUse) generateTrajectoriesParallel = GenerateTrajectoriesParallel( generateTrajectoriesCodeName, evalNumTrials, numCmdList) # run all trials and save trajectories generateTrajectoriesParallelFromDf = lambda df: generateTrajectoriesParallel( readParametersFromDf(df)) toSplitFrame.groupby(levelNames).apply(generateTrajectoriesParallelFromDf) # save evaluation trajectories dirName = os.path.dirname(__file__) dataFolderName = os.path.join(dirName, '..', '..', '..', 'data', 'multiAgentTrain', 'MCTSRandomObstacle') trajectoryDirectory = os.path.join( dataFolderName, 'evaluationTrajectoriesResNNWithObstacle') if not os.path.exists(trajectoryDirectory): os.makedirs(trajectoryDirectory) trajectoryExtension = '.pickle' trajectoryFixedParameters = { 'maxRunningSteps': maxRunningSteps, 'numSimulations': numSimulations, 'killzoneRadius': killzoneRadius, 'agentId': agentId } getTrajectorySavePath = GetSavePath(trajectoryDirectory, trajectoryExtension, trajectoryFixedParameters) getTrajectorySavePathFromDf = lambda df: getTrajectorySavePath( readParametersFromDf(df)) # compute statistics on the trajectories fuzzySearchParameterNames = ['sampleIndex'] loadTrajectories = LoadTrajectories(getTrajectorySavePath, loadFromPickle, fuzzySearchParameterNames) loadTrajectoriesFromDf = lambda df: loadTrajectories( readParametersFromDf(df)) measurementFunction = lambda trajectory: accumulateRewards(trajectory)[0] computeStatistics = ComputeStatistics(loadTrajectoriesFromDf, measurementFunction) statisticsDf = toSplitFrame.groupby(levelNames).apply(computeStatistics) print(statisticsDf) # manipulatedVariables['miniBatchSize'] = [64, 128] # manipulatedVariables['learningRate'] = [ 1e-3,1e-4,1e-5] # manipulatedVariables['depth'] = [4,8,16] # manipulatedVariables['trainSteps']=[0,20000,40000,60000,100000,180000] # plot the results fig = plt.figure() numRows = len(manipulatedVariables['depth']) numColumns = len(manipulatedVariables['learningRate']) plotCounter = 1 selfId = 0 for depth, grp in statisticsDf.groupby('depth'): grp.index = grp.index.droplevel('depth') for learningRate, group in grp.groupby('learningRate'): group.index = group.index.droplevel('learningRate') axForDraw = fig.add_subplot(numRows, numColumns, plotCounter) if (plotCounter % numColumns == 1) or numColumns == 1: axForDraw.set_ylabel('depth: {}'.format(depth)) if plotCounter <= numColumns: axForDraw.set_title('learningRate: {}'.format(learningRate)) axForDraw.set_ylim(-1, 1) drawPerformanceLine(group, axForDraw, selfId) plotCounter += 1 plt.suptitle('SupervisedNNWolfwithRandomWallState') plt.legend(loc='best') plt.show()
def iterateTrainOneCondition(parameterOneCondition): numTrainStepEachIteration = int( parameterOneCondition['numTrainStepEachIteration']) numTrajectoriesPerIteration = int( parameterOneCondition['numTrajectoriesPerIteration']) dirName = os.path.dirname(__file__) numOfAgent = 2 agentIds = list(range(numOfAgent)) maxRunningSteps = 50 numSimulations = 250 killzoneRadius = 50 fixedParameters = { 'maxRunningSteps': maxRunningSteps, 'numSimulations': numSimulations, 'killzoneRadius': killzoneRadius } # env MDP sheepsID = [0] wolvesID = [1, 2] blocksID = [] numSheeps = len(sheepsID) numWolves = len(wolvesID) numBlocks = len(blocksID) numAgents = numWolves + numSheeps numEntities = numAgents + numBlocks sheepSize = 0.05 wolfSize = 0.075 blockSize = 0.2 sheepMaxSpeed = 1.3 * 1 wolfMaxSpeed = 1.0 * 1 blockMaxSpeed = None entitiesSizeList = [sheepSize] * numSheeps + [wolfSize] * numWolves + [ blockSize ] * numBlocks entityMaxSpeedList = [sheepMaxSpeed] * numSheeps + [ wolfMaxSpeed ] * numWolves + [blockMaxSpeed] * numBlocks entitiesMovableList = [True] * numAgents + [False] * numBlocks massList = [1.0] * numEntities centralControlId = 1 centerControlIndexList = [centralControlId] reshapeAction = UnpackCenterControlAction(centerControlIndexList) getCollisionForce = GetCollisionForce() applyActionForce = ApplyActionForce(wolvesID, sheepsID, entitiesMovableList) applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList, entitiesSizeList, getCollisionForce, getPosFromAgentState) integrateState = IntegrateState(numEntities, entitiesMovableList, massList, entityMaxSpeedList, getVelFromAgentState, getPosFromAgentState) interpolateState = TransitMultiAgentChasing(numEntities, reshapeAction, applyActionForce, applyEnvironForce, integrateState) numFramesToInterpolate = 1 def transit(state, action): for frameIndex in range(numFramesToInterpolate): nextState = interpolateState(state, action) action = np.array([(0, 0)] * numAgents) state = nextState return nextState isTerminal = lambda state: False isCollision = IsCollision(getPosFromAgentState) collisonRewardWolf = 1 punishForOutOfBound = PunishForOutOfBound() rewardWolf = RewardCentralControlPunishBond( wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBound, collisonRewardWolf) collisonRewardSheep = -1 rewardSheep = RewardCentralControlPunishBond( sheepsID, wolvesID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBound, collisonRewardSheep) resetState = ResetMultiAgentChasing(numAgents, numBlocks) observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID, blocksID, getPosFromAgentState, getVelFromAgentState) observe = lambda state: [ observeOneAgent(agentID)(state) for agentID in range(numAgents) ] # policy actionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)] wolfActionSpace = [(10, 0), (7, 7), (0, 10), (-7, 7), (-10, 0), (-7, -7), (0, -10), (7, -7), (0, 0)] preyPowerRatio = 0.5 sheepActionSpace = list(map(tuple, np.array(actionSpace) * preyPowerRatio)) predatorPowerRatio = 0.5 wolfActionOneSpace = list( map(tuple, np.array(wolfActionSpace) * predatorPowerRatio)) wolfActionTwoSpace = list( map(tuple, np.array(wolfActionSpace) * predatorPowerRatio)) wolvesActionSpace = list(it.product(wolfActionOneSpace, wolfActionTwoSpace)) actionSpaceList = [sheepActionSpace, wolvesActionSpace] # neural network init numStateSpace = 4 * numEntities numSheepActionSpace = len(sheepActionSpace) numWolvesActionSpace = len(wolvesActionSpace) regularizationFactor = 1e-4 sharedWidths = [128] actionLayerWidths = [128] valueLayerWidths = [128] generateSheepModel = GenerateModel(numStateSpace, numSheepActionSpace, regularizationFactor) generateWolvesModel = GenerateModel(numStateSpace, numWolvesActionSpace, regularizationFactor) generateModelList = [generateSheepModel, generateWolvesModel] sheepDepth = 9 wolfDepth = 9 depthList = [sheepDepth, wolfDepth] resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' multiAgentNNmodel = [ generateModel(sharedWidths * depth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) for depth, generateModel in zip(depthList, generateModelList) ] # replay buffer bufferSize = 20000 saveToBuffer = SaveToBuffer(bufferSize) def getUniformSamplingProbabilities(buffer): return [(1 / len(buffer)) for _ in buffer] miniBatchSize = 512 sampleBatchFromBuffer = SampleBatchFromBuffer( miniBatchSize, getUniformSamplingProbabilities) # pre-process the trajectory for replayBuffer rewardMultiAgents = [rewardSheep, rewardWolf] decay = 1 accumulateMultiAgentRewards = AccumulateMultiAgentRewards(decay) addMultiAgentValuesToTrajectory = AddValuesToTrajectory( accumulateMultiAgentRewards) actionIndex = 1 def getTerminalActionFromTrajectory(trajectory): return trajectory[-1][actionIndex] removeTerminalTupleFromTrajectory = RemoveTerminalTupleFromTrajectory( getTerminalActionFromTrajectory) # pre-process the trajectory for NNTraining sheepActionToOneHot = ActionToOneHot(sheepActionSpace) wolvesActionToOneHot = ActionToOneHot(wolvesActionSpace) actionToOneHotList = [sheepActionToOneHot, wolvesActionToOneHot] processTrajectoryForPolicyValueNets = [ ProcessTrajectoryForPolicyValueNetMultiAgentReward( actionToOneHotList[agentId], agentId) for agentId in agentIds ] # function to train NN model terminalThreshold = 1e-6 lossHistorySize = 10 initActionCoeff = 1 initValueCoeff = 1 initCoeff = (initActionCoeff, initValueCoeff) afterActionCoeff = 1 afterValueCoeff = 1 afterCoeff = (afterActionCoeff, afterValueCoeff) terminalController = TrainTerminalController(lossHistorySize, terminalThreshold) coefficientController = CoefficientCotroller(initCoeff, afterCoeff) reportInterval = 10000 trainStepsIntervel = 1 # 10000 trainReporter = TrainReporter(numTrainStepEachIteration, reportInterval) learningRateDecay = 1 learningRateDecayStep = 1 learningRate = 0.0001 learningRateModifier = LearningRateModifier(learningRate, learningRateDecay, learningRateDecayStep) trainNN = Train(numTrainStepEachIteration, miniBatchSize, sampleData, learningRateModifier, terminalController, coefficientController, trainReporter) # load save dir trajectorySaveExtension = '.pickle' NNModelSaveExtension = '' trajectoriesSaveDirectory = os.path.join( dirName, '..', '..', 'data', 'iterTrain2wolves1sheepMADDPGEnv', 'trajectories') if not os.path.exists(trajectoriesSaveDirectory): os.makedirs(trajectoriesSaveDirectory) NNModelSaveDirectory = os.path.join(dirName, '..', '..', 'data', 'iterTrain2wolves1sheepMADDPGEnv', 'NNModelRes') if not os.path.exists(NNModelSaveDirectory): os.makedirs(NNModelSaveDirectory) generateTrajectorySavePath = GetSavePath(trajectoriesSaveDirectory, trajectorySaveExtension, fixedParameters) generateNNModelSavePath = GetSavePath(NNModelSaveDirectory, NNModelSaveExtension, fixedParameters) startTime = time.time() sheepDepth = 9 wolfDepth = 9 depthList = [sheepDepth, wolfDepth] resBlockSize = 2 dropoutRate = 0.0 initializationMethod = 'uniform' multiAgentNNmodel = [ generateModel(sharedWidths * depth, actionLayerWidths, valueLayerWidths, resBlockSize, initializationMethod, dropoutRate) for depth, generateModel in zip(depthList, generateModelList) ] preprocessMultiAgentTrajectories = PreprocessTrajectoriesForBuffer( addMultiAgentValuesToTrajectory, removeTerminalTupleFromTrajectory) numTrajectoriesToStartTrain = 1024 trainOneAgent = TrainOneAgent(numTrainStepEachIteration, numTrajectoriesToStartTrain, processTrajectoryForPolicyValueNets, sampleBatchFromBuffer, trainNN) # restorePretrainModel sheepPreTrainModelPath = os.path.join( dirName, '..', '..', 'data', 'MADDPG2wolves1sheep', 'trainSheepWithPretrrainWolves', 'trainedResNNModels', 'agentId=0_depth=9_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations=250_trainSteps=50000' ) wolvesPreTrainModelPath = os.path.join( dirName, '..', '..', 'data', 'MADDPG2wolves1sheep', 'trainWolvesTwoCenterControlAction', 'trainedResNNModels', 'agentId=1_depth=9_learningRate=0.0001_maxRunningSteps=50_miniBatchSize=256_numSimulations=250_trainSteps=50000' ) pretrainModelPathList = [sheepPreTrainModelPath, wolvesPreTrainModelPath] sheepId, wolvesId = [0, 1] trainableAgentIds = [sheepId, wolvesId] for agentId in trainableAgentIds: restoredNNModel = restoreVariables(multiAgentNNmodel[agentId], pretrainModelPathList[agentId]) multiAgentNNmodel[agentId] = restoredNNModel NNModelPathParameters = { 'iterationIndex': 0, 'agentId': agentId, 'numTrajectoriesPerIteration': numTrajectoriesPerIteration, 'numTrainStepEachIteration': numTrainStepEachIteration } NNModelSavePath = generateNNModelSavePath(NNModelPathParameters) saveVariables(multiAgentNNmodel[agentId], NNModelSavePath) fuzzySearchParameterNames = ['sampleIndex'] loadTrajectoriesForParallel = LoadTrajectories(generateTrajectorySavePath, loadFromPickle, fuzzySearchParameterNames) loadTrajectoriesForTrainBreak = LoadTrajectories( generateTrajectorySavePath, loadFromPickle) # initRreplayBuffer replayBuffer = [] trajectoryBeforeTrainIndex = 0 trajectoryBeforeTrainPathParamters = { 'iterationIndex': trajectoryBeforeTrainIndex } trajectoriesBeforeTrain = loadTrajectoriesForParallel( trajectoryBeforeTrainPathParamters) preProcessedTrajectoriesBeforeTrain = preprocessMultiAgentTrajectories( trajectoriesBeforeTrain) replayBuffer = saveToBuffer(replayBuffer, preProcessedTrajectoriesBeforeTrain) # delete used model for disk space fixedParametersForDelete = { 'maxRunningSteps': maxRunningSteps, 'numSimulations': numSimulations, 'killzoneRadius': killzoneRadius, 'numTrajectoriesPerIteration': numTrajectoriesPerIteration, 'numTrainStepEachIteration': numTrainStepEachIteration } toDeleteNNModelExtensionList = ['.meta', '.index', '.data-00000-of-00001'] generatetoDeleteNNModelPathList = [ GetSavePath(NNModelSaveDirectory, toDeleteNNModelExtension, fixedParametersForDelete) for toDeleteNNModelExtension in toDeleteNNModelExtensionList ] # restore model restoredIteration = 0 for agentId in trainableAgentIds: modelPathForRestore = generateNNModelSavePath({ 'iterationIndex': restoredIteration, 'agentId': agentId, 'numTrajectoriesPerIteration': numTrajectoriesPerIteration, 'numTrainStepEachIteration': numTrainStepEachIteration }) restoredNNModel = restoreVariables(multiAgentNNmodel[agentId], modelPathForRestore) multiAgentNNmodel[agentId] = restoredNNModel # restore buffer bufferTrajectoryPathParameters = { 'numTrajectoriesPerIteration': numTrajectoriesPerIteration, 'numTrainStepEachIteration': numTrainStepEachIteration } restoredIterationIndexRange = range(restoredIteration) restoredTrajectories = loadTrajectoriesForTrainBreak( parameters=bufferTrajectoryPathParameters, parametersWithSpecificValues={ 'iterationIndex': list(restoredIterationIndexRange) }) preProcessedRestoredTrajectories = preprocessMultiAgentTrajectories( restoredTrajectories) print(len(preProcessedRestoredTrajectories)) replayBuffer = saveToBuffer(replayBuffer, preProcessedRestoredTrajectories) modelMemorySize = 5 modelSaveFrequency = 50 deleteUsedModel = DeleteUsedModel(modelMemorySize, modelSaveFrequency, generatetoDeleteNNModelPathList) numIterations = 10000 for iterationIndex in range(restoredIteration + 1, numIterations): print('iterationIndex: ', iterationIndex) numCpuToUseWhileTrain = int(16) numCmdList = min(numTrajectoriesPerIteration, numCpuToUseWhileTrain) sampleTrajectoryFileName = 'sampleMultiMCTSAgentCenterControlResNetTrajCondtion.py' generateTrajectoriesParallelWhileTrain = GenerateTrajectoriesParallel( sampleTrajectoryFileName, numTrajectoriesPerIteration, numCmdList) trajectoryPathParameters = { 'iterationIndex': iterationIndex, 'numTrajectoriesPerIteration': numTrajectoriesPerIteration, 'numTrainStepEachIteration': numTrainStepEachIteration } cmdList = generateTrajectoriesParallelWhileTrain( trajectoryPathParameters) trajectories = loadTrajectoriesForParallel(trajectoryPathParameters) trajectorySavePath = generateTrajectorySavePath( trajectoryPathParameters) saveToPickle(trajectories, trajectorySavePath) preProcessedTrajectories = preprocessMultiAgentTrajectories( trajectories) updatedReplayBuffer = saveToBuffer(replayBuffer, preProcessedTrajectories) for agentId in trainableAgentIds: updatedAgentNNModel = trainOneAgent(agentId, multiAgentNNmodel, updatedReplayBuffer) NNModelPathParameters = { 'iterationIndex': iterationIndex, 'agentId': agentId, 'numTrajectoriesPerIteration': numTrajectoriesPerIteration, 'numTrainStepEachIteration': numTrainStepEachIteration } NNModelSavePath = generateNNModelSavePath(NNModelPathParameters) saveVariables(updatedAgentNNModel, NNModelSavePath) multiAgentNNmodel[agentId] = updatedAgentNNModel replayBuffer = updatedReplayBuffer deleteUsedModel(iterationIndex, agentId) endTime = time.time() print("Time taken for {} iterations: {} seconds".format( numIterations, (endTime - startTime)))