def main(): stateDim = env.observation_space.shape[0] actionDim = env.action_space.shape[0] actionHigh = env.action_space.high actionLow = env.action_space.low actionBound = (actionHigh - actionLow) / 2 buildActorModel = BuildActorModel(stateDim, actionDim, actionBound) actorLayerWidths = [30] actorWriter, actorModel = buildActorModel(actorLayerWidths) buildCriticModel = BuildCriticModel(stateDim, actionDim) criticLayerWidths = [30] criticWriter, criticModel = buildCriticModel(criticLayerWidths) trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma, criticWriter) trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget, trainCriticBySASRQ) trainActorFromGradients = TrainActorFromGradients(learningRateActor, actorWriter) trainActorOneStep = TrainActorOneStep(actByPolicyTrain, trainActorFromGradients, getActionGradients) trainActor = TrainActor(trainActorOneStep) paramUpdateInterval = 1 updateParameters = UpdateParameters(paramUpdateInterval, tau) modelList = [actorModel, criticModel] actorModel, criticModel = resetTargetParamToTrainParam(modelList) trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, actorModel, criticModel) noiseInitVariance = 3 varianceDiscount = .9995 noiseDecayStartStep = bufferSize getNoise = GetExponentialDecayGaussNoise(noiseInitVariance, varianceDiscount, noiseDecayStartStep) actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, actorModel, getNoise) learningStartBufferSize = minibatchSize sampleFromMemory = SampleFromMemory(minibatchSize) learnFromBuffer = LearnFromBuffer(learningStartBufferSize, sampleFromMemory, trainModels) transit = TransitGymPendulum() getReward = RewardGymPendulum(angle_normalize) sampleOneStep = SampleOneStep(transit, getReward) runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep, learnFromBuffer, observe) reset = ResetGymPendulum(seed) runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminalGymPendulum) dirName = os.path.dirname(__file__) modelPath = os.path.join(dirName, '..', 'trainedDDPGModels', 'pendulum') getTrainedModel = lambda: trainModels.actorModel modelSaveRate = 50 saveModel = SaveModel(modelSaveRate, saveVariables, getTrainedModel, modelPath) ddpg = RunAlgorithm(runEpisode, maxEpisode, [saveModel]) replayBuffer = deque(maxlen=int(bufferSize)) meanRewardList, trajectory = ddpg(replayBuffer) dirName = os.path.dirname(__file__) trajectoryPath = os.path.join(dirName, '..', 'trajectory', 'pendulumTrajectory1.pickle') saveToPickle(trajectory, trajectoryPath) # plots& plot showDemo = True if showDemo: visualize = VisualizeGymPendulum() visualize(trajectory) plotResult = True if plotResult: plt.plot(list(range(maxEpisode)), meanRewardList) plt.show()
def main(): debug = 0 if debug: numWolves = 3 numSheeps = 1 numBlocks = 2 saveAllmodels = False maxTimeStep = 25 sheepSpeedMultiplier = 1 sampleMethod = '5' learningRateSheepCritic = 0.005 learningRateSheepActor = 0.005 else: print(sys.argv) condition = json.loads(sys.argv[1]) numWolves = 3 numSheeps = 1 numBlocks = 2 saveAllmodels = False maxTimeStep = 25 sheepSpeedMultiplier = 1 sampleMethod = condition['sampleMethod'] learningRateSheepCritic = condition['sheepLr'] learningRateSheepActor = condition['sheepLr'] print( "maddpg: {} wolves, {} sheep, {} blocks, {} episodes with {} steps each eps, sheepSpeed: {}x, sampleMethod: {}" .format(numWolves, numSheeps, numBlocks, maxEpisode, maxTimeStep, sheepSpeedMultiplier, str(sampleMethod))) numAgents = numWolves + numSheeps numEntities = numAgents + numBlocks wolvesID = list(range(numWolves)) sheepsID = list(range(numWolves, numAgents)) blocksID = list(range(numAgents, numEntities)) wolfSize = 0.075 sheepSize = 0.05 blockSize = 0.2 entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheeps + [ blockSize ] * numBlocks wolfMaxSpeed = 1.0 blockMaxSpeed = None sheepMaxSpeedOriginal = 1.3 sheepMaxSpeed = sheepMaxSpeedOriginal * sheepSpeedMultiplier entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [ sheepMaxSpeed ] * numSheeps + [blockMaxSpeed] * numBlocks entitiesMovableList = [True] * numAgents + [False] * numBlocks massList = [1.0] * numEntities isCollision = IsCollision(getPosFromAgentState) punishForOutOfBound = PunishForOutOfBound() rewardSheep = RewardSheep(wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBound) rewardWolfIndivid = RewardWolfIndividual(wolvesID, sheepsID, entitiesSizeList, isCollision) rewardWolfShared = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision) rewardFuncIndividWolf = lambda state, action, nextState: \ list(rewardWolfIndivid(state, action, nextState)) + list(rewardSheep(state, action, nextState)) rewardFuncSharedWolf = lambda state, action, nextState: \ list(rewardWolfShared(state, action, nextState)) + list(rewardSheep(state, action, nextState)) reset = ResetMultiAgentChasing(numAgents, numBlocks) observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID, blocksID, getPosFromAgentState, getVelFromAgentState) observe = lambda state: [ observeOneAgent(agentID)(state) for agentID in range(numAgents) ] reshapeAction = ReshapeAction() getCollisionForce = GetCollisionForce() applyActionForce = ApplyActionForce(wolvesID, sheepsID, entitiesMovableList) applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList, entitiesSizeList, getCollisionForce, getPosFromAgentState) integrateState = IntegrateState(numEntities, entitiesMovableList, massList, entityMaxSpeedList, getVelFromAgentState, getPosFromAgentState) transit = TransitMultiAgentChasing(numEntities, reshapeAction, applyActionForce, applyEnvironForce, integrateState) isTerminal = lambda state: [False] * numAgents initObsForParams = observe(reset()) obsShape = [ initObsForParams[obsID].shape[0] for obsID in range(len(initObsForParams)) ] worldDim = 2 actionDim = worldDim * 2 + 1 layerWidth = [128, 128] #------------ models ------------------------ buildMADDPGModels = BuildMADDPGModels(actionDim, numAgents, obsShape) modelsListShared = [ buildMADDPGModels(layerWidth, agentID) for agentID in range(numAgents) ] sheepModel = [modelsListShared[sheepID] for sheepID in sheepsID] modelsListIndivid = [ buildMADDPGModels(layerWidth, agentID) for agentID in wolvesID ] + sheepModel trainCriticBySASRWolf = TrainCriticBySASR( actByPolicyTargetNoisyForNextState, learningRateWolfCritic, gamma) trainCriticWolf = TrainCritic(trainCriticBySASRWolf) trainCriticBySASRSheep = TrainCriticBySASR( actByPolicyTargetNoisyForNextState, learningRateSheepCritic, gamma) trainCriticSheep = TrainCritic(trainCriticBySASRSheep) trainActorFromSAWolf = TrainActorFromSA(learningRateWolfActor) trainActorWolf = TrainActor(trainActorFromSAWolf) trainActorFromSASheep = TrainActorFromSA(learningRateSheepActor) trainActorSheep = TrainActor(trainActorFromSASheep) trainActorList = [trainActorWolf] * numWolves + [trainActorSheep ] * numSheeps trainCriticList = [trainCriticWolf] * numWolves + [trainCriticSheep ] * numSheeps paramUpdateInterval = 1 # updateParameters = UpdateParameters(paramUpdateInterval, tau) sampleBatchFromMemory = SampleFromMemory(minibatchSize) learnInterval = 100 learningStartBufferSize = minibatchSize * maxTimeStep startLearn = StartLearn(learningStartBufferSize, learnInterval) trainMADDPGModelsIndivid = TrainMADDPGModelsWithIterSheep( updateParameters, trainActorList, trainCriticList, sampleBatchFromMemory, startLearn, modelsListIndivid) trainMADDPGModelsShared = TrainMADDPGModelsWithIterSheep( updateParameters, trainActorList, trainCriticList, sampleBatchFromMemory, startLearn, modelsListShared) actOneStepOneModel = ActOneStep(actByPolicyTrainNoisy) actOneStepIndivid = lambda allAgentsStates, runTime: [ actOneStepOneModel(model, allAgentsStates) for model in modelsListIndivid ] actOneStepShared = lambda allAgentsStates, runTime: [ actOneStepOneModel(model, allAgentsStates) for model in modelsListShared ] sampleOneStepIndivid = SampleOneStep(transit, rewardFuncIndividWolf) sampleOneStepShared = SampleOneStep(transit, rewardFuncSharedWolf) runDDPGTimeStepIndivid = RunTimeStep(actOneStepIndivid, sampleOneStepIndivid, trainMADDPGModelsIndivid, observe=observe) runDDPGTimeStepShared = RunTimeStep(actOneStepShared, sampleOneStepShared, trainMADDPGModelsShared, observe=observe) runEpisodeIndivid = RunEpisode(reset, runDDPGTimeStepIndivid, maxTimeStep, isTerminal) runEpisodeShared = RunEpisode(reset, runDDPGTimeStepShared, maxTimeStep, isTerminal) getAgentModelIndivid = lambda agentId: lambda: trainMADDPGModelsIndivid.getTrainedModels( )[agentId] getModelListIndivid = [getAgentModelIndivid(i) for i in range(numAgents)] modelSaveRate = 1000 individStr = 'individ' fileName = "maddpg{}wolves{}sheep{}blocks{}episodes{}stepSheepSpeed{}Lr{}SampleMethod{}{}_agent".format( numWolves, numSheeps, numBlocks, maxEpisode, maxTimeStep, sheepSpeedMultiplier, learningRateSheepActor, sampleMethod, individStr) modelPath = os.path.join(dirName, '..', 'trainedModels', 'IterTrainSheep_evalSheeplrAndSampleMethod', fileName) saveModelsIndivid = [ SaveModel(modelSaveRate, saveVariables, getTrainedModel, modelPath + str(i), saveAllmodels) for i, getTrainedModel in enumerate(getModelListIndivid) ] getAgentModelShared = lambda agentId: lambda: trainMADDPGModelsShared.getTrainedModels( )[agentId] getModelListShared = [getAgentModelShared(i) for i in range(numAgents)] individStr = 'shared' fileName = "maddpg{}wolves{}sheep{}blocks{}episodes{}stepSheepSpeed{}Lr{}SampleMethod{}{}_agent".format( numWolves, numSheeps, numBlocks, maxEpisode, maxTimeStep, sheepSpeedMultiplier, learningRateSheepActor, sampleMethod, individStr) modelPath = os.path.join(dirName, '..', 'trainedModels', 'IterTrainSheep_evalSheeplrAndSampleMethod', fileName) saveModelsShared = [ SaveModel(modelSaveRate, saveVariables, getTrainedModel, modelPath + str(i), saveAllmodels) for i, getTrainedModel in enumerate(getModelListShared) ] maddpgIterSheep = RunAlgorithmWithIterSheep(runEpisodeIndivid, runEpisodeShared, maxEpisode, saveModelsIndivid, saveModelsShared, sampleMethod, numAgents) replayBufferIndivid = getBuffer(bufferSize) replayBufferShared = getBuffer(bufferSize) meanRewardList, trajectory = maddpgIterSheep(replayBufferShared, replayBufferIndivid)
def main(): debug = 1 if debug: numWolves = 2 numSheeps = 1 numBlocks = 1 saveAllmodels = True maxTimeStep = 25 sheepSpeedMultiplier = 1 individualRewardWolf = int(False) else: print(sys.argv) condition = json.loads(sys.argv[1]) numWolves = int(condition['numWolves']) numSheeps = int(condition['numSheeps']) numBlocks = int(condition['numBlocks']) maxTimeStep = int(condition['maxTimeStep']) sheepSpeedMultiplier = float(condition['sheepSpeedMultiplier']) individualRewardWolf = int(condition['individualRewardWolf']) saveAllmodels = False print("maddpg: {} wolves, {} sheep, {} blocks, {} episodes with {} steps each eps, sheepSpeed: {}x, wolfIndividualReward: {}, save all models: {}". format(numWolves, numSheeps, numBlocks, maxEpisode, maxTimeStep, sheepSpeedMultiplier, individualRewardWolf, str(saveAllmodels))) numAgents = numWolves + numSheeps numEntities = numAgents + numBlocks wolvesID = list(range(numWolves)) sheepsID = list(range(numWolves, numAgents)) blocksID = list(range(numAgents, numEntities)) wolfSize = 0.075 sheepSize = 0.05 blockSize = 0.2 entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheeps + [blockSize] * numBlocks wolfMaxSpeed = 1.0 blockMaxSpeed = None sheepMaxSpeedOriginal = 1.3 sheepMaxSpeed = sheepMaxSpeedOriginal * sheepSpeedMultiplier entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [sheepMaxSpeed] * numSheeps + [blockMaxSpeed] * numBlocks entitiesMovableList = [True] * numAgents + [False] * numBlocks massList = [1.0] * numEntities isCollision = IsCollision(getPosFromAgentState) punishForOutOfBound = PunishForOutOfBound() rewardSheep = RewardSheep(wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBound) if individualRewardWolf: rewardWolf = RewardWolfIndividual(wolvesID, sheepsID, entitiesSizeList, isCollision) else: rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision) rewardFunc = lambda state, action, nextState: \ list(rewardWolf(state, action, nextState)) + list(rewardSheep(state, action, nextState)) reset = ResetMultiAgentChasing(numAgents, numBlocks) observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID, blocksID, getPosFromAgentState, getVelFromAgentState) observe = lambda state: [observeOneAgent(agentID)(state) for agentID in range(numAgents)] reshapeAction = ReshapeAction() getCollisionForce = GetCollisionForce() applyActionForce = ApplyActionForce(wolvesID, sheepsID, entitiesMovableList) applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList, entitiesSizeList,getCollisionForce, getPosFromAgentState) integrateState = IntegrateState(numEntities, entitiesMovableList, massList,entityMaxSpeedList, getVelFromAgentState, getPosFromAgentState) transit = TransitMultiAgentChasing(numEntities, reshapeAction, applyActionForce, applyEnvironForce, integrateState) isTerminal = lambda state: [False]* numAgents initObsForParams = observe(reset()) obsShape = [initObsForParams[obsID].shape[0] for obsID in range(len(initObsForParams))] worldDim = 2 actionDim = worldDim * 2 + 1 layerWidth = [128, 128] #------------ models ------------------------ buildMADDPGModels = BuildMADDPGModels(actionDim, numAgents, obsShape) modelsList = [buildMADDPGModels(layerWidth, agentID) for agentID in range(numAgents)] trainCriticBySASR = TrainCriticBySASR(actByPolicyTargetNoisyForNextState, learningRateCritic, gamma) trainCritic = TrainCritic(trainCriticBySASR) trainActorFromSA = TrainActorFromSA(learningRateActor) trainActor = TrainActor(trainActorFromSA) paramUpdateInterval = 1 # updateParameters = UpdateParameters(paramUpdateInterval, tau) sampleBatchFromMemory = SampleFromMemory(minibatchSize) learnInterval = 100 learningStartBufferSize = minibatchSize * maxTimeStep startLearn = StartLearn(learningStartBufferSize, learnInterval) trainMADDPGModels = TrainMADDPGModelsWithBuffer(updateParameters, trainActor, trainCritic, sampleBatchFromMemory, startLearn, modelsList) actOneStepOneModel = ActOneStep(actByPolicyTrainNoisy) actOneStep = lambda allAgentsStates, runTime: [actOneStepOneModel(model, allAgentsStates) for model in modelsList] sampleOneStep = SampleOneStep(transit, rewardFunc) runDDPGTimeStep = RunTimeStep(actOneStep, sampleOneStep, trainMADDPGModels, observe = observe) runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal) getAgentModel = lambda agentId: lambda: trainMADDPGModels.getTrainedModels()[agentId] getModelList = [getAgentModel(i) for i in range(numAgents)] modelSaveRate = 1000 individStr = 'individ' if individualRewardWolf else 'shared' fileName = "maddpg{}wolves{}sheep{}blocks{}episodes{}stepSheepSpeed{}{}_agent".format( numWolves, numSheeps, numBlocks, maxEpisode, maxTimeStep, sheepSpeedMultiplier, individStr) modelPath = os.path.join(dirName, '..', 'trainedModels', 'maddpg', fileName) saveModels = [SaveModel(modelSaveRate, saveVariables, getTrainedModel, modelPath+ str(i), saveAllmodels) for i, getTrainedModel in enumerate(getModelList)] maddpg = RunAlgorithm(runEpisode, maxEpisode, saveModels, numAgents) replayBuffer = getBuffer(bufferSize) meanRewardList, trajectory = maddpg(replayBuffer)
def main(): debug = 0 if debug: damping = 0.0 frictionloss = 0.4 masterForce = 1.0 numWolves = 1 numSheeps = 1 numMasters = 1 saveAllmodels = True maxTimeStep = 25 visualize = False else: print(sys.argv) condition = json.loads(sys.argv[1]) numWolves = 1 numSheeps = 1 numMasters = 1 damping = float(condition['damping']) frictionloss = float(condition['frictionloss']) masterForce = float(condition['masterForce']) maxTimeStep = 25 visualize = False saveAllmodels = True print( "maddpg: {} wolves, {} sheep, {} blocks, {} episodes with {} steps each eps, save all models: {}" .format(numWolves, numSheeps, numMasters, maxEpisode, maxTimeStep, str(saveAllmodels))) print(damping, frictionloss, masterForce) modelFolder = os.path.join( dirName, '..', 'trainedModels', 'mujocoMADDPGLeasedFixedEnv2', 'damping={}_frictionloss={}_masterForce={}'.format( damping, frictionloss, masterForce)) if not os.path.exists(modelFolder): os.makedirs(modelFolder) numAgents = numWolves + numSheeps + numMasters numEntities = numAgents + numMasters wolvesID = [0] sheepsID = [1] masterID = [2] wolfSize = 0.075 sheepSize = 0.05 blockSize = 0.075 entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheeps + [ blockSize ] * numMasters massList = [1.0] * numEntities isCollision = IsCollision(getPosFromAgentState) punishForOutOfBound = PunishForOutOfBound() rewardSheep = RewardSheep(wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBound) rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision) rewardMaster = lambda state, action, nextState: [ -reward for reward in rewardWolf(state, action, nextState) ] rewardFunc = lambda state, action, nextState: \ list(rewardWolf(state, action, nextState)) + list(rewardSheep(state, action, nextState))+list(rewardMaster(state, action, nextState)) makePropertyList = MakePropertyList(transferNumberListToStr) geomIds = [1, 2, 3] keyNameList = [0, 1] valueList = [[damping, damping]] * len(geomIds) dampngParameter = makePropertyList(geomIds, keyNameList, valueList) changeJointDampingProperty = lambda envDict, geomPropertyDict: changeJointProperty( envDict, geomPropertyDict, '@damping') geomIds = [1, 2, 3] keyNameList = [0, 1] valueList = [[frictionloss, frictionloss]] * len(geomIds) frictionlossParameter = makePropertyList(geomIds, keyNameList, valueList) changeJointFrictionlossProperty = lambda envDict, geomPropertyDict: changeJointProperty( envDict, geomPropertyDict, '@frictionloss') physicsDynamicsPath = os.path.join(dirName, '..', '..', 'environment', 'mujocoEnv', 'rope', 'leasedNew.xml') with open(physicsDynamicsPath) as f: xml_string = f.read() envXmlDict = xmltodict.parse(xml_string.strip()) envXmlPropertyDictList = [dampngParameter, frictionlossParameter] changeEnvXmlPropertFuntionyList = [ changeJointDampingProperty, changeJointFrictionlossProperty ] for propertyDict, changeXmlProperty in zip( envXmlPropertyDictList, changeEnvXmlPropertFuntionyList): envXmlDict = changeXmlProperty(envXmlDict, propertyDict) envXml = xmltodict.unparse(envXmlDict) physicsModel = mujoco.load_model_from_xml(envXml) physicsSimulation = mujoco.MjSim(physicsModel) qPosInit = (0, ) * 24 qVelInit = (0, ) * 24 qPosInitNoise = 0.6 qVelInitNoise = 0 numAgent = 3 tiedAgentId = [0, 2] ropePartIndex = list(range(3, 12)) maxRopePartLength = 0.06 reset = ResetUniformWithoutXPosForLeashed(physicsSimulation, qPosInit, qVelInit, numAgent, tiedAgentId, ropePartIndex, maxRopePartLength, qPosInitNoise, qVelInitNoise) numSimulationFrames = 10 isTerminal = lambda state: False reshapeActionList = [ ReshapeAction(5), ReshapeAction(5), ReshapeAction(masterForce) ] transit = TransitionFunctionWithoutXPos(physicsSimulation, numSimulationFrames, visualize, isTerminal, reshapeActionList) observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID, masterID, getPosFromAgentState, getVelFromAgentState) observe = lambda state: [ observeOneAgent(agentID)(state) for agentID in range(numAgents) ] initObsForParams = observe(reset()) obsShape = [ initObsForParams[obsID].shape[0] for obsID in range(len(initObsForParams)) ] worldDim = 2 actionDim = worldDim * 2 + 1 layerWidth = [128, 128] #------------ models ------------------------ buildMADDPGModels = BuildMADDPGModels(actionDim, numAgents, obsShape) modelsList = [ buildMADDPGModels(layerWidth, agentID) for agentID in range(numAgents) ] trainCriticBySASR = TrainCriticBySASR(actByPolicyTargetNoisyForNextState, learningRateCritic, gamma) trainCritic = TrainCritic(trainCriticBySASR) trainActorFromSA = TrainActorFromSA(learningRateActor) trainActor = TrainActor(trainActorFromSA) paramUpdateInterval = 1 # updateParameters = UpdateParameters(paramUpdateInterval, tau) sampleBatchFromMemory = SampleFromMemory(minibatchSize) learnInterval = 100 learningStartBufferSize = minibatchSize * maxTimeStep startLearn = StartLearn(learningStartBufferSize, learnInterval) trainMADDPGModels = TrainMADDPGModelsWithBuffer(updateParameters, trainActor, trainCritic, sampleBatchFromMemory, startLearn, modelsList) actOneStepOneModel = ActOneStep(actByPolicyTrainNoisy) actOneStep = lambda allAgentsStates, runTime: [ actOneStepOneModel(model, allAgentsStates) for model in modelsList ] sampleOneStep = SampleOneStep(transit, rewardFunc) runDDPGTimeStep = RunTimeStep(actOneStep, sampleOneStep, trainMADDPGModels, observe=observe) runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal) getAgentModel = lambda agentId: lambda: trainMADDPGModels.getTrainedModels( )[agentId] getModelList = [getAgentModel(i) for i in range(numAgents)] modelSaveRate = 1000 fileName = "maddpg{}episodes{}step_agent".format(maxEpisode, maxTimeStep) modelPath = os.path.join(modelFolder, fileName) saveModels = [ SaveModel(modelSaveRate, saveVariables, getTrainedModel, modelPath + str(i), saveAllmodels) for i, getTrainedModel in enumerate(getModelList) ] maddpg = RunAlgorithm(runEpisode, maxEpisode, saveModels, numAgents) replayBuffer = getBuffer(bufferSize) meanRewardList, trajectory = maddpg(replayBuffer)
def main(): debug = 1 if debug: numWolves = 2 numSheeps = 4 numBlocks = 2 hasWalls = 1.0 dt = 0.02 maxTimeStep = 25 sheepSpeedMultiplier = 1.0 individualRewardWolf = int(False) mujocoVisualize = False saveAllmodels = True else: print(sys.argv) condition = json.loads(sys.argv[1]) numWolves = int(condition['numWolves']) numSheeps = int(condition['numSheeps']) numBlocks = int(condition['numBlocks']) hasWalls = float(condition['hasWalls']) dt = float(condition['dt']) maxTimeStep = int(condition['maxTimeStep']) sheepSpeedMultiplier = float(condition['sheepSpeedMultiplier']) individualRewardWolf = int(condition['individualRewardWolf']) saveAllmodels = True mujocoVisualize = False print( "maddpg: {} wolves, {} sheep, {} blocks, {} episodes with {} steps each eps, sheepSpeed: {}x, wolfIndividualReward: {}, save all models: {}" .format(numWolves, numSheeps, numBlocks, maxEpisode, maxTimeStep, sheepSpeedMultiplier, individualRewardWolf, str(saveAllmodels))) dataMainFolder = os.path.join(dirName, '..', 'trainedModels', 'mujocoMADDPG') modelFolder = os.path.join( dataMainFolder, 'dt={}'.format(dt), 'hasWalls={}_numBlocks={}_numSheeps={}_numWolves={}_individualRewardWolf={}_sheepSpeedMultiplier={}.xml' .format(hasWalls, numBlocks, numSheeps, numWolves, individualRewardWolf, sheepSpeedMultiplier)) if not os.path.exists(modelFolder): os.makedirs(modelFolder) numAgents = numWolves + numSheeps numEntities = numAgents + numBlocks wolvesID = list(range(numWolves)) sheepsID = list(range(numWolves, numAgents)) blocksID = list(range(numAgents, numEntities)) wolfSize = 0.075 sheepSize = 0.05 blockSize = 0.2 entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheeps + [ blockSize ] * numBlocks isCollision = IsCollision(getPosFromAgentState) punishForOutOfBound = lambda state: 0 #PunishForOutOfBound() rewardSheep = RewardSheep(wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBound) if individualRewardWolf: rewardWolf = RewardWolfIndividual(wolvesID, sheepsID, entitiesSizeList, isCollision) else: rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision) rewardFunc = lambda state, action, nextState: \ list(rewardWolf(state, action, nextState)) + list(rewardSheep(state, action, nextState)) #------------ mujocoEnv ------------------------ physicsDynamicsPath = os.path.join( dirName, '..', '..', 'environment', 'mujocoEnv', 'dt={}'.format(dt), 'hasWalls={}_numBlocks={}_numSheeps={}_numWolves={}.xml'.format( hasWalls, numBlocks, numSheeps, numWolves)) with open(physicsDynamicsPath) as f: xml_string = f.read() envXmlDict = xmltodict.parse(xml_string.strip()) envXml = xmltodict.unparse(envXmlDict) physicsModel = mujoco.load_model_from_xml(envXml) physicsSimulation = mujoco.MjSim(physicsModel) qPosInit = [0, 0] * numAgents qVelInit = [0, 0] * numAgents qVelInitNoise = 0 * hasWalls qPosInitNoise = 0.8 * hasWalls getBlockRandomPos = lambda: np.random.uniform(-0.7 * hasWalls, +0.7 * hasWalls, 2) getBlockSpeed = lambda: np.zeros(2) numQPos = len(physicsSimulation.data.qpos) numQVel = len(physicsSimulation.data.qvel) sampleAgentsQPos = lambda: np.asarray(qPosInit) + np.random.uniform( low=-qPosInitNoise, high=qPosInitNoise, size=numQPos) sampleAgentsQVel = lambda: np.asarray(qVelInit) + np.random.uniform( low=-qVelInitNoise, high=qVelInitNoise, size=numQVel) minDistance = 0.2 + 2 * blockSize #>2*wolfSize+2*blockSize isOverlap = IsOverlap(minDistance) sampleBlockState = SampleBlockState(numBlocks, getBlockRandomPos, getBlockSpeed, isOverlap) reset = ResetUniformWithoutXPos(physicsSimulation, numAgents, numBlocks, sampleAgentsQPos, sampleAgentsQVel, sampleBlockState) transitTimePerStep = 0.1 numSimulationFrames = int(transitTimePerStep / dt) isTerminal = lambda state: [False] * numAgents reshapeAction = ReshapeAction() transit = TransitionFunction(physicsSimulation, numAgents, numSimulationFrames, mujocoVisualize, isTerminal, reshapeAction) observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID, blocksID, getPosFromAgentState, getVelFromAgentState) observe = lambda state: [ observeOneAgent(agentID)(state) for agentID in range(numAgents) ] initObsForParams = observe(reset()) obsShape = [ initObsForParams[obsID].shape[0] for obsID in range(len(initObsForParams)) ] worldDim = 2 actionDim = worldDim * 2 + 1 layerWidth = [128, 128] #------------ models ------------------------ buildMADDPGModels = BuildMADDPGModels(actionDim, numAgents, obsShape) modelsList = [ buildMADDPGModels(layerWidth, agentID) for agentID in range(numAgents) ] trainCriticBySASR = TrainCriticBySASR(actByPolicyTargetNoisyForNextState, learningRateCritic, gamma) trainCritic = TrainCritic(trainCriticBySASR) trainActorFromSA = TrainActorFromSA(learningRateActor) trainActor = TrainActor(trainActorFromSA) paramUpdateInterval = 1 # updateParameters = UpdateParameters(paramUpdateInterval, tau) sampleBatchFromMemory = SampleFromMemory(minibatchSize) learnInterval = 100 learningStartBufferSize = minibatchSize * maxTimeStep startLearn = StartLearn(learningStartBufferSize, learnInterval) trainMADDPGModels = TrainMADDPGModelsWithBuffer(updateParameters, trainActor, trainCritic, sampleBatchFromMemory, startLearn, modelsList) actOneStepOneModel = ActOneStep(actByPolicyTrainNoisy) actOneStep = lambda allAgentsStates, runTime: [ actOneStepOneModel(model, allAgentsStates) for model in modelsList ] sampleOneStep = SampleOneStep(transit, rewardFunc) runDDPGTimeStep = RunTimeStep(actOneStep, sampleOneStep, trainMADDPGModels, observe=observe) runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal) getAgentModel = lambda agentId: lambda: trainMADDPGModels.getTrainedModels( )[agentId] getModelList = [getAgentModel(i) for i in range(numAgents)] modelSaveRate = 1000 individStr = 'individ' if individualRewardWolf else 'shared' fileName = "maddpghasWalls={}{}wolves{}sheep{}blocks{}episodes{}stepSheepSpeed{}{}_agent".format( hasWalls, numWolves, numSheeps, numBlocks, maxEpisode, maxTimeStep, sheepSpeedMultiplier, individStr) modelPath = os.path.join(modelFolder, fileName) saveModels = [ SaveModel(modelSaveRate, saveVariables, getTrainedModel, modelPath + str(i), saveAllmodels) for i, getTrainedModel in enumerate(getModelList) ] maddpg = RunAlgorithm(runEpisode, maxEpisode, saveModels, numAgents) replayBuffer = getBuffer(bufferSize) meanRewardList, trajectory = maddpg(replayBuffer)
def main(): stateDim = env.observation_space.shape[0] actionDim = env.action_space.shape[0] actionHigh = env.action_space.high actionLow = env.action_space.low actionBound = (actionHigh - actionLow)/2 actorWeightInit = tf.random_uniform_initializer(0, 0.03) actorBiasInit = tf.constant_initializer(0.01) criticWeightInit = tf.random_uniform_initializer(0, 0.01) cirticBiasInit = tf.constant_initializer(0.01) weightInitializerList = [actorWeightInit, actorBiasInit, criticWeightInit, cirticBiasInit] buildModel = BuildDDPGModels(stateDim, actionDim, weightInitializerList, actionBound) layerWidths = [30] writer, model = buildModel(layerWidths) trainCriticBySASR = TrainCriticBySASR(learningRateCritic, gamma, writer) trainCritic = TrainCritic(reshapeBatchToGetSASR, trainCriticBySASR) trainActorFromState = TrainActorFromState(learningRateActor, writer) trainActor = TrainActor(reshapeBatchToGetSASR, trainActorFromState) paramUpdateInterval = 1 # updateParameters = UpdateParameters(paramUpdateInterval, tau) trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, model) noiseInitVariance = 3 varianceDiscount = .9995 noiseDecayStartStep = bufferSize getNoise = GetExponentialDecayGaussNoise(noiseInitVariance, varianceDiscount, noiseDecayStartStep) actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, model, getNoise) learningStartBufferSize = minibatchSize sampleFromMemory = SampleFromMemory(minibatchSize) learnFromBuffer = LearnFromBuffer(learningStartBufferSize, sampleFromMemory, trainModels) transit = TransitGymPendulum() getReward = RewardGymPendulum(angle_normalize) sampleOneStep = SampleOneStep(transit, getReward) runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep, learnFromBuffer, observe) reset = ResetGymPendulum(seed) runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminalGymPendulum) dirName = os.path.dirname(__file__) modelPath = os.path.join(dirName, '..', 'trainedDDPGModels', 'pendulum_newddpg') getTrainedModel = lambda: trainModels.getTrainedModels() modelSaveRate = 50 saveModel = SaveModel(modelSaveRate, saveVariables, getTrainedModel, modelPath) ddpg = RunAlgorithm(runEpisode, maxEpisode, [saveModel]) replayBuffer = deque(maxlen=int(bufferSize)) meanRewardList, trajectory = ddpg(replayBuffer) # plots& plot showDemo = False if showDemo: visualize = VisualizeGymPendulum() visualize(trajectory) plotResult = True if plotResult: plt.plot(list(range(maxEpisode)), meanRewardList) plt.show()