예제 #1
0
def main():
    stateDim = env.observation_space.shape[0]
    actionDim = env.action_space.shape[0]
    actionHigh = env.action_space.high
    actionLow = env.action_space.low
    actionBound = (actionHigh - actionLow) / 2

    buildActorModel = BuildActorModel(stateDim, actionDim, actionBound)
    actorLayerWidths = [30]
    actorWriter, actorModel = buildActorModel(actorLayerWidths)

    buildCriticModel = BuildCriticModel(stateDim, actionDim)
    criticLayerWidths = [30]
    criticWriter, criticModel = buildCriticModel(criticLayerWidths)

    trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma,
                                            criticWriter)
    trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget,
                              trainCriticBySASRQ)

    trainActorFromGradients = TrainActorFromGradients(learningRateActor,
                                                      actorWriter)
    trainActorOneStep = TrainActorOneStep(actByPolicyTrain,
                                          trainActorFromGradients,
                                          getActionGradients)
    trainActor = TrainActor(trainActorOneStep)

    paramUpdateInterval = 1
    updateParameters = UpdateParameters(paramUpdateInterval, tau)

    modelList = [actorModel, criticModel]
    actorModel, criticModel = resetTargetParamToTrainParam(modelList)
    trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic,
                                  actorModel, criticModel)

    noiseInitVariance = 3
    varianceDiscount = .9995
    noiseDecayStartStep = bufferSize
    getNoise = GetExponentialDecayGaussNoise(noiseInitVariance,
                                             varianceDiscount,
                                             noiseDecayStartStep)
    actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh,
                                         actByPolicyTrain, actorModel,
                                         getNoise)

    learningStartBufferSize = minibatchSize
    sampleFromMemory = SampleFromMemory(minibatchSize)
    learnFromBuffer = LearnFromBuffer(learningStartBufferSize,
                                      sampleFromMemory, trainModels)

    transit = TransitGymPendulum()
    getReward = RewardGymPendulum(angle_normalize)
    sampleOneStep = SampleOneStep(transit, getReward)

    runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep,
                                  learnFromBuffer, observe)

    reset = ResetGymPendulum(seed)
    runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep,
                            isTerminalGymPendulum)

    dirName = os.path.dirname(__file__)
    modelPath = os.path.join(dirName, '..', 'trainedDDPGModels', 'pendulum')
    getTrainedModel = lambda: trainModels.actorModel
    modelSaveRate = 50
    saveModel = SaveModel(modelSaveRate, saveVariables, getTrainedModel,
                          modelPath)

    ddpg = RunAlgorithm(runEpisode, maxEpisode, [saveModel])

    replayBuffer = deque(maxlen=int(bufferSize))
    meanRewardList, trajectory = ddpg(replayBuffer)

    dirName = os.path.dirname(__file__)
    trajectoryPath = os.path.join(dirName, '..', 'trajectory',
                                  'pendulumTrajectory1.pickle')
    saveToPickle(trajectory, trajectoryPath)

    # plots& plot
    showDemo = True
    if showDemo:
        visualize = VisualizeGymPendulum()
        visualize(trajectory)

    plotResult = True
    if plotResult:
        plt.plot(list(range(maxEpisode)), meanRewardList)
        plt.show()
예제 #2
0
def main():
    debug = 0
    if debug:
        numWolves = 3
        numSheeps = 1
        numBlocks = 2
        saveAllmodels = False
        maxTimeStep = 25
        sheepSpeedMultiplier = 1
        sampleMethod = '5'
        learningRateSheepCritic = 0.005
        learningRateSheepActor = 0.005

    else:
        print(sys.argv)
        condition = json.loads(sys.argv[1])
        numWolves = 3
        numSheeps = 1
        numBlocks = 2
        saveAllmodels = False
        maxTimeStep = 25
        sheepSpeedMultiplier = 1
        sampleMethod = condition['sampleMethod']
        learningRateSheepCritic = condition['sheepLr']
        learningRateSheepActor = condition['sheepLr']

    print(
        "maddpg: {} wolves, {} sheep, {} blocks, {} episodes with {} steps each eps, sheepSpeed: {}x,  sampleMethod: {}"
        .format(numWolves, numSheeps, numBlocks, maxEpisode, maxTimeStep,
                sheepSpeedMultiplier, str(sampleMethod)))

    numAgents = numWolves + numSheeps
    numEntities = numAgents + numBlocks
    wolvesID = list(range(numWolves))
    sheepsID = list(range(numWolves, numAgents))
    blocksID = list(range(numAgents, numEntities))

    wolfSize = 0.075
    sheepSize = 0.05
    blockSize = 0.2
    entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheeps + [
        blockSize
    ] * numBlocks

    wolfMaxSpeed = 1.0
    blockMaxSpeed = None
    sheepMaxSpeedOriginal = 1.3
    sheepMaxSpeed = sheepMaxSpeedOriginal * sheepSpeedMultiplier

    entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [
        sheepMaxSpeed
    ] * numSheeps + [blockMaxSpeed] * numBlocks
    entitiesMovableList = [True] * numAgents + [False] * numBlocks
    massList = [1.0] * numEntities

    isCollision = IsCollision(getPosFromAgentState)
    punishForOutOfBound = PunishForOutOfBound()
    rewardSheep = RewardSheep(wolvesID, sheepsID, entitiesSizeList,
                              getPosFromAgentState, isCollision,
                              punishForOutOfBound)

    rewardWolfIndivid = RewardWolfIndividual(wolvesID, sheepsID,
                                             entitiesSizeList, isCollision)
    rewardWolfShared = RewardWolf(wolvesID, sheepsID, entitiesSizeList,
                                  isCollision)

    rewardFuncIndividWolf = lambda state, action, nextState: \
        list(rewardWolfIndivid(state, action, nextState)) + list(rewardSheep(state, action, nextState))
    rewardFuncSharedWolf = lambda state, action, nextState: \
        list(rewardWolfShared(state, action, nextState)) + list(rewardSheep(state, action, nextState))

    reset = ResetMultiAgentChasing(numAgents, numBlocks)
    observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID,
                                              blocksID, getPosFromAgentState,
                                              getVelFromAgentState)
    observe = lambda state: [
        observeOneAgent(agentID)(state) for agentID in range(numAgents)
    ]

    reshapeAction = ReshapeAction()
    getCollisionForce = GetCollisionForce()
    applyActionForce = ApplyActionForce(wolvesID, sheepsID,
                                        entitiesMovableList)
    applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList,
                                          entitiesSizeList, getCollisionForce,
                                          getPosFromAgentState)
    integrateState = IntegrateState(numEntities, entitiesMovableList, massList,
                                    entityMaxSpeedList, getVelFromAgentState,
                                    getPosFromAgentState)
    transit = TransitMultiAgentChasing(numEntities, reshapeAction,
                                       applyActionForce, applyEnvironForce,
                                       integrateState)

    isTerminal = lambda state: [False] * numAgents
    initObsForParams = observe(reset())
    obsShape = [
        initObsForParams[obsID].shape[0]
        for obsID in range(len(initObsForParams))
    ]

    worldDim = 2
    actionDim = worldDim * 2 + 1

    layerWidth = [128, 128]

    #------------ models ------------------------

    buildMADDPGModels = BuildMADDPGModels(actionDim, numAgents, obsShape)
    modelsListShared = [
        buildMADDPGModels(layerWidth, agentID) for agentID in range(numAgents)
    ]
    sheepModel = [modelsListShared[sheepID] for sheepID in sheepsID]
    modelsListIndivid = [
        buildMADDPGModels(layerWidth, agentID) for agentID in wolvesID
    ] + sheepModel

    trainCriticBySASRWolf = TrainCriticBySASR(
        actByPolicyTargetNoisyForNextState, learningRateWolfCritic, gamma)
    trainCriticWolf = TrainCritic(trainCriticBySASRWolf)
    trainCriticBySASRSheep = TrainCriticBySASR(
        actByPolicyTargetNoisyForNextState, learningRateSheepCritic, gamma)
    trainCriticSheep = TrainCritic(trainCriticBySASRSheep)

    trainActorFromSAWolf = TrainActorFromSA(learningRateWolfActor)
    trainActorWolf = TrainActor(trainActorFromSAWolf)

    trainActorFromSASheep = TrainActorFromSA(learningRateSheepActor)
    trainActorSheep = TrainActor(trainActorFromSASheep)

    trainActorList = [trainActorWolf] * numWolves + [trainActorSheep
                                                     ] * numSheeps
    trainCriticList = [trainCriticWolf] * numWolves + [trainCriticSheep
                                                       ] * numSheeps

    paramUpdateInterval = 1  #
    updateParameters = UpdateParameters(paramUpdateInterval, tau)
    sampleBatchFromMemory = SampleFromMemory(minibatchSize)

    learnInterval = 100
    learningStartBufferSize = minibatchSize * maxTimeStep
    startLearn = StartLearn(learningStartBufferSize, learnInterval)

    trainMADDPGModelsIndivid = TrainMADDPGModelsWithIterSheep(
        updateParameters, trainActorList, trainCriticList,
        sampleBatchFromMemory, startLearn, modelsListIndivid)
    trainMADDPGModelsShared = TrainMADDPGModelsWithIterSheep(
        updateParameters, trainActorList, trainCriticList,
        sampleBatchFromMemory, startLearn, modelsListShared)

    actOneStepOneModel = ActOneStep(actByPolicyTrainNoisy)
    actOneStepIndivid = lambda allAgentsStates, runTime: [
        actOneStepOneModel(model, allAgentsStates)
        for model in modelsListIndivid
    ]
    actOneStepShared = lambda allAgentsStates, runTime: [
        actOneStepOneModel(model, allAgentsStates)
        for model in modelsListShared
    ]

    sampleOneStepIndivid = SampleOneStep(transit, rewardFuncIndividWolf)
    sampleOneStepShared = SampleOneStep(transit, rewardFuncSharedWolf)

    runDDPGTimeStepIndivid = RunTimeStep(actOneStepIndivid,
                                         sampleOneStepIndivid,
                                         trainMADDPGModelsIndivid,
                                         observe=observe)
    runDDPGTimeStepShared = RunTimeStep(actOneStepShared,
                                        sampleOneStepShared,
                                        trainMADDPGModelsShared,
                                        observe=observe)

    runEpisodeIndivid = RunEpisode(reset, runDDPGTimeStepIndivid, maxTimeStep,
                                   isTerminal)
    runEpisodeShared = RunEpisode(reset, runDDPGTimeStepShared, maxTimeStep,
                                  isTerminal)

    getAgentModelIndivid = lambda agentId: lambda: trainMADDPGModelsIndivid.getTrainedModels(
    )[agentId]
    getModelListIndivid = [getAgentModelIndivid(i) for i in range(numAgents)]
    modelSaveRate = 1000
    individStr = 'individ'
    fileName = "maddpg{}wolves{}sheep{}blocks{}episodes{}stepSheepSpeed{}Lr{}SampleMethod{}{}_agent".format(
        numWolves, numSheeps, numBlocks, maxEpisode, maxTimeStep,
        sheepSpeedMultiplier, learningRateSheepActor, sampleMethod, individStr)
    modelPath = os.path.join(dirName, '..', 'trainedModels',
                             'IterTrainSheep_evalSheeplrAndSampleMethod',
                             fileName)
    saveModelsIndivid = [
        SaveModel(modelSaveRate, saveVariables, getTrainedModel,
                  modelPath + str(i), saveAllmodels)
        for i, getTrainedModel in enumerate(getModelListIndivid)
    ]

    getAgentModelShared = lambda agentId: lambda: trainMADDPGModelsShared.getTrainedModels(
    )[agentId]
    getModelListShared = [getAgentModelShared(i) for i in range(numAgents)]
    individStr = 'shared'
    fileName = "maddpg{}wolves{}sheep{}blocks{}episodes{}stepSheepSpeed{}Lr{}SampleMethod{}{}_agent".format(
        numWolves, numSheeps, numBlocks, maxEpisode, maxTimeStep,
        sheepSpeedMultiplier, learningRateSheepActor, sampleMethod, individStr)
    modelPath = os.path.join(dirName, '..', 'trainedModels',
                             'IterTrainSheep_evalSheeplrAndSampleMethod',
                             fileName)
    saveModelsShared = [
        SaveModel(modelSaveRate, saveVariables, getTrainedModel,
                  modelPath + str(i), saveAllmodels)
        for i, getTrainedModel in enumerate(getModelListShared)
    ]

    maddpgIterSheep = RunAlgorithmWithIterSheep(runEpisodeIndivid,
                                                runEpisodeShared, maxEpisode,
                                                saveModelsIndivid,
                                                saveModelsShared, sampleMethod,
                                                numAgents)

    replayBufferIndivid = getBuffer(bufferSize)
    replayBufferShared = getBuffer(bufferSize)

    meanRewardList, trajectory = maddpgIterSheep(replayBufferShared,
                                                 replayBufferIndivid)
def main():
    debug = 1
    if debug:
        numWolves = 2
        numSheeps = 1
        numBlocks = 1
        saveAllmodels = True
        maxTimeStep = 25
        sheepSpeedMultiplier = 1
        individualRewardWolf = int(False)

    else:
        print(sys.argv)
        condition = json.loads(sys.argv[1])
        numWolves = int(condition['numWolves'])
        numSheeps = int(condition['numSheeps'])
        numBlocks = int(condition['numBlocks'])

        maxTimeStep = int(condition['maxTimeStep'])
        sheepSpeedMultiplier = float(condition['sheepSpeedMultiplier'])
        individualRewardWolf = int(condition['individualRewardWolf'])

        saveAllmodels = False

    print("maddpg: {} wolves, {} sheep, {} blocks, {} episodes with {} steps each eps, sheepSpeed: {}x, wolfIndividualReward: {}, save all models: {}".
          format(numWolves, numSheeps, numBlocks, maxEpisode, maxTimeStep, sheepSpeedMultiplier, individualRewardWolf, str(saveAllmodels)))


    numAgents = numWolves + numSheeps
    numEntities = numAgents + numBlocks
    wolvesID = list(range(numWolves))
    sheepsID = list(range(numWolves, numAgents))
    blocksID = list(range(numAgents, numEntities))

    wolfSize = 0.075
    sheepSize = 0.05
    blockSize = 0.2
    entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheeps + [blockSize] * numBlocks

    wolfMaxSpeed = 1.0
    blockMaxSpeed = None
    sheepMaxSpeedOriginal = 1.3
    sheepMaxSpeed = sheepMaxSpeedOriginal * sheepSpeedMultiplier

    entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [sheepMaxSpeed] * numSheeps + [blockMaxSpeed] * numBlocks
    entitiesMovableList = [True] * numAgents + [False] * numBlocks
    massList = [1.0] * numEntities

    isCollision = IsCollision(getPosFromAgentState)
    punishForOutOfBound = PunishForOutOfBound()
    rewardSheep = RewardSheep(wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState, isCollision,
                              punishForOutOfBound)

    if individualRewardWolf:
        rewardWolf = RewardWolfIndividual(wolvesID, sheepsID, entitiesSizeList, isCollision)
    else:
        rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision)

    rewardFunc = lambda state, action, nextState: \
        list(rewardWolf(state, action, nextState)) + list(rewardSheep(state, action, nextState))

    reset = ResetMultiAgentChasing(numAgents, numBlocks)
    observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID, blocksID, getPosFromAgentState,
                                              getVelFromAgentState)
    observe = lambda state: [observeOneAgent(agentID)(state) for agentID in range(numAgents)]

    reshapeAction = ReshapeAction()
    getCollisionForce = GetCollisionForce()
    applyActionForce = ApplyActionForce(wolvesID, sheepsID, entitiesMovableList)
    applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList, entitiesSizeList,getCollisionForce, getPosFromAgentState)
    integrateState = IntegrateState(numEntities, entitiesMovableList, massList,entityMaxSpeedList, getVelFromAgentState, getPosFromAgentState)
    transit = TransitMultiAgentChasing(numEntities, reshapeAction, applyActionForce, applyEnvironForce, integrateState)

    isTerminal = lambda state: [False]* numAgents
    initObsForParams = observe(reset())
    obsShape = [initObsForParams[obsID].shape[0] for obsID in range(len(initObsForParams))]

    worldDim = 2
    actionDim = worldDim * 2 + 1

    layerWidth = [128, 128]

#------------ models ------------------------

    buildMADDPGModels = BuildMADDPGModels(actionDim, numAgents, obsShape)
    modelsList = [buildMADDPGModels(layerWidth, agentID) for agentID in range(numAgents)]

    trainCriticBySASR = TrainCriticBySASR(actByPolicyTargetNoisyForNextState, learningRateCritic, gamma)
    trainCritic = TrainCritic(trainCriticBySASR)
    trainActorFromSA = TrainActorFromSA(learningRateActor)
    trainActor = TrainActor(trainActorFromSA)

    paramUpdateInterval = 1 #
    updateParameters = UpdateParameters(paramUpdateInterval, tau)
    sampleBatchFromMemory = SampleFromMemory(minibatchSize)

    learnInterval = 100
    learningStartBufferSize = minibatchSize * maxTimeStep
    startLearn = StartLearn(learningStartBufferSize, learnInterval)

    trainMADDPGModels = TrainMADDPGModelsWithBuffer(updateParameters, trainActor, trainCritic, sampleBatchFromMemory, startLearn, modelsList)

    actOneStepOneModel = ActOneStep(actByPolicyTrainNoisy)
    actOneStep = lambda allAgentsStates, runTime: [actOneStepOneModel(model, allAgentsStates) for model in modelsList]

    sampleOneStep = SampleOneStep(transit, rewardFunc)
    runDDPGTimeStep = RunTimeStep(actOneStep, sampleOneStep, trainMADDPGModels, observe = observe)

    runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal)

    getAgentModel = lambda agentId: lambda: trainMADDPGModels.getTrainedModels()[agentId]
    getModelList = [getAgentModel(i) for i in range(numAgents)]
    modelSaveRate = 1000
    individStr = 'individ' if individualRewardWolf else 'shared'
    fileName = "maddpg{}wolves{}sheep{}blocks{}episodes{}stepSheepSpeed{}{}_agent".format(
        numWolves, numSheeps, numBlocks, maxEpisode, maxTimeStep, sheepSpeedMultiplier, individStr)

    modelPath = os.path.join(dirName, '..', 'trainedModels', 'maddpg', fileName)
    saveModels = [SaveModel(modelSaveRate, saveVariables, getTrainedModel, modelPath+ str(i), saveAllmodels) for i, getTrainedModel in enumerate(getModelList)]

    maddpg = RunAlgorithm(runEpisode, maxEpisode, saveModels, numAgents)
    replayBuffer = getBuffer(bufferSize)
    meanRewardList, trajectory = maddpg(replayBuffer)
def main():
    debug = 0
    if debug:

        damping = 0.0
        frictionloss = 0.4
        masterForce = 1.0

        numWolves = 1
        numSheeps = 1
        numMasters = 1
        saveAllmodels = True
        maxTimeStep = 25
        visualize = False

    else:
        print(sys.argv)
        condition = json.loads(sys.argv[1])
        numWolves = 1
        numSheeps = 1
        numMasters = 1
        damping = float(condition['damping'])
        frictionloss = float(condition['frictionloss'])
        masterForce = float(condition['masterForce'])

        maxTimeStep = 25
        visualize = False
        saveAllmodels = True
    print(
        "maddpg: {} wolves, {} sheep, {} blocks, {} episodes with {} steps each eps,  save all models: {}"
        .format(numWolves, numSheeps, numMasters, maxEpisode, maxTimeStep,
                str(saveAllmodels)))
    print(damping, frictionloss, masterForce)

    modelFolder = os.path.join(
        dirName, '..', 'trainedModels', 'mujocoMADDPGLeasedFixedEnv2',
        'damping={}_frictionloss={}_masterForce={}'.format(
            damping, frictionloss, masterForce))

    if not os.path.exists(modelFolder):
        os.makedirs(modelFolder)

    numAgents = numWolves + numSheeps + numMasters
    numEntities = numAgents + numMasters
    wolvesID = [0]
    sheepsID = [1]
    masterID = [2]

    wolfSize = 0.075
    sheepSize = 0.05
    blockSize = 0.075
    entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheeps + [
        blockSize
    ] * numMasters

    massList = [1.0] * numEntities

    isCollision = IsCollision(getPosFromAgentState)
    punishForOutOfBound = PunishForOutOfBound()
    rewardSheep = RewardSheep(wolvesID, sheepsID, entitiesSizeList,
                              getPosFromAgentState, isCollision,
                              punishForOutOfBound)

    rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision)
    rewardMaster = lambda state, action, nextState: [
        -reward for reward in rewardWolf(state, action, nextState)
    ]
    rewardFunc = lambda state, action, nextState: \
        list(rewardWolf(state, action, nextState)) + list(rewardSheep(state, action, nextState))+list(rewardMaster(state, action, nextState))

    makePropertyList = MakePropertyList(transferNumberListToStr)

    geomIds = [1, 2, 3]
    keyNameList = [0, 1]
    valueList = [[damping, damping]] * len(geomIds)
    dampngParameter = makePropertyList(geomIds, keyNameList, valueList)

    changeJointDampingProperty = lambda envDict, geomPropertyDict: changeJointProperty(
        envDict, geomPropertyDict, '@damping')

    geomIds = [1, 2, 3]
    keyNameList = [0, 1]
    valueList = [[frictionloss, frictionloss]] * len(geomIds)
    frictionlossParameter = makePropertyList(geomIds, keyNameList, valueList)
    changeJointFrictionlossProperty = lambda envDict, geomPropertyDict: changeJointProperty(
        envDict, geomPropertyDict, '@frictionloss')

    physicsDynamicsPath = os.path.join(dirName, '..', '..', 'environment',
                                       'mujocoEnv', 'rope', 'leasedNew.xml')
    with open(physicsDynamicsPath) as f:
        xml_string = f.read()

    envXmlDict = xmltodict.parse(xml_string.strip())
    envXmlPropertyDictList = [dampngParameter, frictionlossParameter]
    changeEnvXmlPropertFuntionyList = [
        changeJointDampingProperty, changeJointFrictionlossProperty
    ]
    for propertyDict, changeXmlProperty in zip(
            envXmlPropertyDictList, changeEnvXmlPropertFuntionyList):
        envXmlDict = changeXmlProperty(envXmlDict, propertyDict)

    envXml = xmltodict.unparse(envXmlDict)
    physicsModel = mujoco.load_model_from_xml(envXml)
    physicsSimulation = mujoco.MjSim(physicsModel)

    qPosInit = (0, ) * 24
    qVelInit = (0, ) * 24
    qPosInitNoise = 0.6
    qVelInitNoise = 0
    numAgent = 3
    tiedAgentId = [0, 2]
    ropePartIndex = list(range(3, 12))
    maxRopePartLength = 0.06
    reset = ResetUniformWithoutXPosForLeashed(physicsSimulation, qPosInit,
                                              qVelInit, numAgent, tiedAgentId,
                                              ropePartIndex, maxRopePartLength,
                                              qPosInitNoise, qVelInitNoise)

    numSimulationFrames = 10
    isTerminal = lambda state: False
    reshapeActionList = [
        ReshapeAction(5),
        ReshapeAction(5),
        ReshapeAction(masterForce)
    ]
    transit = TransitionFunctionWithoutXPos(physicsSimulation,
                                            numSimulationFrames, visualize,
                                            isTerminal, reshapeActionList)

    observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID,
                                              masterID, getPosFromAgentState,
                                              getVelFromAgentState)
    observe = lambda state: [
        observeOneAgent(agentID)(state) for agentID in range(numAgents)
    ]
    initObsForParams = observe(reset())
    obsShape = [
        initObsForParams[obsID].shape[0]
        for obsID in range(len(initObsForParams))
    ]

    worldDim = 2
    actionDim = worldDim * 2 + 1

    layerWidth = [128, 128]

    #------------ models ------------------------

    buildMADDPGModels = BuildMADDPGModels(actionDim, numAgents, obsShape)
    modelsList = [
        buildMADDPGModels(layerWidth, agentID) for agentID in range(numAgents)
    ]

    trainCriticBySASR = TrainCriticBySASR(actByPolicyTargetNoisyForNextState,
                                          learningRateCritic, gamma)
    trainCritic = TrainCritic(trainCriticBySASR)
    trainActorFromSA = TrainActorFromSA(learningRateActor)
    trainActor = TrainActor(trainActorFromSA)

    paramUpdateInterval = 1  #
    updateParameters = UpdateParameters(paramUpdateInterval, tau)
    sampleBatchFromMemory = SampleFromMemory(minibatchSize)

    learnInterval = 100
    learningStartBufferSize = minibatchSize * maxTimeStep
    startLearn = StartLearn(learningStartBufferSize, learnInterval)

    trainMADDPGModels = TrainMADDPGModelsWithBuffer(updateParameters,
                                                    trainActor, trainCritic,
                                                    sampleBatchFromMemory,
                                                    startLearn, modelsList)

    actOneStepOneModel = ActOneStep(actByPolicyTrainNoisy)
    actOneStep = lambda allAgentsStates, runTime: [
        actOneStepOneModel(model, allAgentsStates) for model in modelsList
    ]

    sampleOneStep = SampleOneStep(transit, rewardFunc)
    runDDPGTimeStep = RunTimeStep(actOneStep,
                                  sampleOneStep,
                                  trainMADDPGModels,
                                  observe=observe)

    runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal)

    getAgentModel = lambda agentId: lambda: trainMADDPGModels.getTrainedModels(
    )[agentId]
    getModelList = [getAgentModel(i) for i in range(numAgents)]
    modelSaveRate = 1000
    fileName = "maddpg{}episodes{}step_agent".format(maxEpisode, maxTimeStep)

    modelPath = os.path.join(modelFolder, fileName)

    saveModels = [
        SaveModel(modelSaveRate, saveVariables, getTrainedModel,
                  modelPath + str(i), saveAllmodels)
        for i, getTrainedModel in enumerate(getModelList)
    ]

    maddpg = RunAlgorithm(runEpisode, maxEpisode, saveModels, numAgents)
    replayBuffer = getBuffer(bufferSize)
    meanRewardList, trajectory = maddpg(replayBuffer)
def main():
    debug = 1
    if debug:

        numWolves = 2
        numSheeps = 4
        numBlocks = 2
        hasWalls = 1.0

        dt = 0.02
        maxTimeStep = 25
        sheepSpeedMultiplier = 1.0
        individualRewardWolf = int(False)

        mujocoVisualize = False
        saveAllmodels = True

    else:

        print(sys.argv)
        condition = json.loads(sys.argv[1])
        numWolves = int(condition['numWolves'])
        numSheeps = int(condition['numSheeps'])
        numBlocks = int(condition['numBlocks'])
        hasWalls = float(condition['hasWalls'])

        dt = float(condition['dt'])
        maxTimeStep = int(condition['maxTimeStep'])
        sheepSpeedMultiplier = float(condition['sheepSpeedMultiplier'])
        individualRewardWolf = int(condition['individualRewardWolf'])

        saveAllmodels = True
        mujocoVisualize = False

    print(
        "maddpg: {} wolves, {} sheep, {} blocks, {} episodes with {} steps each eps, sheepSpeed: {}x, wolfIndividualReward: {}, save all models: {}"
        .format(numWolves, numSheeps, numBlocks, maxEpisode,
                maxTimeStep, sheepSpeedMultiplier, individualRewardWolf,
                str(saveAllmodels)))

    dataMainFolder = os.path.join(dirName, '..', 'trainedModels',
                                  'mujocoMADDPG')
    modelFolder = os.path.join(
        dataMainFolder, 'dt={}'.format(dt),
        'hasWalls={}_numBlocks={}_numSheeps={}_numWolves={}_individualRewardWolf={}_sheepSpeedMultiplier={}.xml'
        .format(hasWalls, numBlocks, numSheeps, numWolves,
                individualRewardWolf, sheepSpeedMultiplier))

    if not os.path.exists(modelFolder):
        os.makedirs(modelFolder)

    numAgents = numWolves + numSheeps
    numEntities = numAgents + numBlocks
    wolvesID = list(range(numWolves))
    sheepsID = list(range(numWolves, numAgents))
    blocksID = list(range(numAgents, numEntities))

    wolfSize = 0.075
    sheepSize = 0.05
    blockSize = 0.2
    entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheeps + [
        blockSize
    ] * numBlocks

    isCollision = IsCollision(getPosFromAgentState)
    punishForOutOfBound = lambda state: 0  #PunishForOutOfBound()
    rewardSheep = RewardSheep(wolvesID, sheepsID, entitiesSizeList,
                              getPosFromAgentState, isCollision,
                              punishForOutOfBound)
    if individualRewardWolf:
        rewardWolf = RewardWolfIndividual(wolvesID, sheepsID, entitiesSizeList,
                                          isCollision)
    else:
        rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList,
                                isCollision)
    rewardFunc = lambda state, action, nextState: \
        list(rewardWolf(state, action, nextState)) + list(rewardSheep(state, action, nextState))

    #------------ mujocoEnv ------------------------

    physicsDynamicsPath = os.path.join(
        dirName, '..', '..', 'environment', 'mujocoEnv', 'dt={}'.format(dt),
        'hasWalls={}_numBlocks={}_numSheeps={}_numWolves={}.xml'.format(
            hasWalls, numBlocks, numSheeps, numWolves))

    with open(physicsDynamicsPath) as f:
        xml_string = f.read()
    envXmlDict = xmltodict.parse(xml_string.strip())
    envXml = xmltodict.unparse(envXmlDict)
    physicsModel = mujoco.load_model_from_xml(envXml)
    physicsSimulation = mujoco.MjSim(physicsModel)

    qPosInit = [0, 0] * numAgents
    qVelInit = [0, 0] * numAgents
    qVelInitNoise = 0 * hasWalls
    qPosInitNoise = 0.8 * hasWalls
    getBlockRandomPos = lambda: np.random.uniform(-0.7 * hasWalls, +0.7 *
                                                  hasWalls, 2)
    getBlockSpeed = lambda: np.zeros(2)

    numQPos = len(physicsSimulation.data.qpos)
    numQVel = len(physicsSimulation.data.qvel)

    sampleAgentsQPos = lambda: np.asarray(qPosInit) + np.random.uniform(
        low=-qPosInitNoise, high=qPosInitNoise, size=numQPos)
    sampleAgentsQVel = lambda: np.asarray(qVelInit) + np.random.uniform(
        low=-qVelInitNoise, high=qVelInitNoise, size=numQVel)

    minDistance = 0.2 + 2 * blockSize  #>2*wolfSize+2*blockSize
    isOverlap = IsOverlap(minDistance)
    sampleBlockState = SampleBlockState(numBlocks, getBlockRandomPos,
                                        getBlockSpeed, isOverlap)

    reset = ResetUniformWithoutXPos(physicsSimulation, numAgents, numBlocks,
                                    sampleAgentsQPos, sampleAgentsQVel,
                                    sampleBlockState)

    transitTimePerStep = 0.1
    numSimulationFrames = int(transitTimePerStep / dt)

    isTerminal = lambda state: [False] * numAgents
    reshapeAction = ReshapeAction()
    transit = TransitionFunction(physicsSimulation, numAgents,
                                 numSimulationFrames, mujocoVisualize,
                                 isTerminal, reshapeAction)

    observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID,
                                              blocksID, getPosFromAgentState,
                                              getVelFromAgentState)
    observe = lambda state: [
        observeOneAgent(agentID)(state) for agentID in range(numAgents)
    ]
    initObsForParams = observe(reset())
    obsShape = [
        initObsForParams[obsID].shape[0]
        for obsID in range(len(initObsForParams))
    ]

    worldDim = 2
    actionDim = worldDim * 2 + 1

    layerWidth = [128, 128]

    #------------ models ------------------------

    buildMADDPGModels = BuildMADDPGModels(actionDim, numAgents, obsShape)
    modelsList = [
        buildMADDPGModels(layerWidth, agentID) for agentID in range(numAgents)
    ]

    trainCriticBySASR = TrainCriticBySASR(actByPolicyTargetNoisyForNextState,
                                          learningRateCritic, gamma)
    trainCritic = TrainCritic(trainCriticBySASR)
    trainActorFromSA = TrainActorFromSA(learningRateActor)
    trainActor = TrainActor(trainActorFromSA)

    paramUpdateInterval = 1  #
    updateParameters = UpdateParameters(paramUpdateInterval, tau)
    sampleBatchFromMemory = SampleFromMemory(minibatchSize)

    learnInterval = 100
    learningStartBufferSize = minibatchSize * maxTimeStep
    startLearn = StartLearn(learningStartBufferSize, learnInterval)

    trainMADDPGModels = TrainMADDPGModelsWithBuffer(updateParameters,
                                                    trainActor, trainCritic,
                                                    sampleBatchFromMemory,
                                                    startLearn, modelsList)

    actOneStepOneModel = ActOneStep(actByPolicyTrainNoisy)
    actOneStep = lambda allAgentsStates, runTime: [
        actOneStepOneModel(model, allAgentsStates) for model in modelsList
    ]

    sampleOneStep = SampleOneStep(transit, rewardFunc)
    runDDPGTimeStep = RunTimeStep(actOneStep,
                                  sampleOneStep,
                                  trainMADDPGModels,
                                  observe=observe)

    runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal)

    getAgentModel = lambda agentId: lambda: trainMADDPGModels.getTrainedModels(
    )[agentId]
    getModelList = [getAgentModel(i) for i in range(numAgents)]
    modelSaveRate = 1000
    individStr = 'individ' if individualRewardWolf else 'shared'
    fileName = "maddpghasWalls={}{}wolves{}sheep{}blocks{}episodes{}stepSheepSpeed{}{}_agent".format(
        hasWalls, numWolves, numSheeps, numBlocks, maxEpisode, maxTimeStep,
        sheepSpeedMultiplier, individStr)

    modelPath = os.path.join(modelFolder, fileName)

    saveModels = [
        SaveModel(modelSaveRate, saveVariables, getTrainedModel,
                  modelPath + str(i), saveAllmodels)
        for i, getTrainedModel in enumerate(getModelList)
    ]

    maddpg = RunAlgorithm(runEpisode, maxEpisode, saveModels, numAgents)
    replayBuffer = getBuffer(bufferSize)
    meanRewardList, trajectory = maddpg(replayBuffer)
def main():
    stateDim = env.observation_space.shape[0]
    actionDim = env.action_space.shape[0]
    actionHigh = env.action_space.high
    actionLow = env.action_space.low
    actionBound = (actionHigh - actionLow)/2

    actorWeightInit = tf.random_uniform_initializer(0, 0.03)
    actorBiasInit = tf.constant_initializer(0.01)
    criticWeightInit = tf.random_uniform_initializer(0, 0.01)
    cirticBiasInit = tf.constant_initializer(0.01)

    weightInitializerList = [actorWeightInit, actorBiasInit, criticWeightInit, cirticBiasInit]
    buildModel = BuildDDPGModels(stateDim, actionDim, weightInitializerList, actionBound)
    layerWidths = [30]
    writer, model = buildModel(layerWidths)

    trainCriticBySASR = TrainCriticBySASR(learningRateCritic, gamma, writer)
    trainCritic = TrainCritic(reshapeBatchToGetSASR, trainCriticBySASR)

    trainActorFromState = TrainActorFromState(learningRateActor, writer)
    trainActor = TrainActor(reshapeBatchToGetSASR, trainActorFromState)

    paramUpdateInterval = 1 #
    updateParameters = UpdateParameters(paramUpdateInterval, tau)

    trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, model)

    noiseInitVariance = 3
    varianceDiscount = .9995
    noiseDecayStartStep = bufferSize
    getNoise = GetExponentialDecayGaussNoise(noiseInitVariance, varianceDiscount, noiseDecayStartStep)
    actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, model, getNoise)

    learningStartBufferSize = minibatchSize
    sampleFromMemory = SampleFromMemory(minibatchSize)
    learnFromBuffer = LearnFromBuffer(learningStartBufferSize, sampleFromMemory, trainModels)

    transit = TransitGymPendulum()
    getReward = RewardGymPendulum(angle_normalize)
    sampleOneStep = SampleOneStep(transit, getReward)

    runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep, learnFromBuffer, observe)

    reset = ResetGymPendulum(seed)
    runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminalGymPendulum)

    dirName = os.path.dirname(__file__)
    modelPath = os.path.join(dirName, '..', 'trainedDDPGModels', 'pendulum_newddpg')
    getTrainedModel = lambda: trainModels.getTrainedModels()
    modelSaveRate = 50
    saveModel = SaveModel(modelSaveRate, saveVariables, getTrainedModel, modelPath)

    ddpg = RunAlgorithm(runEpisode, maxEpisode, [saveModel])

    replayBuffer = deque(maxlen=int(bufferSize))
    meanRewardList, trajectory = ddpg(replayBuffer)

# plots& plot
    showDemo = False
    if showDemo:
        visualize = VisualizeGymPendulum()
        visualize(trajectory)

    plotResult = True
    if plotResult:
        plt.plot(list(range(maxEpisode)), meanRewardList)
        plt.show()