Exemple #1
0
def main():
    independentVariables = OrderedDict()
    # independentVariables['bufferSize'] = [1000, 5000, 10000, 20000]
    # independentVariables['layerWidth'] = [[32], [32, 32], [64, 64, 64], [128, 128, 128, 128]]
    # independentVariables['varianceDiscount'] = [0.995, 0.9995, 0.99995]

    independentVariables['bufferSize'] = [1000, 5000]
    independentVariables['layerWidth'] = [(32, 32), (32, 32, 32)]
    independentVariables['varianceDiscount'] = [0.995, 0.9995]

    modelSaveDirectory = "../trainedDDPGModels"
    modelSaveExtension = '.ckpt'
    getSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension)

    evaluate = EvaluateNoiseAndMemorySize(getSavePath, saveModel=False)

    levelNames = list(independentVariables.keys())
    levelValues = list(independentVariables.values())
    levelIndex = pd.MultiIndex.from_product(levelValues, names=levelNames)
    toSplitFrame = pd.DataFrame(index=levelIndex)
    resultDF = toSplitFrame.groupby(levelNames).apply(evaluate)
    resultPath = os.path.join(
        dirName, '..', '..', 'plots',
        str(list(independentVariables.keys())) + '.pickle')
    saveToPickle(resultDF, resultPath)

    nCols = len(independentVariables['bufferSize'])
    nRows = len(independentVariables['layerWidth'])
    numplot = 1
    axs = []
    figure = plt.figure(dpi=200)
    figure.set_size_inches(7.5, 6)
    figure.suptitle('buffer: ' + str(independentVariables['bufferSize']) +
                    ', layers: ' + str(independentVariables['layerWidth']) +
                    ', varDiscount: ' +
                    str(independentVariables['varianceDiscount']),
                    fontsize=8)

    for layerWidth, outterSubDf in resultDF.groupby('layerWidth'):
        for bufferSize, innerSubDf in outterSubDf.groupby('bufferSize'):
            subplot = figure.add_subplot(nRows, nCols, numplot)
            axs.append(subplot)
            numplot += 1
            plt.ylim([-800, 400])
            innerSubDf.T.plot(ax=subplot)

            subplot.set_title('layerWidth = ' + str(layerWidth) +
                              ' bufferSize = ' + str(bufferSize),
                              fontsize=5)
            subplot.set_ylabel('MeanEpsReward', fontsize=5)
            subplot.set_xlabel('Episode', fontsize=5)
            subplot.tick_params(axis='both', which='major', labelsize=5)
            subplot.tick_params(axis='both', which='minor', labelsize=5)
            plt.legend(loc='best', prop={'size': 5})

    plotPath = os.path.join(dirName, '..', '..', 'plots')
    plt.savefig(os.path.join(plotPath, str(list(independentVariables.keys()))))
    plt.show()
Exemple #2
0
    def __call__(self, env):
        agent = Agent(
            alpha=self.hyperparamDict['actorLR'],
            beta=self.hyperparamDict['criticLR'],
            input_dims=[env.observation_space.shape[0]],
            tau=self.hyperparamDict['tau'],
            env=env_norm(env) if self.hyperparamDict['normalizeEnv'] else env,
            n_actions=env.action_space.shape[0],
            units1=self.hyperparamDict['actorHiddenLayersWidths'],
            units2=self.hyperparamDict['criticHiddenLayersWidths'],
            actoractivationfunction=self.hyperparamDict['actorActivFunction'],
            actorHiddenLayersWeightInit=self.
            hyperparamDict['actorHiddenLayersWeightInit'],
            actorHiddenLayersBiasInit=self.
            hyperparamDict['actorHiddenLayersBiasInit'],
            actorOutputWeightInit=self.hyperparamDict['actorOutputWeightInit'],
            actorOutputBiasInit=self.hyperparamDict['actorOutputBiasInit'],
            criticHiddenLayersWidths=self.
            hyperparamDict['criticHiddenLayersWidths'],
            criticActivFunction=self.hyperparamDict['criticActivFunction'],
            criticHiddenLayersBiasInit=self.
            hyperparamDict['criticHiddenLayersBiasInit'],
            criticHiddenLayersWeightInit=self.
            hyperparamDict['criticHiddenLayersWeightInit'],
            criticOutputWeightInit=self.
            hyperparamDict['criticOutputWeightInit'],
            criticOutputBiasInit=self.hyperparamDict['criticOutputBiasInit'],
            max_size=self.hyperparamDict['bufferSize'],
            gamma=self.hyperparamDict['gamma'],
            batch_size=self.hyperparamDict['minibatchSize'],
            initnoisevar=self.hyperparamDict['noiseInitVariance'],
            noiseDecay=self.hyperparamDict['varianceDiscount'],
            noiseDacayStep=self.hyperparamDict['noiseDecayStartStep'],
            minVar=self.hyperparamDict['minVar'],
            path=self.hyperparamDict['modelSavePathPhil'])

        episodeRewardList = []
        meanEpsRewardList = []

        for episode in range(1, self.hyperparamDict['maxEpisode'] + 1):
            obs = env.reset()
            epsReward = 0

            for i in range(self.hyperparamDict['maxTimeStep']):
                done = False

                #env.render()
                act = agent.choose_action(obs, env.action_space.low,
                                          env.action_space.high)
                new_state, reward, done, info = env.step(act)
                agent.remember(obs, act, reward, new_state, int(done))
                agent.learn()
                epsReward += reward
                obs = new_state

                agent.runtime += 1

                if done:
                    break

            episodeRewardList.append(epsReward)
            meanEpsRewardList.append(np.mean(episodeRewardList))
            last100EpsMeanReward = np.mean(episodeRewardList[-100:])

            if episode % 1 == 0:
                print(
                    'episode: {}, last 100eps mean reward: {}, last eps reward: {} with {} steps'
                    .format(episode, last100EpsMeanReward, epsReward,
                            agent.runtime))

        agent.save_models()
        saveToPickle(meanEpsRewardList,
                     self.hyperparamDict['rewardSavePathPhil'])
        return meanEpsRewardList
def simuliateOneParameter(parameterDict, evalNum, randomSeed, dt):
    mujocoVisualize = False
    demoVisualize = False

    wolfSize = 0.075
    sheepSize = 0.05
    blockSize = 0.2

    wolfColor = np.array([0.85, 0.35, 0.35])
    sheepColor = np.array([0.35, 0.85, 0.35])
    blockColor = np.array([0.25, 0.25, 0.25])

    wolvesID = [0]
    sheepsID = [1]
    blocksID = [2]

    numWolves = len(wolvesID)
    numSheeps = len(sheepsID)
    numBlocks = len(blocksID)

    sheepMaxSpeed = 1.3
    wolfMaxSpeed = 1.0
    blockMaxSpeed = None

    agentsMaxSpeedList = [wolfMaxSpeed] * numWolves + [sheepMaxSpeed
                                                       ] * numSheeps

    numAgents = numWolves + numSheeps
    numEntities = numAgents + numBlocks

    entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheeps + [
        blockSize
    ] * numBlocks
    entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [
        sheepMaxSpeed
    ] * numSheeps + [blockMaxSpeed] * numBlocks
    entitiesMovableList = [True] * numAgents + [False] * numBlocks
    massList = [1.0] * numEntities

    isCollision = IsCollision(getPosFromAgentState)
    rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision)
    punishForOutOfBound = PunishForOutOfBound()
    rewardSheep = RewardSheep(wolvesID, sheepsID, entitiesSizeList,
                              getPosFromAgentState, isCollision,
                              punishForOutOfBound)

    rewardFunc = lambda state, action, nextState: \
        list(rewardWolf(state, action, nextState)) + list(rewardSheep(state, action, nextState))

    makePropertyList = MakePropertyList(transferNumberListToStr)

    #changeCollisionReleventParameter
    dmin = parameterDict['dmin']
    dmax = parameterDict['dmax']
    width = parameterDict['width']
    midpoint = parameterDict['midpoint']
    power = parameterDict['power']
    timeconst = parameterDict['timeconst']
    dampratio = parameterDict['dampratio']

    geomIds = [1, 2, 3]
    keyNameList = ['@solimp', '@solref']
    valueList = [[[dmin, dmax, width, midpoint, power], [timeconst, dampratio]]
                 ] * len(geomIds)
    collisionParameter = makePropertyList(geomIds, keyNameList, valueList)

    #changeSize
    # geomIds=[1,2]
    # keyNameList=['@size']
    # valueList=[[[0.075,0.075]],[[0.1,0.1]]]
    # sizeParameter=makePropertyList(geomIds,keyNameList,valueList)

    #load env xml and change some geoms' property
    physicsDynamicsPath = os.path.join(
        dirName, '..', '..', 'environment', 'mujocoEnv',
        'variousCollision_numBlocks=1_numSheeps=1_numWolves=1dt={}.xml'.format(
            dt))

    with open(physicsDynamicsPath) as f:
        xml_string = f.read()
    envXmlDict = xmltodict.parse(xml_string.strip())

    envXmlPropertyDictList = [collisionParameter]
    changeEnvXmlPropertFuntionyList = [changeGeomProperty]
    for propertyDict, changeXmlProperty in zip(
            envXmlPropertyDictList, changeEnvXmlPropertFuntionyList):
        envXmlDict = changeXmlProperty(envXmlDict, propertyDict)

    envXml = xmltodict.unparse(envXmlDict)
    # print(envXmlDict['mujoco']['worldbody']['body'][0])
    physicsModel = mujoco.load_model_from_xml(envXml)
    physicsSimulation = mujoco.MjSim(physicsModel)

    # MDP function
    qPosInit = [0, 0] * numAgents
    qVelInit = [0, 0] * numAgents

    qVelInitNoise = 0
    qPosInitNoise = 1

    # reset=ResetUniformWithoutXPos(physicsSimulation, qPosInit, qVelInit, numAgents, numBlocks,qPosInitNoise, qVelInitNoise)
    # fixReset=ResetFixWithoutXPos(physicsSimulation, qPosInit, qVelInit, numAgents,numBlocks)
    # blocksState=[[0,-0.8,0,0]]
    # reset=lambda :fixReset([-0.5,0.8,-0.5,0,0.5,0.8],[0,0,0,0,0,0],blocksState)

    isTerminal = lambda state: False
    numSimulationFrames = int(0.1 / dt)
    print(numSimulationFrames)
    # dt=0.01
    damping = 2.5
    reshapeAction = lambda action: action
    # transit = TransitionFunctionWithoutXPos(physicsSimulation,numAgents , numSimulationFrames,damping*dt/numSimulationFrames,agentsMaxSpeedList,mujocoVisualize,reshapeAction)

    maxRunningSteps = 10

    # sampleTrajectory = SampleTrajectory(maxRunningSteps, transit, isTerminal, rewardFunc, reset)
    # observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID, blocksID, getPosFromAgentState, getVelFromAgentState)
    # observe = lambda state: [observeOneAgent(agentID)(state) for agentID in range(numAgents)]
    # initObsForParams = observe(reset())
    # obsShape = [initObsForParams[obsID].shape for obsID in range(len(initObsForParams))]

    class ImpulsePolicy(object):
        """docstring for c"""
        def __init__(self, initAction):
            self.initAction = initAction

        def __call__(self, state, timeStep):
            action = [[0, 0], [0, 0]]
            if timeStep == 0:
                action = self.initAction
            return action

    worldDim = 2
    trajList = []
    startTime = time.time()
    for i in range(evalNum):

        np.random.seed(randomSeed + i)
        initSpeedDirection = np.random.uniform(-np.pi / 2, np.pi / 2, 1)[0]
        initSpeed = np.random.uniform(0, 1, 1)[0]
        initActionDirection = np.random.uniform(-np.pi / 2, np.pi / 2, 1)[0]
        initForce = np.random.uniform(0, 5, 1)[0]
        # print(initSpeedDirection,initSpeed,initActionDirection,initForce)

        fixReset = ResetFixWithoutXPos(physicsSimulation, qPosInit, qVelInit,
                                       numAgents, numBlocks)
        blocksState = [[0, 0, 0, 0]]  #posX posY velX velY
        reset = lambda: fixReset([-0.28, 0, 8, 8], [
            initSpeed * np.cos(initSpeedDirection), initSpeed * np.sin(
                initSpeedDirection), 0, 0
        ], blocksState)

        transit = TransitionFunctionWithoutXPos(
            physicsSimulation, numAgents, numSimulationFrames,
            damping * dt * numSimulationFrames, agentsMaxSpeedList,
            mujocoVisualize, isTerminal, reshapeAction)

        initAction = [[
            initForce * np.cos(initActionDirection),
            initForce * np.sin(initActionDirection)
        ], [0, 0]]
        impulsePolicy = ImpulsePolicy(initAction)

        sampleTrajectory = SampleTrajectory(maxRunningSteps, transit,
                                            isTerminal, rewardFunc, reset)

        traj = sampleTrajectory(impulsePolicy)
        # print('traj',traj[0])
        trajList.append(traj)

    # saveTraj
    # print(trajList)
    saveTraj = True
    if saveTraj:
        # trajSavePath = os.path.join(dirName,'traj', 'evaluateCollision', 'CollisionMoveTransitDamplingCylinder.pickle')
        trajectorySaveExtension = '.pickle'
        fixedParameters = {
            'isMujoco': 1,
            'isCylinder': 1,
            'randomSeed': randomSeed,
            'evalNum': evalNum
        }
        trajectoriesSaveDirectory = trajSavePath = os.path.join(
            dirName, '..', 'trajectory', 'variousCollsiondt={}'.format(dt))
        generateTrajectorySavePath = GetSavePath(trajectoriesSaveDirectory,
                                                 trajectorySaveExtension,
                                                 fixedParameters)
        trajectorySavePath = generateTrajectorySavePath(parameterDict)
        saveToPickle(trajList, trajectorySavePath)

    # visualize

    if demoVisualize:
        entitiesColorList = [wolfColor] * numWolves + [
            sheepColor
        ] * numSheeps + [blockColor] * numBlocks
        render = Render(entitiesSizeList, entitiesColorList, numAgents,
                        getPosFromAgentState)
        trajToRender = np.concatenate(trajList)
        render(trajToRender)
    endTime = time.time()
    print("Time taken {} seconds to generate {} trajectories".format(
        (endTime - startTime), evalNum))
def generateSingleCondition(condition):
    debug = 0
    if debug:


        damping=2.0
        frictionloss=0.0
        masterForce=1.0

        numWolves = 1
        numSheeps = 1
        numMasters = 1
        maxTimeStep = 25

        saveTraj=False
        saveImage=True
        visualizeMujoco=False
        visualizeTraj = True
        makeVideo=True
    else:

        # print(sys.argv)
        # condition = json.loads(sys.argv[1])
        damping = float(condition['damping'])
        frictionloss = float(condition['frictionloss'])
        masterForce = float(condition['masterForce'])

        numWolves = 1
        numSheeps = 1
        numMasters = 1
        maxTimeStep = 25

        saveTraj=False
        saveImage=True
        visualizeMujoco=False
        visualizeTraj = True
        makeVideo=False

    print("maddpg: , saveTraj: {}, visualize: {},damping; {},frictionloss: {}".format( str(saveTraj), str(visualizeMujoco),damping,frictionloss))


    numAgents = numWolves + numSheeps+numMasters
    numEntities = numAgents + numMasters
    wolvesID = [0]
    sheepsID = [1]
    masterID = [2]

    wolfSize = 0.075
    sheepSize = 0.05
    masterSize = 0.075
    entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheeps + [masterSize] * numMasters

    wolfMaxSpeed = 1.0
    blockMaxSpeed = None


    entitiesMovableList = [True] * numAgents + [False] * numMasters
    massList = [1.0] * numEntities

    isCollision = IsCollision(getPosFromAgentState)
    punishForOutOfBound = PunishForOutOfBound()
    rewardSheep = RewardSheep(wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState, isCollision,punishForOutOfBound)


    rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision)
    rewardMaster= lambda state, action, nextState: [-reward  for reward in rewardWolf(state, action, nextState)]
    rewardFunc = lambda state, action, nextState: \
        list(rewardWolf(state, action, nextState)) + list(rewardSheep(state, action, nextState))+list(rewardMaster(state, action, nextState))

    dirName = os.path.dirname(__file__)
    # physicsDynamicsPath=os.path.join(dirName,'..','..','environment','mujocoEnv','rope','leased.xml')

    makePropertyList=MakePropertyList(transferNumberListToStr)

    geomIds=[1,2,3]
    keyNameList=[0,1]
    valueList=[[damping,damping]]*len(geomIds)
    dampngParameter=makePropertyList(geomIds,keyNameList,valueList)

    changeJointDampingProperty=lambda envDict,geomPropertyDict:changeJointProperty(envDict,geomPropertyDict,'@damping')

    geomIds=[1,2,3]
    keyNameList=[0,1]
    valueList=[[frictionloss,frictionloss]]*len(geomIds)
    frictionlossParameter=makePropertyList(geomIds,keyNameList,valueList)
    changeJointFrictionlossProperty=lambda envDict,geomPropertyDict:changeJointProperty(envDict,geomPropertyDict,'@frictionloss')


    physicsDynamicsPath=os.path.join(dirName,'..','..','environment','mujocoEnv','rope','leasedNew.xml')
    # physicsDynamicsPath=os.path.join(dirName,'..','..','environment','mujocoEnv','rope','leased.xml')


    with open(physicsDynamicsPath) as f:
        xml_string = f.read()
    envXmlDict = xmltodict.parse(xml_string.strip())




    envXmlDict = xmltodict.parse(xml_string.strip())
    envXmlPropertyDictList=[dampngParameter,frictionlossParameter]
    changeEnvXmlPropertFuntionyList=[changeJointDampingProperty,changeJointFrictionlossProperty]
    for propertyDict,changeXmlProperty in zip(envXmlPropertyDictList,changeEnvXmlPropertFuntionyList):
        envXmlDict=changeXmlProperty(envXmlDict,propertyDict)





    envXml=xmltodict.unparse(envXmlDict)

    physicsModel = mujoco.load_model_from_xml(envXml)
    physicsSimulation = mujoco.MjSim(physicsModel)
    # print(physicsSimulation.model.body_pos)

    # print(dir(physicsSimulation.model))
    # print(dir(physicsSimulation.data),physicsSimulation.dataphysicsSimulation.data)

    # print(physicsSimulation.data.qpos,dir(physicsSimulation.data.qpos))
    # print(physicsSimulation.data.qpos,dir(physicsSimulation.data.qpos))
    qPosInit = (0, ) * 24
    qVelInit = (0, ) * 24
    qPosInitNoise = 0.4
    qVelInitNoise = 0
    numAgent = 2
    tiedAgentId = [0, 2]
    ropePartIndex = list(range(3, 12))
    maxRopePartLength = 0.06
    reset = ResetUniformWithoutXPosForLeashed(physicsSimulation, qPosInit, qVelInit, numAgent, tiedAgentId,ropePartIndex, maxRopePartLength, qPosInitNoise, qVelInitNoise)
    numSimulationFrames=10
    isTerminal= lambda state: False
    reshapeActionList = [ReshapeAction(5),ReshapeAction(5),ReshapeAction(masterForce)]
    transit=TransitionFunctionWithoutXPos(physicsSimulation, numSimulationFrames, visualizeMujoco,isTerminal, reshapeActionList)

    # damping=2.5
    # numSimulationFrames =int(0.1/dt)
    # agentsMaxSpeedList = [wolfMaxSpeed]* numWolves + [sheepMaxSpeed] * numSheeps
    # reshapeAction = ReshapeAction()
    # isTerminal = lambda state: False
    # transit = TransitionFunctionWithoutXPos(physicsSimulation,numAgents , numSimulationFrames,damping*dt*numSimulationFrames,agentsMaxSpeedList,visualizeMujoco,isTerminal,reshapeAction)


    maxRunningStepsToSample = 100
    sampleTrajectory = SampleTrajectory(maxRunningStepsToSample, transit, isTerminal, rewardFunc, reset)

    observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID, masterID, getPosFromAgentState, getVelFromAgentState)
    observe = lambda state: [observeOneAgent(agentID)(state) for agentID in range(numAgents)]

    initObsForParams = observe(reset())
    obsShape = [initObsForParams[obsID].shape[0] for obsID in range(len(initObsForParams))]

    worldDim = 2
    actionDim = worldDim * 2 + 1

    layerWidth = [128, 128]

    # ------------ model ------------------------
    buildMADDPGModels = BuildMADDPGModels(actionDim, numAgents, obsShape)
    modelsList = [buildMADDPGModels(layerWidth, agentID) for agentID in range(numAgents)]


    # individStr = 'individ' if individualRewardWolf else 'shared'
    # fileName = "maddpg{}wolves{}sheep{}blocks{}episodes{}stepSheepSpeed{}{}_agent".format(
    #     numWolves, numSheeps, numMasters, maxEpisode, maxTimeStep, sheepSpeedMultiplier, individStr)


    # wolvesModelPaths = [os.path.join(dirName, '..', 'trainedModels', '3wolvesMaddpg', fileName + str(i) + '60000eps') for i in wolvesID]
    # [restoreVariables(model, path) for model, path in zip(wolvesModel, wolvesModelPaths)]
    #
    # actOneStepOneModel = ActOneStep(actByPolicyTrainNoisy)
    # policy = lambda allAgentsStates: [actOneStepOneModel(model, observe(allAgentsStates)) for model in modelsList]

    # modelPaths = [os.path.join(dirName, '..', 'trainedModels', '3wolvesMaddpg_ExpEpsLengthAndSheepSpeed', fileName + str(i)) for i in
    #               range(numAgents)]
    # modelFolder = os.path.join(dirName, '..', 'trainedModels', 'mujocoMADDPG')
    # modelFolder = os.path.join(dirName, '..', 'trainedModels', 'mujocoMADDPG','timeconst='+str(timeconst)+'dampratio='+str(dampratio))

    modelFolder = os.path.join(dirName, '..', 'trainedModels', 'mujocoMADDPGLeasedFixedEnv2','damping={}_frictionloss={}_masterForce={}'.format(damping,frictionloss,masterForce))
    fileName = "maddpg{}episodes{}step_agent".format(maxEpisode, maxTimeStep)

    modelPaths = [os.path.join(modelFolder,  fileName + str(i) + '60000eps') for i in range(numAgents)]

    [restoreVariables(model, path) for model, path in zip(modelsList, modelPaths)]

    actOneStepOneModel = ActOneStep(actByPolicyTrainNoisy)
    policy = lambda allAgentsStates: [actOneStepOneModel(model, observe(allAgentsStates)) for model in modelsList]


    trajList = []
    numTrajToSample = 5
    for i in range(numTrajToSample):
        np.random.seed(i)
        traj = sampleTrajectory(policy)
        trajList.append(list(traj))

    # saveTraj
    if saveTraj:
        trajFileName = "maddpg{}wolves{}sheep{}blocks{}eps{}stepSheepSpeed{}{}Traj".format(numWolves, numSheeps, numMasters, maxEpisode, maxTimeStep, sheepSpeedMultiplier, individStr)
        trajSavePath = os.path.join(dirName, '..', 'trajectory', trajFileName)
        saveToPickle(trajList, trajSavePath)


    # visualize
    if visualizeTraj:
        demoFolder = os.path.join(dirName, '..', 'demos', 'mujocoMADDPGLeasedFixedEnv2','damping={}_frictionloss={}_masterForce={}'.format(damping,frictionloss,masterForce))

        if not os.path.exists(demoFolder):
            os.makedirs(demoFolder)
        entitiesColorList = [wolfColor] * numWolves + [sheepColor] * numSheeps + [masterColor] * numMasters
        render = Render(entitiesSizeList, entitiesColorList, numAgents,demoFolder,saveImage, getPosFromAgentState)
        # print(trajList[0][0],'!!!3',trajList[0][1])
        trajToRender = np.concatenate(trajList)
        render(trajToRender)
def simuliateOneParameter(parameterDict):


    wolfSize = 0.075
    sheepSize = 0.05
    blockSize = 0.2


    wolfColor = np.array([0.85, 0.35, 0.35])
    sheepColor = np.array([0.35, 0.85, 0.35])
    blockColor = np.array([0.25, 0.25, 0.25])

    wolvesID = [0,1]
    sheepsID = [2]
    blocksID = [3]

    numWolves = len(wolvesID)
    numSheeps = len(sheepsID)
    numBlocks = len(blocksID)

    sheepMaxSpeed = 1.3
    wolfMaxSpeed =1.0
    blockMaxSpeed = None



    agentsMaxSpeedList = [wolfMaxSpeed]* numWolves + [sheepMaxSpeed] * numSheeps

    numAgents = numWolves + numSheeps
    numEntities = numAgents + numBlocks

    entitiesSizeList = [wolfSize]* numWolves + [sheepSize] * numSheeps + [blockSize]* numBlocks
    entityMaxSpeedList = [wolfMaxSpeed]* numWolves + [sheepMaxSpeed] * numSheeps + [blockMaxSpeed]* numBlocks
    entitiesMovableList = [True]* numAgents + [False] * numBlocks
    massList = [1.0] * numEntities

    isCollision = IsCollision(getPosFromAgentState)
    rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision)
    punishForOutOfBound = PunishForOutOfBound()
    rewardSheep = RewardSheep(wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState, isCollision, punishForOutOfBound)

    rewardFunc = lambda state, action, nextState: \
        list(rewardWolf(state, action, nextState)) + list(rewardSheep(state, action, nextState))

    makePropertyList=MakePropertyList(transferNumberListToStr)

    #changeCollisionReleventParameter
    dmin=parameterDict['dmin']
    dmax=parameterDict['dmax']
    width=parameterDict['width']
    midpoint=parameterDict['midpoint']
    power=parameterDict['power']
    timeconst=parameterDict['timeconst']
    dampratio=parameterDict['dampratio']

    geomIds=[1,2]
    keyNameList=['@solimp','@solref']
    valueList=[[[dmin,dmax,width,midpoint,power],[timeconst,dampratio]]]*len(geomIds)
    collisionParameter=makePropertyList(geomIds,keyNameList,valueList)

#changeSize
    # geomIds=[1,2]
    # keyNameList=['@size']
    # valueList=[[[0.075,0.075]],[[0.1,0.1]]]
    # sizeParameter=makePropertyList(geomIds,keyNameList,valueList)

#load env xml and change some geoms' property
    # physicsDynamicsPath=os.path.join(dirName,'multiAgentcollision.xml')
    physicsDynamicsPath=os.path.join(dirName,'..','..','environment','mujocoEnv','multiAgentcollision.xml')
    with open(physicsDynamicsPath) as f:
        xml_string = f.read()
    envXmlDict = xmltodict.parse(xml_string.strip())
    print(envXmlDict)
    envXmlPropertyDictList=[collisionParameter]
    changeEnvXmlPropertFuntionyList=[changeGeomProperty]
    for propertyDict,changeXmlProperty in zip(envXmlPropertyDictList,changeEnvXmlPropertFuntionyList):
        envXmlDict=changeXmlProperty(envXmlDict,propertyDict)

    envXml=xmltodict.unparse(envXmlDict)

    physicsModel = mujoco.load_model_from_xml(envXml)
    physicsSimulation = mujoco.MjSim(physicsModel)


    # MDP function
    qPosInit = [0, 0]*numAgents
    qVelInit = [0, 0]*numAgents

    qVelInitNoise = 0
    qPosInitNoise = 1

    # reset=ResetUniformWithoutXPos(physicsSimulation, qPosInit, qVelInit, numAgents, numBlocks,qPosInitNoise, qVelInitNoise)

    fixReset=ResetFixWithoutXPos(physicsSimulation, qPosInit, qVelInit, numAgents,numBlocks)
    blocksState=[[0,0,0,0]]
    reset=lambda :fixReset([-0.5,0.8,-0.5,0,0.5,0.8],[0,0,0,0,0,0],blocksState)


    isTerminal = lambda state: False
    numSimulationFrames = 1
    visualize=False
    # physicsViewer=None
    dt=0.1
    damping=2.5
    transit = TransitionFunctionWithoutXPos(physicsSimulation,numAgents , numSimulationFrames,damping*dt/numSimulationFrames,agentsMaxSpeedList,visualize,isTerminal)


    maxRunningSteps = 100
    sampleTrajectory = SampleTrajectory(maxRunningSteps, transit, isTerminal, rewardFunc, reset)
    observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID, blocksID, getPosFromAgentState, getVelFromAgentState)
    observe = lambda state: [observeOneAgent(agentID)(state) for agentID in range(numAgents)]
    initObsForParams = observe(reset())
    obsShape = [initObsForParams[obsID].shape for obsID in range(len(initObsForParams))]

    worldDim = 2
    evalNum=1

    trajectorySaveExtension = '.pickle'
    fixedParameters = {'isMujoco': 1,'isCylinder':1,'evalNum':evalNum}
    trajectoriesSaveDirectory=trajSavePath = os.path.join(dirName,'..','trajectory')
    generateTrajectorySavePath = GetSavePath(trajectoriesSaveDirectory, trajectorySaveExtension, fixedParameters)
    trajectorySavePath = generateTrajectorySavePath(parameterDict)


    if not os.path.isfile(trajectorySavePath):
        trajList =list()

        for i in range(evalNum):

            # policy =lambda state: [[-3,0] for agent in range(numAgents)]
            np.random.seed(i)
            # policy =lambda state: [np.random.uniform(-5,5,2) for agent in range(numAgents)]sss
            # policy =lambda state: [[0,1] for agent in range(numAgents)]
            policy =lambda state: [[1,0] ]
            # policy =lambda state: [[np.random.uniform(0,1,1),0] ,[np.random.uniform(-1,0,1),0] ]
            traj = sampleTrajectory(policy)
            # print(i,'traj',[state[1] for state in traj[:2]])
            # print(traj)
            trajList.append( traj)

        # saveTraj
        saveTraj = True
        if saveTraj:
            # trajSavePath = os.path.join(dirName,'traj', 'evaluateCollision', 'CollisionMoveTransitDamplingCylinder.pickle')

            saveToPickle(trajList, trajectorySavePath)

        # visualize

        # physicsViewer.render()
        visualize = True
        if visualize:
            entitiesColorList = [wolfColor] * numWolves + [sheepColor] * numSheeps + [blockColor] * numBlocks
            render = Render(entitiesSizeList, entitiesColorList, numAgents, getPosFromAgentState)
            render(trajList)
Exemple #6
0
def main():
    stateDim = env.observation_space.shape[0]
    actionDim = env.action_space.shape[0]
    actionHigh = env.action_space.high
    actionLow = env.action_space.low
    actionBound = (actionHigh - actionLow) / 2

    buildActorModel = BuildActorModel(stateDim, actionDim, actionBound)
    actorLayerWidths = [30]
    actorWriter, actorModel = buildActorModel(actorLayerWidths)

    buildCriticModel = BuildCriticModel(stateDim, actionDim)
    criticLayerWidths = [30]
    criticWriter, criticModel = buildCriticModel(criticLayerWidths)

    trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma,
                                            criticWriter)
    trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget,
                              trainCriticBySASRQ)

    trainActorFromGradients = TrainActorFromGradients(learningRateActor,
                                                      actorWriter)
    trainActorOneStep = TrainActorOneStep(actByPolicyTrain,
                                          trainActorFromGradients,
                                          getActionGradients)
    trainActor = TrainActor(trainActorOneStep)

    paramUpdateInterval = 1
    updateParameters = UpdateParameters(paramUpdateInterval, tau)

    modelList = [actorModel, criticModel]
    actorModel, criticModel = resetTargetParamToTrainParam(modelList)
    trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic,
                                  actorModel, criticModel)

    noiseInitVariance = 3
    varianceDiscount = .9995
    noiseDecayStartStep = bufferSize
    getNoise = GetExponentialDecayGaussNoise(noiseInitVariance,
                                             varianceDiscount,
                                             noiseDecayStartStep)
    actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh,
                                         actByPolicyTrain, actorModel,
                                         getNoise)

    learningStartBufferSize = minibatchSize
    sampleFromMemory = SampleFromMemory(minibatchSize)
    learnFromBuffer = LearnFromBuffer(learningStartBufferSize,
                                      sampleFromMemory, trainModels)

    transit = TransitGymPendulum()
    getReward = RewardGymPendulum(angle_normalize)
    sampleOneStep = SampleOneStep(transit, getReward)

    runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep,
                                  learnFromBuffer, observe)

    reset = ResetGymPendulum(seed)
    runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep,
                            isTerminalGymPendulum)

    dirName = os.path.dirname(__file__)
    modelPath = os.path.join(dirName, '..', 'trainedDDPGModels', 'pendulum')
    getTrainedModel = lambda: trainModels.actorModel
    modelSaveRate = 50
    saveModel = SaveModel(modelSaveRate, saveVariables, getTrainedModel,
                          modelPath)

    ddpg = RunAlgorithm(runEpisode, maxEpisode, [saveModel])

    replayBuffer = deque(maxlen=int(bufferSize))
    meanRewardList, trajectory = ddpg(replayBuffer)

    dirName = os.path.dirname(__file__)
    trajectoryPath = os.path.join(dirName, '..', 'trajectory',
                                  'pendulumTrajectory1.pickle')
    saveToPickle(trajectory, trajectoryPath)

    # plots& plot
    showDemo = True
    if showDemo:
        visualize = VisualizeGymPendulum()
        visualize(trajectory)

    plotResult = True
    if plotResult:
        plt.plot(list(range(maxEpisode)), meanRewardList)
        plt.show()
Exemple #7
0
def main():
    statedim = env.observation_space.shape[0]
    actiondim = env.action_space.shape[0]
    actionHigh = env.action_space.high
    actionLow = env.action_space.low
    actionBound = (actionHigh - actionLow)/2
    
    replaybuffer = deque(maxlen=buffersize)
    paramUpdateFrequency = 1
    totalrewards = []
    meanreward = []
    totalreward = []
    
    buildActorModel = BuildActorModel(statedim, actiondim, actionBound)
    actorWriter, actorModel = buildActorModel(actornumberlayers)
    
    buildCriticModel = BuildCriticModel(statedim, actiondim)
    criticWriter, criticModel = buildCriticModel(criticnumberlayers)

    trainCritic = TrainCritic(criticlearningRate, gamma, criticWriter)
    trainActor = TrainActor(actorlearningRate, actorWriter)
    updateParameters = UpdateParameters(tau,paramUpdateFrequency)

    actorModel= ReplaceParameters(actorModel)
    criticModel= ReplaceParameters(criticModel)
    
    trainddpgModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, actorModel, criticModel)
    
    getnoise = GetNoise(noiseDecay,minVar,noiseDacayStep)
    getnoiseaction = GetNoiseAction(actorModel,actionLow, actionHigh)
    learn = Learn(buffersize,batchsize,trainddpgModels,actiondim)
    runtime = 0
    trajectory = []
    noisevar = initnoisevar
    for episode in range(EPISODE):
        state  = env.reset()
        rewards = 0
        for i in range(maxTimeStep):
            env.render()
            noise,noisevar = getnoise(runtime,noisevar)
            noiseaction = getnoiseaction(state,noise)
            nextstate,reward,done,info = env.step(noiseaction)
            learn(replaybuffer,state, noiseaction, nextstate,reward)
            trajectory.append((state, noiseaction, nextstate,reward))
            rewards += reward
            state = nextstate
            runtime += 1
            print(actionHigh,actionLow)
        if i == maxTimeStep-1:
            totalrewards.append(rewards)
            totalreward.append(rewards)
            print('episode: ',episode,'reward:',rewards, 'noisevar',noisevar)
            
        if episode % 100 == 0:
            meanreward.append(np.mean(totalreward))
            print('episode: ',episode,'meanreward:',np.mean(totalreward))
            totalreward = []
    plt.plot(range(EPISODE),totalrewards)
    plt.xlabel('episode')
    plt.ylabel('rewards')
    plt.show()
# save Model
    modelIndex = 0
    actorFixedParam = {'actorModel': modelIndex}
    criticFixedParam = {'criticModel': modelIndex}
    parameters = {'env': Env_name, 'Eps': EPISODE,  'batchsize': batchsize,'buffersize': buffersize,'maxTimeStep':maxTimeStep,
                  'gamma': gamma, 'actorlearningRate': actorlearningRate, 'criticlearningRate': criticlearningRate,
                  'tau': tau, 'noiseDecay': noiseDecay, 'minVar': minVar, 'initnoisevar': initnoisevar}

    modelSaveDirectory = "/path/to/logs/trainedDDPGModels"
    modelSaveExtension = '.ckpt'
    getSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension, actorFixedParam)
    getSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension, criticFixedParam)
    savePathDQN = getSavePath(parameters)

    with actorModel.as_default():
        saveVariables(actorModel, savePathDQN)
    with criticModel.as_default():
        saveVariables(criticModel, savePathDQN)
        
    dirName = os.path.dirname(__file__)
    trajectoryPath = os.path.join(dirName,'trajectory', 'HopperTrajectory.pickle')
    saveToPickle(trajectory, trajectoryPath)
Exemple #8
0
def main():
    debug = 1
    if debug:
        numWolves = 1
        numSheeps = 1
        numBlocks = 1
        saveTraj = False
        visualizeTraj = True
        maxTimeStep = 25
        sheepSpeedMultiplier = 1
        individualRewardWolf = 0

        timeconst=0.5
        dampratio=0.2

    else:
        print(sys.argv)
        condition = json.loads(sys.argv[1])
        numWolves = int(condition['numWolves'])
        numSheeps = int(condition['numSheeps'])
        numBlocks = int(condition['numBlocks'])
        saveTraj = True
        visualizeTraj = False
        maxTimeStep = 50
        sheepSpeedMultiplier = 1.25
        individualRewardWolf = 1

    print("maddpg: {} wolves, {} sheep, {} blocks, saveTraj: {}, visualize: {}".format(numWolves, numSheeps, numBlocks, str(saveTraj), str(visualizeTraj)))


    numAgents = numWolves + numSheeps
    numEntities = numAgents + numBlocks
    wolvesID = list(range(numWolves))
    sheepsID = list(range(numWolves, numAgents))
    blocksID = list(range(numAgents, numEntities))

    wolfSize = 0.075
    sheepSize = 0.05
    blockSize = 0.2
    entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheeps + [blockSize] * numBlocks

    wolfMaxSpeed = 1.0
    blockMaxSpeed = None
    sheepMaxSpeedOriginal = 1.3
    sheepMaxSpeed = sheepMaxSpeedOriginal * sheepSpeedMultiplier
    entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [sheepMaxSpeed] * numSheeps + [blockMaxSpeed] * numBlocks

    entitiesMovableList = [True] * numAgents + [False] * numBlocks
    massList = [1.0] * numEntities

    isCollision = IsCollision(getPosFromAgentState)
    punishForOutOfBound = PunishForOutOfBound()
    rewardSheep = RewardSheep(wolvesID, sheepsID, entitiesSizeList, getPosFromAgentState, isCollision,
                              punishForOutOfBound)

    if individualRewardWolf:
        rewardWolf = RewardWolfIndividual(wolvesID, sheepsID, entitiesSizeList, isCollision)
    else:
        rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision)

    rewardFunc = lambda state, action, nextState: list(rewardWolf(state, action, nextState)) + list(rewardSheep(state, action, nextState))

    reset = ResetMultiAgentChasing(numAgents, numBlocks)
    observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID, blocksID, getPosFromAgentState, getVelFromAgentState)
    observe = lambda state: [observeOneAgent(agentID)(state) for agentID in range(numAgents)]

    reshapeAction = ReshapeAction()
    getCollisionForce = GetCollisionForce()
    applyActionForce = ApplyActionForce(wolvesID, sheepsID, entitiesMovableList)
    applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList, entitiesSizeList,
                                          getCollisionForce, getPosFromAgentState)
    integrateState = IntegrateState(numEntities, entitiesMovableList, massList,
                                    entityMaxSpeedList, getVelFromAgentState, getPosFromAgentState)
    transit = TransitMultiAgentChasing(numEntities, reshapeAction, applyActionForce, applyEnvironForce, integrateState)

    isTerminal = lambda state: False
    maxRunningStepsToSample = 100
    sampleTrajectory = SampleTrajectory(maxRunningStepsToSample, transit, isTerminal, rewardFunc, reset)

    initObsForParams = observe(reset())
    obsShape = [initObsForParams[obsID].shape[0] for obsID in range(len(initObsForParams))]

    worldDim = 2
    actionDim = worldDim * 2 + 1

    layerWidth = [128, 128]

    # ------------ model ------------------------
    buildMADDPGModels = BuildMADDPGModels(actionDim, numAgents, obsShape)
    modelsList = [buildMADDPGModels(layerWidth, agentID) for agentID in range(numAgents)]

    dirName = os.path.dirname(__file__)
    # individStr = 'individ' if individualRewardWolf else 'shared'
    # fileName = "maddpg{}wolves{}sheep{}blocks{}episodes{}stepSheepSpeed{}{}_agent".format(
    #     numWolves, numSheeps, numBlocks, maxEpisode, maxTimeStep, sheepSpeedMultiplier, individStr)
    fileName = "maddpg{}wolves{}sheep{}blocks60000episodes25stepSheepSpeed1shared_agent".format(numWolves, numSheeps, numBlocks)
    # wolvesModelPaths = [os.path.join(dirName, '..', 'trainedModels', '3wolvesMaddpg', fileName + str(i) + '60000eps') for i in wolvesID]
    # [restoreVariables(model, path) for model, path in zip(wolvesModel, wolvesModelPaths)]
    #
    # actOneStepOneModel = ActOneStep(actByPolicyTrainNoisy)
    # policy = lambda allAgentsStates: [actOneStepOneModel(model, observe(allAgentsStates)) for model in modelsList]

    # modelPaths = [os.path.join(dirName, '..', 'trainedModels', '3wolvesMaddpg_ExpEpsLengthAndSheepSpeed', fileName + str(i)) for i in
    #               range(numAgents)]
    modelFolder = os.path.join(dirName, '..', 'trainedModels', 'mujocoMADDPG','timeconst='+str(timeconst)+'dampratio='+str(dampratio))
    modelPaths = [os.path.join(modelFolder, fileName + str(i) + '60000eps') for i in range(numAgents)]

    [restoreVariables(model, path) for model, path in zip(modelsList, modelPaths)]

    actOneStepOneModel = ActOneStep(actByPolicyTrainNoisy)
    policy = lambda allAgentsStates: [actOneStepOneModel(model, observe(allAgentsStates)) for model in modelsList]


    trajList = []
    numTrajToSample = 50
    for i in range(numTrajToSample):
        traj = sampleTrajectory(policy)
        trajList.append(list(traj))

    # saveTraj
    if saveTraj:
        trajFileName = "maddpg{}wolves{}sheep{}blocks{}eps{}stepSheepSpeed{}{}Traj".format(numWolves, numSheeps, numBlocks, maxEpisode, maxTimeStep, sheepSpeedMultiplier, individStr)
        trajSavePath = os.path.join(dirName, '..', 'trajectory', trajFileName)
        saveToPickle(trajList, trajSavePath)


    # visualize
    if visualizeTraj:
        entitiesColorList = [wolfColor] * numWolves + [sheepColor] * numSheeps + [blockColor] * numBlocks
        render = Render(entitiesSizeList, entitiesColorList, numAgents, getPosFromAgentState)
        trajToRender = np.concatenate(trajList)
        render(trajToRender)
 def __call__(self,env):
     env = env_norm(env) if self.fixedParameters['normalizeEnv'] else env
     actionHigh = env.action_space.high
     actionLow = env.action_space.low
     actionBound = (actionHigh - actionLow) / 2
     stateDim = env.observation_space.shape[0]
     actionDim = env.action_space.shape[0]
     meanreward = []
     trajectory = []
     totalreward = []
     totalrewards = []
     episodereward = []
     replaybuffer = deque(maxlen=int(self.fixedParameters['bufferSize']))
     buildActorModel = BuildActorModel(stateDim, actionDim,actionBound ,
                                       self.fixedParameters['actorHiddenLayersWeightInit'],self.fixedParameters['actorHiddenLayersBiasInit'],
                                       self.fixedParameters['actorOutputWeightInit'], self.fixedParameters['actorOutputBiasInit'],self.fixedParameters['actorActivFunction'],self.fixedParameters['gradNormClipValue'],self.fixedParameters['normalizeEnv'])
     actorModel = buildActorModel(self.fixedParameters['actorHiddenLayersWidths'])
     
     buildCriticModel = BuildCriticModel(stateDim, actionDim,
                                         self.fixedParameters['criticHiddenLayersWeightInit'],self.fixedParameters['criticHiddenLayersBiasInit'],
                                         self.fixedParameters['criticOutputWeightInit'], self.fixedParameters['criticOutputBiasInit'],
                                         self.fixedParameters['criticActivFunction'],self.fixedParameters['gradNormClipValue'],self.fixedParameters['normalizeEnv'])
     criticModel = buildCriticModel(self.fixedParameters['criticHiddenLayersWidths'])
     
     trainCritic = TrainCritic(self.fixedParameters['criticLR'], self.fixedParameters['gamma'])
     trainActor = TrainActor(self.fixedParameters['actorLR'])
     updateParameters = UpdateParameters(self.fixedParameters['tau'])
     trainddpgModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, actorModel,criticModel)
 
     getnoise = GetNoise(self.fixedParameters['varianceDiscount'],self.fixedParameters['minVar'],self.fixedParameters['noiseDecayStartStep'],self.fixedParameters['noiseInitVariance'])
     getnoiseaction = GetNoiseAction(actorModel,actionLow, actionHigh)
     learn = Learn(self.fixedParameters['bufferSize'],self.fixedParameters['minibatchSize'],trainddpgModels)
     actorModel= ReplaceParameters(actorModel)
     criticModel= ReplaceParameters(criticModel)
     state  = env.reset()
     replaybuffer = fillbuffer(3000,self.bufferfill,env,replaybuffer,state)
     
     for episode in range(1,self.fixedParameters['maxEpisode']+1):
         state  = env.reset()
         rewards = 0
         for j in range(self.fixedParameters['maxTimeStep']):
             env.render()
             noise = getnoise(self.runstep)
             noiseaction = getnoiseaction(state,noise)
             nextstate,reward,done,info = env.step(noiseaction)
             learn(replaybuffer,state, noiseaction, nextstate,reward)
             trajectory.append((state, noiseaction, nextstate,reward))
             rewards += reward
             state = nextstate
             self.runstep += 1
             if j == self.fixedParameters['maxTimeStep']-1:
                 totalrewards.append(rewards)
                 totalreward.append(rewards)
                 print('episode: ',episode,'reward:',rewards,'runstep',self.runstep)
         episodereward.append(np.mean(totalrewards))
         print('epireward',np.mean(totalrewards))
         if episode % 100 == 0:
             meanreward.append(np.mean(totalreward))
             print('episode: ',episode,'meanreward:',np.mean(totalreward))
             totalreward = []
     with actorModel.as_default():
         saveVariables(actorModel, self.fixedParameters['modelSavePathMartin'])
     with criticModel.as_default():
         saveVariables(criticModel, self.fixedParameters['modelSavePathMartin'])
     saveToPickle(meanreward, self.fixedParameters['rewardSavePathMartin'])
     
     return episodereward
def main():
    # wolvesID = [0, 1]
    # sheepsID = [2]
    # blocksID = [3]
    visualize = False
    wolfSize = 0.075
    sheepSize = 0.05
    blockSize = 0.2

    sheepMaxSpeed = 1.3
    wolfMaxSpeed = 1.0
    blockMaxSpeed = None

    wolfColor = np.array([0.85, 0.35, 0.35])
    sheepColor = np.array([0.35, 0.85, 0.35])
    blockColor = np.array([0.25, 0.25, 0.25])

    wolvesID = [0]
    sheepsID = [1]
    blocksID = [2]

    numWolves = len(wolvesID)
    numSheeps = len(sheepsID)
    numBlocks = len(blocksID)

    numAgents = numWolves + numSheeps
    numEntities = numAgents + numBlocks

    entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheeps + [
        blockSize
    ] * numBlocks
    entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [
        sheepMaxSpeed
    ] * numSheeps + [blockMaxSpeed] * numBlocks
    entitiesMovableList = [True] * numAgents + [False] * numBlocks
    massList = [1.0] * numEntities

    isCollision = IsCollision(getPosFromAgentState)
    rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList, isCollision)
    punishForOutOfBound = PunishForOutOfBound()
    rewardSheep = RewardSheep(wolvesID, sheepsID, entitiesSizeList,
                              getPosFromAgentState, isCollision,
                              punishForOutOfBound)

    rewardFunc = lambda state, action, nextState: \
        list(rewardWolf(state, action, nextState)) + list(rewardSheep(state, action, nextState))

    reset = ResetMultiAgentChasing(numAgents, numBlocks)

    # reset=lambda :np.array([[-0.5,0,0,0],[0.5,0,0,0]])
    reset = lambda: np.array([[-0.5, 0, 0, 0], [8, 8, 0, 0], [0, -8, 0, 0]])
    observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID,
                                              blocksID, getPosFromAgentState,
                                              getVelFromAgentState)
    observe = lambda state: [
        observeOneAgent(agentID)(state) for agentID in range(numAgents)
    ]

    # reshapeAction = ReshapeAction()
    reshapeAction = lambda action: action

    getCollisionForce = GetCollisionForce()
    applyActionForce = ApplyActionForce(wolvesID, sheepsID,
                                        entitiesMovableList)
    applyEnvironForce = ApplyEnvironForce(numEntities, entitiesMovableList,
                                          entitiesSizeList, getCollisionForce,
                                          getPosFromAgentState)
    integrateState = FixedIntegrateState(numEntities, entitiesMovableList,
                                         massList, entityMaxSpeedList,
                                         getVelFromAgentState,
                                         getPosFromAgentState)
    transit = TransitMultiAgentChasing(numEntities, reshapeAction,
                                       applyActionForce, applyEnvironForce,
                                       integrateState)

    isTerminal = lambda state: False
    maxRunningSteps = 100
    sampleTrajectory = SampleTrajectory(maxRunningSteps, transit, isTerminal,
                                        rewardFunc, reset)

    initObsForParams = observe(reset())
    obsShape = [
        initObsForParams[obsID].shape for obsID in range(len(initObsForParams))
    ]

    worldDim = 2

    trajList = []
    evalNum = 1
    for i in range(evalNum):

        # policy =lambda state: [[-3,0] for agent in range(numAgents)]
        np.random.seed(i)
        # policy =lambda state: [np.random.uniform(-5,5,2) for agent in range(numAgents)]
        # policy =lambda state: [[1,1] for agent in range(numAgents)]
        # policy =lambda state: [[float(np.random.uniform(0,1,1)),0] ,[float(np.random.uniform(-1,0,1)),0] ]
        policy = lambda state: [[1, 0], [0, 0]]
        traj = sampleTrajectory(policy)
        # print(i,'traj',[state[1] for state in traj)
        # print('traj',[tr[0][0][0] for tr in traj])
        print('traj', traj[0])
        trajList.append(traj)

    # saveTraj
    saveTraj = True
    # print(trajList[0][0][0])
    if saveTraj:
        trajSavePath = os.path.join(dirName, '..', 'trajectory',
                                    'LineMoveFixedBaseLine.pickle')
        saveToPickle(trajList, trajSavePath)
    # print('traj',trajList[25])
    # visualize

    if visualize:
        entitiesColorList = [wolfColor] * numWolves + [
            sheepColor
        ] * numSheeps + [blockColor] * numBlocks
        render = Render(entitiesSizeList, entitiesColorList, numAgents,
                        getPosFromAgentState)
        trajToRender = np.concatenate(trajList)
        render(trajToRender)
Exemple #11
0
def main():
    debug = 1
    if debug:
        numWolves = 3
        numSheeps = 1
        numBlocks = 2
        timeconst = 0.5
        dampratio = 0.2
        saveTraj = False
        visualizeTraj = True
        maxTimeStep = 25
        sheepSpeedMultiplier = 1.0
        individualRewardWolf = 0
        hasWalls = 1.5
        dt = 0.05

        visualizeMujoco = True

    else:
        print(sys.argv)
        condition = json.loads(sys.argv[1])
        numWolves = int(condition['numWolves'])
        numSheeps = int(condition['numSheeps'])
        numBlocks = int(condition['numBlocks'])
        saveTraj = True
        visualizeTraj = False
        maxTimeStep = 50
        sheepSpeedMultiplier = 1.25
        individualRewardWolf = 1

    print(
        "maddpg: {} wolves, {} sheep, {} blocks, saveTraj: {}, visualize: {}".
        format(numWolves, numSheeps, numBlocks, str(saveTraj),
               str(visualizeTraj)))

    numAgents = numWolves + numSheeps
    numEntities = numAgents + numBlocks
    wolvesID = list(range(numWolves))
    sheepsID = list(range(numWolves, numAgents))
    blocksID = list(range(numAgents, numEntities))

    wolfSize = 0.075
    sheepSize = 0.05
    blockSize = 0.2
    entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheeps + [
        blockSize
    ] * numBlocks

    wolfMaxSpeed = 1.0
    blockMaxSpeed = None
    sheepMaxSpeedOriginal = 1.3
    sheepMaxSpeed = sheepMaxSpeedOriginal * sheepSpeedMultiplier
    entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [
        sheepMaxSpeed
    ] * numSheeps + [blockMaxSpeed] * numBlocks

    entitiesMovableList = [True] * numAgents + [False] * numBlocks
    massList = [1.0] * numEntities

    isCollision = IsCollision(getPosFromAgentState)
    punishForOutOfBound = PunishForOutOfBound()
    rewardSheep = RewardSheep(wolvesID, sheepsID, entitiesSizeList,
                              getPosFromAgentState, isCollision,
                              punishForOutOfBound)

    if individualRewardWolf:
        rewardWolf = RewardWolfIndividual(wolvesID, sheepsID, entitiesSizeList,
                                          isCollision)
    else:
        rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList,
                                isCollision)

    rewardFunc = lambda state, action, nextState: list(
        rewardWolf(state, action, nextState)) + list(
            rewardSheep(state, action, nextState))
    dirName = os.path.dirname(__file__)
    # physicsDynamicsPath=os.path.join(dirName,'..','..','environment','mujocoEnv','numBlocks=1_numSheeps=1_numWolves=1.xml')
    if hasWalls:
        # physicsDynamicsPath=os.path.join(dirName,'..','..','environment','mujocoEnv','hasWalls=1_numBlocks={}_numSheeps={}_numWolves={}timeconst={}dampratio={}.xml'.format(numBlocks,numSheeps,numWolves,timeconst,dampratio))
        physicsDynamicsPath = os.path.join(
            dirName, '..', '..', 'environment', 'mujocoEnv',
            'dt={}'.format(dt),
            'hasWalls={}_numBlocks={}_numSheeps={}_numWolves={}timeconst={}dampratio={}.xml'
            .format(hasWalls, numBlocks, numSheeps, numWolves, timeconst,
                    dampratio))

        # physicsDynamicsPath=os.path.join(dirName,'..','..','environment','mujocoEnv','crossTest3w1s2b.xml')

    else:
        physicsDynamicsPath = os.path.join(
            dirName, '..', '..', 'environment', 'mujocoEnv',
            'numBlocks={}_numSheeps={}_numWolves={}timeconst={}dampratio={}.xml'
            .format(numBlocks, numSheeps, numWolves, timeconst, dampratio))

    with open(physicsDynamicsPath) as f:
        xml_string = f.read()
    envXmlDict = xmltodict.parse(xml_string.strip())
    envXml = xmltodict.unparse(envXmlDict)

    physicsModel = mujoco.load_model_from_xml(envXml)
    physicsSimulation = mujoco.MjSim(physicsModel)
    print(physicsSimulation.model.body_pos)

    # print(dir(physicsSimulation.model))
    # print(dir(physicsSimulation.data),physicsSimulation.dataphysicsSimulation.data)

    # print(physicsSimulation.data.qpos,dir(physicsSimulation.data.qpos))
    # print(physicsSimulation.data.qpos,dir(physicsSimulation.data.qpos))
    qPosInit = [0, 0] * numAgents
    qVelInit = [0, 0] * numAgents
    qVelInitNoise = 0 * hasWalls
    qPosInitNoise = 0.8 * hasWalls
    getBlockRandomPos = lambda: np.random.uniform(-0.7 * hasWalls, +0.7 *
                                                  hasWalls, 2)
    getBlockSpeed = lambda: np.zeros(2)

    numQPos = len(physicsSimulation.data.qpos)
    numQVel = len(physicsSimulation.data.qvel)

    sampleAgentsQPos = lambda: np.asarray(qPosInit) + np.random.uniform(
        low=-qPosInitNoise, high=qPosInitNoise, size=numQPos)
    sampleAgentsQVel = lambda: np.asarray(qVelInit) + np.random.uniform(
        low=-qVelInitNoise, high=qVelInitNoise, size=numQVel)

    minDistance = 0.2 + 2 * blockSize
    isOverlap = IsOverlap(minDistance)

    sampleBlockState = SampleBlockState(numBlocks, getBlockRandomPos,
                                        getBlockSpeed, isOverlap)

    reset = ResetUniformWithoutXPos(physicsSimulation, numAgents, numBlocks,
                                    sampleAgentsQPos, sampleAgentsQVel,
                                    sampleBlockState)

    damping = 2.5
    numSimulationFrames = int(0.1 / dt)
    agentsMaxSpeedList = [wolfMaxSpeed] * numWolves + [sheepMaxSpeed
                                                       ] * numSheeps
    reshapeAction = ReshapeAction()
    isTerminal = lambda state: False
    transit = TransitionFunctionWithoutXPos(physicsSimulation, numAgents,
                                            numSimulationFrames,
                                            damping * dt * numSimulationFrames,
                                            agentsMaxSpeedList,
                                            visualizeMujoco, isTerminal,
                                            reshapeAction)

    maxRunningStepsToSample = 100
    sampleTrajectory = SampleTrajectory(maxRunningStepsToSample, transit,
                                        isTerminal, rewardFunc, reset)

    observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID,
                                              blocksID, getPosFromAgentState,
                                              getVelFromAgentState)
    observe = lambda state: [
        observeOneAgent(agentID)(state) for agentID in range(numAgents)
    ]

    initObsForParams = observe(reset())
    obsShape = [
        initObsForParams[obsID].shape[0]
        for obsID in range(len(initObsForParams))
    ]

    worldDim = 2
    actionDim = worldDim * 2 + 1

    layerWidth = [128, 128]

    # ------------ model ------------------------
    buildMADDPGModels = BuildMADDPGModels(actionDim, numAgents, obsShape)
    modelsList = [
        buildMADDPGModels(layerWidth, agentID) for agentID in range(numAgents)
    ]

    # individStr = 'individ' if individualRewardWolf else 'shared'
    # fileName = "maddpg{}wolves{}sheep{}blocks{}episodes{}stepSheepSpeed{}{}_agent".format(
    #     numWolves, numSheeps, numBlocks, maxEpisode, maxTimeStep, sheepSpeedMultiplier, individStr)

    # wolvesModelPaths = [os.path.join(dirName, '..', 'trainedModels', '3wolvesMaddpg', fileName + str(i) + '60000eps') for i in wolvesID]
    # [restoreVariables(model, path) for model, path in zip(wolvesModel, wolvesModelPaths)]
    #
    # actOneStepOneModel = ActOneStep(actByPolicyTrainNoisy)
    # policy = lambda allAgentsStates: [actOneStepOneModel(model, observe(allAgentsStates)) for model in modelsList]

    # modelPaths = [os.path.join(dirName, '..', 'trainedModels', '3wolvesMaddpg_ExpEpsLengthAndSheepSpeed', fileName + str(i)) for i in
    #               range(numAgents)]
    # modelFolder = os.path.join(dirName, '..', 'trainedModels', 'mujocoMADDPG')
    # modelFolder = os.path.join(dirName, '..', 'trainedModels', 'mujocoMADDPG','timeconst='+str(timeconst)+'dampratio='+str(dampratio))
    if hasWalls:
        # modelFolder = os.path.join(dirName, '..', 'trainedModels', 'mujocoMADDPG','hasWalls=1'+'numBlocks='+str(numBlocks)+'numSheeps='+str(numSheeps)+'numWolves='+str(numWolves)+'timeconst='+str(timeconst)+'dampratio='+str(dampratio))
        # modelFolder = os.path.join(dirName, '..', 'trainedModels', 'mujocoMADDPG','dt={}'.format(dt),'hasWalls='+str(hasWalls)+'numBlocks='+str(numBlocks)+'numSheeps='+str(numSheeps)+'numWolves='+str(numWolves)+'timeconst='+str(timeconst)+'dampratio='+str(dampratio)+'individualRewardWolf='+str(individualRewardWolf)+'sheepSpeedMultiplier='+str(sheepSpeedMultiplier))
        # modelFolder = os.path.join(dirName, '..', 'trainedModels', 'mujocoMADDPG','dt={}'.format(dt),'hasWalls='+str(hasWalls)+'numBlocks='+str(numBlocks)+'numSheeps='+str(numSheeps)+'numWolves='+str(numWolves)+'timeconst='+str(timeconst)+'dampratio='+str(dampratio)+'individualRewardWolf='+str(individualRewardWolf)+'sheepSpeedMultiplier='+str(sheepSpeedMultiplier))

        modelFolder = os.path.join(
            dirName, '..', 'trainedModels', 'mujocoMADDPGeavluateWall',
            'dt={}'.format(dt),
            'hasWalls=' + str(hasWalls) + 'numBlocks=' + str(numBlocks) +
            'numSheeps=' + str(numSheeps) + 'numWolves=' + str(numWolves) +
            'timeconst=' + str(timeconst) + 'dampratio=' + str(dampratio) +
            'individualRewardWolf=' + str(individualRewardWolf) +
            'sheepSpeedMultiplier=' + str(sheepSpeedMultiplier))
        individStr = 'individ' if individualRewardWolf else 'shared'
        fileName = "maddpghasWalls={}{}wolves{}sheep{}blocks{}episodes{}stepSheepSpeed{}{}_agent".format(
            hasWalls, numWolves, numSheeps, numBlocks, maxEpisode, maxTimeStep,
            sheepSpeedMultiplier, individStr)

    else:
        modelFolder = os.path.join(
            dirName, '..', 'trainedModels', 'mujocoMADDPG',
            'numBlocks=' + str(numBlocks) + 'numSheeps=' + str(numSheeps) +
            'numWolves=' + str(numWolves) + 'timeconst=' + str(timeconst) +
            'dampratio=' + str(dampratio))
        fileName = "maddpg{}wolves{}sheep{}blocks60000episodes25stepSheepSpeed1.0shared_agent".format(
            numWolves, numSheeps, numBlocks)
    modelPaths = [
        os.path.join(modelFolder, fileName + str(i) + '60000eps')
        for i in range(numAgents)
    ]

    [
        restoreVariables(model, path)
        for model, path in zip(modelsList, modelPaths)
    ]

    actOneStepOneModel = ActOneStep(actByPolicyTrainNoisy)
    policy = lambda allAgentsStates: [
        actOneStepOneModel(model, observe(allAgentsStates))
        for model in modelsList
    ]

    trajList = []
    numTrajToSample = 50
    for i in range(numTrajToSample):
        np.random.seed(i)
        traj = sampleTrajectory(policy)
        trajList.append(list(traj))

    # saveTraj
    if saveTraj:
        trajFileName = "maddpg{}wolves{}sheep{}blocks{}eps{}stepSheepSpeed{}{}Traj".format(
            numWolves, numSheeps, numBlocks, maxEpisode, maxTimeStep,
            sheepSpeedMultiplier, individStr)
        trajSavePath = os.path.join(dirName, '..', 'trajectory', trajFileName)
        saveToPickle(trajList, trajSavePath)

    # visualize
    if visualizeTraj:
        entitiesColorList = [wolfColor] * numWolves + [
            sheepColor
        ] * numSheeps + [blockColor] * numBlocks
        render = Render(entitiesSizeList, entitiesColorList, numAgents,
                        getPosFromAgentState)
        print(trajList[0][0], '!!!3', trajList[0][1])
        trajToRender = np.concatenate(trajList)
        render(trajToRender)
def main():
    stateDim = env.observation_space.shape[0]
    actionDim = env.action_space.shape[0]
    actionHigh = env.action_space.high
    actionLow = env.action_space.low
    actionBound = (actionHigh - actionLow) / 2

    buildActorModel = BuildActorModel(stateDim, actionDim, actionBound)
    actorLayerWidths = [30]
    actorWriter, actorModel = buildActorModel(actorLayerWidths)

    buildCriticModel = BuildCriticModel(stateDim, actionDim)
    criticLayerWidths = [30]
    criticWriter, criticModel = buildCriticModel(criticLayerWidths)

    trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma,
                                            criticWriter)
    trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget,
                              trainCriticBySASRQ)

    trainActorFromGradients = TrainActorFromGradients(learningRateActor,
                                                      actorWriter)
    trainActorOneStep = TrainActorOneStep(actByPolicyTrain,
                                          trainActorFromGradients,
                                          getActionGradients)
    trainActor = TrainActor(trainActorOneStep)

    paramUpdateInterval = 1
    updateParameters = UpdateParameters(paramUpdateInterval, tau)

    modelList = [actorModel, criticModel]
    actorModel, criticModel = resetTargetParamToTrainParam(modelList)
    trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic,
                                  actorModel, criticModel)

    noiseInitVariance = 1  # control exploration
    varianceDiscount = .99995
    noiseDecayStartStep = bufferSize
    minVar = .1
    getNoise = GetExponentialDecayGaussNoise(noiseInitVariance,
                                             varianceDiscount,
                                             noiseDecayStartStep, minVar)
    actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh,
                                         actByPolicyTrain, actorModel,
                                         getNoise)

    learningStartBufferSize = minibatchSize
    sampleFromMemory = SampleFromMemory(minibatchSize)
    learnFromBuffer = LearnFromBuffer(learningStartBufferSize,
                                      sampleFromMemory, trainModels)

    transit = TransitGymMountCarContinuous()
    isTerminal = IsTerminalMountCarContin()
    getReward = RewardMountCarContin(isTerminal)
    sampleOneStep = SampleOneStep(transit, getReward)

    runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep,
                                  learnFromBuffer)

    resetLow = -1
    resetHigh = 0.4
    reset = ResetMountCarContin(seed=None)
    runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal)

    ddpg = RunAlgorithm(runEpisode, maxEpisode)
    replayBuffer = deque(maxlen=int(bufferSize))
    meanRewardList, trajectory = ddpg(replayBuffer)

    trainedActorModel, trainedCriticModel = trainModels.getTrainedModels()

    # save Model
    modelIndex = 0
    actorFixedParam = {'actorModel': modelIndex}
    criticFixedParam = {'criticModel': modelIndex}
    parameters = {
        'env': ENV_NAME,
        'Eps': maxEpisode,
        'timeStep': maxTimeStep,
        'batch': minibatchSize,
        'gam': gamma,
        'lrActor': learningRateActor,
        'lrCritic': learningRateCritic,
        'noiseVar': noiseInitVariance,
        'varDiscout': varianceDiscount,
        'resetLow': resetLow,
        'High': resetHigh
    }

    modelSaveDirectory = "../trainedDDPGModels"
    modelSaveExtension = '.ckpt'
    getActorSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension,
                                   actorFixedParam)
    getCriticSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension,
                                    criticFixedParam)
    savePathActor = getActorSavePath(parameters)
    savePathCritic = getCriticSavePath(parameters)

    with actorModel.as_default():
        saveVariables(trainedActorModel, savePathActor)
    with criticModel.as_default():
        saveVariables(trainedCriticModel, savePathCritic)

    dirName = os.path.dirname(__file__)
    trajectoryPath = os.path.join(dirName, '..', 'trajectory',
                                  'mountCarTrajectoryOriginalReset1.pickle')
    saveToPickle(trajectory, trajectoryPath)

    # plots& plot
    showDemo = False
    if showDemo:
        visualize = VisualizeMountCarContin()
        visualize(trajectory)

    plotResult = True
    if plotResult:
        plt.plot(list(range(maxEpisode)), meanRewardList)
        plt.show()
Exemple #13
0
def simuliateOneParameter(parameterOneCondiiton, evalNum, randomSeed):
    saveTraj = True
    visualizeTraj = False
    visualizeMujoco = False

    numWolves = parameterOneCondiiton['numWolves']
    numSheeps = parameterOneCondiiton['numSheeps']
    numBlocks = parameterOneCondiiton['numBlocks']
    timeconst = parameterOneCondiiton['timeconst']
    dampratio = parameterOneCondiiton['dampratio']
    maxTimeStep = parameterOneCondiiton['maxTimeStep']
    sheepSpeedMultiplier = parameterOneCondiiton['sheepSpeedMultiplier']
    individualRewardWolf = parameterOneCondiiton['individualRewardWolf']
    hasWalls = parameterOneCondiiton['hasWalls']
    dt = parameterOneCondiiton['dt']

    print(
        "maddpg: {} wolves, {} sheep, {} blocks, saveTraj: {}, visualize: {}".
        format(numWolves, numSheeps, numBlocks, str(saveTraj),
               str(visualizeTraj)))

    numAgents = numWolves + numSheeps
    numEntities = numAgents + numBlocks
    wolvesID = list(range(numWolves))
    sheepsID = list(range(numWolves, numAgents))
    blocksID = list(range(numAgents, numEntities))

    wolfSize = 0.075
    sheepSize = 0.05
    blockSize = 0.2
    entitiesSizeList = [wolfSize] * numWolves + [sheepSize] * numSheeps + [
        blockSize
    ] * numBlocks

    wolfMaxSpeed = 1.0
    blockMaxSpeed = None
    sheepMaxSpeedOriginal = 1.3
    sheepMaxSpeed = sheepMaxSpeedOriginal * sheepSpeedMultiplier
    entityMaxSpeedList = [wolfMaxSpeed] * numWolves + [
        sheepMaxSpeed
    ] * numSheeps + [blockMaxSpeed] * numBlocks

    entitiesMovableList = [True] * numAgents + [False] * numBlocks
    massList = [1.0] * numEntities

    isCollision = IsCollision(getPosFromAgentState)
    punishForOutOfBound = PunishForOutOfBound()
    rewardSheep = RewardSheep(wolvesID, sheepsID, entitiesSizeList,
                              getPosFromAgentState, isCollision,
                              punishForOutOfBound)

    if individualRewardWolf:
        rewardWolf = RewardWolfIndividual(wolvesID, sheepsID, entitiesSizeList,
                                          isCollision)
    else:
        rewardWolf = RewardWolf(wolvesID, sheepsID, entitiesSizeList,
                                isCollision)

    rewardFunc = lambda state, action, nextState: list(
        rewardWolf(state, action, nextState)) + list(
            rewardSheep(state, action, nextState))
    dirName = os.path.dirname(__file__)

    if hasWalls:

        physicsDynamicsPath = os.path.join(
            dirName, '..', '..', 'environment', 'mujocoEnv',
            'dt={}'.format(dt),
            'hasWalls={}_numBlocks={}_numSheeps={}_numWolves={}timeconst={}dampratio={}.xml'
            .format(hasWalls, numBlocks, numSheeps, numWolves, timeconst,
                    dampratio))

    else:
        physicsDynamicsPath = os.path.join(
            dirName, '..', '..', 'environment', 'mujocoEnv',
            'numBlocks={}_numSheeps={}_numWolves={}timeconst={}dampratio={}.xml'
            .format(numBlocks, numSheeps, numWolves, timeconst, dampratio))

    with open(physicsDynamicsPath) as f:
        xml_string = f.read()
    envXmlDict = xmltodict.parse(xml_string.strip())
    envXml = xmltodict.unparse(envXmlDict)

    physicsModel = mujoco.load_model_from_xml(envXml)
    physicsSimulation = mujoco.MjSim(physicsModel)
    # print(physicsSimulation.model.body_pos)

    qPosInit = [0, 0] * numAgents
    qVelInit = [0, 0] * numAgents
    qVelInitNoise = 0 * hasWalls
    qPosInitNoise = 0.8 * hasWalls
    getBlockRandomPos = lambda: np.random.uniform(-0.7 * hasWalls, +0.7 *
                                                  hasWalls, 2)
    getBlockSpeed = lambda: np.zeros(2)

    numQPos = len(physicsSimulation.data.qpos)
    numQVel = len(physicsSimulation.data.qvel)

    sampleAgentsQPos = lambda: np.asarray(qPosInit) + np.random.uniform(
        low=-qPosInitNoise, high=qPosInitNoise, size=numQPos)
    sampleAgentsQVel = lambda: np.asarray(qVelInit) + np.random.uniform(
        low=-qVelInitNoise, high=qVelInitNoise, size=numQVel)

    minDistance = 0.2 + 2 * blockSize
    isOverlap = IsOverlap(minDistance)

    sampleBlockState = SampleBlockState(numBlocks, getBlockRandomPos,
                                        getBlockSpeed, isOverlap)

    reset = ResetUniformWithoutXPos(physicsSimulation, numAgents, numBlocks,
                                    sampleAgentsQPos, sampleAgentsQVel,
                                    sampleBlockState)

    damping = 2.5
    numSimulationFrames = int(0.1 / dt)
    agentsMaxSpeedList = [wolfMaxSpeed] * numWolves + [sheepMaxSpeed
                                                       ] * numSheeps
    reshapeAction = ReshapeAction()
    isTerminal = lambda state: False
    transit = TransitionFunctionWithoutXPos(physicsSimulation, numAgents,
                                            numSimulationFrames,
                                            damping * dt * numSimulationFrames,
                                            agentsMaxSpeedList,
                                            visualizeMujoco, isTerminal,
                                            reshapeAction)

    maxRunningStepsToSample = 100
    sampleTrajectory = SampleTrajectory(maxRunningStepsToSample, transit,
                                        isTerminal, rewardFunc, reset)

    observeOneAgent = lambda agentID: Observe(agentID, wolvesID, sheepsID,
                                              blocksID, getPosFromAgentState,
                                              getVelFromAgentState)
    observe = lambda state: [
        observeOneAgent(agentID)(state) for agentID in range(numAgents)
    ]

    initObsForParams = observe(reset())
    obsShape = [
        initObsForParams[obsID].shape[0]
        for obsID in range(len(initObsForParams))
    ]

    worldDim = 2
    actionDim = worldDim * 2 + 1

    layerWidth = [128, 128]

    # ------------ model ------------------------
    buildMADDPGModels = BuildMADDPGModels(actionDim, numAgents, obsShape)
    modelsList = [
        buildMADDPGModels(layerWidth, agentID) for agentID in range(numAgents)
    ]

    if hasWalls:

        modelFolder = os.path.join(
            dirName, '..', 'trainedModels', 'mujocoMADDPGeavluateWall',
            'dt={}'.format(dt),
            'hasWalls=' + str(hasWalls) + 'numBlocks=' + str(numBlocks) +
            'numSheeps=' + str(numSheeps) + 'numWolves=' + str(numWolves) +
            'timeconst=' + str(timeconst) + 'dampratio=' + str(dampratio) +
            'individualRewardWolf=' + str(individualRewardWolf) +
            'sheepSpeedMultiplier=' + str(sheepSpeedMultiplier))
        individStr = 'individ' if individualRewardWolf else 'shared'
        fileName = "maddpghasWalls={}{}wolves{}sheep{}blocks{}episodes{}stepSheepSpeed{}{}_agent".format(
            hasWalls, numWolves, numSheeps, numBlocks, maxEpisode, maxTimeStep,
            sheepSpeedMultiplier, individStr)

    else:
        modelFolder = os.path.join(
            dirName, '..', 'trainedModels', 'mujocoMADDPG',
            'numBlocks=' + str(numBlocks) + 'numSheeps=' + str(numSheeps) +
            'numWolves=' + str(numWolves) + 'timeconst=' + str(timeconst) +
            'dampratio=' + str(dampratio))
        fileName = "maddpg{}wolves{}sheep{}blocks60000episodes25stepSheepSpeed1.0shared_agent".format(
            numWolves, numSheeps, numBlocks)
    modelPaths = [
        os.path.join(modelFolder, fileName + str(i) + '60000eps')
        for i in range(numAgents)
    ]

    [
        restoreVariables(model, path)
        for model, path in zip(modelsList, modelPaths)
    ]

    actOneStepOneModel = ActOneStep(actByPolicyTrainNoisy)
    policy = lambda allAgentsStates: [
        actOneStepOneModel(model, observe(allAgentsStates))
        for model in modelsList
    ]

    startTime = time.time()
    trajList = []
    for i in range(evalNum):
        np.random.seed(i)
        traj = sampleTrajectory(policy)
        trajList.append(list(traj))
    endTime = time.time()
    print("Time taken {} seconds to generate {} trajectories".format(
        (endTime - startTime), evalNum))

    # saveTraj
    if saveTraj:

        trajectoryFolder = os.path.join(dirName, '..', 'trajectory',
                                        'evluateWall')
        if not os.path.exists(trajectoryFolder):
            os.makedirs(trajectoryFolder)

        trajectorySaveExtension = '.pickle'
        fixedParameters = {'randomSeed': randomSeed, 'evalNum': evalNum}
        generateTrajectorySavePath = GetSavePath(trajectoryFolder,
                                                 trajectorySaveExtension,
                                                 fixedParameters)
        trajectorySavePath = generateTrajectorySavePath(parameterOneCondiiton)

        saveToPickle(trajList, trajectorySavePath)

    # visualize
    if visualizeTraj:
        entitiesColorList = [wolfColor] * numWolves + [
            sheepColor
        ] * numSheeps + [blockColor] * numBlocks
        render = Render(entitiesSizeList, entitiesColorList, numAgents,
                        getPosFromAgentState)
        print(trajList[0][0], '!!!3', trajList[0][1])
        trajToRender = np.concatenate(trajList)
        render(trajToRender)

    return endTime - startTime
def main():
    stateDim = env.observation_space.shape[0]
    actionDim = 7
    buildModel = BuildModel(stateDim, actionDim)
    layersWidths = [30]
    writer, model = buildModel(layersWidths)

    learningRate = 0.001
    gamma = 0.99
    trainModelBySASRQ = TrainModelBySASRQ(learningRate, gamma, writer)

    paramUpdateInterval = 300
    updateParameters = UpdateParameters(paramUpdateInterval)
    model = resetTargetParamToTrainParam([model])[0]
    trainModels = TrainDQNModel(getTargetQValue, trainModelBySASRQ, updateParameters, model)

    epsilonMax = 0.9
    epsilonIncrease = 0.0001
    epsilonMin = 0
    bufferSize = 10000
    decayStartStep = bufferSize
    getEpsilon = GetEpsilon(epsilonMax, epsilonMin, epsilonIncrease, decayStartStep)

    actGreedyByModel = ActGreedyByModel(getTrainQValue, model)
    actRandom = ActRandom(actionDim)
    actByTrainNetEpsilonGreedy = ActByTrainNetEpsilonGreedy(getEpsilon, actGreedyByModel, actRandom)

    minibatchSize = 128
    learningStartBufferSize = minibatchSize
    sampleFromMemory = SampleFromMemory(minibatchSize)
    learnFromBuffer = LearnFromBuffer(learningStartBufferSize, sampleFromMemory, trainModels)

    processAction = ProcessDiscretePendulumAction(actionDim)
    transit = TransitGymPendulum(processAction)
    getReward = RewardGymPendulum(angle_normalize, processAction)
    sampleOneStep = SampleOneStep(transit, getReward)

    runDQNTimeStep = RunTimeStep(actByTrainNetEpsilonGreedy, sampleOneStep, learnFromBuffer, observe)

    reset = ResetGymPendulum(seed)
    maxTimeStep = 200
    runEpisode = RunEpisode(reset, runDQNTimeStep, maxTimeStep, isTerminalGymPendulum)

    maxEpisode = 400
    dqn = RunAlgorithm(runEpisode, maxEpisode)
    replayBuffer = deque(maxlen=int(bufferSize))
    meanRewardList, trajectory = dqn(replayBuffer)

    trainedModel = trainModels.getTrainedModels()

# save Model
    parameters = {'maxEpisode': maxEpisode, 'maxTimeStep': maxTimeStep, 'minibatchSize': minibatchSize, 'gamma': gamma,
                  'learningRate': learningRate, 'epsilonIncrease': epsilonIncrease , 'epsilonMin': epsilonMin}

    modelSaveDirectory = "../trainedDQNModels"
    modelSaveExtension = '.ckpt'
    getSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension)
    savePath = getSavePath(parameters)

    with trainedModel.as_default():
        saveVariables(trainedModel, savePath)

    dirName = os.path.dirname(__file__)
    trajectoryPath = os.path.join(dirName, '..', 'trajectory', 'pendulumDQNTrajectory.pickle')
    saveToPickle(trajectory, trajectoryPath)

    plotResult = True
    if plotResult:
        plt.plot(list(range(maxEpisode)), meanRewardList)
        plt.show()

    showDemo = False
    if showDemo:
        visualize = VisualizeGymPendulum()
        visualize(trajectory)
def main():
    env = MtCarDiscreteEnvSetup()
    visualize = visualizeMtCarDiscrete()
    reset = resetMtCarDiscrete(1234)
    transition = MtCarDiscreteTransition()
    rewardMtCar = MtCarDiscreteReward()
    isterminal = MtCarDiscreteIsTerminal()

    statesdim = env.observation_space.shape[0]
    actiondim = env.action_space.n
    replaybuffer = deque(maxlen=buffersize)
    runepsilon = initepsilon
    totalrewards = []
    meanreward = []
    trajectory = []
    totalreward = []

    buildmodel = BuildModel(statesdim, actiondim)
    Writer, DQNmodel = buildmodel(numberlayers)
    replaceParameters = ReplaceParameters(replaceiter)
    trainModel = TrainModel(learningRate, gamma, Writer)
    trainDQNmodel = TrainDQNmodel(replaceParameters, trainModel, DQNmodel)
    learn = Learn(buffersize, batchsize, trainDQNmodel, actiondim)

    for episode in range(EPISODE):
        state = reset()
        rewards = 0
        runtime = 0
        while True:
            action = learn.Getaction(DQNmodel, runepsilon, state)
            nextstate = transition(state, action)
            done = isterminal(nextstate)
            reward = rewardMtCar(state, action, nextstate, done)
            learn.ReplayMemory(replaybuffer, state, action, reward, nextstate,
                               done)
            trajectory.append((state, action, reward, nextstate))
            rewards += reward
            state = nextstate
            runtime += 1
            if runtime == 200:
                totalrewards.append(rewards)
                totalreward.append(rewards)
                runtime = 0
                print('episode: ', episode, 'reward:', rewards, 'epsilon:',
                      runepsilon)
                break
            if done:
                totalrewards.append(rewards)
                totalreward.append(rewards)
                print('episode: ', episode, 'reward:', rewards, 'epsilon:',
                      runepsilon)
                break
        runepsilon = epsilonDec(runepsilon, minepsilon, epsilondec)
        if episode % 100 == 0:
            meanreward.append(np.mean(totalreward))
            print('episode: ', episode, 'meanreward:', np.mean(totalreward))
            totalreward = []
    episode = 100 * (np.arange(len(meanreward)))
    plt.plot(episode, meanreward)
    plt.xlabel('episode')
    plt.ylabel('rewards')
    plt.ylim([-200, -50])
    plt.show()

    # save Model
    modelIndex = 0
    DQNFixedParam = {'DQNmodel': modelIndex}
    parameters = {
        'env': ENV_NAME,
        'Eps': EPISODE,
        'batch': batchsize,
        'buffersize': buffersize,
        'gam': gamma,
        'learningRate': learningRate,
        'replaceiter': replaceiter,
        'epsilondec': epsilondec,
        'minepsilon': minepsilon,
        'initepsilon': initepsilon
    }

    modelSaveDirectory = "/path/to/logs/trainedDQNModels"
    modelSaveExtension = '.ckpt'
    getSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension,
                              DQNFixedParam)
    savePathDQN = getSavePath(parameters)

    with DQNmodel.as_default():
        saveVariables(DQNmodel, savePathDQN)

    dirName = os.path.dirname(__file__)
    trajectoryPath = os.path.join(dirName, 'trajectory',
                                  'mountCarTrajectory.pickle')
    saveToPickle(trajectory, trajectoryPath)