コード例 #1
0
    def testCriticImprovement(self, stateBatch, actionBatch, rewardBatch,
                              targetQValue):
        criticWriter, criticModel = self.buildCriticModel(
            self.criticLayerWidths)
        trainCriticBySASRQ = TrainCriticBySASRQ(self.learningRateCritic,
                                                self.gamma, criticWriter)
        lossWithTrain1, criticModel = trainCriticBySASRQ(
            criticModel, stateBatch, actionBatch, rewardBatch, targetQValue)
        lossWithTrain2, criticModel = trainCriticBySASRQ(
            criticModel, stateBatch, actionBatch, rewardBatch, targetQValue)

        self.assertTrue(lossWithTrain1 > lossWithTrain2)
コード例 #2
0
    def testActorTrainImprovement(self):
        stateBatch = [[2, 5, 10, 5, 2, 5, 10, 5], [1, 1, 1, 1, 1, 1, 1, 1]]
        actionBatch = [[0.1, 0.2, 0.3, 0.4, 0.5], [0.2, 0.2, 0.2, 0.2, 0.2]]
        rewardBatch = [[2], [0]]
        targetQValue = [[3], [1]]

        numStateSpace = len(stateBatch[0])
        actionDim = 5
        actionRange = 1

        buildActorModel = BuildActorModel(numStateSpace, actionDim,
                                          actionRange)
        actorLayerWidths = [64, 64]
        criticLayerWidths = [64, 64]
        buildCriticModel = BuildCriticModel(numStateSpace, actionDim)

        actorWriter, actorModel = buildActorModel(actorLayerWidths)
        criticWriter, criticModel = buildCriticModel(criticLayerWidths)

        trainCriticBySASRQ = TrainCriticBySASRQ(self.learningRateCritic,
                                                self.gamma, criticWriter)

        for i in range(100):
            lossWithTrain, criticModel = trainCriticBySASRQ(
                criticModel, stateBatch, actionBatch, rewardBatch,
                targetQValue)
            print(lossWithTrain)

        actionUntrained = actByPolicyTrain(actorModel, stateBatch)
        actionUntrainedQVal = evaluateCriticTrain(criticModel, stateBatch,
                                                  actionUntrained)

        trainActorFromGradients = TrainActorFromGradients(
            self.learningRateActor, actorWriter)
        trainOneStep = TrainActorOneStep(actByPolicyTrain,
                                         trainActorFromGradients,
                                         getActionGradients)

        actorModel = trainOneStep(actorModel, criticModel, stateBatch)
        actionTrained = actByPolicyTrain(actorModel, stateBatch)
        actionTrainedValue = evaluateCriticTrain(criticModel, stateBatch,
                                                 actionTrained)

        [
            self.assertTrue(trained > untrained) for trained, untrained in zip(
                actionTrainedValue, actionUntrainedQVal)
        ]
コード例 #3
0
    def testCriticLossCalculation(self, stateBatch, actionBatch, rewardBatch,
                                  targetQValue):
        criticWriter, criticModel = self.buildCriticModel(
            self.criticLayerWidths)
        trainCriticBySASRQ = TrainCriticBySASRQ(self.learningRateCritic,
                                                self.gamma, criticWriter)

        criticGraph = criticModel.graph
        states_ = criticGraph.get_collection_ref("states_")[0]
        actionTarget_ = criticGraph.get_collection_ref("action_")[0]
        trainQ_ = criticGraph.get_collection_ref("trainQ_")[0]

        trainQVal = criticModel.run(trainQ_,
                                    feed_dict={
                                        states_: stateBatch,
                                        actionTarget_: actionBatch
                                    })
        calculatedLoss, criticModel = trainCriticBySASRQ(
            criticModel, stateBatch, actionBatch, rewardBatch, targetQValue)

        yi = np.array(rewardBatch) + self.gamma * np.array(targetQValue)
        trueLoss = np.mean(np.square(yi - trainQVal))
        self.assertAlmostEqual(trueLoss, calculatedLoss, places=3)
コード例 #4
0
    def testDDPGUpdateCriticParams(self):
        criticWriter, criticModel = self.buildCriticModel(
            self.criticLayerWidths)
        trainCriticBySASRQ = TrainCriticBySASRQ(self.learningRateCritic,
                                                self.gamma, criticWriter)
        stateBatch = [[1, 1, 1, 1]]
        actionBatch = [[2, 2]]
        rewardBatch = [[2]]
        targetQValue = [[2]]

        runTime = 20
        for i in range(runTime):
            calculatedLoss, criticModel = trainCriticBySASRQ(
                criticModel, stateBatch, actionBatch, rewardBatch,
                targetQValue)

        criticGraph = criticModel.graph
        trainParams_ = criticGraph.get_collection_ref("trainParams_")[0]
        targetParams_ = criticGraph.get_collection_ref("targetParams_")[0]
        trainParams, targetParams = criticModel.run(
            [trainParams_, targetParams_])

        updatedCriticModel = self.updateParameters(criticModel)

        updatedCriticGraph = updatedCriticModel.graph
        updatedTrainParams_ = updatedCriticGraph.get_collection_ref(
            "trainParams_")[0]
        updatedTargetParams_ = updatedCriticGraph.get_collection_ref(
            "targetParams_")[0]
        updatedTrainParams, updatedTargetParams = criticModel.run(
            [updatedTrainParams_, updatedTargetParams_])

        calUpdatedTargetParam = (1 - self.tau) * np.array(
            targetParams) + self.tau * np.array(updatedTrainParams)
        difference = np.array(updatedTargetParams) - calUpdatedTargetParam

        [self.assertEqual(np.mean(paramDiff), 0) for paramDiff in difference]
コード例 #5
0
def main():
    stateDim = env.observation_space.shape[0]
    actionDim = env.action_space.shape[0]
    actionHigh = env.action_space.high
    actionLow = env.action_space.low
    actionBound = (actionHigh - actionLow) / 2

    buildActorModel = BuildActorModel(stateDim, actionDim, actionBound)
    actorLayerWidths = [30]
    actorWriter, actorModel = buildActorModel(actorLayerWidths)

    buildCriticModel = BuildCriticModel(stateDim, actionDim)
    criticLayerWidths = [30]
    criticWriter, criticModel = buildCriticModel(criticLayerWidths)

    trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma,
                                            criticWriter)
    trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget,
                              trainCriticBySASRQ)

    trainActorFromGradients = TrainActorFromGradients(learningRateActor,
                                                      actorWriter)
    trainActorOneStep = TrainActorOneStep(actByPolicyTrain,
                                          trainActorFromGradients,
                                          getActionGradients)
    trainActor = TrainActor(trainActorOneStep)

    paramUpdateInterval = 1
    updateParameters = UpdateParameters(paramUpdateInterval, tau)

    modelList = [actorModel, criticModel]
    actorModel, criticModel = resetTargetParamToTrainParam(modelList)
    trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic,
                                  actorModel, criticModel)

    noiseInitVariance = 3
    varianceDiscount = .9995
    noiseDecayStartStep = bufferSize
    getNoise = GetExponentialDecayGaussNoise(noiseInitVariance,
                                             varianceDiscount,
                                             noiseDecayStartStep)
    actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh,
                                         actByPolicyTrain, actorModel,
                                         getNoise)

    learningStartBufferSize = minibatchSize
    sampleFromMemory = SampleFromMemory(minibatchSize)
    learnFromBuffer = LearnFromBuffer(learningStartBufferSize,
                                      sampleFromMemory, trainModels)

    transit = TransitGymPendulum()
    getReward = RewardGymPendulum(angle_normalize)
    sampleOneStep = SampleOneStep(transit, getReward)

    runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep,
                                  learnFromBuffer, observe)

    reset = ResetGymPendulum(seed)
    runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep,
                            isTerminalGymPendulum)

    dirName = os.path.dirname(__file__)
    modelPath = os.path.join(dirName, '..', 'trainedDDPGModels', 'pendulum')
    getTrainedModel = lambda: trainModels.actorModel
    modelSaveRate = 50
    saveModel = SaveModel(modelSaveRate, saveVariables, getTrainedModel,
                          modelPath)

    ddpg = RunAlgorithm(runEpisode, maxEpisode, [saveModel])

    replayBuffer = deque(maxlen=int(bufferSize))
    meanRewardList, trajectory = ddpg(replayBuffer)

    dirName = os.path.dirname(__file__)
    trajectoryPath = os.path.join(dirName, '..', 'trajectory',
                                  'pendulumTrajectory1.pickle')
    saveToPickle(trajectory, trajectoryPath)

    # plots& plot
    showDemo = True
    if showDemo:
        visualize = VisualizeGymPendulum()
        visualize(trajectory)

    plotResult = True
    if plotResult:
        plt.plot(list(range(maxEpisode)), meanRewardList)
        plt.show()
コード例 #6
0
    def __call__(self, df):
        varianceDiscount = df.index.get_level_values('varianceDiscount')[0]
        bufferSize = df.index.get_level_values('bufferSize')[0]
        layerWidth = df.index.get_level_values('layerWidth')[0]
        print('buffer: ', bufferSize, ', layers: ', layerWidth,
              ', varDiscount: ', varianceDiscount)

        buildActorModel = BuildActorModel(stateDim, actionDim, actionBound)
        actorWriter, actorModel = buildActorModel(layerWidth)

        buildCriticModel = BuildCriticModel(stateDim, actionDim)
        criticWriter, criticModel = buildCriticModel(layerWidth)

        trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma,
                                                criticWriter)
        trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget,
                                  trainCriticBySASRQ)

        trainActorFromGradients = TrainActorFromGradients(
            learningRateActor, actorWriter)
        trainActorOneStep = TrainActorOneStep(actByPolicyTrain,
                                              trainActorFromGradients,
                                              getActionGradients)
        trainActor = TrainActor(trainActorOneStep)

        paramUpdateInterval = 1
        updateParameters = UpdateParameters(paramUpdateInterval, tau)

        modelList = [actorModel, criticModel]
        actorModel, criticModel = resetTargetParamToTrainParam(modelList)
        trainModels = TrainDDPGModels(updateParameters, trainActor,
                                      trainCritic, actorModel, criticModel)

        noiseInitVariance = 1
        noiseDecayStartStep = bufferSize
        getNoise = GetExponentialDecayGaussNoise(noiseInitVariance,
                                                 varianceDiscount,
                                                 noiseDecayStartStep)
        actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh,
                                             actByPolicyTrain, actorModel,
                                             getNoise)

        learningStartBufferSize = minibatchSize
        sampleFromMemory = SampleFromMemory(minibatchSize)
        learnFromBuffer = LearnFromBuffer(learningStartBufferSize,
                                          sampleFromMemory, trainModels)

        sheepId = 0
        wolfId = 1
        getSheepXPos = GetAgentPosFromState(sheepId)
        getWolfXPos = GetAgentPosFromState(wolfId)

        wolfSpeed = 2
        wolfPolicy = HeatSeekingContinuousDeterministicPolicy(
            getWolfXPos, getSheepXPos, wolfSpeed)
        xBoundary = (0, 20)
        yBoundary = (0, 20)
        stayWithinBoundary = StayWithinBoundary(xBoundary, yBoundary)
        physicalTransition = TransitForNoPhysics(getIntendedNextState,
                                                 stayWithinBoundary)
        transit = TransitWithSingleWolf(physicalTransition, wolfPolicy)

        sheepAliveBonus = 0 / maxTimeStep
        sheepTerminalPenalty = -20

        killzoneRadius = 1
        isTerminal = IsTerminal(getWolfXPos, getSheepXPos, killzoneRadius)
        getBoundaryPunishment = GetBoundaryPunishment(xBoundary,
                                                      yBoundary,
                                                      sheepIndex=0,
                                                      punishmentVal=10)
        rewardSheep = RewardFunctionCompete(sheepAliveBonus,
                                            sheepTerminalPenalty, isTerminal)
        getReward = RewardSheepWithBoundaryHeuristics(rewardSheep,
                                                      getIntendedNextState,
                                                      getBoundaryPunishment,
                                                      getSheepXPos)
        sampleOneStep = SampleOneStep(transit, getReward)

        runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep,
                                      learnFromBuffer)

        reset = Reset(xBoundary, yBoundary, numAgents)
        runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep,
                                isTerminal)

        ddpg = RunAlgorithm(runEpisode, maxEpisode)

        replayBuffer = deque(maxlen=int(bufferSize))
        meanRewardList, trajectory = ddpg(replayBuffer)

        timeStep = list(range(len(meanRewardList)))
        resultSe = pd.Series(
            {time: reward
             for time, reward in zip(timeStep, meanRewardList)})

        return resultSe
コード例 #7
0
def main():
    numAgents = 2
    stateDim = numAgents * 2
    actionLow = -1
    actionHigh = 1
    actionBound = (actionHigh - actionLow) / 2
    actionDim = 2

    buildActorModel = BuildActorModel(stateDim, actionDim, actionBound)
    actorLayerWidths = [64]
    actorWriter, actorModel = buildActorModel(actorLayerWidths)

    buildCriticModel = BuildCriticModel(stateDim, actionDim)
    criticLayerWidths = [64]
    criticWriter, criticModel = buildCriticModel(criticLayerWidths)

    trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma,
                                            criticWriter)
    trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget,
                              trainCriticBySASRQ)

    trainActorFromGradients = TrainActorFromGradients(learningRateActor,
                                                      actorWriter)
    trainActorOneStep = TrainActorOneStep(actByPolicyTrain,
                                          trainActorFromGradients,
                                          getActionGradients)
    trainActor = TrainActor(trainActorOneStep)

    paramUpdateInterval = 1
    updateParameters = UpdateParameters(paramUpdateInterval, tau)

    modelList = [actorModel, criticModel]
    actorModel, criticModel = resetTargetParamToTrainParam(modelList)
    trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic,
                                  actorModel, criticModel)

    noiseInitVariance = 1
    varianceDiscount = .9995
    getNoise = GetExponentialDecayGaussNoise(noiseInitVariance,
                                             varianceDiscount,
                                             noiseDecayStartStep)
    actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh,
                                         actByPolicyTrain, actorModel,
                                         getNoise)

    sampleFromMemory = SampleFromMemory(minibatchSize)
    learnFromBuffer = LearnFromBuffer(learningStartBufferSize,
                                      sampleFromMemory, trainModels)

    sheepId = 0
    wolfId = 1
    getSheepPos = GetAgentPosFromState(sheepId)
    getWolfPos = GetAgentPosFromState(wolfId)

    wolfSpeed = 1
    wolfPolicy = HeatSeekingContinuousDeterministicPolicy(
        getWolfPos, getSheepPos, wolfSpeed)
    # wolfPolicy = lambda state: (0, 0)

    xBoundary = (0, 20)
    yBoundary = (0, 20)
    stayWithinBoundary = StayWithinBoundary(xBoundary, yBoundary)
    physicalTransition = TransitForNoPhysics(getIntendedNextState,
                                             stayWithinBoundary)
    transit = TransitWithSingleWolf(physicalTransition, wolfPolicy)

    sheepAliveBonus = 1 / maxTimeStep
    sheepTerminalPenalty = 20

    killzoneRadius = 1
    isTerminal = IsTerminal(getWolfPos, getSheepPos, killzoneRadius)
    getBoundaryPunishment = GetBoundaryPunishment(xBoundary,
                                                  yBoundary,
                                                  sheepIndex=0,
                                                  punishmentVal=10)
    rewardSheep = RewardFunctionCompete(sheepAliveBonus, sheepTerminalPenalty,
                                        isTerminal)
    getReward = RewardSheepWithBoundaryHeuristics(rewardSheep,
                                                  getIntendedNextState,
                                                  getBoundaryPunishment,
                                                  getSheepPos)
    sampleOneStep = SampleOneStep(transit, getReward)

    runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep,
                                  learnFromBuffer)

    # reset = Reset(xBoundary, yBoundary, numAgents)
    # reset = lambda: np.array([10, 3, 15, 8]) #all [-1, -1] action
    # reset = lambda: np.array([15, 8, 10, 3]) # all [1. 1.]
    # reset = lambda: np.array([15, 10, 10, 10])
    reset = lambda: np.array([10, 10, 15, 5])

    runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal)

    ddpg = RunAlgorithm(runEpisode, maxEpisode)

    replayBuffer = deque(maxlen=int(bufferSize))
    meanRewardList, trajectory = ddpg(replayBuffer)

    trainedActorModel, trainedCriticModel = trainModels.getTrainedModels()

    modelIndex = 0
    actorFixedParam = {'actorModel': modelIndex}
    criticFixedParam = {'criticModel': modelIndex}
    parameters = {
        'wolfSpeed': wolfSpeed,
        'dimension': actionDim,
        'maxEpisode': maxEpisode,
        'maxTimeStep': maxTimeStep,
        'minibatchSize': minibatchSize,
        'gamma': gamma,
        'learningRateActor': learningRateActor,
        'learningRateCritic': learningRateCritic
    }

    modelSaveDirectory = "../trainedDDPGModels"
    modelSaveExtension = '.ckpt'
    getActorSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension,
                                   actorFixedParam)
    getCriticSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension,
                                    criticFixedParam)
    savePathActor = getActorSavePath(parameters)
    savePathCritic = getCriticSavePath(parameters)

    with actorModel.as_default():
        saveVariables(trainedActorModel, savePathActor)
    with criticModel.as_default():
        saveVariables(trainedCriticModel, savePathCritic)

    plotResult = True
    if plotResult:
        plt.plot(list(range(maxEpisode)), meanRewardList)
        plt.show()
コード例 #8
0
    def __call__(self, df):
        noiseVariance = df.index.get_level_values('noiseInitVariance')[0]
        memorySize = df.index.get_level_values('memorySize')[0]

        buildActorModel = BuildActorModel(self.fixedParameters['stateDim'],
                                          self.fixedParameters['actionDim'],
                                          self.fixedParameters['actionBound'])
        actorWriter, actorModel = buildActorModel(
            self.fixedParameters['actorLayerWidths'])

        buildCriticModel = BuildCriticModel(self.fixedParameters['stateDim'],
                                            self.fixedParameters['actionDim'])
        criticWriter, criticModel = buildCriticModel(
            self.fixedParameters['criticLayerWidths'])

        trainCriticBySASRQ = TrainCriticBySASRQ(
            self.fixedParameters['learningRateCritic'],
            self.fixedParameters['gamma'], criticWriter)
        trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget,
                                  trainCriticBySASRQ)

        trainActorFromGradients = TrainActorFromGradients(
            self.fixedParameters['learningRateActor'], actorWriter)
        trainActorOneStep = TrainActorOneStep(actByPolicyTrain,
                                              trainActorFromGradients,
                                              getActionGradients)
        trainActor = TrainActor(trainActorOneStep)

        updateParameters = UpdateParameters(
            self.fixedParameters['paramUpdateInterval'],
            self.fixedParameters['tau'])

        modelList = [actorModel, criticModel]
        actorModel, criticModel = resetTargetParamToTrainParam(modelList)
        trainModels = TrainDDPGModels(updateParameters, trainActor,
                                      trainCritic, actorModel, criticModel)

        getNoise = GetExponentialDecayGaussNoise(
            noiseVariance, self.fixedParameters['varianceDiscount'],
            self.fixedParameters['noiseDecayStartStep'])
        actOneStepWithNoise = ActDDPGOneStep(
            self.fixedParameters['actionLow'],
            self.fixedParameters['actionHigh'], actByPolicyTrain, actorModel,
            getNoise)

        sampleFromMemory = SampleFromMemory(self.fixedParameters['batchSize'])
        learnFromBuffer = LearnFromBuffer(
            self.fixedParameters['learningStartStep'], sampleFromMemory,
            trainModels)

        transit = TransitGymPendulum()
        getReward = RewardGymPendulum(angle_normalize)
        sampleOneStep = SampleOneStep(transit, getReward)

        runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep,
                                      learnFromBuffer, observe)

        reset = ResetGymPendulum(seed)
        runEpisode = RunEpisode(reset, runDDPGTimeStep,
                                self.fixedParameters['maxRunSteps'],
                                isTerminalGymPendulum)

        ddpg = RunAlgorithm(runEpisode, self.fixedParameters['maxEpisode'])

        replayBuffer = deque(maxlen=int(memorySize))
        meanRewardList, trajectory = ddpg(replayBuffer)

        trainedActorModel, trainedCriticModel = trainModels.getTrainedModels()

        timeStep = list(range(len(meanRewardList)))
        resultSe = pd.Series(
            {time: reward
             for time, reward in zip(timeStep, meanRewardList)})

        if self.saveModel:
            actorParameters = {
                'ActorMemorySize': memorySize,
                'NoiseVariance': noiseVariance
            }
            criticParameters = {
                'CriticMemorySize': memorySize,
                'NoiseVariance': noiseVariance
            }
            actorPath = self.getSavePath(actorParameters)
            criticPath = self.getSavePath(criticParameters)
            with trainedActorModel.as_default():
                saveVariables(trainedActorModel, actorPath)
            with trainedCriticModel.as_default():
                saveVariables(trainedCriticModel, criticPath)

        return resultSe
コード例 #9
0
def main():
    stateDim = env.observation_space.shape[0]
    actionDim = env.action_space.shape[0]
    actionHigh = env.action_space.high
    actionLow = env.action_space.low
    actionBound = (actionHigh - actionLow) / 2

    buildActorModel = BuildActorModel(stateDim, actionDim, actionBound)
    actorLayerWidths = [30]
    actorWriter, actorModel = buildActorModel(actorLayerWidths)

    buildCriticModel = BuildCriticModel(stateDim, actionDim)
    criticLayerWidths = [30]
    criticWriter, criticModel = buildCriticModel(criticLayerWidths)

    trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma,
                                            criticWriter)
    trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget,
                              trainCriticBySASRQ)

    trainActorFromGradients = TrainActorFromGradients(learningRateActor,
                                                      actorWriter)
    trainActorOneStep = TrainActorOneStep(actByPolicyTrain,
                                          trainActorFromGradients,
                                          getActionGradients)
    trainActor = TrainActor(trainActorOneStep)

    paramUpdateInterval = 1
    updateParameters = UpdateParameters(paramUpdateInterval, tau)

    modelList = [actorModel, criticModel]
    actorModel, criticModel = resetTargetParamToTrainParam(modelList)
    trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic,
                                  actorModel, criticModel)

    noiseInitVariance = 3
    varianceDiscount = .9995
    noiseDecayStartStep = bufferSize
    getNoise = GetExponentialDecayGaussNoise(noiseInitVariance,
                                             varianceDiscount,
                                             noiseDecayStartStep)
    actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh,
                                         actByPolicyTrain, actorModel,
                                         getNoise)

    learningStartBufferSize = minibatchSize
    sampleFromMemory = SampleFromMemory(minibatchSize)
    learnFromBuffer = LearnFromBuffer(learningStartBufferSize,
                                      sampleFromMemory, trainModels)

    sampleOneStep = SampleOneStepUsingGym(env)
    runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep,
                                  learnFromBuffer)

    reset = lambda: env.reset()
    runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep,
                            isTerminalGymPendulum)

    ddpg = RunAlgorithm(runEpisode, maxEpisode)

    replayBuffer = deque(maxlen=int(bufferSize))
    meanRewardList, trajectory = ddpg(replayBuffer)

    trainedActorModel, trainedCriticModel = trainModels.getTrainedModels()

    env.close()

    plotResult = True
    if plotResult:
        plt.plot(list(range(maxEpisode)), meanRewardList)
        plt.show()
def main():
    stateDim = env.observation_space.shape[0]
    actionDim = env.action_space.shape[0]
    actionHigh = env.action_space.high
    actionLow = env.action_space.low
    actionBound = (actionHigh - actionLow) / 2

    buildActorModel = BuildActorModel(stateDim, actionDim, actionBound)
    actorLayerWidths = [30]
    actorWriter, actorModel = buildActorModel(actorLayerWidths)

    buildCriticModel = BuildCriticModel(stateDim, actionDim)
    criticLayerWidths = [30]
    criticWriter, criticModel = buildCriticModel(criticLayerWidths)

    trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma,
                                            criticWriter)
    trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget,
                              trainCriticBySASRQ)

    trainActorFromGradients = TrainActorFromGradients(learningRateActor,
                                                      actorWriter)
    trainActorOneStep = TrainActorOneStep(actByPolicyTrain,
                                          trainActorFromGradients,
                                          getActionGradients)
    trainActor = TrainActor(trainActorOneStep)

    paramUpdateInterval = 1
    updateParameters = UpdateParameters(paramUpdateInterval, tau)

    modelList = [actorModel, criticModel]
    actorModel, criticModel = resetTargetParamToTrainParam(modelList)
    trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic,
                                  actorModel, criticModel)

    noiseInitVariance = 1  # control exploration
    varianceDiscount = .99995
    noiseDecayStartStep = bufferSize
    minVar = .1
    getNoise = GetExponentialDecayGaussNoise(noiseInitVariance,
                                             varianceDiscount,
                                             noiseDecayStartStep, minVar)
    actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh,
                                         actByPolicyTrain, actorModel,
                                         getNoise)

    learningStartBufferSize = minibatchSize
    sampleFromMemory = SampleFromMemory(minibatchSize)
    learnFromBuffer = LearnFromBuffer(learningStartBufferSize,
                                      sampleFromMemory, trainModels)

    transit = TransitGymMountCarContinuous()
    isTerminal = IsTerminalMountCarContin()
    getReward = RewardMountCarContin(isTerminal)
    sampleOneStep = SampleOneStep(transit, getReward)

    runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep,
                                  learnFromBuffer)

    resetLow = -1
    resetHigh = 0.4
    reset = ResetMountCarContin(seed=None)
    runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal)

    ddpg = RunAlgorithm(runEpisode, maxEpisode)
    replayBuffer = deque(maxlen=int(bufferSize))
    meanRewardList, trajectory = ddpg(replayBuffer)

    trainedActorModel, trainedCriticModel = trainModels.getTrainedModels()

    # save Model
    modelIndex = 0
    actorFixedParam = {'actorModel': modelIndex}
    criticFixedParam = {'criticModel': modelIndex}
    parameters = {
        'env': ENV_NAME,
        'Eps': maxEpisode,
        'timeStep': maxTimeStep,
        'batch': minibatchSize,
        'gam': gamma,
        'lrActor': learningRateActor,
        'lrCritic': learningRateCritic,
        'noiseVar': noiseInitVariance,
        'varDiscout': varianceDiscount,
        'resetLow': resetLow,
        'High': resetHigh
    }

    modelSaveDirectory = "../trainedDDPGModels"
    modelSaveExtension = '.ckpt'
    getActorSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension,
                                   actorFixedParam)
    getCriticSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension,
                                    criticFixedParam)
    savePathActor = getActorSavePath(parameters)
    savePathCritic = getCriticSavePath(parameters)

    with actorModel.as_default():
        saveVariables(trainedActorModel, savePathActor)
    with criticModel.as_default():
        saveVariables(trainedCriticModel, savePathCritic)

    dirName = os.path.dirname(__file__)
    trajectoryPath = os.path.join(dirName, '..', 'trajectory',
                                  'mountCarTrajectoryOriginalReset1.pickle')
    saveToPickle(trajectory, trajectoryPath)

    # plots& plot
    showDemo = False
    if showDemo:
        visualize = VisualizeMountCarContin()
        visualize(trajectory)

    plotResult = True
    if plotResult:
        plt.plot(list(range(maxEpisode)), meanRewardList)
        plt.show()