def __call__(self, df):
        learningRate = df.index.get_level_values('learningRate')[0]
        buffersize = df.index.get_level_values('buffersize')[0]
        
            
        visualize = visualizeCartpole() 
        reset = resetCartpole()         
        transition = CartPoletransition()  
        rewardcart = CartPoleReward()        
        isterminal = isTerminal()
        replaybuffer = deque(maxlen=int(buffersize))
        trajectory = []
        totalrewards = []
        averagerewards = []
    
        buildmodel = BuildModel(self.fixedParameters['stateDim'],self.fixedParameters['actionDim'])
        Writer,DQNmodel = buildmodel(self.fixedParameters['numberlayers'])
        replaceParameters = ReplaceParameters(self.fixedParameters['replaceiter'])
        trainModel = TrainModel(learningRate, self.fixedParameters['gamma'],Writer)
        trainDQNmodel = TrainDQNmodel(replaceParameters, trainModel, DQNmodel)
        learn = Learn(buffersize,self.fixedParameters['batchsize'],trainDQNmodel,self.fixedParameters['actionDim'])
        runepsilon = self.fixedParameters['initepsilon']
        
        
        
        for episode in range(self.fixedParameters['maxEpisode']):
            state  = reset()
            rewards = 0
            while True:
                visualize(state)
                runepsilon = epsilonDec(runepsilon,self.fixedParameters['minepsilon'],self.fixedParameters['epsilondec'])
                action = learn.Getaction(DQNmodel,runepsilon,state)  
                nextstate=transition(state, action)
                done = isterminal(nextstate)
                reward = rewardcart(state,action,nextstate,done)
                trajectory.append((state, action, reward, nextstate))
                learn.ReplayMemory(replaybuffer,state, action, reward, nextstate,done)
                rewards += reward
                state = nextstate
                if done:
                    totalrewards.append(rewards)
                    print('episode: ',episode,'reward:',rewards,'epsilon:',runepsilon)
                    break
            averagerewards.append(np.mean(totalrewards))
            print('episode:',episode,'meanreward:',np.mean(totalrewards))



        timeStep = list(range(len(averagerewards)))
        resultSe = pd.Series({time: reward for time, reward in zip(timeStep, averagerewards)})

        if self.saveModel:
            Parameters = {'learningRate': learningRate, 'buffersize': buffersize }
            modelPath = self.getSavePath(Parameters)
            with DQNmodel.as_default():
                saveVariables(DQNmodel, modelPath)

        return resultSe
Ejemplo n.º 2
0
def main():
    numAgents = 2
    stateDim = numAgents * 2
    actionLow = -1
    actionHigh = 1
    actionBound = (actionHigh - actionLow) / 2
    actionDim = 2

    buildActorModel = BuildActorModel(stateDim, actionDim, actionBound)
    actorLayerWidths = [64]
    actorWriter, actorModel = buildActorModel(actorLayerWidths)

    buildCriticModel = BuildCriticModel(stateDim, actionDim)
    criticLayerWidths = [64]
    criticWriter, criticModel = buildCriticModel(criticLayerWidths)

    trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma,
                                            criticWriter)
    trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget,
                              trainCriticBySASRQ)

    trainActorFromGradients = TrainActorFromGradients(learningRateActor,
                                                      actorWriter)
    trainActorOneStep = TrainActorOneStep(actByPolicyTrain,
                                          trainActorFromGradients,
                                          getActionGradients)
    trainActor = TrainActor(trainActorOneStep)

    paramUpdateInterval = 1
    updateParameters = UpdateParameters(paramUpdateInterval, tau)

    modelList = [actorModel, criticModel]
    actorModel, criticModel = resetTargetParamToTrainParam(modelList)
    trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic,
                                  actorModel, criticModel)

    noiseInitVariance = 1
    varianceDiscount = .9995
    getNoise = GetExponentialDecayGaussNoise(noiseInitVariance,
                                             varianceDiscount,
                                             noiseDecayStartStep)
    actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh,
                                         actByPolicyTrain, actorModel,
                                         getNoise)

    sampleFromMemory = SampleFromMemory(minibatchSize)
    learnFromBuffer = LearnFromBuffer(learningStartBufferSize,
                                      sampleFromMemory, trainModels)

    sheepId = 0
    wolfId = 1
    getSheepPos = GetAgentPosFromState(sheepId)
    getWolfPos = GetAgentPosFromState(wolfId)

    wolfSpeed = 1
    wolfPolicy = HeatSeekingContinuousDeterministicPolicy(
        getWolfPos, getSheepPos, wolfSpeed)
    # wolfPolicy = lambda state: (0, 0)

    xBoundary = (0, 20)
    yBoundary = (0, 20)
    stayWithinBoundary = StayWithinBoundary(xBoundary, yBoundary)
    physicalTransition = TransitForNoPhysics(getIntendedNextState,
                                             stayWithinBoundary)
    transit = TransitWithSingleWolf(physicalTransition, wolfPolicy)

    sheepAliveBonus = 1 / maxTimeStep
    sheepTerminalPenalty = 20

    killzoneRadius = 1
    isTerminal = IsTerminal(getWolfPos, getSheepPos, killzoneRadius)
    getBoundaryPunishment = GetBoundaryPunishment(xBoundary,
                                                  yBoundary,
                                                  sheepIndex=0,
                                                  punishmentVal=10)
    rewardSheep = RewardFunctionCompete(sheepAliveBonus, sheepTerminalPenalty,
                                        isTerminal)
    getReward = RewardSheepWithBoundaryHeuristics(rewardSheep,
                                                  getIntendedNextState,
                                                  getBoundaryPunishment,
                                                  getSheepPos)
    sampleOneStep = SampleOneStep(transit, getReward)

    runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep,
                                  learnFromBuffer)

    # reset = Reset(xBoundary, yBoundary, numAgents)
    # reset = lambda: np.array([10, 3, 15, 8]) #all [-1, -1] action
    # reset = lambda: np.array([15, 8, 10, 3]) # all [1. 1.]
    # reset = lambda: np.array([15, 10, 10, 10])
    reset = lambda: np.array([10, 10, 15, 5])

    runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal)

    ddpg = RunAlgorithm(runEpisode, maxEpisode)

    replayBuffer = deque(maxlen=int(bufferSize))
    meanRewardList, trajectory = ddpg(replayBuffer)

    trainedActorModel, trainedCriticModel = trainModels.getTrainedModels()

    modelIndex = 0
    actorFixedParam = {'actorModel': modelIndex}
    criticFixedParam = {'criticModel': modelIndex}
    parameters = {
        'wolfSpeed': wolfSpeed,
        'dimension': actionDim,
        'maxEpisode': maxEpisode,
        'maxTimeStep': maxTimeStep,
        'minibatchSize': minibatchSize,
        'gamma': gamma,
        'learningRateActor': learningRateActor,
        'learningRateCritic': learningRateCritic
    }

    modelSaveDirectory = "../trainedDDPGModels"
    modelSaveExtension = '.ckpt'
    getActorSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension,
                                   actorFixedParam)
    getCriticSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension,
                                    criticFixedParam)
    savePathActor = getActorSavePath(parameters)
    savePathCritic = getCriticSavePath(parameters)

    with actorModel.as_default():
        saveVariables(trainedActorModel, savePathActor)
    with criticModel.as_default():
        saveVariables(trainedCriticModel, savePathCritic)

    plotResult = True
    if plotResult:
        plt.plot(list(range(maxEpisode)), meanRewardList)
        plt.show()
Ejemplo n.º 3
0
def main():
    statedim = env.observation_space.shape[0]
    actiondim = env.action_space.shape[0]
    actionHigh = env.action_space.high
    actionLow = env.action_space.low
    actionBound = (actionHigh - actionLow)/2
    
    replaybuffer = deque(maxlen=buffersize)
    paramUpdateFrequency = 1
    totalrewards = []
    meanreward = []
    totalreward = []
    
    buildActorModel = BuildActorModel(statedim, actiondim, actionBound)
    actorWriter, actorModel = buildActorModel(actornumberlayers)
    
    buildCriticModel = BuildCriticModel(statedim, actiondim)
    criticWriter, criticModel = buildCriticModel(criticnumberlayers)

    trainCritic = TrainCritic(criticlearningRate, gamma, criticWriter)
    trainActor = TrainActor(actorlearningRate, actorWriter)
    updateParameters = UpdateParameters(tau,paramUpdateFrequency)

    actorModel= ReplaceParameters(actorModel)
    criticModel= ReplaceParameters(criticModel)
    
    trainddpgModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, actorModel, criticModel)
    
    getnoise = GetNoise(noiseDecay,minVar,noiseDacayStep)
    getnoiseaction = GetNoiseAction(actorModel,actionLow, actionHigh)
    learn = Learn(buffersize,batchsize,trainddpgModels,actiondim)
    runtime = 0
    trajectory = []
    noisevar = initnoisevar
    for episode in range(EPISODE):
        state  = env.reset()
        rewards = 0
        for i in range(maxTimeStep):
            env.render()
            noise,noisevar = getnoise(runtime,noisevar)
            noiseaction = getnoiseaction(state,noise)
            nextstate,reward,done,info = env.step(noiseaction)
            learn(replaybuffer,state, noiseaction, nextstate,reward)
            trajectory.append((state, noiseaction, nextstate,reward))
            rewards += reward
            state = nextstate
            runtime += 1
            print(actionHigh,actionLow)
        if i == maxTimeStep-1:
            totalrewards.append(rewards)
            totalreward.append(rewards)
            print('episode: ',episode,'reward:',rewards, 'noisevar',noisevar)
            
        if episode % 100 == 0:
            meanreward.append(np.mean(totalreward))
            print('episode: ',episode,'meanreward:',np.mean(totalreward))
            totalreward = []
    plt.plot(range(EPISODE),totalrewards)
    plt.xlabel('episode')
    plt.ylabel('rewards')
    plt.show()
# save Model
    modelIndex = 0
    actorFixedParam = {'actorModel': modelIndex}
    criticFixedParam = {'criticModel': modelIndex}
    parameters = {'env': Env_name, 'Eps': EPISODE,  'batchsize': batchsize,'buffersize': buffersize,'maxTimeStep':maxTimeStep,
                  'gamma': gamma, 'actorlearningRate': actorlearningRate, 'criticlearningRate': criticlearningRate,
                  'tau': tau, 'noiseDecay': noiseDecay, 'minVar': minVar, 'initnoisevar': initnoisevar}

    modelSaveDirectory = "/path/to/logs/trainedDDPGModels"
    modelSaveExtension = '.ckpt'
    getSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension, actorFixedParam)
    getSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension, criticFixedParam)
    savePathDQN = getSavePath(parameters)

    with actorModel.as_default():
        saveVariables(actorModel, savePathDQN)
    with criticModel.as_default():
        saveVariables(criticModel, savePathDQN)
        
    dirName = os.path.dirname(__file__)
    trajectoryPath = os.path.join(dirName,'trajectory', 'HopperTrajectory.pickle')
    saveToPickle(trajectory, trajectoryPath)
Ejemplo n.º 4
0
 def __call__(self,env):
     env = env_norm(env) if self.fixedParameters['normalizeEnv'] else env
     actionHigh = env.action_space.high
     actionLow = env.action_space.low
     actionBound = (actionHigh - actionLow) / 2
     stateDim = env.observation_space.shape[0]
     actionDim = env.action_space.shape[0]
     meanreward = []
     trajectory = []
     totalreward = []
     totalrewards = []
     episodereward = []
     replaybuffer = deque(maxlen=int(self.fixedParameters['bufferSize']))
     buildActorModel = BuildActorModel(stateDim, actionDim,actionBound ,
                                       self.fixedParameters['actorHiddenLayersWeightInit'],self.fixedParameters['actorHiddenLayersBiasInit'],
                                       self.fixedParameters['actorOutputWeightInit'], self.fixedParameters['actorOutputBiasInit'],self.fixedParameters['actorActivFunction'],self.fixedParameters['gradNormClipValue'],self.fixedParameters['normalizeEnv'])
     actorModel = buildActorModel(self.fixedParameters['actorHiddenLayersWidths'])
     
     buildCriticModel = BuildCriticModel(stateDim, actionDim,
                                         self.fixedParameters['criticHiddenLayersWeightInit'],self.fixedParameters['criticHiddenLayersBiasInit'],
                                         self.fixedParameters['criticOutputWeightInit'], self.fixedParameters['criticOutputBiasInit'],
                                         self.fixedParameters['criticActivFunction'],self.fixedParameters['gradNormClipValue'],self.fixedParameters['normalizeEnv'])
     criticModel = buildCriticModel(self.fixedParameters['criticHiddenLayersWidths'])
     
     trainCritic = TrainCritic(self.fixedParameters['criticLR'], self.fixedParameters['gamma'])
     trainActor = TrainActor(self.fixedParameters['actorLR'])
     updateParameters = UpdateParameters(self.fixedParameters['tau'])
     trainddpgModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, actorModel,criticModel)
 
     getnoise = GetNoise(self.fixedParameters['varianceDiscount'],self.fixedParameters['minVar'],self.fixedParameters['noiseDecayStartStep'],self.fixedParameters['noiseInitVariance'])
     getnoiseaction = GetNoiseAction(actorModel,actionLow, actionHigh)
     learn = Learn(self.fixedParameters['bufferSize'],self.fixedParameters['minibatchSize'],trainddpgModels)
     actorModel= ReplaceParameters(actorModel)
     criticModel= ReplaceParameters(criticModel)
     state  = env.reset()
     replaybuffer = fillbuffer(3000,self.bufferfill,env,replaybuffer,state)
     
     for episode in range(1,self.fixedParameters['maxEpisode']+1):
         state  = env.reset()
         rewards = 0
         for j in range(self.fixedParameters['maxTimeStep']):
             env.render()
             noise = getnoise(self.runstep)
             noiseaction = getnoiseaction(state,noise)
             nextstate,reward,done,info = env.step(noiseaction)
             learn(replaybuffer,state, noiseaction, nextstate,reward)
             trajectory.append((state, noiseaction, nextstate,reward))
             rewards += reward
             state = nextstate
             self.runstep += 1
             if j == self.fixedParameters['maxTimeStep']-1:
                 totalrewards.append(rewards)
                 totalreward.append(rewards)
                 print('episode: ',episode,'reward:',rewards,'runstep',self.runstep)
         episodereward.append(np.mean(totalrewards))
         print('epireward',np.mean(totalrewards))
         if episode % 100 == 0:
             meanreward.append(np.mean(totalreward))
             print('episode: ',episode,'meanreward:',np.mean(totalreward))
             totalreward = []
     with actorModel.as_default():
         saveVariables(actorModel, self.fixedParameters['modelSavePathMartin'])
     with criticModel.as_default():
         saveVariables(criticModel, self.fixedParameters['modelSavePathMartin'])
     saveToPickle(meanreward, self.fixedParameters['rewardSavePathMartin'])
     
     return episodereward
Ejemplo n.º 5
0
    def __call__(self, df):
        noiseVariance = df.index.get_level_values('noiseInitVariance')[0]
        memorySize = df.index.get_level_values('memorySize')[0]

        buildActorModel = BuildActorModel(self.fixedParameters['stateDim'],
                                          self.fixedParameters['actionDim'],
                                          self.fixedParameters['actionBound'])
        actorWriter, actorModel = buildActorModel(
            self.fixedParameters['actorLayerWidths'])

        buildCriticModel = BuildCriticModel(self.fixedParameters['stateDim'],
                                            self.fixedParameters['actionDim'])
        criticWriter, criticModel = buildCriticModel(
            self.fixedParameters['criticLayerWidths'])

        trainCriticBySASRQ = TrainCriticBySASRQ(
            self.fixedParameters['learningRateCritic'],
            self.fixedParameters['gamma'], criticWriter)
        trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget,
                                  trainCriticBySASRQ)

        trainActorFromGradients = TrainActorFromGradients(
            self.fixedParameters['learningRateActor'], actorWriter)
        trainActorOneStep = TrainActorOneStep(actByPolicyTrain,
                                              trainActorFromGradients,
                                              getActionGradients)
        trainActor = TrainActor(trainActorOneStep)

        updateParameters = UpdateParameters(
            self.fixedParameters['paramUpdateInterval'],
            self.fixedParameters['tau'])

        modelList = [actorModel, criticModel]
        actorModel, criticModel = resetTargetParamToTrainParam(modelList)
        trainModels = TrainDDPGModels(updateParameters, trainActor,
                                      trainCritic, actorModel, criticModel)

        getNoise = GetExponentialDecayGaussNoise(
            noiseVariance, self.fixedParameters['varianceDiscount'],
            self.fixedParameters['noiseDecayStartStep'])
        actOneStepWithNoise = ActDDPGOneStep(
            self.fixedParameters['actionLow'],
            self.fixedParameters['actionHigh'], actByPolicyTrain, actorModel,
            getNoise)

        sampleFromMemory = SampleFromMemory(self.fixedParameters['batchSize'])
        learnFromBuffer = LearnFromBuffer(
            self.fixedParameters['learningStartStep'], sampleFromMemory,
            trainModels)

        transit = TransitGymPendulum()
        getReward = RewardGymPendulum(angle_normalize)
        sampleOneStep = SampleOneStep(transit, getReward)

        runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep,
                                      learnFromBuffer, observe)

        reset = ResetGymPendulum(seed)
        runEpisode = RunEpisode(reset, runDDPGTimeStep,
                                self.fixedParameters['maxRunSteps'],
                                isTerminalGymPendulum)

        ddpg = RunAlgorithm(runEpisode, self.fixedParameters['maxEpisode'])

        replayBuffer = deque(maxlen=int(memorySize))
        meanRewardList, trajectory = ddpg(replayBuffer)

        trainedActorModel, trainedCriticModel = trainModels.getTrainedModels()

        timeStep = list(range(len(meanRewardList)))
        resultSe = pd.Series(
            {time: reward
             for time, reward in zip(timeStep, meanRewardList)})

        if self.saveModel:
            actorParameters = {
                'ActorMemorySize': memorySize,
                'NoiseVariance': noiseVariance
            }
            criticParameters = {
                'CriticMemorySize': memorySize,
                'NoiseVariance': noiseVariance
            }
            actorPath = self.getSavePath(actorParameters)
            criticPath = self.getSavePath(criticParameters)
            with trainedActorModel.as_default():
                saveVariables(trainedActorModel, actorPath)
            with trainedCriticModel.as_default():
                saveVariables(trainedCriticModel, criticPath)

        return resultSe
def main():
    stateDim = env.observation_space.shape[0]
    actionDim = env.action_space.shape[0]
    actionHigh = env.action_space.high
    actionLow = env.action_space.low
    actionBound = (actionHigh - actionLow) / 2

    buildActorModel = BuildActorModel(stateDim, actionDim, actionBound)
    actorLayerWidths = [30]
    actorWriter, actorModel = buildActorModel(actorLayerWidths)

    buildCriticModel = BuildCriticModel(stateDim, actionDim)
    criticLayerWidths = [30]
    criticWriter, criticModel = buildCriticModel(criticLayerWidths)

    trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma,
                                            criticWriter)
    trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget,
                              trainCriticBySASRQ)

    trainActorFromGradients = TrainActorFromGradients(learningRateActor,
                                                      actorWriter)
    trainActorOneStep = TrainActorOneStep(actByPolicyTrain,
                                          trainActorFromGradients,
                                          getActionGradients)
    trainActor = TrainActor(trainActorOneStep)

    paramUpdateInterval = 1
    updateParameters = UpdateParameters(paramUpdateInterval, tau)

    modelList = [actorModel, criticModel]
    actorModel, criticModel = resetTargetParamToTrainParam(modelList)
    trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic,
                                  actorModel, criticModel)

    noiseInitVariance = 1  # control exploration
    varianceDiscount = .99995
    noiseDecayStartStep = bufferSize
    minVar = .1
    getNoise = GetExponentialDecayGaussNoise(noiseInitVariance,
                                             varianceDiscount,
                                             noiseDecayStartStep, minVar)
    actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh,
                                         actByPolicyTrain, actorModel,
                                         getNoise)

    learningStartBufferSize = minibatchSize
    sampleFromMemory = SampleFromMemory(minibatchSize)
    learnFromBuffer = LearnFromBuffer(learningStartBufferSize,
                                      sampleFromMemory, trainModels)

    transit = TransitGymMountCarContinuous()
    isTerminal = IsTerminalMountCarContin()
    getReward = RewardMountCarContin(isTerminal)
    sampleOneStep = SampleOneStep(transit, getReward)

    runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep,
                                  learnFromBuffer)

    resetLow = -1
    resetHigh = 0.4
    reset = ResetMountCarContin(seed=None)
    runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal)

    ddpg = RunAlgorithm(runEpisode, maxEpisode)
    replayBuffer = deque(maxlen=int(bufferSize))
    meanRewardList, trajectory = ddpg(replayBuffer)

    trainedActorModel, trainedCriticModel = trainModels.getTrainedModels()

    # save Model
    modelIndex = 0
    actorFixedParam = {'actorModel': modelIndex}
    criticFixedParam = {'criticModel': modelIndex}
    parameters = {
        'env': ENV_NAME,
        'Eps': maxEpisode,
        'timeStep': maxTimeStep,
        'batch': minibatchSize,
        'gam': gamma,
        'lrActor': learningRateActor,
        'lrCritic': learningRateCritic,
        'noiseVar': noiseInitVariance,
        'varDiscout': varianceDiscount,
        'resetLow': resetLow,
        'High': resetHigh
    }

    modelSaveDirectory = "../trainedDDPGModels"
    modelSaveExtension = '.ckpt'
    getActorSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension,
                                   actorFixedParam)
    getCriticSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension,
                                    criticFixedParam)
    savePathActor = getActorSavePath(parameters)
    savePathCritic = getCriticSavePath(parameters)

    with actorModel.as_default():
        saveVariables(trainedActorModel, savePathActor)
    with criticModel.as_default():
        saveVariables(trainedCriticModel, savePathCritic)

    dirName = os.path.dirname(__file__)
    trajectoryPath = os.path.join(dirName, '..', 'trajectory',
                                  'mountCarTrajectoryOriginalReset1.pickle')
    saveToPickle(trajectory, trajectoryPath)

    # plots& plot
    showDemo = False
    if showDemo:
        visualize = VisualizeMountCarContin()
        visualize(trajectory)

    plotResult = True
    if plotResult:
        plt.plot(list(range(maxEpisode)), meanRewardList)
        plt.show()
def main():
    stateDim = env.observation_space.shape[0]
    actionDim = 7
    buildModel = BuildModel(stateDim, actionDim)
    layersWidths = [30]
    writer, model = buildModel(layersWidths)

    learningRate = 0.001
    gamma = 0.99
    trainModelBySASRQ = TrainModelBySASRQ(learningRate, gamma, writer)

    paramUpdateInterval = 300
    updateParameters = UpdateParameters(paramUpdateInterval)
    model = resetTargetParamToTrainParam([model])[0]
    trainModels = TrainDQNModel(getTargetQValue, trainModelBySASRQ, updateParameters, model)

    epsilonMax = 0.9
    epsilonIncrease = 0.0001
    epsilonMin = 0
    bufferSize = 10000
    decayStartStep = bufferSize
    getEpsilon = GetEpsilon(epsilonMax, epsilonMin, epsilonIncrease, decayStartStep)

    actGreedyByModel = ActGreedyByModel(getTrainQValue, model)
    actRandom = ActRandom(actionDim)
    actByTrainNetEpsilonGreedy = ActByTrainNetEpsilonGreedy(getEpsilon, actGreedyByModel, actRandom)

    minibatchSize = 128
    learningStartBufferSize = minibatchSize
    sampleFromMemory = SampleFromMemory(minibatchSize)
    learnFromBuffer = LearnFromBuffer(learningStartBufferSize, sampleFromMemory, trainModels)

    processAction = ProcessDiscretePendulumAction(actionDim)
    transit = TransitGymPendulum(processAction)
    getReward = RewardGymPendulum(angle_normalize, processAction)
    sampleOneStep = SampleOneStep(transit, getReward)

    runDQNTimeStep = RunTimeStep(actByTrainNetEpsilonGreedy, sampleOneStep, learnFromBuffer, observe)

    reset = ResetGymPendulum(seed)
    maxTimeStep = 200
    runEpisode = RunEpisode(reset, runDQNTimeStep, maxTimeStep, isTerminalGymPendulum)

    maxEpisode = 400
    dqn = RunAlgorithm(runEpisode, maxEpisode)
    replayBuffer = deque(maxlen=int(bufferSize))
    meanRewardList, trajectory = dqn(replayBuffer)

    trainedModel = trainModels.getTrainedModels()

# save Model
    parameters = {'maxEpisode': maxEpisode, 'maxTimeStep': maxTimeStep, 'minibatchSize': minibatchSize, 'gamma': gamma,
                  'learningRate': learningRate, 'epsilonIncrease': epsilonIncrease , 'epsilonMin': epsilonMin}

    modelSaveDirectory = "../trainedDQNModels"
    modelSaveExtension = '.ckpt'
    getSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension)
    savePath = getSavePath(parameters)

    with trainedModel.as_default():
        saveVariables(trainedModel, savePath)

    dirName = os.path.dirname(__file__)
    trajectoryPath = os.path.join(dirName, '..', 'trajectory', 'pendulumDQNTrajectory.pickle')
    saveToPickle(trajectory, trajectoryPath)

    plotResult = True
    if plotResult:
        plt.plot(list(range(maxEpisode)), meanRewardList)
        plt.show()

    showDemo = False
    if showDemo:
        visualize = VisualizeGymPendulum()
        visualize(trajectory)
def main():
    env = MtCarDiscreteEnvSetup()
    visualize = visualizeMtCarDiscrete()
    reset = resetMtCarDiscrete(1234)
    transition = MtCarDiscreteTransition()
    rewardMtCar = MtCarDiscreteReward()
    isterminal = MtCarDiscreteIsTerminal()

    statesdim = env.observation_space.shape[0]
    actiondim = env.action_space.n
    replaybuffer = deque(maxlen=buffersize)
    runepsilon = initepsilon
    totalrewards = []
    meanreward = []
    trajectory = []
    totalreward = []

    buildmodel = BuildModel(statesdim, actiondim)
    Writer, DQNmodel = buildmodel(numberlayers)
    replaceParameters = ReplaceParameters(replaceiter)
    trainModel = TrainModel(learningRate, gamma, Writer)
    trainDQNmodel = TrainDQNmodel(replaceParameters, trainModel, DQNmodel)
    learn = Learn(buffersize, batchsize, trainDQNmodel, actiondim)

    for episode in range(EPISODE):
        state = reset()
        rewards = 0
        runtime = 0
        while True:
            action = learn.Getaction(DQNmodel, runepsilon, state)
            nextstate = transition(state, action)
            done = isterminal(nextstate)
            reward = rewardMtCar(state, action, nextstate, done)
            learn.ReplayMemory(replaybuffer, state, action, reward, nextstate,
                               done)
            trajectory.append((state, action, reward, nextstate))
            rewards += reward
            state = nextstate
            runtime += 1
            if runtime == 200:
                totalrewards.append(rewards)
                totalreward.append(rewards)
                runtime = 0
                print('episode: ', episode, 'reward:', rewards, 'epsilon:',
                      runepsilon)
                break
            if done:
                totalrewards.append(rewards)
                totalreward.append(rewards)
                print('episode: ', episode, 'reward:', rewards, 'epsilon:',
                      runepsilon)
                break
        runepsilon = epsilonDec(runepsilon, minepsilon, epsilondec)
        if episode % 100 == 0:
            meanreward.append(np.mean(totalreward))
            print('episode: ', episode, 'meanreward:', np.mean(totalreward))
            totalreward = []
    episode = 100 * (np.arange(len(meanreward)))
    plt.plot(episode, meanreward)
    plt.xlabel('episode')
    plt.ylabel('rewards')
    plt.ylim([-200, -50])
    plt.show()

    # save Model
    modelIndex = 0
    DQNFixedParam = {'DQNmodel': modelIndex}
    parameters = {
        'env': ENV_NAME,
        'Eps': EPISODE,
        'batch': batchsize,
        'buffersize': buffersize,
        'gam': gamma,
        'learningRate': learningRate,
        'replaceiter': replaceiter,
        'epsilondec': epsilondec,
        'minepsilon': minepsilon,
        'initepsilon': initepsilon
    }

    modelSaveDirectory = "/path/to/logs/trainedDQNModels"
    modelSaveExtension = '.ckpt'
    getSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension,
                              DQNFixedParam)
    savePathDQN = getSavePath(parameters)

    with DQNmodel.as_default():
        saveVariables(DQNmodel, savePathDQN)

    dirName = os.path.dirname(__file__)
    trajectoryPath = os.path.join(dirName, 'trajectory',
                                  'mountCarTrajectory.pickle')
    saveToPickle(trajectory, trajectoryPath)