def __call__(self, df): learningRate = df.index.get_level_values('learningRate')[0] buffersize = df.index.get_level_values('buffersize')[0] visualize = visualizeCartpole() reset = resetCartpole() transition = CartPoletransition() rewardcart = CartPoleReward() isterminal = isTerminal() replaybuffer = deque(maxlen=int(buffersize)) trajectory = [] totalrewards = [] averagerewards = [] buildmodel = BuildModel(self.fixedParameters['stateDim'],self.fixedParameters['actionDim']) Writer,DQNmodel = buildmodel(self.fixedParameters['numberlayers']) replaceParameters = ReplaceParameters(self.fixedParameters['replaceiter']) trainModel = TrainModel(learningRate, self.fixedParameters['gamma'],Writer) trainDQNmodel = TrainDQNmodel(replaceParameters, trainModel, DQNmodel) learn = Learn(buffersize,self.fixedParameters['batchsize'],trainDQNmodel,self.fixedParameters['actionDim']) runepsilon = self.fixedParameters['initepsilon'] for episode in range(self.fixedParameters['maxEpisode']): state = reset() rewards = 0 while True: visualize(state) runepsilon = epsilonDec(runepsilon,self.fixedParameters['minepsilon'],self.fixedParameters['epsilondec']) action = learn.Getaction(DQNmodel,runepsilon,state) nextstate=transition(state, action) done = isterminal(nextstate) reward = rewardcart(state,action,nextstate,done) trajectory.append((state, action, reward, nextstate)) learn.ReplayMemory(replaybuffer,state, action, reward, nextstate,done) rewards += reward state = nextstate if done: totalrewards.append(rewards) print('episode: ',episode,'reward:',rewards,'epsilon:',runepsilon) break averagerewards.append(np.mean(totalrewards)) print('episode:',episode,'meanreward:',np.mean(totalrewards)) timeStep = list(range(len(averagerewards))) resultSe = pd.Series({time: reward for time, reward in zip(timeStep, averagerewards)}) if self.saveModel: Parameters = {'learningRate': learningRate, 'buffersize': buffersize } modelPath = self.getSavePath(Parameters) with DQNmodel.as_default(): saveVariables(DQNmodel, modelPath) return resultSe
def main(): numAgents = 2 stateDim = numAgents * 2 actionLow = -1 actionHigh = 1 actionBound = (actionHigh - actionLow) / 2 actionDim = 2 buildActorModel = BuildActorModel(stateDim, actionDim, actionBound) actorLayerWidths = [64] actorWriter, actorModel = buildActorModel(actorLayerWidths) buildCriticModel = BuildCriticModel(stateDim, actionDim) criticLayerWidths = [64] criticWriter, criticModel = buildCriticModel(criticLayerWidths) trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma, criticWriter) trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget, trainCriticBySASRQ) trainActorFromGradients = TrainActorFromGradients(learningRateActor, actorWriter) trainActorOneStep = TrainActorOneStep(actByPolicyTrain, trainActorFromGradients, getActionGradients) trainActor = TrainActor(trainActorOneStep) paramUpdateInterval = 1 updateParameters = UpdateParameters(paramUpdateInterval, tau) modelList = [actorModel, criticModel] actorModel, criticModel = resetTargetParamToTrainParam(modelList) trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, actorModel, criticModel) noiseInitVariance = 1 varianceDiscount = .9995 getNoise = GetExponentialDecayGaussNoise(noiseInitVariance, varianceDiscount, noiseDecayStartStep) actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, actorModel, getNoise) sampleFromMemory = SampleFromMemory(minibatchSize) learnFromBuffer = LearnFromBuffer(learningStartBufferSize, sampleFromMemory, trainModels) sheepId = 0 wolfId = 1 getSheepPos = GetAgentPosFromState(sheepId) getWolfPos = GetAgentPosFromState(wolfId) wolfSpeed = 1 wolfPolicy = HeatSeekingContinuousDeterministicPolicy( getWolfPos, getSheepPos, wolfSpeed) # wolfPolicy = lambda state: (0, 0) xBoundary = (0, 20) yBoundary = (0, 20) stayWithinBoundary = StayWithinBoundary(xBoundary, yBoundary) physicalTransition = TransitForNoPhysics(getIntendedNextState, stayWithinBoundary) transit = TransitWithSingleWolf(physicalTransition, wolfPolicy) sheepAliveBonus = 1 / maxTimeStep sheepTerminalPenalty = 20 killzoneRadius = 1 isTerminal = IsTerminal(getWolfPos, getSheepPos, killzoneRadius) getBoundaryPunishment = GetBoundaryPunishment(xBoundary, yBoundary, sheepIndex=0, punishmentVal=10) rewardSheep = RewardFunctionCompete(sheepAliveBonus, sheepTerminalPenalty, isTerminal) getReward = RewardSheepWithBoundaryHeuristics(rewardSheep, getIntendedNextState, getBoundaryPunishment, getSheepPos) sampleOneStep = SampleOneStep(transit, getReward) runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep, learnFromBuffer) # reset = Reset(xBoundary, yBoundary, numAgents) # reset = lambda: np.array([10, 3, 15, 8]) #all [-1, -1] action # reset = lambda: np.array([15, 8, 10, 3]) # all [1. 1.] # reset = lambda: np.array([15, 10, 10, 10]) reset = lambda: np.array([10, 10, 15, 5]) runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal) ddpg = RunAlgorithm(runEpisode, maxEpisode) replayBuffer = deque(maxlen=int(bufferSize)) meanRewardList, trajectory = ddpg(replayBuffer) trainedActorModel, trainedCriticModel = trainModels.getTrainedModels() modelIndex = 0 actorFixedParam = {'actorModel': modelIndex} criticFixedParam = {'criticModel': modelIndex} parameters = { 'wolfSpeed': wolfSpeed, 'dimension': actionDim, 'maxEpisode': maxEpisode, 'maxTimeStep': maxTimeStep, 'minibatchSize': minibatchSize, 'gamma': gamma, 'learningRateActor': learningRateActor, 'learningRateCritic': learningRateCritic } modelSaveDirectory = "../trainedDDPGModels" modelSaveExtension = '.ckpt' getActorSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension, actorFixedParam) getCriticSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension, criticFixedParam) savePathActor = getActorSavePath(parameters) savePathCritic = getCriticSavePath(parameters) with actorModel.as_default(): saveVariables(trainedActorModel, savePathActor) with criticModel.as_default(): saveVariables(trainedCriticModel, savePathCritic) plotResult = True if plotResult: plt.plot(list(range(maxEpisode)), meanRewardList) plt.show()
def main(): statedim = env.observation_space.shape[0] actiondim = env.action_space.shape[0] actionHigh = env.action_space.high actionLow = env.action_space.low actionBound = (actionHigh - actionLow)/2 replaybuffer = deque(maxlen=buffersize) paramUpdateFrequency = 1 totalrewards = [] meanreward = [] totalreward = [] buildActorModel = BuildActorModel(statedim, actiondim, actionBound) actorWriter, actorModel = buildActorModel(actornumberlayers) buildCriticModel = BuildCriticModel(statedim, actiondim) criticWriter, criticModel = buildCriticModel(criticnumberlayers) trainCritic = TrainCritic(criticlearningRate, gamma, criticWriter) trainActor = TrainActor(actorlearningRate, actorWriter) updateParameters = UpdateParameters(tau,paramUpdateFrequency) actorModel= ReplaceParameters(actorModel) criticModel= ReplaceParameters(criticModel) trainddpgModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, actorModel, criticModel) getnoise = GetNoise(noiseDecay,minVar,noiseDacayStep) getnoiseaction = GetNoiseAction(actorModel,actionLow, actionHigh) learn = Learn(buffersize,batchsize,trainddpgModels,actiondim) runtime = 0 trajectory = [] noisevar = initnoisevar for episode in range(EPISODE): state = env.reset() rewards = 0 for i in range(maxTimeStep): env.render() noise,noisevar = getnoise(runtime,noisevar) noiseaction = getnoiseaction(state,noise) nextstate,reward,done,info = env.step(noiseaction) learn(replaybuffer,state, noiseaction, nextstate,reward) trajectory.append((state, noiseaction, nextstate,reward)) rewards += reward state = nextstate runtime += 1 print(actionHigh,actionLow) if i == maxTimeStep-1: totalrewards.append(rewards) totalreward.append(rewards) print('episode: ',episode,'reward:',rewards, 'noisevar',noisevar) if episode % 100 == 0: meanreward.append(np.mean(totalreward)) print('episode: ',episode,'meanreward:',np.mean(totalreward)) totalreward = [] plt.plot(range(EPISODE),totalrewards) plt.xlabel('episode') plt.ylabel('rewards') plt.show() # save Model modelIndex = 0 actorFixedParam = {'actorModel': modelIndex} criticFixedParam = {'criticModel': modelIndex} parameters = {'env': Env_name, 'Eps': EPISODE, 'batchsize': batchsize,'buffersize': buffersize,'maxTimeStep':maxTimeStep, 'gamma': gamma, 'actorlearningRate': actorlearningRate, 'criticlearningRate': criticlearningRate, 'tau': tau, 'noiseDecay': noiseDecay, 'minVar': minVar, 'initnoisevar': initnoisevar} modelSaveDirectory = "/path/to/logs/trainedDDPGModels" modelSaveExtension = '.ckpt' getSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension, actorFixedParam) getSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension, criticFixedParam) savePathDQN = getSavePath(parameters) with actorModel.as_default(): saveVariables(actorModel, savePathDQN) with criticModel.as_default(): saveVariables(criticModel, savePathDQN) dirName = os.path.dirname(__file__) trajectoryPath = os.path.join(dirName,'trajectory', 'HopperTrajectory.pickle') saveToPickle(trajectory, trajectoryPath)
def __call__(self,env): env = env_norm(env) if self.fixedParameters['normalizeEnv'] else env actionHigh = env.action_space.high actionLow = env.action_space.low actionBound = (actionHigh - actionLow) / 2 stateDim = env.observation_space.shape[0] actionDim = env.action_space.shape[0] meanreward = [] trajectory = [] totalreward = [] totalrewards = [] episodereward = [] replaybuffer = deque(maxlen=int(self.fixedParameters['bufferSize'])) buildActorModel = BuildActorModel(stateDim, actionDim,actionBound , self.fixedParameters['actorHiddenLayersWeightInit'],self.fixedParameters['actorHiddenLayersBiasInit'], self.fixedParameters['actorOutputWeightInit'], self.fixedParameters['actorOutputBiasInit'],self.fixedParameters['actorActivFunction'],self.fixedParameters['gradNormClipValue'],self.fixedParameters['normalizeEnv']) actorModel = buildActorModel(self.fixedParameters['actorHiddenLayersWidths']) buildCriticModel = BuildCriticModel(stateDim, actionDim, self.fixedParameters['criticHiddenLayersWeightInit'],self.fixedParameters['criticHiddenLayersBiasInit'], self.fixedParameters['criticOutputWeightInit'], self.fixedParameters['criticOutputBiasInit'], self.fixedParameters['criticActivFunction'],self.fixedParameters['gradNormClipValue'],self.fixedParameters['normalizeEnv']) criticModel = buildCriticModel(self.fixedParameters['criticHiddenLayersWidths']) trainCritic = TrainCritic(self.fixedParameters['criticLR'], self.fixedParameters['gamma']) trainActor = TrainActor(self.fixedParameters['actorLR']) updateParameters = UpdateParameters(self.fixedParameters['tau']) trainddpgModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, actorModel,criticModel) getnoise = GetNoise(self.fixedParameters['varianceDiscount'],self.fixedParameters['minVar'],self.fixedParameters['noiseDecayStartStep'],self.fixedParameters['noiseInitVariance']) getnoiseaction = GetNoiseAction(actorModel,actionLow, actionHigh) learn = Learn(self.fixedParameters['bufferSize'],self.fixedParameters['minibatchSize'],trainddpgModels) actorModel= ReplaceParameters(actorModel) criticModel= ReplaceParameters(criticModel) state = env.reset() replaybuffer = fillbuffer(3000,self.bufferfill,env,replaybuffer,state) for episode in range(1,self.fixedParameters['maxEpisode']+1): state = env.reset() rewards = 0 for j in range(self.fixedParameters['maxTimeStep']): env.render() noise = getnoise(self.runstep) noiseaction = getnoiseaction(state,noise) nextstate,reward,done,info = env.step(noiseaction) learn(replaybuffer,state, noiseaction, nextstate,reward) trajectory.append((state, noiseaction, nextstate,reward)) rewards += reward state = nextstate self.runstep += 1 if j == self.fixedParameters['maxTimeStep']-1: totalrewards.append(rewards) totalreward.append(rewards) print('episode: ',episode,'reward:',rewards,'runstep',self.runstep) episodereward.append(np.mean(totalrewards)) print('epireward',np.mean(totalrewards)) if episode % 100 == 0: meanreward.append(np.mean(totalreward)) print('episode: ',episode,'meanreward:',np.mean(totalreward)) totalreward = [] with actorModel.as_default(): saveVariables(actorModel, self.fixedParameters['modelSavePathMartin']) with criticModel.as_default(): saveVariables(criticModel, self.fixedParameters['modelSavePathMartin']) saveToPickle(meanreward, self.fixedParameters['rewardSavePathMartin']) return episodereward
def __call__(self, df): noiseVariance = df.index.get_level_values('noiseInitVariance')[0] memorySize = df.index.get_level_values('memorySize')[0] buildActorModel = BuildActorModel(self.fixedParameters['stateDim'], self.fixedParameters['actionDim'], self.fixedParameters['actionBound']) actorWriter, actorModel = buildActorModel( self.fixedParameters['actorLayerWidths']) buildCriticModel = BuildCriticModel(self.fixedParameters['stateDim'], self.fixedParameters['actionDim']) criticWriter, criticModel = buildCriticModel( self.fixedParameters['criticLayerWidths']) trainCriticBySASRQ = TrainCriticBySASRQ( self.fixedParameters['learningRateCritic'], self.fixedParameters['gamma'], criticWriter) trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget, trainCriticBySASRQ) trainActorFromGradients = TrainActorFromGradients( self.fixedParameters['learningRateActor'], actorWriter) trainActorOneStep = TrainActorOneStep(actByPolicyTrain, trainActorFromGradients, getActionGradients) trainActor = TrainActor(trainActorOneStep) updateParameters = UpdateParameters( self.fixedParameters['paramUpdateInterval'], self.fixedParameters['tau']) modelList = [actorModel, criticModel] actorModel, criticModel = resetTargetParamToTrainParam(modelList) trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, actorModel, criticModel) getNoise = GetExponentialDecayGaussNoise( noiseVariance, self.fixedParameters['varianceDiscount'], self.fixedParameters['noiseDecayStartStep']) actOneStepWithNoise = ActDDPGOneStep( self.fixedParameters['actionLow'], self.fixedParameters['actionHigh'], actByPolicyTrain, actorModel, getNoise) sampleFromMemory = SampleFromMemory(self.fixedParameters['batchSize']) learnFromBuffer = LearnFromBuffer( self.fixedParameters['learningStartStep'], sampleFromMemory, trainModels) transit = TransitGymPendulum() getReward = RewardGymPendulum(angle_normalize) sampleOneStep = SampleOneStep(transit, getReward) runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep, learnFromBuffer, observe) reset = ResetGymPendulum(seed) runEpisode = RunEpisode(reset, runDDPGTimeStep, self.fixedParameters['maxRunSteps'], isTerminalGymPendulum) ddpg = RunAlgorithm(runEpisode, self.fixedParameters['maxEpisode']) replayBuffer = deque(maxlen=int(memorySize)) meanRewardList, trajectory = ddpg(replayBuffer) trainedActorModel, trainedCriticModel = trainModels.getTrainedModels() timeStep = list(range(len(meanRewardList))) resultSe = pd.Series( {time: reward for time, reward in zip(timeStep, meanRewardList)}) if self.saveModel: actorParameters = { 'ActorMemorySize': memorySize, 'NoiseVariance': noiseVariance } criticParameters = { 'CriticMemorySize': memorySize, 'NoiseVariance': noiseVariance } actorPath = self.getSavePath(actorParameters) criticPath = self.getSavePath(criticParameters) with trainedActorModel.as_default(): saveVariables(trainedActorModel, actorPath) with trainedCriticModel.as_default(): saveVariables(trainedCriticModel, criticPath) return resultSe
def main(): stateDim = env.observation_space.shape[0] actionDim = env.action_space.shape[0] actionHigh = env.action_space.high actionLow = env.action_space.low actionBound = (actionHigh - actionLow) / 2 buildActorModel = BuildActorModel(stateDim, actionDim, actionBound) actorLayerWidths = [30] actorWriter, actorModel = buildActorModel(actorLayerWidths) buildCriticModel = BuildCriticModel(stateDim, actionDim) criticLayerWidths = [30] criticWriter, criticModel = buildCriticModel(criticLayerWidths) trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma, criticWriter) trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget, trainCriticBySASRQ) trainActorFromGradients = TrainActorFromGradients(learningRateActor, actorWriter) trainActorOneStep = TrainActorOneStep(actByPolicyTrain, trainActorFromGradients, getActionGradients) trainActor = TrainActor(trainActorOneStep) paramUpdateInterval = 1 updateParameters = UpdateParameters(paramUpdateInterval, tau) modelList = [actorModel, criticModel] actorModel, criticModel = resetTargetParamToTrainParam(modelList) trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, actorModel, criticModel) noiseInitVariance = 1 # control exploration varianceDiscount = .99995 noiseDecayStartStep = bufferSize minVar = .1 getNoise = GetExponentialDecayGaussNoise(noiseInitVariance, varianceDiscount, noiseDecayStartStep, minVar) actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, actorModel, getNoise) learningStartBufferSize = minibatchSize sampleFromMemory = SampleFromMemory(minibatchSize) learnFromBuffer = LearnFromBuffer(learningStartBufferSize, sampleFromMemory, trainModels) transit = TransitGymMountCarContinuous() isTerminal = IsTerminalMountCarContin() getReward = RewardMountCarContin(isTerminal) sampleOneStep = SampleOneStep(transit, getReward) runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep, learnFromBuffer) resetLow = -1 resetHigh = 0.4 reset = ResetMountCarContin(seed=None) runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal) ddpg = RunAlgorithm(runEpisode, maxEpisode) replayBuffer = deque(maxlen=int(bufferSize)) meanRewardList, trajectory = ddpg(replayBuffer) trainedActorModel, trainedCriticModel = trainModels.getTrainedModels() # save Model modelIndex = 0 actorFixedParam = {'actorModel': modelIndex} criticFixedParam = {'criticModel': modelIndex} parameters = { 'env': ENV_NAME, 'Eps': maxEpisode, 'timeStep': maxTimeStep, 'batch': minibatchSize, 'gam': gamma, 'lrActor': learningRateActor, 'lrCritic': learningRateCritic, 'noiseVar': noiseInitVariance, 'varDiscout': varianceDiscount, 'resetLow': resetLow, 'High': resetHigh } modelSaveDirectory = "../trainedDDPGModels" modelSaveExtension = '.ckpt' getActorSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension, actorFixedParam) getCriticSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension, criticFixedParam) savePathActor = getActorSavePath(parameters) savePathCritic = getCriticSavePath(parameters) with actorModel.as_default(): saveVariables(trainedActorModel, savePathActor) with criticModel.as_default(): saveVariables(trainedCriticModel, savePathCritic) dirName = os.path.dirname(__file__) trajectoryPath = os.path.join(dirName, '..', 'trajectory', 'mountCarTrajectoryOriginalReset1.pickle') saveToPickle(trajectory, trajectoryPath) # plots& plot showDemo = False if showDemo: visualize = VisualizeMountCarContin() visualize(trajectory) plotResult = True if plotResult: plt.plot(list(range(maxEpisode)), meanRewardList) plt.show()
def main(): stateDim = env.observation_space.shape[0] actionDim = 7 buildModel = BuildModel(stateDim, actionDim) layersWidths = [30] writer, model = buildModel(layersWidths) learningRate = 0.001 gamma = 0.99 trainModelBySASRQ = TrainModelBySASRQ(learningRate, gamma, writer) paramUpdateInterval = 300 updateParameters = UpdateParameters(paramUpdateInterval) model = resetTargetParamToTrainParam([model])[0] trainModels = TrainDQNModel(getTargetQValue, trainModelBySASRQ, updateParameters, model) epsilonMax = 0.9 epsilonIncrease = 0.0001 epsilonMin = 0 bufferSize = 10000 decayStartStep = bufferSize getEpsilon = GetEpsilon(epsilonMax, epsilonMin, epsilonIncrease, decayStartStep) actGreedyByModel = ActGreedyByModel(getTrainQValue, model) actRandom = ActRandom(actionDim) actByTrainNetEpsilonGreedy = ActByTrainNetEpsilonGreedy(getEpsilon, actGreedyByModel, actRandom) minibatchSize = 128 learningStartBufferSize = minibatchSize sampleFromMemory = SampleFromMemory(minibatchSize) learnFromBuffer = LearnFromBuffer(learningStartBufferSize, sampleFromMemory, trainModels) processAction = ProcessDiscretePendulumAction(actionDim) transit = TransitGymPendulum(processAction) getReward = RewardGymPendulum(angle_normalize, processAction) sampleOneStep = SampleOneStep(transit, getReward) runDQNTimeStep = RunTimeStep(actByTrainNetEpsilonGreedy, sampleOneStep, learnFromBuffer, observe) reset = ResetGymPendulum(seed) maxTimeStep = 200 runEpisode = RunEpisode(reset, runDQNTimeStep, maxTimeStep, isTerminalGymPendulum) maxEpisode = 400 dqn = RunAlgorithm(runEpisode, maxEpisode) replayBuffer = deque(maxlen=int(bufferSize)) meanRewardList, trajectory = dqn(replayBuffer) trainedModel = trainModels.getTrainedModels() # save Model parameters = {'maxEpisode': maxEpisode, 'maxTimeStep': maxTimeStep, 'minibatchSize': minibatchSize, 'gamma': gamma, 'learningRate': learningRate, 'epsilonIncrease': epsilonIncrease , 'epsilonMin': epsilonMin} modelSaveDirectory = "../trainedDQNModels" modelSaveExtension = '.ckpt' getSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension) savePath = getSavePath(parameters) with trainedModel.as_default(): saveVariables(trainedModel, savePath) dirName = os.path.dirname(__file__) trajectoryPath = os.path.join(dirName, '..', 'trajectory', 'pendulumDQNTrajectory.pickle') saveToPickle(trajectory, trajectoryPath) plotResult = True if plotResult: plt.plot(list(range(maxEpisode)), meanRewardList) plt.show() showDemo = False if showDemo: visualize = VisualizeGymPendulum() visualize(trajectory)
def main(): env = MtCarDiscreteEnvSetup() visualize = visualizeMtCarDiscrete() reset = resetMtCarDiscrete(1234) transition = MtCarDiscreteTransition() rewardMtCar = MtCarDiscreteReward() isterminal = MtCarDiscreteIsTerminal() statesdim = env.observation_space.shape[0] actiondim = env.action_space.n replaybuffer = deque(maxlen=buffersize) runepsilon = initepsilon totalrewards = [] meanreward = [] trajectory = [] totalreward = [] buildmodel = BuildModel(statesdim, actiondim) Writer, DQNmodel = buildmodel(numberlayers) replaceParameters = ReplaceParameters(replaceiter) trainModel = TrainModel(learningRate, gamma, Writer) trainDQNmodel = TrainDQNmodel(replaceParameters, trainModel, DQNmodel) learn = Learn(buffersize, batchsize, trainDQNmodel, actiondim) for episode in range(EPISODE): state = reset() rewards = 0 runtime = 0 while True: action = learn.Getaction(DQNmodel, runepsilon, state) nextstate = transition(state, action) done = isterminal(nextstate) reward = rewardMtCar(state, action, nextstate, done) learn.ReplayMemory(replaybuffer, state, action, reward, nextstate, done) trajectory.append((state, action, reward, nextstate)) rewards += reward state = nextstate runtime += 1 if runtime == 200: totalrewards.append(rewards) totalreward.append(rewards) runtime = 0 print('episode: ', episode, 'reward:', rewards, 'epsilon:', runepsilon) break if done: totalrewards.append(rewards) totalreward.append(rewards) print('episode: ', episode, 'reward:', rewards, 'epsilon:', runepsilon) break runepsilon = epsilonDec(runepsilon, minepsilon, epsilondec) if episode % 100 == 0: meanreward.append(np.mean(totalreward)) print('episode: ', episode, 'meanreward:', np.mean(totalreward)) totalreward = [] episode = 100 * (np.arange(len(meanreward))) plt.plot(episode, meanreward) plt.xlabel('episode') plt.ylabel('rewards') plt.ylim([-200, -50]) plt.show() # save Model modelIndex = 0 DQNFixedParam = {'DQNmodel': modelIndex} parameters = { 'env': ENV_NAME, 'Eps': EPISODE, 'batch': batchsize, 'buffersize': buffersize, 'gam': gamma, 'learningRate': learningRate, 'replaceiter': replaceiter, 'epsilondec': epsilondec, 'minepsilon': minepsilon, 'initepsilon': initepsilon } modelSaveDirectory = "/path/to/logs/trainedDQNModels" modelSaveExtension = '.ckpt' getSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension, DQNFixedParam) savePathDQN = getSavePath(parameters) with DQNmodel.as_default(): saveVariables(DQNmodel, savePathDQN) dirName = os.path.dirname(__file__) trajectoryPath = os.path.join(dirName, 'trajectory', 'mountCarTrajectory.pickle') saveToPickle(trajectory, trajectoryPath)