def testActorTrainImprovement(self): stateBatch = [[2, 5, 10, 5, 2, 5, 10, 5], [1, 1, 1, 1, 1, 1, 1, 1]] actionBatch = [[0.1, 0.2, 0.3, 0.4, 0.5], [0.2, 0.2, 0.2, 0.2, 0.2]] rewardBatch = [[2], [0]] targetQValue = [[3], [1]] numStateSpace = len(stateBatch[0]) actionDim = 5 actionRange = 1 buildActorModel = BuildActorModel(numStateSpace, actionDim, actionRange) actorLayerWidths = [64, 64] criticLayerWidths = [64, 64] buildCriticModel = BuildCriticModel(numStateSpace, actionDim) actorWriter, actorModel = buildActorModel(actorLayerWidths) criticWriter, criticModel = buildCriticModel(criticLayerWidths) trainCriticBySASRQ = TrainCriticBySASRQ(self.learningRateCritic, self.gamma, criticWriter) for i in range(100): lossWithTrain, criticModel = trainCriticBySASRQ( criticModel, stateBatch, actionBatch, rewardBatch, targetQValue) print(lossWithTrain) actionUntrained = actByPolicyTrain(actorModel, stateBatch) actionUntrainedQVal = evaluateCriticTrain(criticModel, stateBatch, actionUntrained) trainActorFromGradients = TrainActorFromGradients( self.learningRateActor, actorWriter) trainOneStep = TrainActorOneStep(actByPolicyTrain, trainActorFromGradients, getActionGradients) actorModel = trainOneStep(actorModel, criticModel, stateBatch) actionTrained = actByPolicyTrain(actorModel, stateBatch) actionTrainedValue = evaluateCriticTrain(criticModel, stateBatch, actionTrained) [ self.assertTrue(trained > untrained) for trained, untrained in zip( actionTrainedValue, actionUntrainedQVal) ]
def main(): stateDim = env.observation_space.shape[0] actionDim = env.action_space.shape[0] actionHigh = env.action_space.high actionLow = env.action_space.low actionBound = (actionHigh - actionLow) / 2 buildActorModel = BuildActorModel(stateDim, actionDim, actionBound) actorLayerWidths = [30] actorWriter, actorModel = buildActorModel(actorLayerWidths) dirName = os.path.dirname(__file__) actorModelPath = os.path.join( dirName, '..', 'trainedDDPGModels', 'Eps=300_High=0.4_actorModel=0_batch=128_env=MountainCarContinuous-v0_gam=0.9_lrActor=0.001_lrCritic=0.001_noiseVar=1_resetLow=-1_timeStep=2000_varDiscout=0.99995.ckpt' ) restoreVariables(actorModel, actorModelPath) policy = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, actorModel, getNoise=None) isTerminal = IsTerminalMountCarContin() reset = ResetMountCarContin(seed=None, low=-1, high=0.4) transit = TransitGymMountCarContinuous() rewardFunc = RewardMountCarContin(isTerminal) for i in range(20): maxRunningSteps = 2000 sampleTrajectory = SampleTrajectory(maxRunningSteps, transit, isTerminal, rewardFunc, reset) trajectory = sampleTrajectory(policy) # plots& plot showDemo = True if showDemo: visualize = VisualizeMountCarContin() visualize(trajectory)
def setUp(self): numAgents = 2 numStateSpace = numAgents * 2 actionLow = -np.pi actionHigh = np.pi actionRange = (actionHigh - actionLow) / 2.0 actionDim = 1 self.buildActorModel = BuildActorModel(numStateSpace, actionDim, actionRange) self.actorLayerWidths = [20, 20] self.tau = 0.01 self.gamma = 0.95 self.learningRateActor = 0.0001 paramUpdateInterval = 1 self.updateParameters = UpdateParameters(paramUpdateInterval, self.tau) self.buildCriticModel = BuildCriticModel(numStateSpace, actionDim) self.criticLayerWidths = [10, 10] self.learningRateCritic = 0.001
def main(): stateDim = env.observation_space.shape[0] actionDim = env.action_space.shape[0] actionHigh = env.action_space.high actionLow = env.action_space.low actionBound = (actionHigh - actionLow) / 2 buildActorModel = BuildActorModel(stateDim, actionDim, actionBound) actorLayerWidths = [30] actorWriter, actorModel = buildActorModel(actorLayerWidths) actorModelPath = os.path.join( dirName, '..', 'trainedDDPGModels', 'Eps=200_actorModel=0_batch=128_env=Pendulum-v0_gam=0.9_lrActor=0.001_lrCritic=0.001_noiseVar=3_timeStep=200_varDiscout=0.9995.ckpt' ) restoreVariables(actorModel, actorModelPath) actOneStep = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, actorModel, getNoise=None) policy = lambda state: actOneStep(observe(state)) isTerminal = isTerminalGymPendulum reset = ResetGymPendulum(seed) transit = TransitGymPendulum() rewardFunc = RewardGymPendulum(angle_normalize) for i in range(10): maxRunningSteps = 200 sampleTrajectory = SampleTrajectory(maxRunningSteps, transit, isTerminal, rewardFunc, reset) trajectory = sampleTrajectory(policy) # plots& plot showDemo = True if showDemo: visualize = VisualizeGymPendulum() visualize(trajectory)
def main(): numAgents = 2 stateDim = numAgents * 2 actionLow = -1 actionHigh = 1 actionBound = (actionHigh - actionLow) / 2 actionDim = 2 buildActorModel = BuildActorModel(stateDim, actionDim, actionBound) actorLayerWidths = [64] actorWriter, actorModel = buildActorModel(actorLayerWidths) dirName = os.path.dirname(__file__) actorModelPath = os.path.join( dirName, '..', 'trainedDDPGModels', 'actorModel=0_dimension=2_gamma=0.95_learningRateActor=0.001_learningRateCritic=0.001_maxEpisode=2000_maxTimeStep=20_minibatchSize=32_wolfSpeed=0.5.ckpt' ) # 'actorModel=0_dimension=2_gamma=0.95_learningRateActor=0.01_learningRateCritic=0.01_maxEpisode=400_maxTimeStep=100_minibatchSize=32_wolfSpeed=1.ckpt') # 'actorModel=0_dimension=2_gamma=0.95_learningRateActor=0.01_learningRateCritic=0.01_maxEpisode=1000_maxTimeStep=100_minibatchSize=32_wolfSpeed=0.5.ckpt') # 'actorModel=0_dimension=2_gamma=0.95_learningRateActor=0.01_learningRateCritic=0.01_maxEpisode=5000_maxTimeStep=100_minibatchSize=32_wolfSpeed=0.5.ckpt') restoreVariables(actorModel, actorModelPath) sheepPolicy = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, actorModel, getNoise=None) sheepId = 0 wolfId = 1 getSheepPos = GetAgentPosFromState(sheepId) getWolfPos = GetAgentPosFromState(wolfId) wolfSpeed = 0.5 wolfPolicy = HeatSeekingContinuousDeterministicPolicy( getWolfPos, getSheepPos, wolfSpeed) xBoundary = (0, 20) yBoundary = (0, 20) stayWithinBoundary = StayWithinBoundary(xBoundary, yBoundary) transit = TransitForNoPhysics(getIntendedNextState, stayWithinBoundary) # transit = TransitWithSingleWolf(physicalTransition, wolfPolicy) maxTimeStep = 20 sheepAliveBonus = 1 / maxTimeStep sheepTerminalPenalty = 20 killzoneRadius = 0 isTerminal = IsTerminal(getWolfPos, getSheepPos, killzoneRadius) getBoundaryPunishment = GetBoundaryPunishment(xBoundary, yBoundary, sheepIndex=0, punishmentVal=10) rewardSheep = RewardFunctionCompete(sheepAliveBonus, sheepTerminalPenalty, isTerminal) rewardSheepWithBoundaryHeuristics = RewardSheepWithBoundaryHeuristics( rewardSheep, getIntendedNextState, getBoundaryPunishment, getSheepPos) getSheepAction = lambda actions: [ actions[sheepId * actionDim], actions[sheepId * actionDim + 1] ] getReward = lambda state, action, nextState: rewardSheepWithBoundaryHeuristics( state, getSheepAction(action), nextState) policy = lambda state: list(sheepPolicy(state)) + list(wolfPolicy(state)) # reset = Reset(xBoundary, yBoundary, numAgents) reset = lambda: np.array([10, 10, 15, 5]) for i in range(10): maxRunningSteps = 50 sampleTrajectory = SampleTrajectory(maxRunningSteps, transit, isTerminal, getReward, reset) trajectory = sampleTrajectory(policy) # plots& plot showDemo = True if showDemo: observe = Observe(trajectory, numAgents) fullScreen = False screenWidth = 800 screenHeight = 800 screen = initializeScreen(fullScreen, screenWidth, screenHeight) leaveEdgeSpace = 200 lineWidth = 3 xBoundary = [leaveEdgeSpace, screenWidth - leaveEdgeSpace * 2] yBoundary = [leaveEdgeSpace, screenHeight - leaveEdgeSpace * 2] screenColor = THECOLORS['black'] lineColor = THECOLORS['white'] drawBackground = DrawBackground(screen, screenColor, xBoundary, yBoundary, lineColor, lineWidth) circleSize = 10 positionIndex = [0, 1] drawState = DrawState(screen, circleSize, positionIndex, drawBackground) numberOfAgents = 2 chasingColors = [THECOLORS['green'], THECOLORS['red']] colorSpace = chasingColors[:numberOfAgents] FPS = 60 chaseTrial = ChaseTrialWithTraj(FPS, colorSpace, drawState, saveImage=True) rawXRange = [0, 20] rawYRange = [0, 20] scaledXRange = [210, 590] scaledYRange = [210, 590] scaleTrajectory = ScaleTrajectory(positionIndex, rawXRange, rawYRange, scaledXRange, scaledYRange) oldFPS = 5 adjustFPS = AdjustDfFPStoTraj(oldFPS, FPS) getTrajectory = lambda rawTrajectory: scaleTrajectory( adjustFPS(rawTrajectory)) positionList = [observe(index) for index in range(len(trajectory))] positionListToDraw = getTrajectory(positionList) currentDir = os.getcwd() parentDir = os.path.abspath(os.path.join(currentDir, os.pardir)) imageFolderName = 'Demo' saveImageDir = os.path.join(os.path.join(parentDir, 'chasingDemo'), imageFolderName) if not os.path.exists(saveImageDir): os.makedirs(saveImageDir) chaseTrial(numberOfAgents, positionListToDraw, saveImageDir)
def main(): stateDim = env.observation_space.shape[0] actionDim = env.action_space.shape[0] actionHigh = env.action_space.high actionLow = env.action_space.low actionBound = (actionHigh - actionLow) / 2 buildActorModel = BuildActorModel(stateDim, actionDim, actionBound) actorLayerWidths = [30] actorWriter, actorModel = buildActorModel(actorLayerWidths) buildCriticModel = BuildCriticModel(stateDim, actionDim) criticLayerWidths = [30] criticWriter, criticModel = buildCriticModel(criticLayerWidths) trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma, criticWriter) trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget, trainCriticBySASRQ) trainActorFromGradients = TrainActorFromGradients(learningRateActor, actorWriter) trainActorOneStep = TrainActorOneStep(actByPolicyTrain, trainActorFromGradients, getActionGradients) trainActor = TrainActor(trainActorOneStep) paramUpdateInterval = 1 updateParameters = UpdateParameters(paramUpdateInterval, tau) modelList = [actorModel, criticModel] actorModel, criticModel = resetTargetParamToTrainParam(modelList) trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, actorModel, criticModel) noiseInitVariance = 3 varianceDiscount = .9995 noiseDecayStartStep = bufferSize getNoise = GetExponentialDecayGaussNoise(noiseInitVariance, varianceDiscount, noiseDecayStartStep) actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, actorModel, getNoise) learningStartBufferSize = minibatchSize sampleFromMemory = SampleFromMemory(minibatchSize) learnFromBuffer = LearnFromBuffer(learningStartBufferSize, sampleFromMemory, trainModels) transit = TransitGymPendulum() getReward = RewardGymPendulum(angle_normalize) sampleOneStep = SampleOneStep(transit, getReward) runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep, learnFromBuffer, observe) reset = ResetGymPendulum(seed) runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminalGymPendulum) dirName = os.path.dirname(__file__) modelPath = os.path.join(dirName, '..', 'trainedDDPGModels', 'pendulum') getTrainedModel = lambda: trainModels.actorModel modelSaveRate = 50 saveModel = SaveModel(modelSaveRate, saveVariables, getTrainedModel, modelPath) ddpg = RunAlgorithm(runEpisode, maxEpisode, [saveModel]) replayBuffer = deque(maxlen=int(bufferSize)) meanRewardList, trajectory = ddpg(replayBuffer) dirName = os.path.dirname(__file__) trajectoryPath = os.path.join(dirName, '..', 'trajectory', 'pendulumTrajectory1.pickle') saveToPickle(trajectory, trajectoryPath) # plots& plot showDemo = True if showDemo: visualize = VisualizeGymPendulum() visualize(trajectory) plotResult = True if plotResult: plt.plot(list(range(maxEpisode)), meanRewardList) plt.show()
def __call__(self, df): varianceDiscount = df.index.get_level_values('varianceDiscount')[0] bufferSize = df.index.get_level_values('bufferSize')[0] layerWidth = df.index.get_level_values('layerWidth')[0] print('buffer: ', bufferSize, ', layers: ', layerWidth, ', varDiscount: ', varianceDiscount) buildActorModel = BuildActorModel(stateDim, actionDim, actionBound) actorWriter, actorModel = buildActorModel(layerWidth) buildCriticModel = BuildCriticModel(stateDim, actionDim) criticWriter, criticModel = buildCriticModel(layerWidth) trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma, criticWriter) trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget, trainCriticBySASRQ) trainActorFromGradients = TrainActorFromGradients( learningRateActor, actorWriter) trainActorOneStep = TrainActorOneStep(actByPolicyTrain, trainActorFromGradients, getActionGradients) trainActor = TrainActor(trainActorOneStep) paramUpdateInterval = 1 updateParameters = UpdateParameters(paramUpdateInterval, tau) modelList = [actorModel, criticModel] actorModel, criticModel = resetTargetParamToTrainParam(modelList) trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, actorModel, criticModel) noiseInitVariance = 1 noiseDecayStartStep = bufferSize getNoise = GetExponentialDecayGaussNoise(noiseInitVariance, varianceDiscount, noiseDecayStartStep) actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, actorModel, getNoise) learningStartBufferSize = minibatchSize sampleFromMemory = SampleFromMemory(minibatchSize) learnFromBuffer = LearnFromBuffer(learningStartBufferSize, sampleFromMemory, trainModels) sheepId = 0 wolfId = 1 getSheepXPos = GetAgentPosFromState(sheepId) getWolfXPos = GetAgentPosFromState(wolfId) wolfSpeed = 2 wolfPolicy = HeatSeekingContinuousDeterministicPolicy( getWolfXPos, getSheepXPos, wolfSpeed) xBoundary = (0, 20) yBoundary = (0, 20) stayWithinBoundary = StayWithinBoundary(xBoundary, yBoundary) physicalTransition = TransitForNoPhysics(getIntendedNextState, stayWithinBoundary) transit = TransitWithSingleWolf(physicalTransition, wolfPolicy) sheepAliveBonus = 0 / maxTimeStep sheepTerminalPenalty = -20 killzoneRadius = 1 isTerminal = IsTerminal(getWolfXPos, getSheepXPos, killzoneRadius) getBoundaryPunishment = GetBoundaryPunishment(xBoundary, yBoundary, sheepIndex=0, punishmentVal=10) rewardSheep = RewardFunctionCompete(sheepAliveBonus, sheepTerminalPenalty, isTerminal) getReward = RewardSheepWithBoundaryHeuristics(rewardSheep, getIntendedNextState, getBoundaryPunishment, getSheepXPos) sampleOneStep = SampleOneStep(transit, getReward) runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep, learnFromBuffer) reset = Reset(xBoundary, yBoundary, numAgents) runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal) ddpg = RunAlgorithm(runEpisode, maxEpisode) replayBuffer = deque(maxlen=int(bufferSize)) meanRewardList, trajectory = ddpg(replayBuffer) timeStep = list(range(len(meanRewardList))) resultSe = pd.Series( {time: reward for time, reward in zip(timeStep, meanRewardList)}) return resultSe
def main(): numAgents = 2 stateDim = numAgents * 2 actionLow = -1 actionHigh = 1 actionBound = (actionHigh - actionLow) / 2 actionDim = 2 buildActorModel = BuildActorModel(stateDim, actionDim, actionBound) actorLayerWidths = [64] actorWriter, actorModel = buildActorModel(actorLayerWidths) buildCriticModel = BuildCriticModel(stateDim, actionDim) criticLayerWidths = [64] criticWriter, criticModel = buildCriticModel(criticLayerWidths) trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma, criticWriter) trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget, trainCriticBySASRQ) trainActorFromGradients = TrainActorFromGradients(learningRateActor, actorWriter) trainActorOneStep = TrainActorOneStep(actByPolicyTrain, trainActorFromGradients, getActionGradients) trainActor = TrainActor(trainActorOneStep) paramUpdateInterval = 1 updateParameters = UpdateParameters(paramUpdateInterval, tau) modelList = [actorModel, criticModel] actorModel, criticModel = resetTargetParamToTrainParam(modelList) trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, actorModel, criticModel) noiseInitVariance = 1 varianceDiscount = .9995 getNoise = GetExponentialDecayGaussNoise(noiseInitVariance, varianceDiscount, noiseDecayStartStep) actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, actorModel, getNoise) sampleFromMemory = SampleFromMemory(minibatchSize) learnFromBuffer = LearnFromBuffer(learningStartBufferSize, sampleFromMemory, trainModels) sheepId = 0 wolfId = 1 getSheepPos = GetAgentPosFromState(sheepId) getWolfPos = GetAgentPosFromState(wolfId) wolfSpeed = 1 wolfPolicy = HeatSeekingContinuousDeterministicPolicy( getWolfPos, getSheepPos, wolfSpeed) # wolfPolicy = lambda state: (0, 0) xBoundary = (0, 20) yBoundary = (0, 20) stayWithinBoundary = StayWithinBoundary(xBoundary, yBoundary) physicalTransition = TransitForNoPhysics(getIntendedNextState, stayWithinBoundary) transit = TransitWithSingleWolf(physicalTransition, wolfPolicy) sheepAliveBonus = 1 / maxTimeStep sheepTerminalPenalty = 20 killzoneRadius = 1 isTerminal = IsTerminal(getWolfPos, getSheepPos, killzoneRadius) getBoundaryPunishment = GetBoundaryPunishment(xBoundary, yBoundary, sheepIndex=0, punishmentVal=10) rewardSheep = RewardFunctionCompete(sheepAliveBonus, sheepTerminalPenalty, isTerminal) getReward = RewardSheepWithBoundaryHeuristics(rewardSheep, getIntendedNextState, getBoundaryPunishment, getSheepPos) sampleOneStep = SampleOneStep(transit, getReward) runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep, learnFromBuffer) # reset = Reset(xBoundary, yBoundary, numAgents) # reset = lambda: np.array([10, 3, 15, 8]) #all [-1, -1] action # reset = lambda: np.array([15, 8, 10, 3]) # all [1. 1.] # reset = lambda: np.array([15, 10, 10, 10]) reset = lambda: np.array([10, 10, 15, 5]) runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal) ddpg = RunAlgorithm(runEpisode, maxEpisode) replayBuffer = deque(maxlen=int(bufferSize)) meanRewardList, trajectory = ddpg(replayBuffer) trainedActorModel, trainedCriticModel = trainModels.getTrainedModels() modelIndex = 0 actorFixedParam = {'actorModel': modelIndex} criticFixedParam = {'criticModel': modelIndex} parameters = { 'wolfSpeed': wolfSpeed, 'dimension': actionDim, 'maxEpisode': maxEpisode, 'maxTimeStep': maxTimeStep, 'minibatchSize': minibatchSize, 'gamma': gamma, 'learningRateActor': learningRateActor, 'learningRateCritic': learningRateCritic } modelSaveDirectory = "../trainedDDPGModels" modelSaveExtension = '.ckpt' getActorSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension, actorFixedParam) getCriticSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension, criticFixedParam) savePathActor = getActorSavePath(parameters) savePathCritic = getCriticSavePath(parameters) with actorModel.as_default(): saveVariables(trainedActorModel, savePathActor) with criticModel.as_default(): saveVariables(trainedCriticModel, savePathCritic) plotResult = True if plotResult: plt.plot(list(range(maxEpisode)), meanRewardList) plt.show()
def __call__(self, df): noiseVariance = df.index.get_level_values('noiseInitVariance')[0] memorySize = df.index.get_level_values('memorySize')[0] buildActorModel = BuildActorModel(self.fixedParameters['stateDim'], self.fixedParameters['actionDim'], self.fixedParameters['actionBound']) actorWriter, actorModel = buildActorModel( self.fixedParameters['actorLayerWidths']) buildCriticModel = BuildCriticModel(self.fixedParameters['stateDim'], self.fixedParameters['actionDim']) criticWriter, criticModel = buildCriticModel( self.fixedParameters['criticLayerWidths']) trainCriticBySASRQ = TrainCriticBySASRQ( self.fixedParameters['learningRateCritic'], self.fixedParameters['gamma'], criticWriter) trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget, trainCriticBySASRQ) trainActorFromGradients = TrainActorFromGradients( self.fixedParameters['learningRateActor'], actorWriter) trainActorOneStep = TrainActorOneStep(actByPolicyTrain, trainActorFromGradients, getActionGradients) trainActor = TrainActor(trainActorOneStep) updateParameters = UpdateParameters( self.fixedParameters['paramUpdateInterval'], self.fixedParameters['tau']) modelList = [actorModel, criticModel] actorModel, criticModel = resetTargetParamToTrainParam(modelList) trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, actorModel, criticModel) getNoise = GetExponentialDecayGaussNoise( noiseVariance, self.fixedParameters['varianceDiscount'], self.fixedParameters['noiseDecayStartStep']) actOneStepWithNoise = ActDDPGOneStep( self.fixedParameters['actionLow'], self.fixedParameters['actionHigh'], actByPolicyTrain, actorModel, getNoise) sampleFromMemory = SampleFromMemory(self.fixedParameters['batchSize']) learnFromBuffer = LearnFromBuffer( self.fixedParameters['learningStartStep'], sampleFromMemory, trainModels) transit = TransitGymPendulum() getReward = RewardGymPendulum(angle_normalize) sampleOneStep = SampleOneStep(transit, getReward) runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep, learnFromBuffer, observe) reset = ResetGymPendulum(seed) runEpisode = RunEpisode(reset, runDDPGTimeStep, self.fixedParameters['maxRunSteps'], isTerminalGymPendulum) ddpg = RunAlgorithm(runEpisode, self.fixedParameters['maxEpisode']) replayBuffer = deque(maxlen=int(memorySize)) meanRewardList, trajectory = ddpg(replayBuffer) trainedActorModel, trainedCriticModel = trainModels.getTrainedModels() timeStep = list(range(len(meanRewardList))) resultSe = pd.Series( {time: reward for time, reward in zip(timeStep, meanRewardList)}) if self.saveModel: actorParameters = { 'ActorMemorySize': memorySize, 'NoiseVariance': noiseVariance } criticParameters = { 'CriticMemorySize': memorySize, 'NoiseVariance': noiseVariance } actorPath = self.getSavePath(actorParameters) criticPath = self.getSavePath(criticParameters) with trainedActorModel.as_default(): saveVariables(trainedActorModel, actorPath) with trainedCriticModel.as_default(): saveVariables(trainedCriticModel, criticPath) return resultSe
def main(): stateDim = env.observation_space.shape[0] actionDim = env.action_space.shape[0] actionHigh = env.action_space.high actionLow = env.action_space.low actionBound = (actionHigh - actionLow) / 2 buildActorModel = BuildActorModel(stateDim, actionDim, actionBound) actorLayerWidths = [30] actorWriter, actorModel = buildActorModel(actorLayerWidths) buildCriticModel = BuildCriticModel(stateDim, actionDim) criticLayerWidths = [30] criticWriter, criticModel = buildCriticModel(criticLayerWidths) trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma, criticWriter) trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget, trainCriticBySASRQ) trainActorFromGradients = TrainActorFromGradients(learningRateActor, actorWriter) trainActorOneStep = TrainActorOneStep(actByPolicyTrain, trainActorFromGradients, getActionGradients) trainActor = TrainActor(trainActorOneStep) paramUpdateInterval = 1 updateParameters = UpdateParameters(paramUpdateInterval, tau) modelList = [actorModel, criticModel] actorModel, criticModel = resetTargetParamToTrainParam(modelList) trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, actorModel, criticModel) noiseInitVariance = 3 varianceDiscount = .9995 noiseDecayStartStep = bufferSize getNoise = GetExponentialDecayGaussNoise(noiseInitVariance, varianceDiscount, noiseDecayStartStep) actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, actorModel, getNoise) learningStartBufferSize = minibatchSize sampleFromMemory = SampleFromMemory(minibatchSize) learnFromBuffer = LearnFromBuffer(learningStartBufferSize, sampleFromMemory, trainModels) sampleOneStep = SampleOneStepUsingGym(env) runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep, learnFromBuffer) reset = lambda: env.reset() runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminalGymPendulum) ddpg = RunAlgorithm(runEpisode, maxEpisode) replayBuffer = deque(maxlen=int(bufferSize)) meanRewardList, trajectory = ddpg(replayBuffer) trainedActorModel, trainedCriticModel = trainModels.getTrainedModels() env.close() plotResult = True if plotResult: plt.plot(list(range(maxEpisode)), meanRewardList) plt.show()
def main(): stateDim = env.observation_space.shape[0] actionDim = env.action_space.shape[0] actionHigh = env.action_space.high actionLow = env.action_space.low actionBound = (actionHigh - actionLow) / 2 buildActorModel = BuildActorModel(stateDim, actionDim, actionBound) actorLayerWidths = [30] actorWriter, actorModel = buildActorModel(actorLayerWidths) buildCriticModel = BuildCriticModel(stateDim, actionDim) criticLayerWidths = [30] criticWriter, criticModel = buildCriticModel(criticLayerWidths) trainCriticBySASRQ = TrainCriticBySASRQ(learningRateCritic, gamma, criticWriter) trainCritic = TrainCritic(actByPolicyTarget, evaluateCriticTarget, trainCriticBySASRQ) trainActorFromGradients = TrainActorFromGradients(learningRateActor, actorWriter) trainActorOneStep = TrainActorOneStep(actByPolicyTrain, trainActorFromGradients, getActionGradients) trainActor = TrainActor(trainActorOneStep) paramUpdateInterval = 1 updateParameters = UpdateParameters(paramUpdateInterval, tau) modelList = [actorModel, criticModel] actorModel, criticModel = resetTargetParamToTrainParam(modelList) trainModels = TrainDDPGModels(updateParameters, trainActor, trainCritic, actorModel, criticModel) noiseInitVariance = 1 # control exploration varianceDiscount = .99995 noiseDecayStartStep = bufferSize minVar = .1 getNoise = GetExponentialDecayGaussNoise(noiseInitVariance, varianceDiscount, noiseDecayStartStep, minVar) actOneStepWithNoise = ActDDPGOneStep(actionLow, actionHigh, actByPolicyTrain, actorModel, getNoise) learningStartBufferSize = minibatchSize sampleFromMemory = SampleFromMemory(minibatchSize) learnFromBuffer = LearnFromBuffer(learningStartBufferSize, sampleFromMemory, trainModels) transit = TransitGymMountCarContinuous() isTerminal = IsTerminalMountCarContin() getReward = RewardMountCarContin(isTerminal) sampleOneStep = SampleOneStep(transit, getReward) runDDPGTimeStep = RunTimeStep(actOneStepWithNoise, sampleOneStep, learnFromBuffer) resetLow = -1 resetHigh = 0.4 reset = ResetMountCarContin(seed=None) runEpisode = RunEpisode(reset, runDDPGTimeStep, maxTimeStep, isTerminal) ddpg = RunAlgorithm(runEpisode, maxEpisode) replayBuffer = deque(maxlen=int(bufferSize)) meanRewardList, trajectory = ddpg(replayBuffer) trainedActorModel, trainedCriticModel = trainModels.getTrainedModels() # save Model modelIndex = 0 actorFixedParam = {'actorModel': modelIndex} criticFixedParam = {'criticModel': modelIndex} parameters = { 'env': ENV_NAME, 'Eps': maxEpisode, 'timeStep': maxTimeStep, 'batch': minibatchSize, 'gam': gamma, 'lrActor': learningRateActor, 'lrCritic': learningRateCritic, 'noiseVar': noiseInitVariance, 'varDiscout': varianceDiscount, 'resetLow': resetLow, 'High': resetHigh } modelSaveDirectory = "../trainedDDPGModels" modelSaveExtension = '.ckpt' getActorSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension, actorFixedParam) getCriticSavePath = GetSavePath(modelSaveDirectory, modelSaveExtension, criticFixedParam) savePathActor = getActorSavePath(parameters) savePathCritic = getCriticSavePath(parameters) with actorModel.as_default(): saveVariables(trainedActorModel, savePathActor) with criticModel.as_default(): saveVariables(trainedCriticModel, savePathCritic) dirName = os.path.dirname(__file__) trajectoryPath = os.path.join(dirName, '..', 'trajectory', 'mountCarTrajectoryOriginalReset1.pickle') saveToPickle(trajectory, trajectoryPath) # plots& plot showDemo = False if showDemo: visualize = VisualizeMountCarContin() visualize(trajectory) plotResult = True if plotResult: plt.plot(list(range(maxEpisode)), meanRewardList) plt.show()