def __init__(self, actionList, inputHeight, inputWidth, batchSize, phiLength, nnFile, loadWeightsFlipped, updateFrequency, replayMemorySize, replayStartSize, networkType, updateRule, batchAccumulator, networkUpdateDelay, discountRate, learningRate, rmsRho, rmsEpsilon, momentum, epsilonStart, epsilonEnd, epsilonDecaySteps, evalEpsilon, useSARSAUpdate, kReturnLength): self.actionList = actionList self.numActions = len(self.actionList) self.inputHeight = inputHeight self.inputWidth = inputWidth self.batchSize = batchSize self.phiLength = phiLength self.nnFile = nnFile self.loadWeightsFlipped = loadWeightsFlipped self.updateFrequency = updateFrequency self.replayMemorySize = replayMemorySize self.replayStartSize = replayStartSize self.networkType = networkType self.updateRule = updateRule self.batchAccumulator = batchAccumulator self.networkUpdateDelay = networkUpdateDelay self.discountRate = discountRate self.learningRate = learningRate self.rmsRho = rmsRho self.rmsEpsilon = rmsEpsilon self.momentum = momentum self.epsilonStart = epsilonStart self.epsilonEnd = epsilonEnd self.epsilonDecaySteps = epsilonDecaySteps self.evalEpsilon = evalEpsilon self.kReturnLength = kReturnLength self.useSARSAUpdate = useSARSAUpdate self.trainingMemory =DQNAgentMemory.DQNAgentMemory((self.inputHeight, self.inputWidth), self.phiLength, self.replayMemorySize, self.discountRate) self.evaluationMemory=DQNAgentMemory.DQNAgentMemory((self.inputHeight, self.inputWidth), self.phiLength, self.phiLength * 2, self.discountRate) self.episodeCounter = 0 self.stepCounter = 0 self.batchCounter = 0 self.lossAverages = [] self.actionToTake = 0 self.epsilon = self.epsilonStart if self.epsilonDecaySteps != 0: self.epsilonRate = ((self.epsilonStart - self.epsilonEnd) / self.epsilonDecaySteps) else: self.epsilonRate = 0 self.training = False self.network = DeepQNetwork.DeepQNetwork(self.batchSize, self.phiLength, self.inputHeight, self.inputWidth, self.numActions, self.discountRate, self.learningRate, self.rmsRho, self.rmsEpsilon, self.momentum, self.networkUpdateDelay, self.useSARSAUpdate, self.kReturnLength, self.networkType, self.updateRule, self.batchAccumulator) if self.nnFile is not None: #Load network DeepNetworks.loadNetworkParams(self.network.qValueNetwork, self.nnFile, self.loadWeightsFlipped) self.network.resetNextQValueNetwork()
def generateModel(Specs, dataObj): if (Specs.Mode == "1"): import BoxJenkins modelObj = BoxJenkins.BoxJenkins(dataObj, Specs) elif (Specs.Mode == "2"): import DeepNetworks modelObj = DeepNetworks.DeepNetworks(dataObj, Specs, 10) modelObj.modelling() if (Specs.Mode == "1"): attributeName = list(dataObj.data)[int(Specs.ForecastCol)] mdlPath = Specs.mdlpath + "/" + attributeName print("model under " + mdlPath) #if model exists, no parameter estimation is necessary modelObj.fitting(Specs.mdlName, mdlPath) if (Specs.Mode == "2"): modelObj.fitting() return modelObj
def __init__(self, batchSize, numFrames, inputHeight, inputWidth, numActions, discountRate, learningRate, rho, rms_epsilon, momentum, networkUpdateDelay, useSARSAUpdate, kReturnLength, networkType = "conv", updateRule = "deepmind_rmsprop", batchAccumulator = "sum", clipDelta = 1.0, inputScale = 255.0): self.batchSize = batchSize self.numFrames = numFrames self.inputWidth = inputWidth self.inputHeight = inputHeight self.inputScale = inputScale self.numActions = numActions self.discountRate = discountRate self.learningRate = learningRate self.rho = rho self.rms_epsilon = rms_epsilon self.momentum = momentum self.networkUpdateDelay = networkUpdateDelay self.useSARSAUpdate = useSARSAUpdate self.kReturnLength = kReturnLength self.networkType = networkType self.updateRule = updateRule self.batchAccumulator = batchAccumulator self.clipDelta = clipDelta self.updateCounter = 0 states = T.tensor4("states") nextStates = T.tensor4("nextStates") rewards = T.col("rewards") actions = T.icol("actions") nextActions= T.icol("nextActions") terminals = T.icol("terminals") self.statesShared = theano.shared(np.zeros((self.batchSize, self.numFrames, self.inputHeight, self.inputWidth), dtype=theano.config.floatX)) self.nextStatesShared = theano.shared(np.zeros((self.batchSize, self.numFrames, self.inputHeight, self.inputWidth), dtype=theano.config.floatX)) self.rewardsShared = theano.shared(np.zeros((self.batchSize, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actionsShared = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.nextActionsShared = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.terminalsShared = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.qValueNetwork = DeepNetworks.buildDeepQNetwork( self.batchSize, self.numFrames, self.inputHeight, self.inputWidth, self.numActions, self.networkType) qValues = lasagne.layers.get_output(self.qValueNetwork, states / self.inputScale) if self.networkUpdateDelay > 0: self.nextQValueNetwork = DeepNetworks.buildDeepQNetwork( self.batchSize, self.numFrames, self.inputHeight, self.inputWidth, self.numActions, self.networkType) self.resetNextQValueNetwork() nextQValues = lasagne.layers.get_output(self.nextQValueNetwork, nextStates / self.inputScale) else: nextQValues = lasagne.layers.get_output(self.qValueNetwork, nextStates / self.inputScale) nextQValues = theano.gradient.disconnected_grad(nextQValues) if self.useSARSAUpdate: target = rewards + terminals * (self.discountRate ** self.kReturnLength) * nextQValues[T.arange(self.batchSize), nextActions.reshape((-1,))].reshape((-1, 1)) else: target = rewards + terminals * (self.discountRate ** self.kReturnLength) * T.max(nextQValues, axis = 1, keepdims = True) targetDifference = target - qValues[T.arange(self.batchSize), actions.reshape((-1,))].reshape((-1, 1)) quadraticPart = T.minimum(abs(targetDifference), self.clipDelta) linearPart = abs(targetDifference) - quadraticPart # if self.clipDelta > 0: # targetDifference = targetDifference.clip(-1.0 * self.clipDelta, self.clipDelta) if self.batchAccumulator == "sum": # loss = T.sum(targetDifference ** 2) loss = T.sum(0.5 * quadraticPart ** 2 + self.clipDelta * linearPart) elif self.batchAccumulator == "mean": # loss = T.mean(targetDifference ** 2) loss = T.mean(0.5 * quadraticPart ** 2 + self.clipDelta * linearPart) else: raise ValueError("Bad Network Accumulator. {sum, mean} expected") networkParameters = lasagne.layers.helper.get_all_params(self.qValueNetwork) if self.updateRule == "deepmind_rmsprop": updates = DeepNetworks.deepmind_rmsprop(loss, networkParameters, self.learningRate, self.rho, self.rms_epsilon) elif self.updateRule == "rmsprop": updates = lasagne.updates.rmsprop(loss, networkParameters, self.learningRate, self.rho, self.rms_epsilon) elif self.updateRule == "sgd": updates = lasagne.updates.sgd(loss, networkParameters, self.learningRate) else: raise ValueError("Bad update rule. {deepmind_rmsprop, rmsprop, sgd} expected") if self.momentum > 0: updates.lasagne.updates.apply_momentum(updates, None, self.momentum) lossGivens = { states: self.statesShared, nextStates: self.nextStatesShared, rewards:self.rewardsShared, actions: self.actionsShared, nextActions: self.nextActionsShared, terminals: self.terminalsShared } self.__trainNetwork = theano.function([], [loss, qValues], updates=updates, givens=lossGivens, on_unused_input='warn') self.__computeQValues = theano.function([], qValues, givens={states: self.statesShared})
def __init__(self, batchSize, numFrames, inputHeight, inputWidth, numActions, discountRate, learningRate, rho, rms_epsilon, momentum, networkUpdateDelay, useSARSAUpdate, kReturnLength, networkType="conv", updateRule="deepmind_rmsprop", batchAccumulator="sum", clipDelta=1.0, inputScale=255.0): self.batchSize = batchSize self.numFrames = numFrames self.inputWidth = inputWidth self.inputHeight = inputHeight self.inputScale = inputScale self.numActions = numActions self.discountRate = discountRate self.learningRate = learningRate self.rho = rho self.rms_epsilon = rms_epsilon self.momentum = momentum self.networkUpdateDelay = networkUpdateDelay self.useSARSAUpdate = useSARSAUpdate self.kReturnLength = kReturnLength self.networkType = networkType self.updateRule = updateRule self.batchAccumulator = batchAccumulator self.clipDelta = clipDelta self.updateCounter = 0 states = T.tensor4("states") nextStates = T.tensor4("nextStates") rewards = T.col("rewards") actions = T.icol("actions") nextActions = T.icol("nextActions") terminals = T.icol("terminals") self.statesShared = theano.shared( np.zeros((self.batchSize, self.numFrames, self.inputHeight, self.inputWidth), dtype=theano.config.floatX)) self.nextStatesShared = theano.shared( np.zeros((self.batchSize, self.numFrames, self.inputHeight, self.inputWidth), dtype=theano.config.floatX)) self.rewardsShared = theano.shared(np.zeros( (self.batchSize, 1), dtype=theano.config.floatX), broadcastable=(False, True)) self.actionsShared = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.nextActionsShared = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.terminalsShared = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True)) self.qValueNetwork = DeepNetworks.buildDeepQNetwork( self.batchSize, self.numFrames, self.inputHeight, self.inputWidth, self.numActions, self.networkType) qValues = lasagne.layers.get_output(self.qValueNetwork, states / self.inputScale) if self.networkUpdateDelay > 0: self.nextQValueNetwork = DeepNetworks.buildDeepQNetwork( self.batchSize, self.numFrames, self.inputHeight, self.inputWidth, self.numActions, self.networkType) self.resetNextQValueNetwork() nextQValues = lasagne.layers.get_output( self.nextQValueNetwork, nextStates / self.inputScale) else: nextQValues = lasagne.layers.get_output( self.qValueNetwork, nextStates / self.inputScale) nextQValues = theano.gradient.disconnected_grad(nextQValues) if self.useSARSAUpdate: target = rewards + terminals * ( self.discountRate** self.kReturnLength) * nextQValues[T.arange(self.batchSize), nextActions.reshape( (-1, ))].reshape((-1, 1)) else: target = rewards + terminals * ( self.discountRate**self.kReturnLength) * T.max( nextQValues, axis=1, keepdims=True) targetDifference = target - qValues[T.arange(self.batchSize), actions.reshape((-1, ))].reshape( (-1, 1)) quadraticPart = T.minimum(abs(targetDifference), self.clipDelta) linearPart = abs(targetDifference) - quadraticPart # if self.clipDelta > 0: # targetDifference = targetDifference.clip(-1.0 * self.clipDelta, self.clipDelta) if self.batchAccumulator == "sum": # loss = T.sum(targetDifference ** 2) loss = T.sum(0.5 * quadraticPart**2 + self.clipDelta * linearPart) elif self.batchAccumulator == "mean": # loss = T.mean(targetDifference ** 2) loss = T.mean(0.5 * quadraticPart**2 + self.clipDelta * linearPart) else: raise ValueError("Bad Network Accumulator. {sum, mean} expected") networkParameters = lasagne.layers.helper.get_all_params( self.qValueNetwork) if self.updateRule == "deepmind_rmsprop": updates = DeepNetworks.deepmind_rmsprop(loss, networkParameters, self.learningRate, self.rho, self.rms_epsilon) elif self.updateRule == "rmsprop": updates = lasagne.updates.rmsprop(loss, networkParameters, self.learningRate, self.rho, self.rms_epsilon) elif self.updateRule == "sgd": updates = lasagne.updates.sgd(loss, networkParameters, self.learningRate) else: raise ValueError( "Bad update rule. {deepmind_rmsprop, rmsprop, sgd} expected") if self.momentum > 0: updates.lasagne.updates.apply_momentum(updates, None, self.momentum) lossGivens = { states: self.statesShared, nextStates: self.nextStatesShared, rewards: self.rewardsShared, actions: self.actionsShared, nextActions: self.nextActionsShared, terminals: self.terminalsShared } self.__trainNetwork = theano.function([], [loss, qValues], updates=updates, givens=lossGivens, on_unused_input='warn') self.__computeQValues = theano.function( [], qValues, givens={states: self.statesShared})
def __init__(self, actionList, inputHeight, inputWidth, batchSize, phiLength, nnFile, loadWeightsFlipped, updateFrequency, replayMemorySize, replayStartSize, networkType, updateRule, batchAccumulator, networkUpdateDelay, discountRate, learningRate, rmsRho, rmsEpsilon, momentum, epsilonStart, epsilonEnd, epsilonDecaySteps, evalEpsilon, useSARSAUpdate, kReturnLength): self.actionList = actionList self.numActions = len(self.actionList) self.inputHeight = inputHeight self.inputWidth = inputWidth self.batchSize = batchSize self.phiLength = phiLength self.nnFile = nnFile self.loadWeightsFlipped = loadWeightsFlipped self.updateFrequency = updateFrequency self.replayMemorySize = replayMemorySize self.replayStartSize = replayStartSize self.networkType = networkType self.updateRule = updateRule self.batchAccumulator = batchAccumulator self.networkUpdateDelay = networkUpdateDelay self.discountRate = discountRate self.learningRate = learningRate self.rmsRho = rmsRho self.rmsEpsilon = rmsEpsilon self.momentum = momentum self.epsilonStart = epsilonStart self.epsilonEnd = epsilonEnd self.epsilonDecaySteps = epsilonDecaySteps self.evalEpsilon = evalEpsilon self.kReturnLength = kReturnLength self.useSARSAUpdate = useSARSAUpdate self.trainingMemory = DQNAgentMemory.DQNAgentMemory( (self.inputHeight, self.inputWidth), self.phiLength, self.replayMemorySize, self.discountRate) self.evaluationMemory = DQNAgentMemory.DQNAgentMemory( (self.inputHeight, self.inputWidth), self.phiLength, self.phiLength * 2, self.discountRate) self.episodeCounter = 0 self.stepCounter = 0 self.batchCounter = 0 self.lossAverages = [] self.actionToTake = 0 self.epsilon = self.epsilonStart if self.epsilonDecaySteps != 0: self.epsilonRate = ((self.epsilonStart - self.epsilonEnd) / self.epsilonDecaySteps) else: self.epsilonRate = 0 self.training = False self.network = DeepQNetwork.DeepQNetwork( self.batchSize, self.phiLength, self.inputHeight, self.inputWidth, self.numActions, self.discountRate, self.learningRate, self.rmsRho, self.rmsEpsilon, self.momentum, self.networkUpdateDelay, self.useSARSAUpdate, self.kReturnLength, self.networkType, self.updateRule, self.batchAccumulator) if self.nnFile is not None: #Load network DeepNetworks.loadNetworkParams(self.network.qValueNetwork, self.nnFile, self.loadWeightsFlipped) self.network.resetNextQValueNetwork()
def run_experiment(args): parameters = Parameters.processArguments(args, __doc__) #if the nnFile is a directory, check for a previous experiment run in it and start from there #load its parameters, append to its evalresults file, open its largest network file #If its none, create a experiment directory. create a results file, save parameters, save network files here. experimentDirectory = parameters.rom + "_" + time.strftime( "%d-%m-%Y-%H-%M") + "/" resultsFileName = experimentDirectory + "results.csv" startingEpoch = 1 if parameters.nnFile is None or parameters.nnFile.endswith(".pkl"): #Create your experiment directory, results file, save parameters if not os.path.isdir(experimentDirectory): os.mkdir(experimentDirectory) resultsFile = open(resultsFileName, "a") resultsFile.write("Epoch,\tAverageReward,\tMean Q Value\n") resultsFile.close() parametersFile = open(experimentDirectory + "parameters.pkl", 'wb', -1) cPickle.dump(parameters, parametersFile) parametersFile.close() if parameters.nnFile is not None and os.path.isdir(parameters.nnFile): #Found a experiment directory if not parameters.nnFile.endswith("/"): parameters.nnFile += "/" experimentDirectory = parameters.nnFile resultsFileName = experimentDirectory + "results.csv" if os.path.exists(experimentDirectory + "parameters.pkl"): parametersFile = open(experimentDirectory + "parameters.pkl", 'rb') parameters = cPickle.load(parametersFile) parametersFile.close() else: parametersFile = open(experimentDirectory + "parameters.pkl", 'wb', -1) cPickle.dump(parameters, parametersFile) parametersFile.close() contents = os.listdir(experimentDirectory) networkFiles = [] for handle in contents: if handle.startswith("network") and handle.endswith(".pkl"): networkFiles.append(handle) if len(networkFiles) == 0: #Found a premature experiment, didnt finish a single training epoch parameters.nnFile = None else: #Found a previous experiments network files, now find the highest epoch number highestNNFile = networkFiles[0] highestNetworkEpochNumber = int( highestNNFile[highestNNFile.index("_") + 1:highestNNFile.index(".")]) for networkFile in networkFiles: networkEpochNumber = int(networkFile[networkFile.index("_") + 1:networkFile.index(".")]) if networkEpochNumber > highestNetworkEpochNumber: highestNNFile = networkFile highestNetworkEpochNumber = networkEpochNumber startingEpoch = highestNetworkEpochNumber + 1 #dont use full exploration, its not a good way to fill the replay memory when we already have a decent policy if startingEpoch > 1: parameters.epsilonStart = parameters.epsilonEnd parameters.nnFile = experimentDirectory + highestNNFile print "Loaded experiment: " + experimentDirectory + "\nLoaded network file:" + highestNNFile sys.setrecursionlimit(10000) ale = ALEInterface() Environment.initializeALEParameters(ale, parameters.seed, parameters.frameSkip, parameters.repeatActionProbability, parameters.displayScreen) ale.loadROM(parameters.fullRomPath) minimalActions = ale.getMinimalActionSet() agent = DQNAgent.DQNAgent( minimalActions, parameters.croppedHeight, parameters.croppedWidth, parameters.batchSize, parameters.phiLength, parameters.nnFile, parameters.loadWeightsFlipped, parameters.updateFrequency, parameters.replayMemorySize, parameters.replayStartSize, parameters.networkType, parameters.updateRule, parameters.batchAccumulator, parameters.networkUpdateDelay, parameters.discountRate, parameters.learningRate, parameters.rmsRho, parameters.rmsEpsilon, parameters.momentum, parameters.epsilonStart, parameters.epsilonEnd, parameters.epsilonDecaySteps, parameters.evalEpsilon, parameters.useSARSAUpdate, parameters.kReturnLength) for epoch in xrange(startingEpoch, parameters.epochs + 1): agent.startTrainingEpoch(epoch) runTrainingEpoch(ale, agent, epoch, parameters.stepsPerEpoch) agent.endTrainingEpoch(epoch) networkFileName = experimentDirectory + "network_" + str( epoch) + ".pkl" DeepNetworks.saveNetworkParams(agent.network.qValueNetwork, networkFileName) if parameters.stepsPerTest > 0 and epoch % parameters.evaluationFrequency == 0: agent.startEvaluationEpoch(epoch) avgReward = runEvaluationEpoch(ale, agent, epoch, parameters.stepsPerTest) holdoutQVals = agent.computeHoldoutQValues(3200) resultsFile = open(resultsFileName, 'a') resultsFile.write( str(epoch) + ",\t" + str(round(avgReward, 4)) + ",\t\t" + str(round(holdoutQVals, 4)) + "\n") resultsFile.close() agent.endEvaluationEpoch(epoch) agent.agentCleanup()
def run_experiment(args): parameters = Parameters.processArguments(args, __doc__) #if the nnFile is a directory, check for a previous experiment run in it and start from there #load its parameters, append to its evalresults file, open its largest network file #If its none, create a experiment directory. create a results file, save parameters, save network files here. experimentDirectory = parameters.rom + "_" + time.strftime("%d-%m-%Y-%H-%M") +"/" resultsFileName = experimentDirectory + "results.csv" startingEpoch = 0 if parameters.nnFile is None or parameters.nnFile.endswith(".pkl"): #Create your experiment directory, results file, save parameters if not os.path.isdir(experimentDirectory): os.mkdir(experimentDirectory) resultsFile = open(resultsFileName, "a") resultsFile.write("Epoch,\tAverageReward,\tMean Q Value\n") resultsFile.close() parametersFile = open(experimentDirectory + "parameters.pkl" , 'wb', -1) cPickle.dump(parameters,parametersFile) parametersFile.close() if parameters.nnFile is not None and os.path.isdir(parameters.nnFile): #Found a experiment directory if not parameters.nnFile.endswith("/"): parameters.nnFile += "/" experimentDirectory = parameters.nnFile resultsFileName = experimentDirectory + "results.csv" if os.path.exists(experimentDirectory + "parameters.pkl"): parametersFile = open(experimentDirectory + "parameters.pkl" , 'rb') parameters = cPickle.load(parametersFile) parametersFile.close() else: parametersFile = open(experimentDirectory + "parameters.pkl" , 'wb', -1) cPickle.dump(parameters,parametersFile) parametersFile.close() contents = os.listdir(experimentDirectory) networkFiles = [] for handle in contents: if handle.startswith("network") and handle.endswith(".pkl"): networkFiles.append(handle) if len(networkFiles) == 0: #Found a premature experiment, didnt finish a single training epoch parameters.nnFile = None else: #Found a previous experiments network files, now find the highest epoch number highestNNFile = networkFiles[0] highestNetworkEpochNumber = int(highestNNFile[highestNNFile.index("_") + 1 : highestNNFile.index(".")]) for networkFile in networkFiles: networkEpochNumber = int(networkFile[networkFile.index("_") + 1 : networkFile.index(".")]) if networkEpochNumber > highestNetworkEpochNumber: highestNNFile = networkFile highestNetworkEpochNumber = networkEpochNumber startingEpoch = highestNetworkEpochNumber + 1 #dont use full exploration, its not a good way to fill the replay memory when we already have a decent policy if startingEpoch > 4: parameters.epsilonStart = parameters.epsilonEnd parameters.nnFile = experimentDirectory + highestNNFile print "Loaded experiment: " + experimentDirectory + "\nLoaded network file:" + highestNNFile sys.setrecursionlimit(10000) ale = ALEInterface() Environment.initializeALEParameters(ale, parameters.seed, parameters.frameSkip, parameters.repeatActionProbability, parameters.displayScreen) # ale.loadROM(parameters.fullRomPath) # minimalActions = ale.getMinimalActionSet() # difficulties = ale.getAvailableDifficulties() # modes = ale.getAvailableModes() # maxNumFlavors = len(difficulties) * len(modes) # difficulties = createFlavorList(parameters.difficultyString, len(difficulties)) # modes = createFlavorList(parameters.modeString, len(modes)) # transferTaskModule = TransferTaskModule.TransferTaskModule(difficulties, modes) transferTaskModule = TransferTaskModule.TransferTaskModule(ale, parameters.roms, parameters.difficultyString, parameters.modeString, parameters.taskBatchFlag) numActionsToUse = transferTaskModule.getNumTotalActions() print "Number of total tasks:" + str(transferTaskModule.getNumTasks()) + " across " + str(transferTaskModule.getNumGames()) + " games." print "Actions List:" + str(transferTaskModule.getTotalActionsList()) # print "Num difficulties: " + str(len(difficulties)) + " num modes: " + str(len(modes)) + " numtasks: " + str(transferTaskModule.getNumTasks()) # print "Modes: " + str(modes) # print "Difficulties: " + str(difficulties) numTransferTasks = transferTaskModule.getNumTasks() if (parameters.reduceEpochLengthByNumFlavors): parameters.stepsPerEpoch = int(parameters.stepsPerEpoch / numTransferTasks) agent = DQTNAgent.DQTNAgent(transferTaskModule.getTotalActionsList(), parameters.croppedHeight, parameters.croppedWidth, parameters.batchSize, parameters.phiLength, parameters.nnFile, parameters.loadWeightsFlipped, parameters.updateFrequency, parameters.replayMemorySize, parameters.replayStartSize, parameters.networkType, parameters.updateRule, parameters.batchAccumulator, parameters.networkUpdateDelay, transferTaskModule, parameters.transferExperimentType, numTransferTasks, parameters.discountRate, parameters.learningRate, parameters.rmsRho, parameters.rmsEpsilon, parameters.momentum, parameters.epsilonStart, parameters.epsilonEnd, parameters.epsilonDecaySteps, parameters.evalEpsilon, parameters.useSARSAUpdate, parameters.kReturnLength, parameters.deathEndsEpisode) for epoch in xrange(startingEpoch, parameters.epochs + 1): agent.startTrainingEpoch(epoch) runTrainingEpoch(ale, agent, epoch, parameters.stepsPerEpoch, transferTaskModule, parameters.frameSkip, parameters.maxNoActions) agent.endTrainingEpoch(epoch) networkFileName = experimentDirectory + "network_" + str(epoch) + ".pkl" DeepNetworks.saveNetworkParams(agent.network.qValueNetwork, networkFileName) print "Total number of samples seen per task: " print str(agent.trainingMemory.totalTaskSampleCount) if parameters.stepsPerTest > 0 and epoch % parameters.evaluationFrequency == 0: agent.startEvaluationEpoch(epoch) avgRewardPerTask = runEvaluationEpoch(ale, agent, epoch, parameters.stepsPerTest, transferTaskModule, parameters.frameSkip, parameters.maxNoActions) holdoutQVals = agent.computeHoldoutQValues(parameters.numHoldoutQValues) resultsFile = open(resultsFileName, 'a') resultsFile.write(str(epoch) + ",\t") resultsString = "" for avgReward in avgRewardPerTask: resultsString += str(round(avgReward, 4)) + ",\t" resultsFile.write(resultsString) resultsFile.write("\t" + str([round(x, 4) for x in holdoutQVals]) + "\n") resultsFile.close() agent.endEvaluationEpoch(epoch) agent.agentCleanup()
def run_experiment(args): parameters = Parameters.processArguments(args, __doc__) #if the nnFile is a directory, check for a previous experiment run in it and start from there #load its parameters, append to its evalresults file, open its largest network file #If its none, create a experiment directory. create a results file, save parameters, save network files here. experimentDirectory = parameters.rom + "_" + time.strftime("%d-%m-%Y-%H-%M") +"/" resultsFileName = experimentDirectory + "results.csv" startingEpoch = 1 if parameters.nnFile is None or parameters.nnFile.endswith(".pkl"): #Create your experiment directory, results file, save parameters if not os.path.isdir(experimentDirectory): os.mkdir(experimentDirectory) resultsFile = open(resultsFileName, "a") resultsFile.write("Epoch,\tAverageReward,\tMean Q Value\n") resultsFile.close() parametersFile = open(experimentDirectory + "parameters.pkl" , 'wb', -1) cPickle.dump(parameters,parametersFile) parametersFile.close() if parameters.nnFile is not None and os.path.isdir(parameters.nnFile): #Found a experiment directory if not parameters.nnFile.endswith("/"): parameters.nnFile += "/" experimentDirectory = parameters.nnFile resultsFileName = experimentDirectory + "results.csv" if os.path.exists(experimentDirectory + "parameters.pkl"): parametersFile = open(experimentDirectory + "parameters.pkl" , 'rb') parameters = cPickle.load(parametersFile) parametersFile.close() else: parametersFile = open(experimentDirectory + "parameters.pkl" , 'wb', -1) cPickle.dump(parameters,parametersFile) parametersFile.close() contents = os.listdir(experimentDirectory) networkFiles = [] for handle in contents: if handle.startswith("network") and handle.endswith(".pkl"): networkFiles.append(handle) if len(networkFiles) == 0: #Found a premature experiment, didnt finish a single training epoch parameters.nnFile = None else: #Found a previous experiments network files, now find the highest epoch number highestNNFile = networkFiles[0] highestNetworkEpochNumber = int(highestNNFile[highestNNFile.index("_") + 1 : highestNNFile.index(".")]) for networkFile in networkFiles: networkEpochNumber = int(networkFile[networkFile.index("_") + 1 : networkFile.index(".")]) if networkEpochNumber > highestNetworkEpochNumber: highestNNFile = networkFile highestNetworkEpochNumber = networkEpochNumber startingEpoch = highestNetworkEpochNumber + 1 #dont use full exploration, its not a good way to fill the replay memory when we already have a decent policy if startingEpoch > 1: parameters.epsilonStart = parameters.epsilonEnd parameters.nnFile = experimentDirectory + highestNNFile print "Loaded experiment: " + experimentDirectory + "\nLoaded network file:" + highestNNFile sys.setrecursionlimit(10000) ale = ALEInterface() Environment.initializeALEParameters(ale, parameters.seed, parameters.frameSkip, parameters.repeatActionProbability, parameters.displayScreen) ale.loadROM(parameters.fullRomPath) minimalActions = ale.getMinimalActionSet() agent = DQNAgent.DQNAgent(minimalActions, parameters.croppedHeight, parameters.croppedWidth, parameters.batchSize, parameters.phiLength, parameters.nnFile, parameters.loadWeightsFlipped, parameters.updateFrequency, parameters.replayMemorySize, parameters.replayStartSize, parameters.networkType, parameters.updateRule, parameters.batchAccumulator, parameters.networkUpdateDelay, parameters.discountRate, parameters.learningRate, parameters.rmsRho, parameters.rmsEpsilon, parameters.momentum, parameters.epsilonStart, parameters.epsilonEnd, parameters.epsilonDecaySteps, parameters.evalEpsilon, parameters.useSARSAUpdate, parameters.kReturnLength) for epoch in xrange(startingEpoch, parameters.epochs + 1): agent.startTrainingEpoch(epoch) runTrainingEpoch(ale, agent, epoch, parameters.stepsPerEpoch) agent.endTrainingEpoch(epoch) networkFileName = experimentDirectory + "network_" + str(epoch) + ".pkl" DeepNetworks.saveNetworkParams(agent.network.qValueNetwork, networkFileName) if parameters.stepsPerTest > 0 and epoch % parameters.evaluationFrequency == 0: agent.startEvaluationEpoch(epoch) avgReward = runEvaluationEpoch(ale, agent, epoch, parameters.stepsPerTest) holdoutQVals = agent.computeHoldoutQValues(3200) resultsFile = open(resultsFileName, 'a') resultsFile.write(str(epoch) + ",\t" + str(round(avgReward, 4)) + ",\t\t" + str(round(holdoutQVals, 4)) + "\n") resultsFile.close() agent.endEvaluationEpoch(epoch) agent.agentCleanup()
def run_experiment(args): parameters = Parameters.processArguments(args, __doc__) #if the nnFile is a directory, check for a previous experiment run in it and start from there #load its parameters, append to its evalresults file, open its largest network file #If its none, create a experiment directory. create a results file, save parameters, save network files here. experimentDirectory = parameters.rom + "_" + time.strftime( "%d-%m-%Y-%H-%M") + "/" resultsFileName = experimentDirectory + "results.csv" startingEpoch = 0 if parameters.nnFile is None or parameters.nnFile.endswith(".pkl"): #Create your experiment directory, results file, save parameters if not os.path.isdir(experimentDirectory): os.mkdir(experimentDirectory) resultsFile = open(resultsFileName, "a") resultsFile.write("Epoch,\tAverageReward,\tMean Q Value\n") resultsFile.close() parametersFile = open(experimentDirectory + "parameters.pkl", 'wb', -1) cPickle.dump(parameters, parametersFile) parametersFile.close() if parameters.nnFile is not None and os.path.isdir(parameters.nnFile): #Found a experiment directory if not parameters.nnFile.endswith("/"): parameters.nnFile += "/" experimentDirectory = parameters.nnFile resultsFileName = experimentDirectory + "results.csv" if os.path.exists(experimentDirectory + "parameters.pkl"): parametersFile = open(experimentDirectory + "parameters.pkl", 'rb') parameters = cPickle.load(parametersFile) parametersFile.close() else: parametersFile = open(experimentDirectory + "parameters.pkl", 'wb', -1) cPickle.dump(parameters, parametersFile) parametersFile.close() contents = os.listdir(experimentDirectory) networkFiles = [] for handle in contents: if handle.startswith("network") and handle.endswith(".pkl"): networkFiles.append(handle) if len(networkFiles) == 0: #Found a premature experiment, didnt finish a single training epoch parameters.nnFile = None else: #Found a previous experiments network files, now find the highest epoch number highestNNFile = networkFiles[0] highestNetworkEpochNumber = int( highestNNFile[highestNNFile.index("_") + 1:highestNNFile.index(".")]) for networkFile in networkFiles: networkEpochNumber = int(networkFile[networkFile.index("_") + 1:networkFile.index(".")]) if networkEpochNumber > highestNetworkEpochNumber: highestNNFile = networkFile highestNetworkEpochNumber = networkEpochNumber startingEpoch = highestNetworkEpochNumber + 1 #dont use full exploration, its not a good way to fill the replay memory when we already have a decent policy if startingEpoch > 4: parameters.epsilonStart = parameters.epsilonEnd parameters.nnFile = experimentDirectory + highestNNFile print "Loaded experiment: " + experimentDirectory + "\nLoaded network file:" + highestNNFile sys.setrecursionlimit(10000) ale = ALEInterface() Environment.initializeALEParameters(ale, parameters.seed, parameters.frameSkip, parameters.repeatActionProbability, parameters.displayScreen) # ale.loadROM(parameters.fullRomPath) # minimalActions = ale.getMinimalActionSet() # difficulties = ale.getAvailableDifficulties() # modes = ale.getAvailableModes() # maxNumFlavors = len(difficulties) * len(modes) # difficulties = createFlavorList(parameters.difficultyString, len(difficulties)) # modes = createFlavorList(parameters.modeString, len(modes)) # transferTaskModule = TransferTaskModule.TransferTaskModule(difficulties, modes) transferTaskModule = TransferTaskModule.TransferTaskModule( ale, parameters.roms, parameters.difficultyString, parameters.modeString, parameters.taskBatchFlag) numActionsToUse = transferTaskModule.getNumTotalActions() print "Number of total tasks:" + str( transferTaskModule.getNumTasks()) + " across " + str( transferTaskModule.getNumGames()) + " games." print "Actions List:" + str(transferTaskModule.getTotalActionsList()) # print "Num difficulties: " + str(len(difficulties)) + " num modes: " + str(len(modes)) + " numtasks: " + str(transferTaskModule.getNumTasks()) # print "Modes: " + str(modes) # print "Difficulties: " + str(difficulties) numTransferTasks = transferTaskModule.getNumTasks() if (parameters.reduceEpochLengthByNumFlavors): parameters.stepsPerEpoch = int(parameters.stepsPerEpoch / numTransferTasks) agent = DQTNAgent.DQTNAgent( transferTaskModule.getTotalActionsList(), parameters.croppedHeight, parameters.croppedWidth, parameters.batchSize, parameters.phiLength, parameters.nnFile, parameters.loadWeightsFlipped, parameters.updateFrequency, parameters.replayMemorySize, parameters.replayStartSize, parameters.networkType, parameters.updateRule, parameters.batchAccumulator, parameters.networkUpdateDelay, transferTaskModule, parameters.transferExperimentType, numTransferTasks, parameters.discountRate, parameters.learningRate, parameters.rmsRho, parameters.rmsEpsilon, parameters.momentum, parameters.epsilonStart, parameters.epsilonEnd, parameters.epsilonDecaySteps, parameters.evalEpsilon, parameters.useSARSAUpdate, parameters.kReturnLength, parameters.deathEndsEpisode) for epoch in xrange(startingEpoch, parameters.epochs + 1): agent.startTrainingEpoch(epoch) runTrainingEpoch(ale, agent, epoch, parameters.stepsPerEpoch, transferTaskModule, parameters.frameSkip, parameters.maxNoActions) agent.endTrainingEpoch(epoch) networkFileName = experimentDirectory + "network_" + str( epoch) + ".pkl" DeepNetworks.saveNetworkParams(agent.network.qValueNetwork, networkFileName) print "Total number of samples seen per task: " print str(agent.trainingMemory.totalTaskSampleCount) if parameters.stepsPerTest > 0 and epoch % parameters.evaluationFrequency == 0: agent.startEvaluationEpoch(epoch) avgRewardPerTask = runEvaluationEpoch(ale, agent, epoch, parameters.stepsPerTest, transferTaskModule, parameters.frameSkip, parameters.maxNoActions) holdoutQVals = agent.computeHoldoutQValues( parameters.numHoldoutQValues) resultsFile = open(resultsFileName, 'a') resultsFile.write(str(epoch) + ",\t") resultsString = "" for avgReward in avgRewardPerTask: resultsString += str(round(avgReward, 4)) + ",\t" resultsFile.write(resultsString) resultsFile.write("\t" + str([round(x, 4) for x in holdoutQVals]) + "\n") resultsFile.close() agent.endEvaluationEpoch(epoch) agent.agentCleanup()