예제 #1
0
파일: DQNAgent.py 프로젝트: Mog333/DeepRL
    def __init__(self, actionList, inputHeight, inputWidth, batchSize, phiLength, 
        nnFile, loadWeightsFlipped, updateFrequency, replayMemorySize, replayStartSize,
        networkType, updateRule, batchAccumulator, networkUpdateDelay,
        discountRate, learningRate, rmsRho, rmsEpsilon, momentum,
        epsilonStart, epsilonEnd, epsilonDecaySteps, evalEpsilon, useSARSAUpdate, kReturnLength):        
        self.actionList         = actionList
        self.numActions         = len(self.actionList)
        self.inputHeight        = inputHeight
        self.inputWidth         = inputWidth
        self.batchSize          = batchSize
        self.phiLength          = phiLength
        self.nnFile             = nnFile
        self.loadWeightsFlipped = loadWeightsFlipped
        self.updateFrequency    = updateFrequency
        self.replayMemorySize   = replayMemorySize
        self.replayStartSize    = replayStartSize
        self.networkType        = networkType
        self.updateRule         = updateRule
        self.batchAccumulator   = batchAccumulator
        self.networkUpdateDelay = networkUpdateDelay
        self.discountRate       = discountRate
        self.learningRate       = learningRate
        self.rmsRho             = rmsRho
        self.rmsEpsilon         = rmsEpsilon
        self.momentum           = momentum
        self.epsilonStart       = epsilonStart
        self.epsilonEnd         = epsilonEnd
        self.epsilonDecaySteps  = epsilonDecaySteps
        self.evalEpsilon        = evalEpsilon
        self.kReturnLength      = kReturnLength
        self.useSARSAUpdate     = useSARSAUpdate

        self.trainingMemory  =DQNAgentMemory.DQNAgentMemory((self.inputHeight, self.inputWidth), self.phiLength, self.replayMemorySize, self.discountRate)
        self.evaluationMemory=DQNAgentMemory.DQNAgentMemory((self.inputHeight, self.inputWidth), self.phiLength, self.phiLength * 2,    self.discountRate)

        self.episodeCounter  = 0 
        self.stepCounter     = 0
        self.batchCounter    = 0
        self.lossAverages    = []
        self.actionToTake    = 0

        self.epsilon = self.epsilonStart
        if self.epsilonDecaySteps != 0:
            self.epsilonRate = ((self.epsilonStart - self.epsilonEnd) / self.epsilonDecaySteps)
        else:
            self.epsilonRate = 0

        self.training = False

        self.network = DeepQNetwork.DeepQNetwork(self.batchSize, self.phiLength, self.inputHeight, self.inputWidth, self.numActions,
            self.discountRate, self.learningRate, self.rmsRho, self.rmsEpsilon, self.momentum, self.networkUpdateDelay,
            self.useSARSAUpdate, self.kReturnLength,
            self.networkType, self.updateRule, self.batchAccumulator)

        if self.nnFile is not None:
            #Load network
            DeepNetworks.loadNetworkParams(self.network.qValueNetwork, self.nnFile, self.loadWeightsFlipped)
            self.network.resetNextQValueNetwork()
def generateModel(Specs, dataObj):

    if (Specs.Mode == "1"):

        import BoxJenkins
        modelObj = BoxJenkins.BoxJenkins(dataObj, Specs)

    elif (Specs.Mode == "2"):

        import DeepNetworks
        modelObj = DeepNetworks.DeepNetworks(dataObj, Specs, 10)

    modelObj.modelling()

    if (Specs.Mode == "1"):
        attributeName = list(dataObj.data)[int(Specs.ForecastCol)]

        mdlPath = Specs.mdlpath + "/" + attributeName

        print("model under " + mdlPath)
        #if model exists, no parameter estimation is necessary
        modelObj.fitting(Specs.mdlName, mdlPath)

    if (Specs.Mode == "2"):
        modelObj.fitting()

    return modelObj
예제 #3
0
    def __init__(self, batchSize, numFrames, inputHeight, inputWidth, numActions, 
        discountRate, learningRate, rho, rms_epsilon, momentum, networkUpdateDelay, useSARSAUpdate, kReturnLength,
        networkType = "conv", updateRule = "deepmind_rmsprop", batchAccumulator = "sum", clipDelta = 1.0, inputScale = 255.0):
        
        self.batchSize          = batchSize
        self.numFrames          = numFrames
        self.inputWidth         = inputWidth
        self.inputHeight        = inputHeight
        self.inputScale         = inputScale
        self.numActions         = numActions
        self.discountRate       = discountRate
        self.learningRate       = learningRate
        self.rho                = rho
        self.rms_epsilon        = rms_epsilon
        self.momentum           = momentum
        self.networkUpdateDelay = networkUpdateDelay
        self.useSARSAUpdate     = useSARSAUpdate
        self.kReturnLength      = kReturnLength
        self.networkType        = networkType
        self.updateRule         = updateRule
        self.batchAccumulator   = batchAccumulator
        self.clipDelta          = clipDelta
        self.updateCounter      = 0

        states     = T.tensor4("states")
        nextStates = T.tensor4("nextStates")
        rewards    = T.col("rewards")
        actions    = T.icol("actions")
        nextActions= T.icol("nextActions")
        terminals  = T.icol("terminals")

        self.statesShared      = theano.shared(np.zeros((self.batchSize, self.numFrames, self.inputHeight, self.inputWidth), dtype=theano.config.floatX))
        self.nextStatesShared  = theano.shared(np.zeros((self.batchSize, self.numFrames, self.inputHeight, self.inputWidth), dtype=theano.config.floatX))
        self.rewardsShared     = theano.shared(np.zeros((self.batchSize, 1), dtype=theano.config.floatX), broadcastable=(False, True))
        self.actionsShared     = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True))
        self.nextActionsShared = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True))
        self.terminalsShared   = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True))

        self.qValueNetwork  = DeepNetworks.buildDeepQNetwork(
            self.batchSize, self.numFrames, self.inputHeight, self.inputWidth, self.numActions, self.networkType)

        qValues = lasagne.layers.get_output(self.qValueNetwork, states / self.inputScale)

        if self.networkUpdateDelay > 0:
            self.nextQValueNetwork = DeepNetworks.buildDeepQNetwork(
                self.batchSize, self.numFrames, self.inputHeight, self.inputWidth, self.numActions, self.networkType)
            self.resetNextQValueNetwork()
            nextQValues = lasagne.layers.get_output(self.nextQValueNetwork, nextStates / self.inputScale)

        else:
            nextQValues = lasagne.layers.get_output(self.qValueNetwork, nextStates / self.inputScale)
            nextQValues = theano.gradient.disconnected_grad(nextQValues)


        if self.useSARSAUpdate:
            target = rewards + terminals * (self.discountRate ** self.kReturnLength) * nextQValues[T.arange(self.batchSize), nextActions.reshape((-1,))].reshape((-1, 1))
        else:
            target = rewards + terminals * (self.discountRate ** self.kReturnLength) * T.max(nextQValues, axis = 1, keepdims = True)

        targetDifference = target - qValues[T.arange(self.batchSize), actions.reshape((-1,))].reshape((-1, 1))


        quadraticPart = T.minimum(abs(targetDifference), self.clipDelta)
        linearPart = abs(targetDifference) - quadraticPart

        # if self.clipDelta > 0:
        #     targetDifference = targetDifference.clip(-1.0 * self.clipDelta, self.clipDelta)

        if self.batchAccumulator == "sum":
            # loss = T.sum(targetDifference ** 2)
            loss = T.sum(0.5 * quadraticPart ** 2 + self.clipDelta * linearPart)
        elif self.batchAccumulator == "mean":
            # loss = T.mean(targetDifference ** 2)
            loss = T.mean(0.5 * quadraticPart ** 2 + self.clipDelta * linearPart)
        else:
            raise ValueError("Bad Network Accumulator. {sum, mean} expected")


        networkParameters = lasagne.layers.helper.get_all_params(self.qValueNetwork)

        if self.updateRule == "deepmind_rmsprop":
            updates = DeepNetworks.deepmind_rmsprop(loss, networkParameters, self.learningRate, self.rho, self.rms_epsilon)
        elif self.updateRule == "rmsprop":
            updates = lasagne.updates.rmsprop(loss, networkParameters, self.learningRate, self.rho, self.rms_epsilon)
        elif self.updateRule == "sgd":
            updates = lasagne.updates.sgd(loss, networkParameters, self.learningRate)
        else:
            raise ValueError("Bad update rule. {deepmind_rmsprop, rmsprop, sgd} expected")

        if self.momentum > 0:
            updates.lasagne.updates.apply_momentum(updates, None, self.momentum)

        lossGivens = {
            states: self.statesShared,
            nextStates: self.nextStatesShared,
            rewards:self.rewardsShared,
            actions: self.actionsShared,
            nextActions: self.nextActionsShared,
            terminals: self.terminalsShared
        }

        self.__trainNetwork = theano.function([], [loss, qValues], updates=updates, givens=lossGivens, on_unused_input='warn')
        self.__computeQValues = theano.function([], qValues, givens={states: self.statesShared})
예제 #4
0
    def __init__(self,
                 batchSize,
                 numFrames,
                 inputHeight,
                 inputWidth,
                 numActions,
                 discountRate,
                 learningRate,
                 rho,
                 rms_epsilon,
                 momentum,
                 networkUpdateDelay,
                 useSARSAUpdate,
                 kReturnLength,
                 networkType="conv",
                 updateRule="deepmind_rmsprop",
                 batchAccumulator="sum",
                 clipDelta=1.0,
                 inputScale=255.0):

        self.batchSize = batchSize
        self.numFrames = numFrames
        self.inputWidth = inputWidth
        self.inputHeight = inputHeight
        self.inputScale = inputScale
        self.numActions = numActions
        self.discountRate = discountRate
        self.learningRate = learningRate
        self.rho = rho
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.networkUpdateDelay = networkUpdateDelay
        self.useSARSAUpdate = useSARSAUpdate
        self.kReturnLength = kReturnLength
        self.networkType = networkType
        self.updateRule = updateRule
        self.batchAccumulator = batchAccumulator
        self.clipDelta = clipDelta
        self.updateCounter = 0

        states = T.tensor4("states")
        nextStates = T.tensor4("nextStates")
        rewards = T.col("rewards")
        actions = T.icol("actions")
        nextActions = T.icol("nextActions")
        terminals = T.icol("terminals")

        self.statesShared = theano.shared(
            np.zeros((self.batchSize, self.numFrames, self.inputHeight,
                      self.inputWidth),
                     dtype=theano.config.floatX))
        self.nextStatesShared = theano.shared(
            np.zeros((self.batchSize, self.numFrames, self.inputHeight,
                      self.inputWidth),
                     dtype=theano.config.floatX))
        self.rewardsShared = theano.shared(np.zeros(
            (self.batchSize, 1), dtype=theano.config.floatX),
                                           broadcastable=(False, True))
        self.actionsShared = theano.shared(np.zeros((self.batchSize, 1),
                                                    dtype='int32'),
                                           broadcastable=(False, True))
        self.nextActionsShared = theano.shared(np.zeros((self.batchSize, 1),
                                                        dtype='int32'),
                                               broadcastable=(False, True))
        self.terminalsShared = theano.shared(np.zeros((self.batchSize, 1),
                                                      dtype='int32'),
                                             broadcastable=(False, True))

        self.qValueNetwork = DeepNetworks.buildDeepQNetwork(
            self.batchSize, self.numFrames, self.inputHeight, self.inputWidth,
            self.numActions, self.networkType)

        qValues = lasagne.layers.get_output(self.qValueNetwork,
                                            states / self.inputScale)

        if self.networkUpdateDelay > 0:
            self.nextQValueNetwork = DeepNetworks.buildDeepQNetwork(
                self.batchSize, self.numFrames, self.inputHeight,
                self.inputWidth, self.numActions, self.networkType)
            self.resetNextQValueNetwork()
            nextQValues = lasagne.layers.get_output(
                self.nextQValueNetwork, nextStates / self.inputScale)

        else:
            nextQValues = lasagne.layers.get_output(
                self.qValueNetwork, nextStates / self.inputScale)
            nextQValues = theano.gradient.disconnected_grad(nextQValues)

        if self.useSARSAUpdate:
            target = rewards + terminals * (
                self.discountRate**
                self.kReturnLength) * nextQValues[T.arange(self.batchSize),
                                                  nextActions.reshape(
                                                      (-1, ))].reshape((-1, 1))
        else:
            target = rewards + terminals * (
                self.discountRate**self.kReturnLength) * T.max(
                    nextQValues, axis=1, keepdims=True)

        targetDifference = target - qValues[T.arange(self.batchSize),
                                            actions.reshape((-1, ))].reshape(
                                                (-1, 1))

        quadraticPart = T.minimum(abs(targetDifference), self.clipDelta)
        linearPart = abs(targetDifference) - quadraticPart

        # if self.clipDelta > 0:
        #     targetDifference = targetDifference.clip(-1.0 * self.clipDelta, self.clipDelta)

        if self.batchAccumulator == "sum":
            # loss = T.sum(targetDifference ** 2)
            loss = T.sum(0.5 * quadraticPart**2 + self.clipDelta * linearPart)
        elif self.batchAccumulator == "mean":
            # loss = T.mean(targetDifference ** 2)
            loss = T.mean(0.5 * quadraticPart**2 + self.clipDelta * linearPart)
        else:
            raise ValueError("Bad Network Accumulator. {sum, mean} expected")

        networkParameters = lasagne.layers.helper.get_all_params(
            self.qValueNetwork)

        if self.updateRule == "deepmind_rmsprop":
            updates = DeepNetworks.deepmind_rmsprop(loss, networkParameters,
                                                    self.learningRate,
                                                    self.rho, self.rms_epsilon)
        elif self.updateRule == "rmsprop":
            updates = lasagne.updates.rmsprop(loss, networkParameters,
                                              self.learningRate, self.rho,
                                              self.rms_epsilon)
        elif self.updateRule == "sgd":
            updates = lasagne.updates.sgd(loss, networkParameters,
                                          self.learningRate)
        else:
            raise ValueError(
                "Bad update rule. {deepmind_rmsprop, rmsprop, sgd} expected")

        if self.momentum > 0:
            updates.lasagne.updates.apply_momentum(updates, None,
                                                   self.momentum)

        lossGivens = {
            states: self.statesShared,
            nextStates: self.nextStatesShared,
            rewards: self.rewardsShared,
            actions: self.actionsShared,
            nextActions: self.nextActionsShared,
            terminals: self.terminalsShared
        }

        self.__trainNetwork = theano.function([], [loss, qValues],
                                              updates=updates,
                                              givens=lossGivens,
                                              on_unused_input='warn')
        self.__computeQValues = theano.function(
            [], qValues, givens={states: self.statesShared})
예제 #5
0
    def __init__(self, actionList, inputHeight, inputWidth, batchSize,
                 phiLength, nnFile, loadWeightsFlipped, updateFrequency,
                 replayMemorySize, replayStartSize, networkType, updateRule,
                 batchAccumulator, networkUpdateDelay, discountRate,
                 learningRate, rmsRho, rmsEpsilon, momentum, epsilonStart,
                 epsilonEnd, epsilonDecaySteps, evalEpsilon, useSARSAUpdate,
                 kReturnLength):
        self.actionList = actionList
        self.numActions = len(self.actionList)
        self.inputHeight = inputHeight
        self.inputWidth = inputWidth
        self.batchSize = batchSize
        self.phiLength = phiLength
        self.nnFile = nnFile
        self.loadWeightsFlipped = loadWeightsFlipped
        self.updateFrequency = updateFrequency
        self.replayMemorySize = replayMemorySize
        self.replayStartSize = replayStartSize
        self.networkType = networkType
        self.updateRule = updateRule
        self.batchAccumulator = batchAccumulator
        self.networkUpdateDelay = networkUpdateDelay
        self.discountRate = discountRate
        self.learningRate = learningRate
        self.rmsRho = rmsRho
        self.rmsEpsilon = rmsEpsilon
        self.momentum = momentum
        self.epsilonStart = epsilonStart
        self.epsilonEnd = epsilonEnd
        self.epsilonDecaySteps = epsilonDecaySteps
        self.evalEpsilon = evalEpsilon
        self.kReturnLength = kReturnLength
        self.useSARSAUpdate = useSARSAUpdate

        self.trainingMemory = DQNAgentMemory.DQNAgentMemory(
            (self.inputHeight, self.inputWidth), self.phiLength,
            self.replayMemorySize, self.discountRate)
        self.evaluationMemory = DQNAgentMemory.DQNAgentMemory(
            (self.inputHeight, self.inputWidth), self.phiLength,
            self.phiLength * 2, self.discountRate)

        self.episodeCounter = 0
        self.stepCounter = 0
        self.batchCounter = 0
        self.lossAverages = []
        self.actionToTake = 0

        self.epsilon = self.epsilonStart
        if self.epsilonDecaySteps != 0:
            self.epsilonRate = ((self.epsilonStart - self.epsilonEnd) /
                                self.epsilonDecaySteps)
        else:
            self.epsilonRate = 0

        self.training = False

        self.network = DeepQNetwork.DeepQNetwork(
            self.batchSize, self.phiLength, self.inputHeight, self.inputWidth,
            self.numActions, self.discountRate, self.learningRate, self.rmsRho,
            self.rmsEpsilon, self.momentum, self.networkUpdateDelay,
            self.useSARSAUpdate, self.kReturnLength, self.networkType,
            self.updateRule, self.batchAccumulator)

        if self.nnFile is not None:
            #Load network
            DeepNetworks.loadNetworkParams(self.network.qValueNetwork,
                                           self.nnFile,
                                           self.loadWeightsFlipped)
            self.network.resetNextQValueNetwork()
예제 #6
0
파일: run_dqn.py 프로젝트: Mog333/DeepRL
def run_experiment(args):
    parameters = Parameters.processArguments(args, __doc__)

    #if the nnFile is a directory, check for a previous experiment run in it and start from there
    #load its parameters, append to its evalresults file, open its largest network file
    #If its none, create a experiment directory. create a results file, save parameters, save network files here.

    experimentDirectory = parameters.rom + "_" + time.strftime(
        "%d-%m-%Y-%H-%M") + "/"
    resultsFileName = experimentDirectory + "results.csv"
    startingEpoch = 1
    if parameters.nnFile is None or parameters.nnFile.endswith(".pkl"):
        #Create your experiment directory, results file, save parameters
        if not os.path.isdir(experimentDirectory):
            os.mkdir(experimentDirectory)

        resultsFile = open(resultsFileName, "a")
        resultsFile.write("Epoch,\tAverageReward,\tMean Q Value\n")
        resultsFile.close()

        parametersFile = open(experimentDirectory + "parameters.pkl", 'wb', -1)
        cPickle.dump(parameters, parametersFile)
        parametersFile.close()

    if parameters.nnFile is not None and os.path.isdir(parameters.nnFile):
        #Found a experiment directory
        if not parameters.nnFile.endswith("/"):
            parameters.nnFile += "/"

        experimentDirectory = parameters.nnFile
        resultsFileName = experimentDirectory + "results.csv"

        if os.path.exists(experimentDirectory + "parameters.pkl"):
            parametersFile = open(experimentDirectory + "parameters.pkl", 'rb')
            parameters = cPickle.load(parametersFile)
            parametersFile.close()
        else:
            parametersFile = open(experimentDirectory + "parameters.pkl", 'wb',
                                  -1)
            cPickle.dump(parameters, parametersFile)
            parametersFile.close()

        contents = os.listdir(experimentDirectory)
        networkFiles = []
        for handle in contents:
            if handle.startswith("network") and handle.endswith(".pkl"):
                networkFiles.append(handle)

        if len(networkFiles) == 0:
            #Found a premature experiment, didnt finish a single training epoch
            parameters.nnFile = None
        else:
            #Found a previous experiments network files, now find the highest epoch number
            highestNNFile = networkFiles[0]
            highestNetworkEpochNumber = int(
                highestNNFile[highestNNFile.index("_") +
                              1:highestNNFile.index(".")])
            for networkFile in networkFiles:
                networkEpochNumber = int(networkFile[networkFile.index("_") +
                                                     1:networkFile.index(".")])
                if networkEpochNumber > highestNetworkEpochNumber:
                    highestNNFile = networkFile
                    highestNetworkEpochNumber = networkEpochNumber

            startingEpoch = highestNetworkEpochNumber + 1
            #dont use full exploration, its not a good way to fill the replay memory when we already have a decent policy
            if startingEpoch > 1:
                parameters.epsilonStart = parameters.epsilonEnd

            parameters.nnFile = experimentDirectory + highestNNFile
            print "Loaded experiment: " + experimentDirectory + "\nLoaded network file:" + highestNNFile

    sys.setrecursionlimit(10000)
    ale = ALEInterface()

    Environment.initializeALEParameters(ale, parameters.seed,
                                        parameters.frameSkip,
                                        parameters.repeatActionProbability,
                                        parameters.displayScreen)
    ale.loadROM(parameters.fullRomPath)
    minimalActions = ale.getMinimalActionSet()

    agent = DQNAgent.DQNAgent(
        minimalActions, parameters.croppedHeight, parameters.croppedWidth,
        parameters.batchSize, parameters.phiLength, parameters.nnFile,
        parameters.loadWeightsFlipped, parameters.updateFrequency,
        parameters.replayMemorySize, parameters.replayStartSize,
        parameters.networkType, parameters.updateRule,
        parameters.batchAccumulator, parameters.networkUpdateDelay,
        parameters.discountRate, parameters.learningRate, parameters.rmsRho,
        parameters.rmsEpsilon, parameters.momentum, parameters.epsilonStart,
        parameters.epsilonEnd, parameters.epsilonDecaySteps,
        parameters.evalEpsilon, parameters.useSARSAUpdate,
        parameters.kReturnLength)

    for epoch in xrange(startingEpoch, parameters.epochs + 1):
        agent.startTrainingEpoch(epoch)
        runTrainingEpoch(ale, agent, epoch, parameters.stepsPerEpoch)
        agent.endTrainingEpoch(epoch)

        networkFileName = experimentDirectory + "network_" + str(
            epoch) + ".pkl"
        DeepNetworks.saveNetworkParams(agent.network.qValueNetwork,
                                       networkFileName)

        if parameters.stepsPerTest > 0 and epoch % parameters.evaluationFrequency == 0:
            agent.startEvaluationEpoch(epoch)
            avgReward = runEvaluationEpoch(ale, agent, epoch,
                                           parameters.stepsPerTest)
            holdoutQVals = agent.computeHoldoutQValues(3200)

            resultsFile = open(resultsFileName, 'a')
            resultsFile.write(
                str(epoch) + ",\t" + str(round(avgReward, 4)) + ",\t\t" +
                str(round(holdoutQVals, 4)) + "\n")
            resultsFile.close()

            agent.endEvaluationEpoch(epoch)

    agent.agentCleanup()
예제 #7
0
파일: run_dqtn.py 프로젝트: Mog333/DeepRL
def run_experiment(args):
    parameters = Parameters.processArguments(args, __doc__)

    #if the nnFile is a directory, check for a previous experiment run in it and start from there
    #load its parameters, append to its evalresults file, open its largest network file
    #If its none, create a experiment directory. create a results file, save parameters, save network files here. 

    experimentDirectory = parameters.rom + "_" + time.strftime("%d-%m-%Y-%H-%M") +"/"
    resultsFileName = experimentDirectory + "results.csv"
    startingEpoch = 0
    if parameters.nnFile is None or parameters.nnFile.endswith(".pkl"):
        #Create your experiment directory, results file, save parameters
        if not os.path.isdir(experimentDirectory):
            os.mkdir(experimentDirectory)

        resultsFile = open(resultsFileName, "a")
        resultsFile.write("Epoch,\tAverageReward,\tMean Q Value\n")
        resultsFile.close()

        parametersFile = open(experimentDirectory + "parameters.pkl" , 'wb', -1)
        cPickle.dump(parameters,parametersFile)
        parametersFile.close()


    if parameters.nnFile is not None and os.path.isdir(parameters.nnFile):
        #Found a experiment directory
        if not parameters.nnFile.endswith("/"):
            parameters.nnFile += "/"

        experimentDirectory = parameters.nnFile
        resultsFileName = experimentDirectory + "results.csv"

        if os.path.exists(experimentDirectory + "parameters.pkl"):
            parametersFile = open(experimentDirectory + "parameters.pkl" , 'rb')
            parameters = cPickle.load(parametersFile)
            parametersFile.close()
        else:
            parametersFile = open(experimentDirectory + "parameters.pkl" , 'wb', -1)
            cPickle.dump(parameters,parametersFile)
            parametersFile.close()

        contents = os.listdir(experimentDirectory)
        networkFiles = []
        for handle in contents:
            if handle.startswith("network") and handle.endswith(".pkl"):
                networkFiles.append(handle)

        if len(networkFiles) == 0:
            #Found a premature experiment, didnt finish a single training epoch
            parameters.nnFile = None
        else:
            #Found a previous experiments network files, now find the highest epoch number
            highestNNFile = networkFiles[0]
            highestNetworkEpochNumber = int(highestNNFile[highestNNFile.index("_") + 1 : highestNNFile.index(".")])
            for networkFile in networkFiles:
                networkEpochNumber =  int(networkFile[networkFile.index("_") + 1 : networkFile.index(".")])
                if networkEpochNumber > highestNetworkEpochNumber:
                    highestNNFile = networkFile
                    highestNetworkEpochNumber = networkEpochNumber

            startingEpoch = highestNetworkEpochNumber + 1
            #dont use full exploration, its not a good way to fill the replay memory when we already have a decent policy
            if startingEpoch > 4:
                parameters.epsilonStart = parameters.epsilonEnd

            parameters.nnFile = experimentDirectory + highestNNFile
            print "Loaded experiment: " + experimentDirectory + "\nLoaded network file:" + highestNNFile

    
    sys.setrecursionlimit(10000)
    ale = ALEInterface()

    Environment.initializeALEParameters(ale, parameters.seed, parameters.frameSkip, parameters.repeatActionProbability, parameters.displayScreen)

    # ale.loadROM(parameters.fullRomPath)

    # minimalActions = ale.getMinimalActionSet()

    # difficulties = ale.getAvailableDifficulties()
    # modes = ale.getAvailableModes()

    # maxNumFlavors = len(difficulties) * len(modes)

    # difficulties = createFlavorList(parameters.difficultyString, len(difficulties))
    # modes = createFlavorList(parameters.modeString, len(modes))

    # transferTaskModule = TransferTaskModule.TransferTaskModule(difficulties, modes)


    transferTaskModule = TransferTaskModule.TransferTaskModule(ale, parameters.roms, parameters.difficultyString, parameters.modeString, parameters.taskBatchFlag)
    numActionsToUse = transferTaskModule.getNumTotalActions()
    print "Number of total tasks:" + str(transferTaskModule.getNumTasks()) + " across " + str(transferTaskModule.getNumGames()) + " games."
    print "Actions List:" + str(transferTaskModule.getTotalActionsList())
    # print "Num difficulties: " + str(len(difficulties)) + " num modes: " + str(len(modes)) + " numtasks: " + str(transferTaskModule.getNumTasks())
    # print "Modes: " + str(modes)
    # print "Difficulties: " + str(difficulties)

    numTransferTasks = transferTaskModule.getNumTasks()

    if (parameters.reduceEpochLengthByNumFlavors):
        parameters.stepsPerEpoch = int(parameters.stepsPerEpoch / numTransferTasks)

    agent = DQTNAgent.DQTNAgent(transferTaskModule.getTotalActionsList(), parameters.croppedHeight, parameters.croppedWidth, 
                parameters.batchSize, 
                parameters.phiLength,
                parameters.nnFile, 
                parameters.loadWeightsFlipped, 
                parameters.updateFrequency, 
                parameters.replayMemorySize, 
                parameters.replayStartSize,
                parameters.networkType, 
                parameters.updateRule, 
                parameters.batchAccumulator, 
                parameters.networkUpdateDelay,
                transferTaskModule,
                parameters.transferExperimentType,
                numTransferTasks,
                parameters.discountRate, 
                parameters.learningRate, 
                parameters.rmsRho, 
                parameters.rmsEpsilon, 
                parameters.momentum,
                parameters.epsilonStart, 
                parameters.epsilonEnd, 
                parameters.epsilonDecaySteps,
                parameters.evalEpsilon,
                parameters.useSARSAUpdate,
                parameters.kReturnLength,
                parameters.deathEndsEpisode)



    for epoch in xrange(startingEpoch, parameters.epochs + 1):
        agent.startTrainingEpoch(epoch)
        runTrainingEpoch(ale, agent, epoch, parameters.stepsPerEpoch, transferTaskModule, parameters.frameSkip, parameters.maxNoActions)
        agent.endTrainingEpoch(epoch)

        networkFileName = experimentDirectory + "network_" + str(epoch) + ".pkl"
        DeepNetworks.saveNetworkParams(agent.network.qValueNetwork, networkFileName)

        print "Total number of samples seen per task: "
        print str(agent.trainingMemory.totalTaskSampleCount)

        if parameters.stepsPerTest > 0 and epoch % parameters.evaluationFrequency == 0:
            agent.startEvaluationEpoch(epoch)
            avgRewardPerTask = runEvaluationEpoch(ale, agent, epoch, parameters.stepsPerTest, transferTaskModule, parameters.frameSkip, parameters.maxNoActions)
            holdoutQVals = agent.computeHoldoutQValues(parameters.numHoldoutQValues)

            resultsFile = open(resultsFileName, 'a')
            resultsFile.write(str(epoch) + ",\t")
            resultsString = ""

            for avgReward in avgRewardPerTask:
                resultsString += str(round(avgReward, 4)) + ",\t"

            resultsFile.write(resultsString)
            resultsFile.write("\t" + str([round(x, 4) for x in holdoutQVals]) + "\n")
            resultsFile.close()

            agent.endEvaluationEpoch(epoch)

    agent.agentCleanup()
예제 #8
0
파일: run_dqn.py 프로젝트: Mog333/DeepRL
def run_experiment(args):
    parameters = Parameters.processArguments(args, __doc__)

    #if the nnFile is a directory, check for a previous experiment run in it and start from there
    #load its parameters, append to its evalresults file, open its largest network file
    #If its none, create a experiment directory. create a results file, save parameters, save network files here. 

    experimentDirectory = parameters.rom + "_" + time.strftime("%d-%m-%Y-%H-%M") +"/"
    resultsFileName = experimentDirectory + "results.csv"
    startingEpoch = 1
    if parameters.nnFile is None or parameters.nnFile.endswith(".pkl"):
        #Create your experiment directory, results file, save parameters
        if not os.path.isdir(experimentDirectory):
            os.mkdir(experimentDirectory)

        resultsFile = open(resultsFileName, "a")
        resultsFile.write("Epoch,\tAverageReward,\tMean Q Value\n")
        resultsFile.close()

        parametersFile = open(experimentDirectory + "parameters.pkl" , 'wb', -1)
        cPickle.dump(parameters,parametersFile)
        parametersFile.close()


    if parameters.nnFile is not None and os.path.isdir(parameters.nnFile):
        #Found a experiment directory
        if not parameters.nnFile.endswith("/"):
            parameters.nnFile += "/"

        experimentDirectory = parameters.nnFile
        resultsFileName = experimentDirectory + "results.csv"

        if os.path.exists(experimentDirectory + "parameters.pkl"):
            parametersFile = open(experimentDirectory + "parameters.pkl" , 'rb')
            parameters = cPickle.load(parametersFile)
            parametersFile.close()
        else:
            parametersFile = open(experimentDirectory + "parameters.pkl" , 'wb', -1)
            cPickle.dump(parameters,parametersFile)
            parametersFile.close()

        contents = os.listdir(experimentDirectory)
        networkFiles = []
        for handle in contents:
            if handle.startswith("network") and handle.endswith(".pkl"):
                networkFiles.append(handle)

        if len(networkFiles) == 0:
            #Found a premature experiment, didnt finish a single training epoch
            parameters.nnFile = None
        else:
            #Found a previous experiments network files, now find the highest epoch number
            highestNNFile = networkFiles[0]
            highestNetworkEpochNumber = int(highestNNFile[highestNNFile.index("_") + 1 : highestNNFile.index(".")])
            for networkFile in networkFiles:
                networkEpochNumber =  int(networkFile[networkFile.index("_") + 1 : networkFile.index(".")])
                if networkEpochNumber > highestNetworkEpochNumber:
                    highestNNFile = networkFile
                    highestNetworkEpochNumber = networkEpochNumber

            startingEpoch = highestNetworkEpochNumber + 1
            #dont use full exploration, its not a good way to fill the replay memory when we already have a decent policy
            if startingEpoch > 1:
                parameters.epsilonStart = parameters.epsilonEnd

            parameters.nnFile = experimentDirectory + highestNNFile
            print "Loaded experiment: " + experimentDirectory + "\nLoaded network file:" + highestNNFile


    sys.setrecursionlimit(10000)
    ale = ALEInterface()

    Environment.initializeALEParameters(ale, parameters.seed, parameters.frameSkip, parameters.repeatActionProbability, parameters.displayScreen)
    ale.loadROM(parameters.fullRomPath)
    minimalActions = ale.getMinimalActionSet()


    agent = DQNAgent.DQNAgent(minimalActions, parameters.croppedHeight, parameters.croppedWidth, 
                parameters.batchSize, 
                parameters.phiLength,
                parameters.nnFile, 
                parameters.loadWeightsFlipped, 
                parameters.updateFrequency, 
                parameters.replayMemorySize, 
                parameters.replayStartSize,
                parameters.networkType, 
                parameters.updateRule, 
                parameters.batchAccumulator, 
                parameters.networkUpdateDelay,
                parameters.discountRate, 
                parameters.learningRate, 
                parameters.rmsRho, 
                parameters.rmsEpsilon, 
                parameters.momentum,
                parameters.epsilonStart, 
                parameters.epsilonEnd, 
                parameters.epsilonDecaySteps,
                parameters.evalEpsilon,
                parameters.useSARSAUpdate,
                parameters.kReturnLength)



    for epoch in xrange(startingEpoch, parameters.epochs + 1):
        agent.startTrainingEpoch(epoch)
        runTrainingEpoch(ale, agent, epoch, parameters.stepsPerEpoch)
        agent.endTrainingEpoch(epoch)

        networkFileName = experimentDirectory + "network_" + str(epoch) + ".pkl"
        DeepNetworks.saveNetworkParams(agent.network.qValueNetwork, networkFileName)

        if parameters.stepsPerTest > 0 and epoch % parameters.evaluationFrequency == 0:
            agent.startEvaluationEpoch(epoch)
            avgReward = runEvaluationEpoch(ale, agent, epoch, parameters.stepsPerTest)
            holdoutQVals = agent.computeHoldoutQValues(3200)

            resultsFile = open(resultsFileName, 'a')
            resultsFile.write(str(epoch) + ",\t" + str(round(avgReward, 4)) + ",\t\t" + str(round(holdoutQVals, 4)) + "\n")
            resultsFile.close()

            agent.endEvaluationEpoch(epoch)

    agent.agentCleanup()
예제 #9
0
파일: run_dqtn.py 프로젝트: Mog333/DeepRL
def run_experiment(args):
    parameters = Parameters.processArguments(args, __doc__)

    #if the nnFile is a directory, check for a previous experiment run in it and start from there
    #load its parameters, append to its evalresults file, open its largest network file
    #If its none, create a experiment directory. create a results file, save parameters, save network files here.

    experimentDirectory = parameters.rom + "_" + time.strftime(
        "%d-%m-%Y-%H-%M") + "/"
    resultsFileName = experimentDirectory + "results.csv"
    startingEpoch = 0
    if parameters.nnFile is None or parameters.nnFile.endswith(".pkl"):
        #Create your experiment directory, results file, save parameters
        if not os.path.isdir(experimentDirectory):
            os.mkdir(experimentDirectory)

        resultsFile = open(resultsFileName, "a")
        resultsFile.write("Epoch,\tAverageReward,\tMean Q Value\n")
        resultsFile.close()

        parametersFile = open(experimentDirectory + "parameters.pkl", 'wb', -1)
        cPickle.dump(parameters, parametersFile)
        parametersFile.close()

    if parameters.nnFile is not None and os.path.isdir(parameters.nnFile):
        #Found a experiment directory
        if not parameters.nnFile.endswith("/"):
            parameters.nnFile += "/"

        experimentDirectory = parameters.nnFile
        resultsFileName = experimentDirectory + "results.csv"

        if os.path.exists(experimentDirectory + "parameters.pkl"):
            parametersFile = open(experimentDirectory + "parameters.pkl", 'rb')
            parameters = cPickle.load(parametersFile)
            parametersFile.close()
        else:
            parametersFile = open(experimentDirectory + "parameters.pkl", 'wb',
                                  -1)
            cPickle.dump(parameters, parametersFile)
            parametersFile.close()

        contents = os.listdir(experimentDirectory)
        networkFiles = []
        for handle in contents:
            if handle.startswith("network") and handle.endswith(".pkl"):
                networkFiles.append(handle)

        if len(networkFiles) == 0:
            #Found a premature experiment, didnt finish a single training epoch
            parameters.nnFile = None
        else:
            #Found a previous experiments network files, now find the highest epoch number
            highestNNFile = networkFiles[0]
            highestNetworkEpochNumber = int(
                highestNNFile[highestNNFile.index("_") +
                              1:highestNNFile.index(".")])
            for networkFile in networkFiles:
                networkEpochNumber = int(networkFile[networkFile.index("_") +
                                                     1:networkFile.index(".")])
                if networkEpochNumber > highestNetworkEpochNumber:
                    highestNNFile = networkFile
                    highestNetworkEpochNumber = networkEpochNumber

            startingEpoch = highestNetworkEpochNumber + 1
            #dont use full exploration, its not a good way to fill the replay memory when we already have a decent policy
            if startingEpoch > 4:
                parameters.epsilonStart = parameters.epsilonEnd

            parameters.nnFile = experimentDirectory + highestNNFile
            print "Loaded experiment: " + experimentDirectory + "\nLoaded network file:" + highestNNFile

    sys.setrecursionlimit(10000)
    ale = ALEInterface()

    Environment.initializeALEParameters(ale, parameters.seed,
                                        parameters.frameSkip,
                                        parameters.repeatActionProbability,
                                        parameters.displayScreen)

    # ale.loadROM(parameters.fullRomPath)

    # minimalActions = ale.getMinimalActionSet()

    # difficulties = ale.getAvailableDifficulties()
    # modes = ale.getAvailableModes()

    # maxNumFlavors = len(difficulties) * len(modes)

    # difficulties = createFlavorList(parameters.difficultyString, len(difficulties))
    # modes = createFlavorList(parameters.modeString, len(modes))

    # transferTaskModule = TransferTaskModule.TransferTaskModule(difficulties, modes)

    transferTaskModule = TransferTaskModule.TransferTaskModule(
        ale, parameters.roms, parameters.difficultyString,
        parameters.modeString, parameters.taskBatchFlag)
    numActionsToUse = transferTaskModule.getNumTotalActions()
    print "Number of total tasks:" + str(
        transferTaskModule.getNumTasks()) + " across " + str(
            transferTaskModule.getNumGames()) + " games."
    print "Actions List:" + str(transferTaskModule.getTotalActionsList())
    # print "Num difficulties: " + str(len(difficulties)) + " num modes: " + str(len(modes)) + " numtasks: " + str(transferTaskModule.getNumTasks())
    # print "Modes: " + str(modes)
    # print "Difficulties: " + str(difficulties)

    numTransferTasks = transferTaskModule.getNumTasks()

    if (parameters.reduceEpochLengthByNumFlavors):
        parameters.stepsPerEpoch = int(parameters.stepsPerEpoch /
                                       numTransferTasks)

    agent = DQTNAgent.DQTNAgent(
        transferTaskModule.getTotalActionsList(), parameters.croppedHeight,
        parameters.croppedWidth, parameters.batchSize, parameters.phiLength,
        parameters.nnFile, parameters.loadWeightsFlipped,
        parameters.updateFrequency, parameters.replayMemorySize,
        parameters.replayStartSize, parameters.networkType,
        parameters.updateRule, parameters.batchAccumulator,
        parameters.networkUpdateDelay, transferTaskModule,
        parameters.transferExperimentType, numTransferTasks,
        parameters.discountRate, parameters.learningRate, parameters.rmsRho,
        parameters.rmsEpsilon, parameters.momentum, parameters.epsilonStart,
        parameters.epsilonEnd, parameters.epsilonDecaySteps,
        parameters.evalEpsilon, parameters.useSARSAUpdate,
        parameters.kReturnLength, parameters.deathEndsEpisode)

    for epoch in xrange(startingEpoch, parameters.epochs + 1):
        agent.startTrainingEpoch(epoch)
        runTrainingEpoch(ale, agent, epoch, parameters.stepsPerEpoch,
                         transferTaskModule, parameters.frameSkip,
                         parameters.maxNoActions)
        agent.endTrainingEpoch(epoch)

        networkFileName = experimentDirectory + "network_" + str(
            epoch) + ".pkl"
        DeepNetworks.saveNetworkParams(agent.network.qValueNetwork,
                                       networkFileName)

        print "Total number of samples seen per task: "
        print str(agent.trainingMemory.totalTaskSampleCount)

        if parameters.stepsPerTest > 0 and epoch % parameters.evaluationFrequency == 0:
            agent.startEvaluationEpoch(epoch)
            avgRewardPerTask = runEvaluationEpoch(ale, agent, epoch,
                                                  parameters.stepsPerTest,
                                                  transferTaskModule,
                                                  parameters.frameSkip,
                                                  parameters.maxNoActions)
            holdoutQVals = agent.computeHoldoutQValues(
                parameters.numHoldoutQValues)

            resultsFile = open(resultsFileName, 'a')
            resultsFile.write(str(epoch) + ",\t")
            resultsString = ""

            for avgReward in avgRewardPerTask:
                resultsString += str(round(avgReward, 4)) + ",\t"

            resultsFile.write(resultsString)
            resultsFile.write("\t" + str([round(x, 4)
                                          for x in holdoutQVals]) + "\n")
            resultsFile.close()

            agent.endEvaluationEpoch(epoch)

    agent.agentCleanup()