Exemple #1
0
def ingestImagery(filepath, shared=True, holdoutPercentage=.05, minTest=5,
                  batchSize=1, log=None) :
    '''Load the labeled dataset into memory. This is formatted such that the
       directory structure becomes the labels, and all imagery within the 
       directory will be assigned this label. All images in any directory is
       required to have the same dimensions.

       filepath          : This can be a cPickle, a path to the directory
                           structure.
       shared            : Load data into shared variables for training --
                           NOTE: this is only a user suggestion. However the
                                 size of the data will ultimately determine
                                 how its loaded.
       holdoutPercentage : Percentage of the data to holdout for testing
       minTest           : Hard minimum on holdout if percentage is low
       batchSize         : Size of a mini-batch
       log               : Logger for tracking the progress
       return : (trainData, testData)
    '''
    from dataset.ingest.preprocHDF5 import reuseableIngest, \
                                           checkAvailableMemory
    from dataset.shared import toShared

    # Load the dataset to memory
    train, test, labels = reuseableIngest(filepath=filepath,
                                          holdoutPercentage=holdoutPercentage,
                                          minTest=minTest,
                                          batchSize=batchSize,
                                          saveLabels=False,
                                          log=log)
    train, test = train[0], test[0]

    # calculate the memory needed by this dataset
    floatsize = float(np.dtype(t.config.floatX).itemsize)
    intsize = float(np.dtype(np.int32).itemsize)
    dt = [floatsize, intsize, floatsize, intsize]
    dataMemoryConsumption = \
        np.prod(np.asarray(train.shape, dtype=np.float32)) * floatsize + \
        np.prod(np.asarray(test.shape,  dtype=np.float32)) * floatsize

    # check physical memory constraints
    shared = checkAvailableMemory(dataMemoryConsumption, shared, log)

    # load each into shared variables -- 
    # this avoids having to copy the data to the GPU between each call
    if shared is True :
        if log is not None :
            log.debug('Transfer the memory into shared variables')
        try :
            tr = toShared(train, log=log)
            te = toShared(test, log=log)
            return tr, te
        except :
            pass
    return train, test
Exemple #2
0
 def __setstate__(self, dict) :
     '''Load layer pickle'''
     from dataset.shared import toShared
     self.__dict__.update(dict)
     self._learningRate = theano.shared(np.float32(self._learningRate))
     self._momentumRate = theano.shared(np.float32(self._momentumRate))
     # NOTE: this is saving to a secondary variable to allow
     #       borrowing the memory.
     initialWeights = self._weights
     self._weights = toShared(initialWeights)
     initialThresholds = self._thresholds
     self._thresholds = toShared(initialThresholds)
     # convert back to a theano operation
     self._activation = convertActivation(self._activation)
Exemple #3
0
    def checkReconstructionLoss(self, layerIndex) :
        '''Check the reconstruction cost of the layer/network against the test
           set. This runs against the entire test set in a single call and
           returns the current loss of the layer/network [0:inf].
        '''
        self._startProfile('Checking Reconstruction Loss', 'debug')
        if len(self._checkGreedy) == 0 :
            inp = toShared(self._testData[0], borrow=True) \
                  if not isShared(self._testData) else self._testData[0]
            self.finalizeNetwork(inp[:])

        # WARNING: there is something strange going on between the interaction
        #          between theano and its usage with a list of lambdas. In
        #          normal cases it would be better not to build this lambda
        #          JIT, however this bug forces my hand.
        #          At least we can still get around the if/else tight inner loop
        checkGreedy = lambda l, x: self._checkGreedy[l](x) \
                      if isShared(self._testData) else \
                      lambda l, x: self._checkGreedy[l](self._testData[x])

        # check the reconstruction error --
        # the user decides whether this will be a greedy or network check
        # by passing in a layer index. If the index does not have an associated
        # layer, it automatically chooses network-wide training.
        loss = 0.0
        for ii in range(self._numTestBatches) :
            loss += float(checkGreedy(layerIndex, ii))

        self._endProfile()
        return loss
Exemple #4
0
def ingestImagery(filepaths, shared=True, batchSize=1,
                  log=None, chipFunc=None, **kwargs) :
    '''Load the unlabeled dataset into memory. This reads and chips any
       imagery found within the filepath according the the options sent to the
       function.

       filepath : This can be a hdf5, a path to the directory structure, of a 
                  list of directories containing files.
       shared   : Load data into shared variables for training
       batchSize: Size of a mini-batch
       log      : Logger for tracking the progress
       chipFunc : Chipping utility to use on each image
       kwargs   : Parameters specific for the chipping function
       return   : (trainingData, pixelRegion={None})
    '''
    import theano.tensor as t
    from dataset.pickle import readPickleZip
    from dataset.ingest.labeled import checkAvailableMemory

    if not isinstance(filepaths, list) :
        filepaths = [filepaths]
    filepaths = [os.path.abspath(d) for d in filepaths]

    # verify all paths exist
    for filepath in filepaths :
        if not os.path.exists(filepath) :
            raise ValueError('The path specified does not exist.')

    # read the directory structure and chip it
    if os.path.isdir(filepaths[0]) :
        filepath = hdf5Dataset(filepaths, batchSize=batchSize, log=log,
                               chipFunc=chipFunc, **kwargs['kwargs'])
    else :
        filepath = filepaths[0]

    # Load the dataset to memory
    train = readPickleZip(filepath, log)

    # calculate the memory needed by this dataset
    dt = 4. if t.config.floatX == 'float32' else 8.
    dataMemoryConsumption = np.prod(np.asarray(
        train.shape, dtype=np.float32)) * dt

    # check physical memory constraints
    shared = checkAvailableMemory(dataMemoryConsumption, shared, log)

    # load each into shared variables -- 
    # this avoids having to copy the data to the GPU between each call
    if shared is True :
        from dataset.shared import toShared
        if log is not None :
            log.debug('Transfer the memory into shared variables')
        try :
            return toShared(train)
        except :
            pass
    return train
Exemple #5
0
    def softTarget(self, inputs):
        '''The output the soft target from the network. '''
        self._startProfile('Classifying the Inputs', 'debug')
        if not hasattr(self, '_softTarget'):
            from dataset.shared import toShared
            inp = toShared(inputs, borrow=True) \
                  if not isShared(inputs) else inputs
            self.finalizeNetwork(inp[:])

        # activating the last layer triggers all previous
        # layers due to dependencies we've enforced
        softTarget = self._softTarget(inputs)
        self._endProfile()
        return softTarget
Exemple #6
0
    def softTarget(self, inputs) :
        '''The output the soft target from the network. '''
        self._startProfile('Classifying the Inputs', 'debug')
        if not hasattr(self, '_softTarget') :
            from dataset.shared import toShared
            inp = toShared(inputs, borrow=True) \
                  if not isShared(inputs) else inputs
            self.finalizeNetwork(inp[:])

        # activating the last layer triggers all previous 
        # layers due to dependencies we've enforced
        softTarget = self._softTarget(inputs)
        self._endProfile()
        return softTarget
Exemple #7
0
    def encode(self, inputs) :
        '''Encode the given inputs. The input is assumed to be 
           numpy.ndarray with dimensions specified by the first layer of the 
           network. The output is the index of the softmax classification.
        '''
        self._startProfile('Encoding the Inputs', 'debug')
        if not hasattr(self, '_encode') :
            inp = toShared(inputs, borrow=True) \
                  if not isShared(inputs) else inputs
            self.finalizeNetwork(inp[:])

        # activating the last layer triggers all previous 
        # layers due to dependencies we've enforced
        enc = self._encode(inputs)
        self._endProfile()
        return enc
Exemple #8
0
    def classify(self, inputs):
        '''Classify the given inputs. The input is assumed to be 
           numpy.ndarray with dimensions specified by the first layer of the 
           network. The output is the index of the softmax classification.
        '''
        self._startProfile('Classifying the Inputs', 'debug')
        if not hasattr(self, '_classify'):
            from dataset.shared import toShared
            inp = toShared(inputs, borrow=True) \
                  if not isShared(inputs) else inputs
            self.finalizeNetwork(inp[:])

        # activating the last layer triggers all previous
        # layers due to dependencies we've enforced
        classIndex = self._classify(inputs)
        self._endProfile()
        return classIndex
Exemple #9
0
    def checkAccuracy(self) :
        '''Check the accuracy against the pre-compiled the given inputs.
           This runs against the entire test set in a single call and returns
           the current accuracy of the network [0%:100%].
        '''
        self._startProfile('Checking Accuracy', 'debug')
        if not hasattr(self, '_checkAccuracy') :
            from dataset.shared import toShared
            inp = toShared(self._trainData[0], borrow=True) \
                  if not isShared(self._trainData) else self._trainData[0]
            self.finalizeNetwork(inp[:])

        # return the sum of all correctly classified targets
        acc = 0.0
        for ii in range(self._numTestBatches) :
            acc += float(self._checkAccuracy(ii))

        self._endProfile()
        return acc / float(self._numTestSize) * 100.
Exemple #10
0
    def checkAccuracy(self):
        '''Check the accuracy against the pre-compiled the given inputs.
           This runs against the entire test set in a single call and returns
           the current accuracy of the network [0%:100%].
        '''
        self._startProfile('Checking Accuracy', 'debug')
        if not hasattr(self, '_checkAccuracy'):
            from dataset.shared import toShared
            inp = toShared(self._trainData[0], borrow=True) \
                  if not isShared(self._trainData) else self._trainData[0]
            self.finalizeNetwork(inp[:])

        # return the sum of all correctly classified targets
        acc = 0.0
        for ii in range(self._numTestBatches):
            acc += float(self._checkAccuracy(ii))

        self._endProfile()
        return acc / float(self._numTestSize) * 100.
Exemple #11
0
    def classifyAndSoftmax (self, inputs) :
        '''Classify the given inputs. The input is assumed to be
           numpy.ndarray with dimensions specified by the first layer of the
           network.

           return : (classification index, softmax vector)
        '''
        self._startProfile('Classifying the Inputs', 'debug')
        if not hasattr(self, '_classifyAndSoftmax') :
            from dataset.shared import toShared
            inp = toShared(inputs, borrow=True) \
                  if not isShared(inputs) else inputs
            self.finalizeNetwork(inp[:])

        # activating the last layer triggers all previous
        # layers due to dependencies we've enforced
        classIndex, softmax = self._classifyAndSoftmax(inputs)
        self._endProfile()
        return classIndex, softmax
Exemple #12
0
    def train(self, index) :
        '''Train the network against the pre-loaded inputs. This accepts
           a batch index into the pre-compiled input and expectedOutput sets.

           NOTE: Class labels for expectedOutput are assumed to be [0,1]
        '''
        self._startProfile('Training Batch [' + str(index) +
                           '/' + str(self._numTrainBatches) + ']', 'debug')
        if not hasattr(self, '_trainNetwork') :
            from dataset.shared import toShared
            inp = toShared(self._trainData[0], borrow=True) \
                  if not isShared(self._trainData) else self._trainData[0]
            self.finalizeNetwork(inp[:])
        if not isinstance(index, int) :
            raise Exception('Variable index must be an integer value')
        if index >= self._numTrainBatches :
            raise Exception('Variable index out of range for numBatches')

        # train the input --
        # the user decides if this is online or batch training
        self._trainNetwork(index)
        self._endProfile()
Exemple #13
0
    def train(self, index):
        '''Train the network against the pre-loaded inputs. This accepts 
           a batch index into the pre-compiled input and expectedOutput sets.

           NOTE: Class labels for expectedOutput are assumed to be [0,1]
        '''
        self._startProfile(
            'Training Batch [' + str(index) + '/' +
            str(self._numTrainBatches) + ']', 'debug')
        if not hasattr(self, '_trainNetwork'):
            from dataset.shared import toShared
            inp = toShared(self._trainData[0], borrow=True) \
                  if not isShared(self._trainData) else self._trainData[0]
            self.finalizeNetwork(inp[:])
        if not isinstance(index, int):
            raise Exception('Variable index must be an integer value')
        if index >= self._numTrainBatches:
            raise Exception('Variable index out of range for numBatches')

        # train the input --
        # the user decides if this is online or batch training
        self._trainNetwork(index)
        self._endProfile()
Exemple #14
0
    def closeness(self, inputs, cosineVector=None) :
        '''This is a form of classification for SAE networks. The network has
           been provided a target input, which we now use to determine the
           similarity of this input against that target set. 

           inputs:       Example imagery to test for closeness. 
                         (batchSize, numChannels, rows, cols)
           cosineVector: Pre-initialized vector. Use this when the input needs
                         to be biased, or if you are normalizing the responses
                         from several networks.

           return      : The calculation returns a value between [0., 1.] for
                         each input. If the user specifies a cosineVector, the
                         responses from this network are added to the previous
                         vector. If cosineVector is None, the networks raw 
                         responses are returned.

           NOTE: Response of 1.0 indicates equality. The lower number indicate
                 less overlap between features.
        '''

        self._startProfile('Determining Closeness of Inputs', 'debug')
        if not hasattr(self, '_closeness') :
            inp = toShared(inputs, borrow=True) \
                  if not isShared(inputs) else inputs
            self.finalizeNetwork(inp[:])
        if not hasattr(self, '_targetEncodings') :
            raise ValueError('User must load the feature matrix before ' +
                             'attempting to test for closeness.')

        # test out similar this input is compared with the targets
        if cosineVector is not None :
            cosineVector += self._closeness(inputs, self._numTargets)
        else :
            cosineVector = self._closeness(inputs, self._numTargets)
        self._endProfile()
        return cosineVector
Exemple #15
0
    def closeness(self, inputs, cosineVector=None):
        '''This is a form of classification for SAE networks. The network has
           been provided a target input, which we now use to determine the
           similarity of this input against that target set. 

           inputs:       Example imagery to test for closeness. 
                         (batchSize, numChannels, rows, cols)
           cosineVector: Pre-initialized vector. Use this when the input needs
                         to be biased, or if you are normalizing the responses
                         from several networks.

           return      : The calculation returns a value between [0., 1.] for
                         each input. If the user specifies a cosineVector, the
                         responses from this network are added to the previous
                         vector. If cosineVector is None, the networks raw 
                         responses are returned.

           NOTE: Response of 1.0 indicates equality. The lower number indicate
                 less overlap between features.
        '''
        from dataset.shared import toShared
        self._startProfile('Determining Closeness of Inputs', 'debug')
        if not hasattr(self, '_closeness'):
            inp = toShared(inputs, borrow=True) \
                  if not isShared(inputs) else inputs
            self.finalizeNetwork(inp[:])
        if not hasattr(self, '_targetEncodings'):
            raise ValueError('User must finalize the feature matrix before ' +
                             'attempting to finalize the network.')

        # test out similar this input is compared with the targets
        if cosineVector is not None:
            cosineVector += self._closeness(inputs)
        else:
            cosineVector = self._closeness(inputs)
        self._endProfile()
        return cosineVector
Exemple #16
0
    def finalizeNetwork(self, networkInput):
        '''Setup the network based on the current network configuration.
           This creates several network-wide functions so they will be
           pre-compiled and optimized when we need them.
        '''
        from theano import dot, function
        from dataset.shared import toShared
        import numpy as np

        if len(self._layers) == 0:
            raise IndexError('Network must have at least one layer' +
                             'to call getNetworkInput().')

        self._startProfile('Finalizing Network', 'info')

        # disable the profiler temporarily so we don't get a second entry
        tmp = self._profiler
        self._profiler = None
        SAENetwork.finalizeNetwork(self, networkInput)
        self._profiler = tmp

        # ensure targetData is at least one batchSize, otherwise enlarge
        batchSize = networkInput.shape.eval()[0]
        numTargets = self._targetData.shape[0]
        if numTargets < batchSize:
            # add rows of zeros to fill out the rest of the batch
            self._targetData = np.resize(
                np.append(
                    np.zeros([batchSize - numTargets] +
                             list(self._targetData.shape[1:]), np.float32),
                    self._targetData),
                [batchSize] + list(self._targetData.shape[1:]))

        # produce the encoded feature matrix --
        # this matrix will be used for all closeness calculations
        #
        # classify the inputs one batch at a time
        enc = []
        for ii in range(int(numTargets / batchSize)):
            enc.extend(
                self.encode(self._targetData[ii * batchSize:(ii + 1) *
                                             batchSize]))

        # run one last batch and collect the remainder --
        # this is also used if there is less than one batch worth of targets
        remainder = numTargets % batchSize
        if remainder > 0:
            enc.extend(self.encode(self._targetData[-batchSize:])[-remainder:])

        # reduce the encodings to only check against unique vectors --
        # this is an optimization as many examples could be encoded to
        # the same example vector.
        def uniqueRows(a):
            a = np.ascontiguousarray(a)
            unique_a = np.unique(a.view([('', a.dtype)] * a.shape[1]))
            return unique_a.view(a.dtype).reshape(
                (unique_a.shape[0], a.shape[1]))

        enc = uniqueRows(enc)

        # NOTE: this is the transpose to orient for matrix multiplication
        self._targetEncodings = toShared(enc, borrow=True).T

        # TODO: Check if this should be the raw logit from the output layer or
        #       the softmax return of the output layer.
        # TODO: This needs to be updated to handle matrix vs matrix cosine
        #       similarities between all pairs of vectors
        # setup the closeness execution graph based on target information
        targets = t.fmatrix('targets')
        outClass = self.getNetworkOutput()[0]
        cosineSimilarity = dot(outClass, targets) / \
            (t.sqrt(t.sum(outClass**2)) * (t.sqrt(t.sum(targets**2))))
        self._closeness = function([self.getNetworkInput()[0]],
                                   t.mean(cosineSimilarity, axis=1),
                                   givens={targets: self._targetEncodings})
        self._endProfile()
Exemple #17
0
 def __init__ (self, maxTargets, filepath=None, prof=None, debug=False) :
     SAENetwork.__init__(self, filepath, prof, debug)
     self._numTargets = 0
     self._targetEncodings = toShared(np.zeros(
         tuple([np.prod(self.getNetworkOutputSize()[1:]), maxTargets]),
         dtype=theano.config.floatX), borrow=False)