def ingestImagery(filepath, shared=True, holdoutPercentage=.05, minTest=5, batchSize=1, log=None) : '''Load the labeled dataset into memory. This is formatted such that the directory structure becomes the labels, and all imagery within the directory will be assigned this label. All images in any directory is required to have the same dimensions. filepath : This can be a cPickle, a path to the directory structure. shared : Load data into shared variables for training -- NOTE: this is only a user suggestion. However the size of the data will ultimately determine how its loaded. holdoutPercentage : Percentage of the data to holdout for testing minTest : Hard minimum on holdout if percentage is low batchSize : Size of a mini-batch log : Logger for tracking the progress return : (trainData, testData) ''' from dataset.ingest.preprocHDF5 import reuseableIngest, \ checkAvailableMemory from dataset.shared import toShared # Load the dataset to memory train, test, labels = reuseableIngest(filepath=filepath, holdoutPercentage=holdoutPercentage, minTest=minTest, batchSize=batchSize, saveLabels=False, log=log) train, test = train[0], test[0] # calculate the memory needed by this dataset floatsize = float(np.dtype(t.config.floatX).itemsize) intsize = float(np.dtype(np.int32).itemsize) dt = [floatsize, intsize, floatsize, intsize] dataMemoryConsumption = \ np.prod(np.asarray(train.shape, dtype=np.float32)) * floatsize + \ np.prod(np.asarray(test.shape, dtype=np.float32)) * floatsize # check physical memory constraints shared = checkAvailableMemory(dataMemoryConsumption, shared, log) # load each into shared variables -- # this avoids having to copy the data to the GPU between each call if shared is True : if log is not None : log.debug('Transfer the memory into shared variables') try : tr = toShared(train, log=log) te = toShared(test, log=log) return tr, te except : pass return train, test
def __setstate__(self, dict) : '''Load layer pickle''' from dataset.shared import toShared self.__dict__.update(dict) self._learningRate = theano.shared(np.float32(self._learningRate)) self._momentumRate = theano.shared(np.float32(self._momentumRate)) # NOTE: this is saving to a secondary variable to allow # borrowing the memory. initialWeights = self._weights self._weights = toShared(initialWeights) initialThresholds = self._thresholds self._thresholds = toShared(initialThresholds) # convert back to a theano operation self._activation = convertActivation(self._activation)
def checkReconstructionLoss(self, layerIndex) : '''Check the reconstruction cost of the layer/network against the test set. This runs against the entire test set in a single call and returns the current loss of the layer/network [0:inf]. ''' self._startProfile('Checking Reconstruction Loss', 'debug') if len(self._checkGreedy) == 0 : inp = toShared(self._testData[0], borrow=True) \ if not isShared(self._testData) else self._testData[0] self.finalizeNetwork(inp[:]) # WARNING: there is something strange going on between the interaction # between theano and its usage with a list of lambdas. In # normal cases it would be better not to build this lambda # JIT, however this bug forces my hand. # At least we can still get around the if/else tight inner loop checkGreedy = lambda l, x: self._checkGreedy[l](x) \ if isShared(self._testData) else \ lambda l, x: self._checkGreedy[l](self._testData[x]) # check the reconstruction error -- # the user decides whether this will be a greedy or network check # by passing in a layer index. If the index does not have an associated # layer, it automatically chooses network-wide training. loss = 0.0 for ii in range(self._numTestBatches) : loss += float(checkGreedy(layerIndex, ii)) self._endProfile() return loss
def ingestImagery(filepaths, shared=True, batchSize=1, log=None, chipFunc=None, **kwargs) : '''Load the unlabeled dataset into memory. This reads and chips any imagery found within the filepath according the the options sent to the function. filepath : This can be a hdf5, a path to the directory structure, of a list of directories containing files. shared : Load data into shared variables for training batchSize: Size of a mini-batch log : Logger for tracking the progress chipFunc : Chipping utility to use on each image kwargs : Parameters specific for the chipping function return : (trainingData, pixelRegion={None}) ''' import theano.tensor as t from dataset.pickle import readPickleZip from dataset.ingest.labeled import checkAvailableMemory if not isinstance(filepaths, list) : filepaths = [filepaths] filepaths = [os.path.abspath(d) for d in filepaths] # verify all paths exist for filepath in filepaths : if not os.path.exists(filepath) : raise ValueError('The path specified does not exist.') # read the directory structure and chip it if os.path.isdir(filepaths[0]) : filepath = hdf5Dataset(filepaths, batchSize=batchSize, log=log, chipFunc=chipFunc, **kwargs['kwargs']) else : filepath = filepaths[0] # Load the dataset to memory train = readPickleZip(filepath, log) # calculate the memory needed by this dataset dt = 4. if t.config.floatX == 'float32' else 8. dataMemoryConsumption = np.prod(np.asarray( train.shape, dtype=np.float32)) * dt # check physical memory constraints shared = checkAvailableMemory(dataMemoryConsumption, shared, log) # load each into shared variables -- # this avoids having to copy the data to the GPU between each call if shared is True : from dataset.shared import toShared if log is not None : log.debug('Transfer the memory into shared variables') try : return toShared(train) except : pass return train
def softTarget(self, inputs): '''The output the soft target from the network. ''' self._startProfile('Classifying the Inputs', 'debug') if not hasattr(self, '_softTarget'): from dataset.shared import toShared inp = toShared(inputs, borrow=True) \ if not isShared(inputs) else inputs self.finalizeNetwork(inp[:]) # activating the last layer triggers all previous # layers due to dependencies we've enforced softTarget = self._softTarget(inputs) self._endProfile() return softTarget
def softTarget(self, inputs) : '''The output the soft target from the network. ''' self._startProfile('Classifying the Inputs', 'debug') if not hasattr(self, '_softTarget') : from dataset.shared import toShared inp = toShared(inputs, borrow=True) \ if not isShared(inputs) else inputs self.finalizeNetwork(inp[:]) # activating the last layer triggers all previous # layers due to dependencies we've enforced softTarget = self._softTarget(inputs) self._endProfile() return softTarget
def encode(self, inputs) : '''Encode the given inputs. The input is assumed to be numpy.ndarray with dimensions specified by the first layer of the network. The output is the index of the softmax classification. ''' self._startProfile('Encoding the Inputs', 'debug') if not hasattr(self, '_encode') : inp = toShared(inputs, borrow=True) \ if not isShared(inputs) else inputs self.finalizeNetwork(inp[:]) # activating the last layer triggers all previous # layers due to dependencies we've enforced enc = self._encode(inputs) self._endProfile() return enc
def classify(self, inputs): '''Classify the given inputs. The input is assumed to be numpy.ndarray with dimensions specified by the first layer of the network. The output is the index of the softmax classification. ''' self._startProfile('Classifying the Inputs', 'debug') if not hasattr(self, '_classify'): from dataset.shared import toShared inp = toShared(inputs, borrow=True) \ if not isShared(inputs) else inputs self.finalizeNetwork(inp[:]) # activating the last layer triggers all previous # layers due to dependencies we've enforced classIndex = self._classify(inputs) self._endProfile() return classIndex
def checkAccuracy(self) : '''Check the accuracy against the pre-compiled the given inputs. This runs against the entire test set in a single call and returns the current accuracy of the network [0%:100%]. ''' self._startProfile('Checking Accuracy', 'debug') if not hasattr(self, '_checkAccuracy') : from dataset.shared import toShared inp = toShared(self._trainData[0], borrow=True) \ if not isShared(self._trainData) else self._trainData[0] self.finalizeNetwork(inp[:]) # return the sum of all correctly classified targets acc = 0.0 for ii in range(self._numTestBatches) : acc += float(self._checkAccuracy(ii)) self._endProfile() return acc / float(self._numTestSize) * 100.
def checkAccuracy(self): '''Check the accuracy against the pre-compiled the given inputs. This runs against the entire test set in a single call and returns the current accuracy of the network [0%:100%]. ''' self._startProfile('Checking Accuracy', 'debug') if not hasattr(self, '_checkAccuracy'): from dataset.shared import toShared inp = toShared(self._trainData[0], borrow=True) \ if not isShared(self._trainData) else self._trainData[0] self.finalizeNetwork(inp[:]) # return the sum of all correctly classified targets acc = 0.0 for ii in range(self._numTestBatches): acc += float(self._checkAccuracy(ii)) self._endProfile() return acc / float(self._numTestSize) * 100.
def classifyAndSoftmax (self, inputs) : '''Classify the given inputs. The input is assumed to be numpy.ndarray with dimensions specified by the first layer of the network. return : (classification index, softmax vector) ''' self._startProfile('Classifying the Inputs', 'debug') if not hasattr(self, '_classifyAndSoftmax') : from dataset.shared import toShared inp = toShared(inputs, borrow=True) \ if not isShared(inputs) else inputs self.finalizeNetwork(inp[:]) # activating the last layer triggers all previous # layers due to dependencies we've enforced classIndex, softmax = self._classifyAndSoftmax(inputs) self._endProfile() return classIndex, softmax
def train(self, index) : '''Train the network against the pre-loaded inputs. This accepts a batch index into the pre-compiled input and expectedOutput sets. NOTE: Class labels for expectedOutput are assumed to be [0,1] ''' self._startProfile('Training Batch [' + str(index) + '/' + str(self._numTrainBatches) + ']', 'debug') if not hasattr(self, '_trainNetwork') : from dataset.shared import toShared inp = toShared(self._trainData[0], borrow=True) \ if not isShared(self._trainData) else self._trainData[0] self.finalizeNetwork(inp[:]) if not isinstance(index, int) : raise Exception('Variable index must be an integer value') if index >= self._numTrainBatches : raise Exception('Variable index out of range for numBatches') # train the input -- # the user decides if this is online or batch training self._trainNetwork(index) self._endProfile()
def train(self, index): '''Train the network against the pre-loaded inputs. This accepts a batch index into the pre-compiled input and expectedOutput sets. NOTE: Class labels for expectedOutput are assumed to be [0,1] ''' self._startProfile( 'Training Batch [' + str(index) + '/' + str(self._numTrainBatches) + ']', 'debug') if not hasattr(self, '_trainNetwork'): from dataset.shared import toShared inp = toShared(self._trainData[0], borrow=True) \ if not isShared(self._trainData) else self._trainData[0] self.finalizeNetwork(inp[:]) if not isinstance(index, int): raise Exception('Variable index must be an integer value') if index >= self._numTrainBatches: raise Exception('Variable index out of range for numBatches') # train the input -- # the user decides if this is online or batch training self._trainNetwork(index) self._endProfile()
def closeness(self, inputs, cosineVector=None) : '''This is a form of classification for SAE networks. The network has been provided a target input, which we now use to determine the similarity of this input against that target set. inputs: Example imagery to test for closeness. (batchSize, numChannels, rows, cols) cosineVector: Pre-initialized vector. Use this when the input needs to be biased, or if you are normalizing the responses from several networks. return : The calculation returns a value between [0., 1.] for each input. If the user specifies a cosineVector, the responses from this network are added to the previous vector. If cosineVector is None, the networks raw responses are returned. NOTE: Response of 1.0 indicates equality. The lower number indicate less overlap between features. ''' self._startProfile('Determining Closeness of Inputs', 'debug') if not hasattr(self, '_closeness') : inp = toShared(inputs, borrow=True) \ if not isShared(inputs) else inputs self.finalizeNetwork(inp[:]) if not hasattr(self, '_targetEncodings') : raise ValueError('User must load the feature matrix before ' + 'attempting to test for closeness.') # test out similar this input is compared with the targets if cosineVector is not None : cosineVector += self._closeness(inputs, self._numTargets) else : cosineVector = self._closeness(inputs, self._numTargets) self._endProfile() return cosineVector
def closeness(self, inputs, cosineVector=None): '''This is a form of classification for SAE networks. The network has been provided a target input, which we now use to determine the similarity of this input against that target set. inputs: Example imagery to test for closeness. (batchSize, numChannels, rows, cols) cosineVector: Pre-initialized vector. Use this when the input needs to be biased, or if you are normalizing the responses from several networks. return : The calculation returns a value between [0., 1.] for each input. If the user specifies a cosineVector, the responses from this network are added to the previous vector. If cosineVector is None, the networks raw responses are returned. NOTE: Response of 1.0 indicates equality. The lower number indicate less overlap between features. ''' from dataset.shared import toShared self._startProfile('Determining Closeness of Inputs', 'debug') if not hasattr(self, '_closeness'): inp = toShared(inputs, borrow=True) \ if not isShared(inputs) else inputs self.finalizeNetwork(inp[:]) if not hasattr(self, '_targetEncodings'): raise ValueError('User must finalize the feature matrix before ' + 'attempting to finalize the network.') # test out similar this input is compared with the targets if cosineVector is not None: cosineVector += self._closeness(inputs) else: cosineVector = self._closeness(inputs) self._endProfile() return cosineVector
def finalizeNetwork(self, networkInput): '''Setup the network based on the current network configuration. This creates several network-wide functions so they will be pre-compiled and optimized when we need them. ''' from theano import dot, function from dataset.shared import toShared import numpy as np if len(self._layers) == 0: raise IndexError('Network must have at least one layer' + 'to call getNetworkInput().') self._startProfile('Finalizing Network', 'info') # disable the profiler temporarily so we don't get a second entry tmp = self._profiler self._profiler = None SAENetwork.finalizeNetwork(self, networkInput) self._profiler = tmp # ensure targetData is at least one batchSize, otherwise enlarge batchSize = networkInput.shape.eval()[0] numTargets = self._targetData.shape[0] if numTargets < batchSize: # add rows of zeros to fill out the rest of the batch self._targetData = np.resize( np.append( np.zeros([batchSize - numTargets] + list(self._targetData.shape[1:]), np.float32), self._targetData), [batchSize] + list(self._targetData.shape[1:])) # produce the encoded feature matrix -- # this matrix will be used for all closeness calculations # # classify the inputs one batch at a time enc = [] for ii in range(int(numTargets / batchSize)): enc.extend( self.encode(self._targetData[ii * batchSize:(ii + 1) * batchSize])) # run one last batch and collect the remainder -- # this is also used if there is less than one batch worth of targets remainder = numTargets % batchSize if remainder > 0: enc.extend(self.encode(self._targetData[-batchSize:])[-remainder:]) # reduce the encodings to only check against unique vectors -- # this is an optimization as many examples could be encoded to # the same example vector. def uniqueRows(a): a = np.ascontiguousarray(a) unique_a = np.unique(a.view([('', a.dtype)] * a.shape[1])) return unique_a.view(a.dtype).reshape( (unique_a.shape[0], a.shape[1])) enc = uniqueRows(enc) # NOTE: this is the transpose to orient for matrix multiplication self._targetEncodings = toShared(enc, borrow=True).T # TODO: Check if this should be the raw logit from the output layer or # the softmax return of the output layer. # TODO: This needs to be updated to handle matrix vs matrix cosine # similarities between all pairs of vectors # setup the closeness execution graph based on target information targets = t.fmatrix('targets') outClass = self.getNetworkOutput()[0] cosineSimilarity = dot(outClass, targets) / \ (t.sqrt(t.sum(outClass**2)) * (t.sqrt(t.sum(targets**2)))) self._closeness = function([self.getNetworkInput()[0]], t.mean(cosineSimilarity, axis=1), givens={targets: self._targetEncodings}) self._endProfile()
def __init__ (self, maxTargets, filepath=None, prof=None, debug=False) : SAENetwork.__init__(self, filepath, prof, debug) self._numTargets = 0 self._targetEncodings = toShared(np.zeros( tuple([np.prod(self.getNetworkOutputSize()[1:]), maxTargets]), dtype=theano.config.floatX), borrow=False)