class KNNClassifier(object): """k nearest neighbor classifier""" def __init__( self, k=1, # The K in KNN distanceNorm=2.0, # By default, we use L2 norm as distance metric distanceMethod='norm', # The method used to compute distance. See # below for options distThreshold=0, # Distance threshold for entering patterns doBinarization=False, # Inputs are binarized. binarizationThreshold=0.5, # Threshold for binarization of inputs. useSparseMemory=True, # Use sparse memory matrix sparseThreshold=0.1, # Anything below this threshold is considered zero. relativeThreshold=False, # Multiply the threshold by the max input value numWinners=0, # Only numWinners elements of input are stored numSVDSamples=None, # Number of samples to do SVD after numSVDDims=None, # % of the dims to keep after SVD fractionOfMax=None, # The cut-off fraction in relation to the largest singular value when adaptive dimension selection is used verbosity=0, # verbosity level (0: none, increasing integers # providing increasing levels of verbosity maxStoredPatterns=-1, # Limits the maximum number of the training # patterns stored. When KNN learns in a fixed # capacity mode, the unused patterns are # deleted once the number of stored patterns # is greater than maxStoredPatterns replaceDuplicates=False, # if true, during learning, replace existing # entries that match exactly, even if # distThreshold is 0. cellsPerCol=0, # if >=1, then only store the start cell in # any columns which are bursting. ): """ Constructor for the kNN classifier. distanceMethod -- method used to compute distance. Possible options are: 'norm': When distanceNorm is 2, this is the euclidean distance, When distanceNorm is 1, this is the manhattan distance In general: sum(abs(x-proto) ^ distanceNorm) ^ (1/distanceNorm) 'rawOverlap': Only appropriate when inputs are binary. This computes: (width of the input) - (# bits of overlap between input and prototype). 'pctOverlapOfLarger': Only appropriate for binary inputs. This computes 1.0 - (# bits overlap between input and prototype) / max(# bits in input, # bits in prototype) 'pctOverlapOfProto': Only appropriate for binary inputs. This computes 1.0 - (# bits overlap between input and prototype) / (# bits in prototype) distThreshold -- Distance Threshold. If a pattern that is less than distThreshold apart from the input pattern already exists in the kNN's memory, then the input pattern is not added to kNN's memory. """ self.version = KNNCLASSIFIER_VERSION self.k = k self.distanceNorm = distanceNorm assert (distanceMethod in ('norm', 'rawOverlap', 'pctOverlapOfLarger', 'pctOverlapOfProto')) self.distanceMethod = distanceMethod self.distThreshold = distThreshold self.doBinarization = doBinarization self.binarizationThreshold = binarizationThreshold self.useSparseMemory = useSparseMemory self.sparseThreshold = sparseThreshold self.relativeThreshold = relativeThreshold self.numWinners = numWinners self.numSVDSamples = numSVDSamples self.numSVDDims = numSVDDims self.fractionOfMax = fractionOfMax if self.numSVDDims == 'adaptive': self._adaptiveSVDDims = True else: self._adaptiveSVDDims = False self.verbosity = verbosity self.replaceDuplicates = replaceDuplicates self.cellsPerCol = cellsPerCol self.maxStoredPatterns = maxStoredPatterns self.clear() ########################################################################## def clear(self): self._Memory = None self._numPatterns = 0 self._M = None self._categoryList = [] self._partitionIdList = [] self._partitionIdArray = None self._finishedLearning = False self._iterationIdx = -1 # Fixed capacity KNN if self.maxStoredPatterns > 0: assert self.useSparseMemory, "Fixed capacity KNN is implemented only in" \ " the sparse memory mode" self.fixedCapacity = True self._categoryRecencyList = [] else: self.fixedCapacity = False # Cached value of the store prototype sizes self._protoSizes = None # Used by PCA self._s = None self._vt = None self._nc = None self._mean = None # Used by Network Builder self._specificIndexTraining = False self._nextTrainingIndices = None ########################################################################## def _doubleMemoryNumRows(self): m = 2 * self._Memory.shape[0] n = self._Memory.shape[1] self._Memory = numpy.resize(self._Memory, (m, n)) self._M = self._Memory[:self._numPatterns] ########################################################################## def _sparsifyVector(self, inputPattern, doWinners=False): # Do sparsification, using a relative or absolute threshold if not self.relativeThreshold: inputPattern = inputPattern * (abs(inputPattern) > self.sparseThreshold) elif self.sparseThreshold > 0: inputPattern = inputPattern * \ (abs(inputPattern) > (self.sparseThreshold * abs(inputPattern).max())) # Do winner-take-all if doWinners: if (self.numWinners > 0) and (self.numWinners < (inputPattern > 0).sum()): sparseInput = numpy.zeros(inputPattern.shape) # Don't consider strongly negative numbers as winners. sorted = inputPattern.argsort()[0:self.numWinners] sparseInput[sorted] += inputPattern[sorted] inputPattern = sparseInput # Do binarization if self.doBinarization: # Don't binarize negative numbers to positive 1. inputPattern = (inputPattern > self.binarizationThreshold).astype(float) return inputPattern def prototypeSetCategory(self, idToRelabel, newCategory): if idToRelabel not in self._categoryRecencyList: return recordIndex = self._categoryRecencyList.index(idToRelabel) self._categoryList[recordIndex] = newCategory def removeIds(self, idsToRemove): # Form a list of all categories to remove rowsToRemove = [k for k, rowID in enumerate(self._categoryRecencyList) \ if rowID in idsToRemove] # Remove rows from the classifier self._removeRows(rowsToRemove) ########################################################################## def removeCategory(self, categoryToRemove): removedRows = 0 if self._Memory is None: return removedRows # The internal category indices are stored in float # format, so we should compare with a float catToRemove = float(categoryToRemove) # Form a list of all categories to remove rowsToRemove = [k for k, catID in enumerate(self._categoryList) \ if catID == catToRemove] # Remove rows from the classifier self._removeRows(rowsToRemove) assert catToRemove not in self._categoryList def _removeRows(self, rowsToRemove): # Form a numpy array of row indices to be removed removalArray = numpy.array(rowsToRemove) # Remove categories self._categoryList = numpy.delete(numpy.array(self._categoryList), removalArray).tolist() self._categoryRecencyList = numpy.delete( numpy.array(self._categoryRecencyList), removalArray).tolist() # Remove the partition ID, if any if self._partitionIdArray is not None: self._partitionIdArray = numpy.delete(self._partitionIdArray, removalArray) # Remove actual patterns if self.useSparseMemory: # Delete backwards for rowIndex in rowsToRemove[::-1]: self._Memory.deleteRow(rowIndex) else: self._M = numpy.delete(self._M, removalArray, 0) numRemoved = len(rowsToRemove) # Sanity checks numRowsExpected = self._numPatterns - numRemoved if self.useSparseMemory: if self._Memory is not None: assert self._Memory.nRows() == numRowsExpected else: assert self._M.shape[0] == numRowsExpected assert len(self._categoryList) == numRowsExpected assert self._partitionIdArray is None or \ self._partitionIdArray.shape[0] == numRowsExpected self._numPatterns -= numRemoved return numRemoved # Used to increment iteration for models that don't learn each timestep def doIteration(self): self._iterationIdx += 1 ########################################################################## def learn(self, inputPattern, inputCategory, partitionId=None, isSparse=0, rowID=None): """ Learn a new training presentation Parameters: ------------------------------------------------------------------------ inputPattern: training pattern to learn. This should be a dense array if isSparse==0 or a list of non-zero indices if isSparse>0 inputCategory: category index of the training pattern. partitionID: ?? isSparse: If >0, the input pattern is a list of non-zero indices and isSparse is the length of the dense representation. """ if self.verbosity >= 1: print "%s learn:" % (g_debugPrefix) print " category:", int(inputCategory) print " active inputs:", _labeledInput( inputPattern, cellsPerCol=self.cellsPerCol) if rowID is None: rowID = self._iterationIdx assert partitionId is None, \ "No documentation is available for partitionId, not sure how it works." #--------------------------------------------------------------------------------- # Dense vectors if not self.useSparseMemory: # Not supported assert self.cellsPerCol == 0, "not implemented for dense vectors" # If the input was given in sparse form, convert it to dense if isSparse > 0: denseInput = numpy.zeros(isSparse) denseInput[inputPattern] = 1.0 inputPattern = denseInput if self._specificIndexTraining and not self._nextTrainingIndices: # Specific index mode without any index provided - skip training return self._numPatterns if self._Memory is None: # Initialize memory with 100 rows and numPatterns = 0 inputWidth = len(inputPattern) self._Memory = numpy.zeros((100, inputWidth)) self._numPatterns = 0 self._M = self._Memory[:self._numPatterns] addRow = True if self._vt is not None: # Compute projection inputPattern = numpy.dot(self._vt, inputPattern - self._mean) if self.distThreshold > 0: # Check if input is too close to an existing input to be accepted dist = self._calcDistance(inputPattern) minDist = dist.min() addRow = (minDist >= self.distThreshold) if addRow: self._protoSizes = None # need to re-compute if self._numPatterns == self._Memory.shape[0]: # Double the size of the memory self._doubleMemoryNumRows() if not self._specificIndexTraining: # Normal learning - append the new input vector self._Memory[self._numPatterns] = inputPattern self._numPatterns += 1 self._categoryList.append(int(inputCategory)) else: # Specific index training mode - insert vector in specified slot vectorIndex = self._nextTrainingIndices.pop(0) while vectorIndex >= self._Memory.shape[0]: self._doubleMemoryNumRows() self._Memory[vectorIndex] = inputPattern self._numPatterns = max(self._numPatterns, vectorIndex + 1) if vectorIndex >= len(self._categoryList): self._categoryList += [-1] * ( vectorIndex - len(self._categoryList) + 1) self._categoryList[vectorIndex] = int(inputCategory) # Set _M to the "active" part of _Memory self._M = self._Memory[0:self._numPatterns] if partitionId is not None: self._partitionIdList.append(partitionId) #--------------------------------------------------------------------------------- # Sparse vectors else: # If the input was given in sparse form, convert it to dense if necessary if isSparse > 0 and (self._vt is not None or self.distThreshold > 0 \ or self.numSVDDims is not None or self.numSVDSamples is not None \ or self.numWinners > 0): denseInput = numpy.zeros(isSparse) denseInput[inputPattern] = 1.0 inputPattern = denseInput isSparse = 0 # Get the input width if isSparse > 0: inputWidth = isSparse else: inputWidth = len(inputPattern) # Allocate storage if this is the first training vector if self._Memory is None: self._Memory = NearestNeighbor(0, inputWidth) # Support SVD if it is on if self._vt is not None: inputPattern = numpy.dot(self._vt, inputPattern - self._mean) # Threshold the input, zeroing out entries that are too close to 0. # This is only done if we are given a dense input. if isSparse == 0: thresholdedInput = self._sparsifyVector(inputPattern, True) addRow = True # If given the layout of the cells, then turn on the logic that stores # only the start cell for bursting columns. if self.cellsPerCol >= 1: numCols = thresholdedInput.size / self.cellsPerCol burstingCols = thresholdedInput.reshape( -1, self.cellsPerCol).min(axis=1).nonzero()[0] for col in burstingCols: thresholdedInput[(col * self.cellsPerCol) + 1:(col * self.cellsPerCol) + self.cellsPerCol] = 0 # Don't learn entries that are too close to existing entries. if self._Memory.nRows() > 0: dist = None # if this vector is a perfect match for one we already learned, then # replace the category - it may have changed with online learning on. if self.replaceDuplicates: dist = self._calcDistance(thresholdedInput, distanceNorm=1) if dist.min() == 0: rowIdx = dist.argmin() self._categoryList[rowIdx] = int(inputCategory) if self.fixedCapacity: self._categoryRecencyList[rowIdx] = rowID addRow = False # Don't add this vector if it matches closely with another we already # added if self.distThreshold > 0: if dist is None or self.distanceNorm != 1: dist = self._calcDistance(thresholdedInput) minDist = dist.min() addRow = (minDist >= self.distThreshold) if not addRow: if self.fixedCapacity: rowIdx = dist.argmin() self._categoryRecencyList[rowIdx] = rowID # Add the new vector to our storage if addRow: self._protoSizes = None # need to re-compute if isSparse == 0: self._Memory.addRow(thresholdedInput) else: self._Memory.addRowNZ(inputPattern, [1] * len(inputPattern)) self._numPatterns += 1 self._categoryList.append(int(inputCategory)) if partitionId is not None: self._partitionIdList.append(partitionId) if self.fixedCapacity: self._categoryRecencyList.append(rowID) if self._numPatterns > self.maxStoredPatterns and \ self.maxStoredPatterns > 0: leastRecentlyUsedPattern = numpy.argmin( self._categoryRecencyList) self._Memory.deleteRow(leastRecentlyUsedPattern) self._categoryList.pop(leastRecentlyUsedPattern) self._categoryRecencyList.pop(leastRecentlyUsedPattern) self._numPatterns -= 1 if self.numSVDDims is not None and self.numSVDSamples is not None \ and self._numPatterns == self.numSVDSamples: self.computeSVD() return self._numPatterns ########################################################################## def getOverlaps(self, inputPattern): """Return the overlap amount of the input pattern with each category. This returns 2 numpy arrays of the same length, the overlaps and the category numbers. The overlap is computed by compuing: logical_and(inputPattern != 0, trainingPattern != 0).sum() Parameters: ------------------------------------------------------------------- inputPattern: pattern to check overlap of retval: (overlaps, categories) overlaps: an integer overlap amount for each category categories: category index for each element of overlaps """ assert self.useSparseMemory, "Not implemented yet for dense storage" overlaps = self._Memory.rightVecSumAtNZ(inputPattern) return (overlaps, self._categoryList) ########################################################################## def getDistances(self, inputPattern): """Return the distance between the input pattern and all other stored patterns. This returns 2 numpy arrays of the same length, the distances and the category numbers. Parameters: ------------------------------------------------------------------- inputPattern: pattern to check distance with retval: (distances, categories) overlaps: an integer overlap amount for each category categories: category index for each element of distances """ dist = self._getDistances(inputPattern) return (dist, self._categoryList) ########################################################################## def infer(self, inputPattern, computeScores=True, overCategories=True, partitionId=None): """Find the category that best matches the input pattern. Returns the winning category index plus a distribution over all categories. This method returns a 4 item tuple: (winner, inferenceResult, dist, categoryDist) winner: The category with the greatest number of nearest neighbors within the kth nearest neighbors inferenceResult: A list of length numCategories, each entry contains the number of neighbors within the top K neighbors that are in that category dist: A list of length numPrototypes. Each entry is the distance from the unknown to that prototype. All distances are between 0 and 1.0 categoryDist: A list of length numCategories. Each entry is the distance from the unknown to the nearest prototype of that category. All distances are between 0 and 1.0. """ # If no categories learned yet (applies for first inference with # online learning) if len(self._categoryList) == 0: winner = 0 inferenceResult = numpy.zeros(1) dist = numpy.ones(1) categoryDist = numpy.ones(1) else: maxCategoryIdx = max(self._categoryList) inferenceResult = numpy.zeros(maxCategoryIdx + 1) dist = self._getDistances(inputPattern, partitionId=partitionId) sorted = dist.argsort() validVectorCount = len( self._categoryList) - self._categoryList.count(-1) for j in sorted[:min(self.k, validVectorCount)]: inferenceResult[self._categoryList[j]] += 1.0 winner = inferenceResult.argmax() inferenceResult /= inferenceResult.sum() categoryDist = min_score_per_category(maxCategoryIdx, self._categoryList, dist) categoryDist.clip(0, 1.0, categoryDist) if self.verbosity >= 1: print "%s infer:" % (g_debugPrefix) print " active inputs:", _labeledInput( inputPattern, cellsPerCol=self.cellsPerCol) print " winner category:", int(winner) print " pct neighbors of each category:", inferenceResult print " dist of each prototype:", dist print " dist of each category:", categoryDist result = (winner, inferenceResult, dist, categoryDist) return result ########################################################################## def getClosest(self, inputPattern, topKCategories=3): """Return index to the pattern that is closest to inputPattern as well as indices to the topKCategories closest categories.""" inferenceResult = numpy.zeros(max(self._categoryList) + 1) dist = self._getDistances(inputPattern) sorted = dist.argsort() validVectorCount = len( self._categoryList) - self._categoryList.count(-1) for j in sorted[:min(self.k, validVectorCount)]: inferenceResult[self._categoryList[j]] += 1.0 winner = inferenceResult.argmax() topNCats = [] for i in range(topKCategories): topNCats.append((self._categoryList[sorted[i]], dist[sorted[i]])) return winner, dist, topNCats ########################################################################## def closestTrainingPattern(self, inputPattern, cat): """ Return the training pattern belonging to the given category 'cat', that matches inputPattern the closest. inputPattern: the pattern to compare with cat: the category to consider retval: dense version of training pattern, None if no patterns found """ dist = self._getDistances(inputPattern) sorted = dist.argsort() for patIdx in sorted: patternCat = self._categoryList[patIdx] # If closest pattern belongs to desired category, return it if patternCat == cat: if self.useSparseMemory: closestPattern = self._Memory.getRow(int(patIdx)) else: closestPattern = self._M[patIdx] return closestPattern # No patterns were found! return None ########################################################################## def closestOtherTrainingPattern(self, inputPattern, cat): """ Return the closest training pattern that is *not* in the given category 'cat'. inputPattern: the pattern to compare with cat: the category to avoid retval: dense version of training pattern, None if no patterns found """ dist = self._getDistances(inputPattern) sorted = dist.argsort() for patIdx in sorted: patternCat = self._categoryList[patIdx] # If closest pattern does not belong to specified category, return it if patternCat != cat: if self.useSparseMemory: closestPattern = self._Memory.getRow(int(patIdx)) else: closestPattern = self._M[patIdx] return closestPattern # No patterns were found! return None ########################################################################## def getPattern(self, idx, sparseBinaryForm=False, cat=None): """Return a training pattern either by index or category number Parameters: ------------------------------------------------------------------------ idx: Index of the training pattern sparseBinaryForm: If true, return only a list of the non-zeros in the training pattern cat: If not None, get the first pattern belonging to category cat. If this is specified, idx must be None """ if cat is not None: assert idx is None idx = self._categoryList.index(cat) if not self.useSparseMemory: pattern = self._Memory[idx] if sparseBinaryForm: pattern = pattern.nonzero()[0] else: (nz, values) = self._Memory.rowNonZeros(idx) if not sparseBinaryForm: pattern = numpy.zeros(self._Memory.nCols()) numpy.put(pattern, nz, 1) else: pattern = nz return pattern ########################################################################## def _calcDistance(self, inputPattern, distanceNorm=None): """Calculate the distances from inputPattern to all stored patterns. The distances are all between 0 and 1.0""" if distanceNorm is None: distanceNorm = self.distanceNorm # Sparse memory if self.useSparseMemory: if self.distanceMethod == 'pctOvlerapOfLarger': if self._protoSizes is None: self._protoSizes = self._Memory.rowSums() dist = self._Memory.rightVecSumAtNZ(inputPattern) maxVal = numpy.maximum(self._protoSizes, inputPattern.sum()) if maxVal > 0: dist /= maxVal dist = 1.0 - dist elif self.distanceMethod == 'rawOverlap': if self._protoSizes is None: self._protoSizes = self._Memory.rowSums() inputPatternSum = inputPattern.sum() dist = (inputPatternSum - self._Memory.rightVecSumAtNZ(inputPattern)) if inputPatternSum > 0: dist /= inputPatternSum elif self.distanceMethod == 'pctOverlapOfProto': if self._protoSizes is None: self._protoSizes = self._Memory.rowSums() dist = self._Memory.rightVecSumAtNZ(inputPattern) dist /= self._protoSizes dist = 1.0 - dist elif self.distanceMethod == 'norm': dist = self._Memory.vecLpDist(self.distanceNorm, inputPattern) distMax = dist.max() if distMax > 0: dist /= distMax else: raise RuntimeError("Unimplemented distance method %s" % \ (self.distanceMethod)) # Dense memory else: if self.distanceMethod == 'norm': dist = numpy.power(numpy.abs(self._M - inputPattern), self.distanceNorm) dist = dist.sum(1) dist = numpy.power(dist, 1.0 / self.distanceNorm) dist /= dist.max() else: raise RuntimeError("Not implemented yet for dense storage....") return dist ########################################################################## def _getDistances(self, inputPattern, partitionId=None): """Return distances from inputPattern to all stored patterns.""" if not self._finishedLearning: self.finishLearning() self._finishedLearning = True if self._vt is not None and len(self._vt) > 0: inputPattern = numpy.dot(self._vt, inputPattern - self._mean) sparseInput = self._sparsifyVector(inputPattern) # Compute distances dist = self._calcDistance(sparseInput) # Invalidate results where category is -1 if self._specificIndexTraining: dist[numpy.array(self._categoryList) == -1] = numpy.inf # Ignore vectors with same partition id if self._partitionIdArray is not None: dist[self._partitionIdArray == partitionId] = numpy.inf return dist ########################################################################## def finishLearning(self): if self.numSVDDims is not None and self._vt is None: self.computeSVD() # Check if our partition ID list is non-trivial # (i.e., whether it contains at least two different # partition IDs) if self._partitionIdList: partitions = set(self._partitionIdList) if len(partitions) > 1: # Compile into a numpy array self._partitionIdArray = numpy.array(self._partitionIdList) else: # Trivial partitions; ignore self._partitionIdArray = None # Either way, we don't need the original list self._partitionIdList = [] ########################################################################## def restartLearning(self): """ This is only invoked if we have already called finishLearning() but now want to go back and provide more samples. """ # We need to convert the partition ID array back into a list if hasattr(self, '_partitionIdArray'): # In the case of trivial partitions, we need to regenerate # the 'null' partition ID if self._partitionIdArray is None: self._partitionIdList = [0] * self._numPatterns else: self._partitionIdList = self._partitionIdArray.tolist() ########################################################################## def computeSVD(self, numSVDSamples=None, finalize=True): if numSVDSamples is None: numSVDSamples = self._numPatterns if not self.useSparseMemory: self._a = self._Memory[:self._numPatterns] else: self._a = self._Memory.toDense()[:self._numPatterns] self._mean = numpy.mean(self._a, axis=0) self._a -= self._mean u, self._s, self._vt = numpy.linalg.svd(self._a[:numSVDSamples]) if finalize: self.finalizeSVD() return self._s ########################################################################## def getAdaptiveSVDDims(self, singularValues, fractionOfMax=0.001): v = singularValues / singularValues[0] idx = numpy.where(v < fractionOfMax)[0] if len(idx): print "Number of PCA dimensions chosen: ", idx[0], "out of ", len( v) return idx[0] else: print "Number of PCA dimensions chosen: ", len( v) - 1, "out of ", len(v) return len(v) - 1 ########################################################################## def finalizeSVD(self, numSVDDims=None): if numSVDDims is not None: self.numSVDDims = numSVDDims if self.numSVDDims == 'adaptive': if self.fractionOfMax is not None: self.numSVDDims = self.getAdaptiveSVDDims( self._s, self.fractionOfMax) else: self.numSVDDims = self.getAdaptiveSVDDims(self._s) if self._vt.shape[0] < self.numSVDDims: print "******************************************************************************" print "Warning: The requested number of PCA dimensions is more than the number of pattern dimensions." print "Setting numSVDDims = ", self._vt.shape[0] print "******************************************************************************" self.numSVDDims = self._vt.shape[0] self._vt = self._vt[:self.numSVDDims] # Added when svd is not able to decompose vectors - uses raw spare vectors if len(self._vt) == 0: return self._Memory = numpy.zeros((self._numPatterns, self.numSVDDims)) self._M = self._Memory self.useSparseMemory = False for i in range(self._numPatterns): self._Memory[i] = numpy.dot(self._vt, self._a[i]) self._a = None ########################################################################## def leaveOneOutTest(self): """ Run leave-one-out testing. Returns the total number of samples and the number correctly classified. Ignores invalid vectors (those with a category of -1). Uses partitionIdList, if non-empty, to avoid matching a vector against other vectors that came from the same training sequence. """ if self.useSparseMemory: raise Exception( "leaveOneOutTest only works with dense memory right now") # The basic test is simple, but we need to prepare some data structures to # handle _specificIndexTraining and _partitionIdList categoryListArray = numpy.array(self._categoryList[:self._M.shape[0]]) if self._specificIndexTraining: # Find valid and invalid vectors using the category list validIndices = (categoryListArray != -1) invalidIndices = (categoryListArray == -1) # Convert list of partitions to numpy array if we haven't # already done so. partitionIdArray = None if hasattr(self, '_partitionIdArray') and \ self._partitionIdArray is not None: partitionIdArray = self._partitionIdArray elif self._partitionIdList: # Use the partition id list partitionIdArray = numpy.array(self._partitionIdList) # Find the winning vector for each cache vector, excluding itself, # excluding invalid vectors, and excluding other vectors with the # same partition id winners = numpy.zeros(self._M.shape[0], numpy.int32) for i in xrange(self._M.shape[0]): if self._specificIndexTraining \ and categoryListArray[i] == -1: # This is an invalid vector continue # Calculate distance between this vector and all others distances = numpy.power(numpy.abs(self._M - self._M[i, :]), self.distanceNorm) distances = distances.sum(1) # Invalidate certain vectors by setting their distance to infinity if self._specificIndexTraining: distances[invalidIndices] = numpy.inf # Ignore invalid vectors if partitionIdArray is not None: # Ignore vectors with same partition id distances[partitionIdArray == partitionIdArray[i]] = numpy.inf else: distances[i] = numpy.inf # Don't match vector with itself if self.k == 1: # Take the closest vector as the winner (k=1) winners[i] = distances.argmin() else: # Have the top k winners vote on the category categoryScores = numpy.zeros(categoryListArray.max() + 1) for j in xrange(self.k): winner = distances.argmin() distances[winner] = numpy.inf categoryScores[categoryListArray[winner]] += 1 winners[i] = categoryScores.argmax() if self.k == 1: # Convert the winners (vector IDs) to their category indices # For k > 1, the winners are already category indices winners = categoryListArray[winners] if self._specificIndexTraining: # Count the number of correct categories, ignoring invalid vectors matches = ( winners[validIndices] == categoryListArray[validIndices]) else: # Count the number of correct categories matches = (winners == categoryListArray) # number of samples, number correct return float(matches.shape[0]), matches.sum() ########################################################################## def remapCategories(self, mapping): """ Change the category indices. mapping -- List of new category indices. For example, mapping=[2,0,1] would change all vectors of category 0 to be category 2, category 1 to 0, and category 2 to 1. Used by the Network Builder to keep the category indices in sync with the ImageSensor categoryInfo when the user renames or removes categories. """ categoryArray = numpy.array(self._categoryList) newCategoryArray = numpy.zeros(categoryArray.shape[0]) newCategoryArray.fill(-1) for i in xrange(len(mapping)): newCategoryArray[categoryArray == i] = mapping[i] self._categoryList = list(newCategoryArray) ########################################################################## def setCategoryOfVectors(self, vectorIndices, categoryIndices): """ Change the category associated with this vector(s). vectorIndices -- Single index or list of indices. categoryIndices -- Single index or list of indices. Can also be a single index when vectorIndices is a list, in which case the same category will be used for all vectors. Used by the Network Builder to move vectors between categories, to enable categories, and to invalidate vectors by setting the category to -1. """ if not hasattr(vectorIndices, '__iter__'): vectorIndices = [vectorIndices] categoryIndices = [categoryIndices] elif not hasattr(categoryIndices, '__iter__'): categoryIndices = [categoryIndices] * len(vectorIndices) for i in xrange(len(vectorIndices)): vectorIndex = vectorIndices[i] categoryIndex = categoryIndices[i] # Out-of-bounds is not an error, because the KNN may not have seen the # vector yet if vectorIndex < len(self._categoryList): self._categoryList[vectorIndex] = categoryIndex def __getstate__(self): """ Return serializable state. This function will return a version of the __dict__. """ state = self.__dict__.copy() return state def __setstate__(self, state): """ Set the state of ourself from a serialized state. """ if 'version' not in state: pass elif state['version'] == 1: pass elif state['version'] == 2: raise RuntimeError( "Invalid deserialization of invalid KNNClassifier" "Verison") self.__dict__.update(state) # Set to new version self.version = KNNCLASSIFIER_VERSION
class KNNClassifier(object): """k Nearest Neighbor Classifier""" def __init__(self, k=1, exact=False, distanceNorm=2.0, distanceMethod="norm", distThreshold=0, doBinarization=False, binarizationThreshold=0.5, useSparseMemory=True, sparseThreshold=0.1, relativeThreshold=False, numWinners=0, numSVDSamples=None, numSVDDims=None, fractionOfMax=None, verbosity=0, maxStoredPatterns=-1, replaceDuplicates=False, cellsPerCol=0): """Constructor for the kNN classifier. @param k (int) The number of nearest neighbors used in the classification of patterns. Must be odd @param exact (boolean) If true, patterns must match exactly when assigning class labels @param distanceNorm (int) When distance method is "norm", this specifies the p value of the Lp-norm @param distanceMethod (string) The method used to compute distance between input patterns and prototype patterns. The possible options are: "norm": When distanceNorm is 2, this is the euclidean distance, When distanceNorm is 1, this is the manhattan distance In general: sum(abs(x-proto) ^ distanceNorm) ^ (1/distanceNorm) The distances are normalized such that farthest prototype from a given input is 1.0. "rawOverlap": Only appropriate when inputs are binary. This computes: (width of the input) - (# bits of overlap between input and prototype). "pctOverlapOfInput": Only appropriate for binary inputs. This computes 1.0 - (# bits overlap between input and prototype) / (# ON bits in input) "pctOverlapOfProto": Only appropriate for binary inputs. This computes 1.0 - (# bits overlap between input and prototype) / (# ON bits in prototype) "pctOverlapOfLarger": Only appropriate for binary inputs. This computes 1.0 - (# bits overlap between input and prototype) / max(# ON bits in input, # ON bits in prototype) @param distThreshold (float) A threshold on the distance between learned patterns and a new pattern proposed to be learned. The distance must be greater than this threshold in order for the new pattern to be added to the classifier's memory @param doBinarization (boolean) If True, then scalar inputs will be binarized. @param binarizationThreshold (float) If doBinarization is True, this specifies the threshold for the binarization of inputs @param useSparseMemory (boolean) If True, classifier will use a sparse memory matrix @param sparseThreshold (float) If useSparseMemory is True, input variables whose absolute values are less than this threshold will be stored as zero @param relativeThreshold (boolean) Flag specifying whether to multiply sparseThreshold by max value in input @param numWinners (int) Number of elements of the input that are stored. If 0, all elements are stored @param numSVDSamples (int) Number of samples the must occur before a SVD (Singular Value Decomposition) transformation will be performed. If 0, the transformation will never be performed @param numSVDDims (string) Controls dimensions kept after SVD transformation. If "adaptive", the number is chosen automatically @param fractionOfMax (float) If numSVDDims is "adaptive", this controls the smallest singular value that is retained as a fraction of the largest singular value @param verbosity (int) Console verbosity level where 0 is no output and larger integers provide increasing levels of verbosity @param maxStoredPatterns (int) Limits the maximum number of the training patterns stored. When KNN learns in a fixed capacity mode, the unused patterns are deleted once the number of stored patterns is greater than maxStoredPatterns. A value of -1 is no limit @param replaceDuplicates (bool) A boolean flag that determines whether, during learning, the classifier replaces duplicates that match exactly, even if distThreshold is 0. Should be True for online learning @param cellsPerCol (int) If >= 1, input is assumed to be organized into columns, in the same manner as the temporal pooler AND whenever a new prototype is stored, only the start cell (first cell) is stored in any bursting column """ self.version = KNNCLASSIFIER_VERSION self.k = k self.exact = exact self.distanceNorm = distanceNorm assert (distanceMethod in ("norm", "rawOverlap", "pctOverlapOfLarger", "pctOverlapOfProto", "pctOverlapOfInput")) self.distanceMethod = distanceMethod self.distThreshold = distThreshold self.doBinarization = doBinarization self.binarizationThreshold = binarizationThreshold self.useSparseMemory = useSparseMemory self.sparseThreshold = sparseThreshold self.relativeThreshold = relativeThreshold self.numWinners = numWinners self.numSVDSamples = numSVDSamples self.numSVDDims = numSVDDims self.fractionOfMax = fractionOfMax if self.numSVDDims == "adaptive": self._adaptiveSVDDims = True else: self._adaptiveSVDDims = False self.verbosity = verbosity self.replaceDuplicates = replaceDuplicates self.cellsPerCol = cellsPerCol self.maxStoredPatterns = maxStoredPatterns self.clear() def clear(self): """Clears the state of the KNNClassifier.""" self._Memory = None self._numPatterns = 0 self._M = None self._categoryList = [] self._partitionIdList = [] self._partitionIdArray = None self._finishedLearning = False self._iterationIdx = -1 # Fixed capacity KNN if self.maxStoredPatterns > 0: assert self.useSparseMemory, ( "Fixed capacity KNN is implemented only " "in the sparse memory mode") self.fixedCapacity = True self._categoryRecencyList = [] else: self.fixedCapacity = False # Cached value of the store prototype sizes self._protoSizes = None # Used by PCA self._s = None self._vt = None self._nc = None self._mean = None # Used by Network Builder self._specificIndexTraining = False self._nextTrainingIndices = None def _doubleMemoryNumRows(self): m = 2 * self._Memory.shape[0] n = self._Memory.shape[1] self._Memory = numpy.resize(self._Memory, (m, n)) self._M = self._Memory[:self._numPatterns] def _sparsifyVector(self, inputPattern, doWinners=False): # Do sparsification, using a relative or absolute threshold if not self.relativeThreshold: inputPattern = inputPattern * (abs(inputPattern) > self.sparseThreshold) elif self.sparseThreshold > 0: inputPattern = inputPattern * \ (abs(inputPattern) > (self.sparseThreshold * abs(inputPattern).max())) # Do winner-take-all if doWinners: if (self.numWinners > 0) and (self.numWinners < (inputPattern > 0).sum()): sparseInput = numpy.zeros(inputPattern.shape) # Don't consider strongly negative numbers as winners. sorted = inputPattern.argsort()[0:self.numWinners] sparseInput[sorted] += inputPattern[sorted] inputPattern = sparseInput # Do binarization if self.doBinarization: # Don't binarize negative numbers to positive 1. inputPattern = (inputPattern > self.binarizationThreshold).astype(float) return inputPattern def prototypeSetCategory(self, idToRelabel, newCategory): if idToRelabel not in self._categoryRecencyList: return recordIndex = self._categoryRecencyList.index(idToRelabel) self._categoryList[recordIndex] = newCategory def removeIds(self, idsToRemove): # Form a list of all categories to remove rowsToRemove = [k for k, rowID in enumerate(self._categoryRecencyList) \ if rowID in idsToRemove] # Remove rows from the classifier self._removeRows(rowsToRemove) def removeCategory(self, categoryToRemove): removedRows = 0 if self._Memory is None: return removedRows # The internal category indices are stored in float # format, so we should compare with a float catToRemove = float(categoryToRemove) # Form a list of all categories to remove rowsToRemove = [k for k, catID in enumerate(self._categoryList) \ if catID == catToRemove] # Remove rows from the classifier self._removeRows(rowsToRemove) assert catToRemove not in self._categoryList def _removeRows(self, rowsToRemove): # Form a numpy array of row indices to be removed removalArray = numpy.array(rowsToRemove) # Remove categories self._categoryList = numpy.delete(numpy.array(self._categoryList), removalArray).tolist() self._categoryRecencyList = numpy.delete( numpy.array(self._categoryRecencyList), removalArray).tolist() # Remove the partition ID, if any if self._partitionIdArray is not None: self._partitionIdArray = numpy.delete(self._partitionIdArray, removalArray) # Remove actual patterns if self.useSparseMemory: # Delete backwards for rowIndex in rowsToRemove[::-1]: self._Memory.deleteRow(rowIndex) else: self._M = numpy.delete(self._M, removalArray, 0) numRemoved = len(rowsToRemove) # Sanity checks numRowsExpected = self._numPatterns - numRemoved if self.useSparseMemory: if self._Memory is not None: assert self._Memory.nRows() == numRowsExpected else: assert self._M.shape[0] == numRowsExpected assert len(self._categoryList) == numRowsExpected assert self._partitionIdArray is None or \ self._partitionIdArray.shape[0] == numRowsExpected self._numPatterns -= numRemoved return numRemoved def doIteration(self): """Utility method to increment the iteration index. Intended for models that don't learn each timestep. """ self._iterationIdx += 1 def learn(self, inputPattern, inputCategory, partitionId=None, isSparse=0, rowID=None): """Train the classifier to associate specified input pattern with a particular category. @param inputPattern (list) The pattern to be assigned a category. If isSparse is 0, this should be a dense array (both ON and OFF bits present). Otherwise, if isSparse > 0, this should be a list of the indices of the non-zero bits in sorted order @param inputCategory (int) The category to be associated to the training pattern @param partitionId (int) partitionID allows you to partition the data set by associating unique IDs with sets of vectors. One use case is to ignore a specific set of vectors during inference for k-fold cross validation (see description of infer() for further details). This is an optional parameter. @param isSparse (int) If 0, the input pattern is a dense representation. If isSparse > 0, the input pattern is a list of non-zero indices and isSparse is the length of the dense representation @param rowID (int) UNKNOWN @return The number of patterns currently stored in the classifier """ if self.verbosity >= 1: print "%s learn:" % g_debugPrefix print " category:", int(inputCategory) print " active inputs:", _labeledInput( inputPattern, cellsPerCol=self.cellsPerCol) if rowID is None: rowID = self._iterationIdx # Dense vectors if not self.useSparseMemory: # Not supported assert self.cellsPerCol == 0, "not implemented for dense vectors" # If the input was given in sparse form, convert it to dense if isSparse > 0: denseInput = numpy.zeros(isSparse) denseInput[inputPattern] = 1.0 inputPattern = denseInput if self._specificIndexTraining and not self._nextTrainingIndices: # Specific index mode without any index provided - skip training return self._numPatterns if self._Memory is None: # Initialize memory with 100 rows and numPatterns = 0 inputWidth = len(inputPattern) self._Memory = numpy.zeros((100, inputWidth)) self._numPatterns = 0 self._M = self._Memory[:self._numPatterns] addRow = True if self._vt is not None: # Compute projection inputPattern = numpy.dot(self._vt, inputPattern - self._mean) if self.distThreshold > 0: # Check if input is too close to an existing input to be accepted dist = self._calcDistance(inputPattern) minDist = dist.min() addRow = (minDist >= self.distThreshold) if addRow: self._protoSizes = None # need to re-compute if self._numPatterns == self._Memory.shape[0]: # Double the size of the memory self._doubleMemoryNumRows() if not self._specificIndexTraining: # Normal learning - append the new input vector self._Memory[self._numPatterns] = inputPattern self._numPatterns += 1 self._categoryList.append(int(inputCategory)) else: # Specific index training mode - insert vector in specified slot vectorIndex = self._nextTrainingIndices.pop(0) while vectorIndex >= self._Memory.shape[0]: self._doubleMemoryNumRows() self._Memory[vectorIndex] = inputPattern self._numPatterns = max(self._numPatterns, vectorIndex + 1) if vectorIndex >= len(self._categoryList): self._categoryList += [-1] * ( vectorIndex - len(self._categoryList) + 1) self._categoryList[vectorIndex] = int(inputCategory) # Set _M to the "active" part of _Memory self._M = self._Memory[0:self._numPatterns] if partitionId is not None: self._partitionIdList.append(partitionId) # Sparse vectors else: # If the input was given in sparse form, convert it to dense if necessary if isSparse > 0 and (self._vt is not None or self.distThreshold > 0 \ or self.numSVDDims is not None or self.numSVDSamples is not None \ or self.numWinners > 0): denseInput = numpy.zeros(isSparse) denseInput[inputPattern] = 1.0 inputPattern = denseInput isSparse = 0 # Get the input width if isSparse > 0: inputWidth = isSparse else: inputWidth = len(inputPattern) # Allocate storage if this is the first training vector if self._Memory is None: self._Memory = NearestNeighbor(0, inputWidth) # Support SVD if it is on if self._vt is not None: inputPattern = numpy.dot(self._vt, inputPattern - self._mean) # Threshold the input, zeroing out entries that are too close to 0. # This is only done if we are given a dense input. if isSparse == 0: thresholdedInput = self._sparsifyVector(inputPattern, True) addRow = True # If given the layout of the cells, then turn on the logic that stores # only the start cell for bursting columns. if self.cellsPerCol >= 1: numCols = thresholdedInput.size / self.cellsPerCol burstingCols = thresholdedInput.reshape( -1, self.cellsPerCol).min(axis=1).nonzero()[0] for col in burstingCols: thresholdedInput[(col * self.cellsPerCol) + 1:(col * self.cellsPerCol) + self.cellsPerCol] = 0 # Don't learn entries that are too close to existing entries. if self._Memory.nRows() > 0: dist = None # if this vector is a perfect match for one we already learned, then # replace the category - it may have changed with online learning on. if self.replaceDuplicates: dist = self._calcDistance(thresholdedInput, distanceNorm=1) if dist.min() == 0: rowIdx = dist.argmin() self._categoryList[rowIdx] = int(inputCategory) if self.fixedCapacity: self._categoryRecencyList[rowIdx] = rowID addRow = False # Don't add this vector if it matches closely with another we already # added if self.distThreshold > 0: if dist is None or self.distanceNorm != 1: dist = self._calcDistance(thresholdedInput) minDist = dist.min() addRow = (minDist >= self.distThreshold) if not addRow: if self.fixedCapacity: rowIdx = dist.argmin() self._categoryRecencyList[rowIdx] = rowID # Add the new vector to our storage if addRow: self._protoSizes = None # need to re-compute if isSparse == 0: self._Memory.addRow(thresholdedInput) else: self._Memory.addRowNZ(inputPattern, [1] * len(inputPattern)) self._numPatterns += 1 self._categoryList.append(int(inputCategory)) if partitionId is not None: self._partitionIdList.append(partitionId) if self.fixedCapacity: self._categoryRecencyList.append(rowID) if self._numPatterns > self.maxStoredPatterns and \ self.maxStoredPatterns > 0: leastRecentlyUsedPattern = numpy.argmin( self._categoryRecencyList) self._Memory.deleteRow(leastRecentlyUsedPattern) self._categoryList.pop(leastRecentlyUsedPattern) self._categoryRecencyList.pop(leastRecentlyUsedPattern) self._numPatterns -= 1 if self.numSVDDims is not None and self.numSVDSamples is not None \ and self._numPatterns == self.numSVDSamples: self.computeSVD() return self._numPatterns def getOverlaps(self, inputPattern): """Return the degree of overlap between an input pattern and each category stored in the classifier. The overlap is computed by compuing: logical_and(inputPattern != 0, trainingPattern != 0).sum() @param inputPattern pattern to check overlap of @return (overlaps, categories) Two numpy arrays of the same length: overlaps: an integer overlap amount for each category categories: category index for each element of overlaps """ assert self.useSparseMemory, "Not implemented yet for dense storage" overlaps = self._Memory.rightVecSumAtNZ(inputPattern) return (overlaps, self._categoryList) def getDistances(self, inputPattern): """Return the distances between the input pattern and all other stored patterns. @param inputPattern pattern to check distance with @return (distances, categories) numpy arrays of the same length: overlaps: an integer overlap amount for each category categories: category index for each element of distances """ dist = self._getDistances(inputPattern) return (dist, self._categoryList) def infer(self, inputPattern, computeScores=True, overCategories=True, partitionId=None): """Finds the category that best matches the input pattern. Returns the winning category index as well as a distribution over all categories. @param inputPattern (list) A pattern to be classified @param computeScores NO EFFECT @param overCategories NO EFFECT @param partitionId (int) If provided, all training vectors with partitionId equal to that of the input pattern are ignored. For example, this may be used to perform k-fold cross validation without repopulating the classifier. First partition all the data into k equal partitions numbered 0, 1, 2, ... and then call learn() for each vector passing in its partitionId. Then, during inference, by passing in the partition ID in the call to infer(), all other vectors with the same partitionId are ignored simulating the effect of repopulating the classifier while ommitting the training vectors in the same partition. This method returns a 4-tuple: (winner, inferenceResult, dist, categoryDist) winner: The category with the greatest number of nearest neighbors within the kth nearest neighbors. If the inferenceResult contains no neighbors, the value of winner is None; this applies to the case of exact matching. inferenceResult: A list of length numCategories, each entry contains the number of neighbors within the top k neighbors that are in that category. dist: A list of length numPrototypes. Each entry is the distance from the unknown to that prototype. All distances are between 0.0 and 1.0 categoryDist: A list of length numCategories. Each entry is the distance from the unknown to the nearest prototype of that category. All distances are between 0 and 1.0. """ if len(self._categoryList) == 0: # No categories learned yet; i.e. first inference w/ online learning. winner = 0 inferenceResult = numpy.zeros(1) dist = numpy.ones(1) categoryDist = numpy.ones(1) else: maxCategoryIdx = max(self._categoryList) inferenceResult = numpy.zeros(maxCategoryIdx + 1) dist = self._getDistances(inputPattern, partitionId=partitionId) validVectorCount = len( self._categoryList) - self._categoryList.count(-1) # Loop through the indices of the nearest neighbors. if self.exact: # Is there an exact match in the distances? exactMatches = numpy.where(dist < 0.00001)[0] if len(exactMatches) > 0: for i in exactMatches[:min(self.k, validVectorCount)]: inferenceResult[self._categoryList[i]] += 1.0 else: sorted = dist.argsort() for j in sorted[:min(self.k, validVectorCount)]: inferenceResult[self._categoryList[j]] += 1.0 # Prepare inference results. if inferenceResult.any(): winner = inferenceResult.argmax() inferenceResult /= inferenceResult.sum() else: winner = None categoryDist = min_score_per_category(maxCategoryIdx, self._categoryList, dist) categoryDist.clip(0, 1.0, categoryDist) if self.verbosity >= 1: print "%s infer:" % (g_debugPrefix) print " active inputs:", _labeledInput( inputPattern, cellsPerCol=self.cellsPerCol) print " winner category:", winner print " pct neighbors of each category:", inferenceResult print " dist of each prototype:", dist print " dist of each category:", categoryDist result = (winner, inferenceResult, dist, categoryDist) return result def getClosest(self, inputPattern, topKCategories=3): """Returns the index of the pattern that is closest to inputPattern, the distances of all patterns to inputPattern, and the indices of the k closest categories. """ inferenceResult = numpy.zeros(max(self._categoryList) + 1) dist = self._getDistances(inputPattern) sorted = dist.argsort() validVectorCount = len( self._categoryList) - self._categoryList.count(-1) for j in sorted[:min(self.k, validVectorCount)]: inferenceResult[self._categoryList[j]] += 1.0 winner = inferenceResult.argmax() topNCats = [] for i in range(topKCategories): topNCats.append((self._categoryList[sorted[i]], dist[sorted[i]])) return winner, dist, topNCats def closestTrainingPattern(self, inputPattern, cat): """Returns the closest training pattern to inputPattern that belongs to category "cat". @param inputPattern The pattern whose closest neighbor is sought @param cat The required category of closest neighbor @return A dense version of the closest training pattern, or None if no such patterns exist """ dist = self._getDistances(inputPattern) sorted = dist.argsort() for patIdx in sorted: patternCat = self._categoryList[patIdx] # If closest pattern belongs to desired category, return it if patternCat == cat: if self.useSparseMemory: closestPattern = self._Memory.getRow(int(patIdx)) else: closestPattern = self._M[patIdx] return closestPattern # No patterns were found! return None def closestOtherTrainingPattern(self, inputPattern, cat): """Return the closest training pattern that is *not* of the given category "cat". @param inputPattern The pattern whose closest neighbor is sought @param cat Training patterns of this category will be ignored no matter their distance to inputPattern @return A dense version of the closest training pattern, or None if no such patterns exist """ dist = self._getDistances(inputPattern) sorted = dist.argsort() for patIdx in sorted: patternCat = self._categoryList[patIdx] # If closest pattern does not belong to specified category, return it if patternCat != cat: if self.useSparseMemory: closestPattern = self._Memory.getRow(int(patIdx)) else: closestPattern = self._M[patIdx] return closestPattern # No patterns were found! return None def getPattern(self, idx, sparseBinaryForm=False, cat=None): """Gets a training pattern either by index or category number. @param idx Index of the training pattern @param sparseBinaryForm If true, returns a list of the indices of the non-zero bits in the training pattern @param cat If not None, get the first pattern belonging to category cat. If this is specified, idx must be None. @return The training pattern with specified index """ if cat is not None: assert idx is None idx = self._categoryList.index(cat) if not self.useSparseMemory: pattern = self._Memory[idx] if sparseBinaryForm: pattern = pattern.nonzero()[0] else: (nz, values) = self._Memory.rowNonZeros(idx) if not sparseBinaryForm: pattern = numpy.zeros(self._Memory.nCols()) numpy.put(pattern, nz, 1) else: pattern = nz return pattern def _calcDistance(self, inputPattern, distanceNorm=None): """Calculate the distances from inputPattern to all stored patterns. All distances are between 0.0 and 1.0 @param inputPattern The pattern from which distances to all other patterns are calculated @param distanceNorm Degree of the distance norm """ if distanceNorm is None: distanceNorm = self.distanceNorm # Sparse memory if self.useSparseMemory: if self._protoSizes is None: self._protoSizes = self._Memory.rowSums() overlapsWithProtos = self._Memory.rightVecSumAtNZ(inputPattern) inputPatternSum = inputPattern.sum() if self.distanceMethod == "rawOverlap": dist = inputPattern.sum() - overlapsWithProtos elif self.distanceMethod == "pctOverlapOfInput": dist = inputPatternSum - overlapsWithProtos if inputPatternSum > 0: dist /= inputPatternSum elif self.distanceMethod == "pctOverlapOfProto": overlapsWithProtos /= self._protoSizes dist = 1.0 - overlapsWithProtos elif self.distanceMethod == "pctOverlapOfLarger": maxVal = numpy.maximum(self._protoSizes, inputPatternSum) if maxVal.all() > 0: overlapsWithProtos /= maxVal dist = 1.0 - overlapsWithProtos elif self.distanceMethod == "norm": dist = self._Memory.vecLpDist(self.distanceNorm, inputPattern) distMax = dist.max() if distMax > 0: dist /= distMax else: raise RuntimeError("Unimplemented distance method %s" % self.distanceMethod) # Dense memory else: if self.distanceMethod == "norm": dist = numpy.power(numpy.abs(self._M - inputPattern), self.distanceNorm) dist = dist.sum(1) dist = numpy.power(dist, 1.0 / self.distanceNorm) dist /= dist.max() else: raise RuntimeError("Not implemented yet for dense storage....") return dist def _getDistances(self, inputPattern, partitionId=None): """Return the distances from inputPattern to all stored patterns. @param inputPattern The pattern from which distances to all other patterns are returned @param partitionId If provided, ignore all training vectors with this partitionId. """ if not self._finishedLearning: self.finishLearning() self._finishedLearning = True if self._vt is not None and len(self._vt) > 0: inputPattern = numpy.dot(self._vt, inputPattern - self._mean) sparseInput = self._sparsifyVector(inputPattern) # Compute distances dist = self._calcDistance(sparseInput) # Invalidate results where category is -1 if self._specificIndexTraining: dist[numpy.array(self._categoryList) == -1] = numpy.inf # Ignore vectors with same partition id if self._partitionIdArray is not None: dist[self._partitionIdArray == partitionId] = numpy.inf return dist def finishLearning(self): if self.numSVDDims is not None and self._vt is None: self.computeSVD() # Check if our partition ID list is non-trivial # (i.e., whether it contains at least two different # partition IDs) if self._partitionIdList: partitions = set(self._partitionIdList) if len(partitions) > 1: # Compile into a numpy array self._partitionIdArray = numpy.array(self._partitionIdList) else: # Trivial partitions; ignore self._partitionIdArray = None # Either way, we don't need the original list self._partitionIdList = [] def restartLearning(self): """This is only invoked if we have already called finishLearning() but now want to go back and provide more samples. """ # We need to convert the partition ID array back into a list if hasattr(self, "_partitionIdArray"): # In the case of trivial partitions, we need to regenerate # the "null" partition ID if self._partitionIdArray is None: self._partitionIdList = [0] * self._numPatterns else: self._partitionIdList = self._partitionIdArray.tolist() def computeSVD(self, numSVDSamples=None, finalize=True): if numSVDSamples is None: numSVDSamples = self._numPatterns if not self.useSparseMemory: self._a = self._Memory[:self._numPatterns] else: self._a = self._Memory.toDense()[:self._numPatterns] self._mean = numpy.mean(self._a, axis=0) self._a -= self._mean u, self._s, self._vt = numpy.linalg.svd(self._a[:numSVDSamples]) if finalize: self.finalizeSVD() return self._s def getAdaptiveSVDDims(self, singularValues, fractionOfMax=0.001): v = singularValues / singularValues[0] idx = numpy.where(v < fractionOfMax)[0] if len(idx): print "Number of PCA dimensions chosen: ", idx[0], "out of ", len( v) return idx[0] else: print "Number of PCA dimensions chosen: ", len( v) - 1, "out of ", len(v) return len(v) - 1 def finalizeSVD(self, numSVDDims=None): if numSVDDims is not None: self.numSVDDims = numSVDDims if self.numSVDDims == "adaptive": if self.fractionOfMax is not None: self.numSVDDims = self.getAdaptiveSVDDims( self._s, self.fractionOfMax) else: self.numSVDDims = self.getAdaptiveSVDDims(self._s) if self._vt.shape[0] < self.numSVDDims: print "******************************************************************" print( "Warning: The requested number of PCA dimensions is more than " "the number of pattern dimensions.") print "Setting numSVDDims = ", self._vt.shape[0] print "******************************************************************" self.numSVDDims = self._vt.shape[0] self._vt = self._vt[:self.numSVDDims] # Added when svd is not able to decompose vectors - uses raw spare vectors if len(self._vt) == 0: return self._Memory = numpy.zeros((self._numPatterns, self.numSVDDims)) self._M = self._Memory self.useSparseMemory = False for i in range(self._numPatterns): self._Memory[i] = numpy.dot(self._vt, self._a[i]) self._a = None def leaveOneOutTest(self): """Run leave-one-out testing. Returns the total number of samples and the number correctly classified. Ignores invalid vectors (those with a category of -1). Uses partitionIdList, if non-empty, to avoid matching a vector against other vectors that came from the same training sequence. """ if self.useSparseMemory: raise Exception( "leaveOneOutTest only works with dense memory right now") # The basic test is simple, but we need to prepare some data structures to # handle _specificIndexTraining and _partitionIdList categoryListArray = numpy.array(self._categoryList[:self._M.shape[0]]) if self._specificIndexTraining: # Find valid and invalid vectors using the category list validIndices = (categoryListArray != -1) invalidIndices = (categoryListArray == -1) # Convert list of partitions to numpy array if we haven't # already done so. partitionIdArray = None if hasattr(self, "_partitionIdArray") and \ self._partitionIdArray is not None: partitionIdArray = self._partitionIdArray elif self._partitionIdList: # Use the partition id list partitionIdArray = numpy.array(self._partitionIdList) # Find the winning vector for each cache vector, excluding itself, # excluding invalid vectors, and excluding other vectors with the # same partition id winners = numpy.zeros(self._M.shape[0], numpy.int32) for i in xrange(self._M.shape[0]): if self._specificIndexTraining \ and categoryListArray[i] == -1: # This is an invalid vector continue # Calculate distance between this vector and all others distances = numpy.power(numpy.abs(self._M - self._M[i, :]), self.distanceNorm) distances = distances.sum(1) # Invalidate certain vectors by setting their distance to infinity if self._specificIndexTraining: distances[invalidIndices] = numpy.inf # Ignore invalid vectors if partitionIdArray is not None: # Ignore vectors with same partition id distances[partitionIdArray == partitionIdArray[i]] = numpy.inf else: distances[i] = numpy.inf # Don't match vector with itself if self.k == 1: # Take the closest vector as the winner (k=1) winners[i] = distances.argmin() else: # Have the top k winners vote on the category categoryScores = numpy.zeros(categoryListArray.max() + 1) for j in xrange(self.k): winner = distances.argmin() distances[winner] = numpy.inf categoryScores[categoryListArray[winner]] += 1 winners[i] = categoryScores.argmax() if self.k == 1: # Convert the winners (vector IDs) to their category indices # For k > 1, the winners are already category indices winners = categoryListArray[winners] if self._specificIndexTraining: # Count the number of correct categories, ignoring invalid vectors matches = ( winners[validIndices] == categoryListArray[validIndices]) else: # Count the number of correct categories matches = (winners == categoryListArray) # number of samples, number correct return float(matches.shape[0]), matches.sum() def remapCategories(self, mapping): """Change the category indices. Used by the Network Builder to keep the category indices in sync with the ImageSensor categoryInfo when the user renames or removes categories. @param mapping List of new category indices. For example, mapping=[2,0,1] would change all vectors of category 0 to be category 2, category 1 to 0, and category 2 to 1 """ categoryArray = numpy.array(self._categoryList) newCategoryArray = numpy.zeros(categoryArray.shape[0]) newCategoryArray.fill(-1) for i in xrange(len(mapping)): newCategoryArray[categoryArray == i] = mapping[i] self._categoryList = list(newCategoryArray) def setCategoryOfVectors(self, vectorIndices, categoryIndices): """Change the category associated with this vector(s). Used by the Network Builder to move vectors between categories, to enable categories, and to invalidate vectors by setting the category to -1. @param vectorIndices Single index or list of indices @param categoryIndices Single index or list of indices. Can also be a single index when vectorIndices is a list, in which case the same category will be used for all vectors """ if not hasattr(vectorIndices, "__iter__"): vectorIndices = [vectorIndices] categoryIndices = [categoryIndices] elif not hasattr(categoryIndices, "__iter__"): categoryIndices = [categoryIndices] * len(vectorIndices) for i in xrange(len(vectorIndices)): vectorIndex = vectorIndices[i] categoryIndex = categoryIndices[i] # Out-of-bounds is not an error, because the KNN may not have seen the # vector yet if vectorIndex < len(self._categoryList): self._categoryList[vectorIndex] = categoryIndex def __getstate__(self): """Return serializable state. This function will return a version of the __dict__. """ state = self.__dict__.copy() return state def __setstate__(self, state): """Set the state of this object from a serialized state.""" if "version" not in state: pass elif state["version"] == 1: pass elif state["version"] == 2: raise RuntimeError( "Invalid deserialization of invalid KNNClassifier" "Verison") self.__dict__.update(state) # Set to new version self.version = KNNCLASSIFIER_VERSION
class KNNClassifier(object): """k nearest neighbor classifier""" def __init__(self, k=1, # The K in KNN exact=False, # Specifies exact matching for inferring class # labels. distanceNorm=2.0, # By default, we use L2 norm as distance metric distanceMethod='norm', # The method used to compute distance. See # below for options distThreshold=0, # Distance threshold for entering patterns doBinarization=False, # Inputs are binarized. binarizationThreshold=0.5, # Threshold for binarization of inputs. useSparseMemory=True, # Use sparse memory matrix sparseThreshold=0.1, # Anything below this threshold is considered # zero. relativeThreshold=False, # Multiply the threshold by the max input value numWinners=0, # Only numWinners elements of input are stored numSVDSamples=None, # Number of samples to do SVD after numSVDDims=None, # % of the dims to keep after SVD fractionOfMax=None, # The cut-off fraction in relation to the # largest singular value when adaptive # dimension selection is used. verbosity=0, # verbosity level (0: none, increasing integers # providing increasing levels of verbosity maxStoredPatterns=-1, # Limits the maximum number of the training # patterns stored. When KNN learns in a fixed # capacity mode, the unused patterns are # deleted once the number of stored patterns # is greater than maxStoredPatterns replaceDuplicates=False, # if true, during learning, replace existing # entries that match exactly, even if # distThreshold is 0. cellsPerCol=0, # if >=1, then only store the start cell in # any columns which are bursting. ): """ Constructor for the kNN classifier. distanceMethod -- method used to compute distance. Possible options are: 'norm': When distanceNorm is 2, this is the euclidean distance, When distanceNorm is 1, this is the manhattan distance In general: sum(abs(x-proto) ^ distanceNorm) ^ (1/distanceNorm) 'rawOverlap': Only appropriate when inputs are binary. This computes: (width of the input) - (# bits of overlap between input and prototype). 'pctOverlapOfLarger': Only appropriate for binary inputs. This computes 1.0 - (# bits overlap between input and prototype) / max(# bits in input, # bits in prototype) 'pctOverlapOfProto': Only appropriate for binary inputs. This computes 1.0 - (# bits overlap between input and prototype) / (# bits in prototype) distThreshold -- Distance Threshold. If a pattern that is less than distThreshold apart from the input pattern already exists in the kNN's memory, then the input pattern is not added to kNN's memory. """ self.version = KNNCLASSIFIER_VERSION self.k = k self.exact = exact self.distanceNorm = distanceNorm assert (distanceMethod in ('norm', 'rawOverlap', 'pctOverlapOfLarger', 'pctOverlapOfProto')) self.distanceMethod = distanceMethod self.distThreshold = distThreshold self.doBinarization = doBinarization self.binarizationThreshold = binarizationThreshold self.useSparseMemory = useSparseMemory self.sparseThreshold = sparseThreshold self.relativeThreshold = relativeThreshold self.numWinners = numWinners self.numSVDSamples = numSVDSamples self.numSVDDims = numSVDDims self.fractionOfMax = fractionOfMax if self.numSVDDims=='adaptive': self._adaptiveSVDDims = True else: self._adaptiveSVDDims = False self.verbosity = verbosity self.replaceDuplicates = replaceDuplicates self.cellsPerCol = cellsPerCol self.maxStoredPatterns = maxStoredPatterns self.clear() ########################################################################## def clear(self): self._Memory = None self._numPatterns = 0 self._M = None self._categoryList = [] self._partitionIdList = [] self._partitionIdArray = None self._finishedLearning = False self._iterationIdx = -1 # Fixed capacity KNN if self.maxStoredPatterns > 0: assert self.useSparseMemory, "Fixed capacity KNN is implemented only in" \ " the sparse memory mode" self.fixedCapacity = True self._categoryRecencyList = [] else: self.fixedCapacity = False # Cached value of the store prototype sizes self._protoSizes = None # Used by PCA self._s = None self._vt = None self._nc = None self._mean = None # Used by Network Builder self._specificIndexTraining = False self._nextTrainingIndices = None ########################################################################## def _doubleMemoryNumRows(self): m = 2*self._Memory.shape[0] n = self._Memory.shape[1] self._Memory = numpy.resize(self._Memory,(m,n)) self._M = self._Memory[:self._numPatterns] ########################################################################## def _sparsifyVector(self, inputPattern, doWinners=False): # Do sparsification, using a relative or absolute threshold if not self.relativeThreshold: inputPattern = inputPattern*(abs(inputPattern) > self.sparseThreshold) elif self.sparseThreshold > 0: inputPattern = inputPattern * \ (abs(inputPattern) > (self.sparseThreshold * abs(inputPattern).max())) # Do winner-take-all if doWinners: if (self.numWinners>0) and (self.numWinners < (inputPattern > 0).sum()): sparseInput = numpy.zeros(inputPattern.shape) # Don't consider strongly negative numbers as winners. sorted = inputPattern.argsort()[0:self.numWinners] sparseInput[sorted] += inputPattern[sorted] inputPattern = sparseInput # Do binarization if self.doBinarization: # Don't binarize negative numbers to positive 1. inputPattern = (inputPattern > self.binarizationThreshold).astype(float) return inputPattern def prototypeSetCategory(self, idToRelabel, newCategory): if idToRelabel not in self._categoryRecencyList: return recordIndex = self._categoryRecencyList.index(idToRelabel) self._categoryList[recordIndex] = newCategory def removeIds(self, idsToRemove): # Form a list of all categories to remove rowsToRemove = [k for k, rowID in enumerate(self._categoryRecencyList) \ if rowID in idsToRemove] # Remove rows from the classifier self._removeRows(rowsToRemove) ########################################################################## def removeCategory(self, categoryToRemove): removedRows = 0 if self._Memory is None: return removedRows # The internal category indices are stored in float # format, so we should compare with a float catToRemove = float(categoryToRemove) # Form a list of all categories to remove rowsToRemove = [k for k, catID in enumerate(self._categoryList) \ if catID == catToRemove] # Remove rows from the classifier self._removeRows(rowsToRemove) assert catToRemove not in self._categoryList def _removeRows(self, rowsToRemove): # Form a numpy array of row indices to be removed removalArray = numpy.array(rowsToRemove) # Remove categories self._categoryList = numpy.delete(numpy.array(self._categoryList), removalArray).tolist() self._categoryRecencyList = numpy.delete(numpy.array(self._categoryRecencyList), removalArray).tolist() # Remove the partition ID, if any if self._partitionIdArray is not None: self._partitionIdArray = numpy.delete(self._partitionIdArray, removalArray) # Remove actual patterns if self.useSparseMemory: # Delete backwards for rowIndex in rowsToRemove[::-1]: self._Memory.deleteRow(rowIndex) else: self._M = numpy.delete(self._M, removalArray, 0) numRemoved = len(rowsToRemove) # Sanity checks numRowsExpected = self._numPatterns - numRemoved if self.useSparseMemory: if self._Memory is not None: assert self._Memory.nRows() == numRowsExpected else: assert self._M.shape[0] == numRowsExpected assert len(self._categoryList) == numRowsExpected assert self._partitionIdArray is None or \ self._partitionIdArray.shape[0] == numRowsExpected self._numPatterns -= numRemoved return numRemoved # Used to increment iteration for models that don't learn each timestep def doIteration(self): self._iterationIdx += 1 ########################################################################## def learn(self, inputPattern, inputCategory, partitionId=None, isSparse=0, rowID=None): """ Learn a new training presentation Parameters: ------------------------------------------------------------------------ inputPattern: training pattern to learn. This should be a dense array if isSparse==0 or a list of non-zero indices if isSparse>0 inputCategory: category index of the training pattern. partitionID: ?? isSparse: If >0, the input pattern is a list of non-zero indices and isSparse is the length of the dense representation. """ if self.verbosity >= 1: print "%s learn:" % (g_debugPrefix) print " category:", int(inputCategory) print " active inputs:", _labeledInput(inputPattern, cellsPerCol=self.cellsPerCol) if rowID is None: rowID = self._iterationIdx assert partitionId is None, \ "No documentation is available for partitionId, not sure how it works." #--------------------------------------------------------------------------------- # Dense vectors if not self.useSparseMemory: # Not supported assert self.cellsPerCol == 0, "not implemented for dense vectors" # If the input was given in sparse form, convert it to dense if isSparse > 0: denseInput = numpy.zeros(isSparse) denseInput[inputPattern] = 1.0 inputPattern = denseInput if self._specificIndexTraining and not self._nextTrainingIndices: # Specific index mode without any index provided - skip training return self._numPatterns if self._Memory is None: # Initialize memory with 100 rows and numPatterns = 0 inputWidth = len(inputPattern) self._Memory = numpy.zeros((100,inputWidth)) self._numPatterns = 0 self._M = self._Memory[:self._numPatterns] addRow = True if self._vt is not None: # Compute projection inputPattern = numpy.dot(self._vt, inputPattern - self._mean) if self.distThreshold > 0: # Check if input is too close to an existing input to be accepted dist = self._calcDistance(inputPattern) minDist = dist.min() addRow = (minDist >= self.distThreshold) if addRow: self._protoSizes = None # need to re-compute if self._numPatterns == self._Memory.shape[0]: # Double the size of the memory self._doubleMemoryNumRows() if not self._specificIndexTraining: # Normal learning - append the new input vector self._Memory[self._numPatterns] = inputPattern self._numPatterns += 1 self._categoryList.append(int(inputCategory)) else: # Specific index training mode - insert vector in specified slot vectorIndex = self._nextTrainingIndices.pop(0) while vectorIndex >= self._Memory.shape[0]: self._doubleMemoryNumRows() self._Memory[vectorIndex] = inputPattern self._numPatterns = max(self._numPatterns, vectorIndex + 1) if vectorIndex >= len(self._categoryList): self._categoryList += [-1] * (vectorIndex - len(self._categoryList) + 1) self._categoryList[vectorIndex] = int(inputCategory) # Set _M to the "active" part of _Memory self._M = self._Memory[0:self._numPatterns] if partitionId is not None: self._partitionIdList.append(partitionId) #--------------------------------------------------------------------------------- # Sparse vectors else: # If the input was given in sparse form, convert it to dense if necessary if isSparse > 0 and (self._vt is not None or self.distThreshold > 0 \ or self.numSVDDims is not None or self.numSVDSamples is not None \ or self.numWinners > 0): denseInput = numpy.zeros(isSparse) denseInput[inputPattern] = 1.0 inputPattern = denseInput isSparse = 0 # Get the input width if isSparse > 0: inputWidth = isSparse else: inputWidth = len(inputPattern) # Allocate storage if this is the first training vector if self._Memory is None: self._Memory = NearestNeighbor(0, inputWidth) # Support SVD if it is on if self._vt is not None: inputPattern = numpy.dot(self._vt, inputPattern - self._mean) # Threshold the input, zeroing out entries that are too close to 0. # This is only done if we are given a dense input. if isSparse == 0: thresholdedInput = self._sparsifyVector(inputPattern, True) addRow = True # If given the layout of the cells, then turn on the logic that stores # only the start cell for bursting columns. if self.cellsPerCol >= 1: numCols = thresholdedInput.size / self.cellsPerCol burstingCols = thresholdedInput.reshape(-1, self.cellsPerCol).min(axis=1).nonzero()[0] for col in burstingCols: thresholdedInput[(col * self.cellsPerCol) + 1 : (col * self.cellsPerCol) + self.cellsPerCol] = 0 # Don't learn entries that are too close to existing entries. if self._Memory.nRows() > 0: dist = None # if this vector is a perfect match for one we already learned, then # replace the category - it may have changed with online learning on. if self.replaceDuplicates: dist = self._calcDistance(thresholdedInput, distanceNorm=1) if dist.min() == 0: rowIdx = dist.argmin() self._categoryList[rowIdx] = int(inputCategory) if self.fixedCapacity: self._categoryRecencyList[rowIdx] = rowID addRow = False # Don't add this vector if it matches closely with another we already # added if self.distThreshold > 0: if dist is None or self.distanceNorm != 1: dist = self._calcDistance(thresholdedInput) minDist = dist.min() addRow = (minDist >= self.distThreshold) if not addRow: if self.fixedCapacity: rowIdx = dist.argmin() self._categoryRecencyList[rowIdx] = rowID # Add the new vector to our storage if addRow: self._protoSizes = None # need to re-compute if isSparse == 0: self._Memory.addRow(thresholdedInput) else: self._Memory.addRowNZ(inputPattern, [1]*len(inputPattern)) self._numPatterns += 1 self._categoryList.append(int(inputCategory)) if partitionId is not None: self._partitionIdList.append(partitionId) if self.fixedCapacity: self._categoryRecencyList.append(rowID) if self._numPatterns > self.maxStoredPatterns and \ self.maxStoredPatterns > 0: leastRecentlyUsedPattern = numpy.argmin(self._categoryRecencyList) self._Memory.deleteRow(leastRecentlyUsedPattern) self._categoryList.pop(leastRecentlyUsedPattern) self._categoryRecencyList.pop(leastRecentlyUsedPattern) self._numPatterns -= 1 if self.numSVDDims is not None and self.numSVDSamples is not None \ and self._numPatterns == self.numSVDSamples: self.computeSVD() return self._numPatterns ########################################################################## def getOverlaps(self, inputPattern): """Return the overlap amount of the input pattern with each category. This returns 2 numpy arrays of the same length, the overlaps and the category numbers. The overlap is computed by compuing: logical_and(inputPattern != 0, trainingPattern != 0).sum() Parameters: ------------------------------------------------------------------- inputPattern: pattern to check overlap of retval: (overlaps, categories) overlaps: an integer overlap amount for each category categories: category index for each element of overlaps """ assert self.useSparseMemory, "Not implemented yet for dense storage" overlaps = self._Memory.rightVecSumAtNZ(inputPattern) return (overlaps, self._categoryList) ########################################################################## def getDistances(self, inputPattern): """Return the distance between the input pattern and all other stored patterns. This returns 2 numpy arrays of the same length, the distances and the category numbers. Parameters: ------------------------------------------------------------------- inputPattern: pattern to check distance with retval: (distances, categories) overlaps: an integer overlap amount for each category categories: category index for each element of distances """ dist = self._getDistances(inputPattern) return (dist, self._categoryList) ########################################################################## def infer(self, inputPattern, computeScores=True, overCategories=True, partitionId=None): """Find the category that best matches the input pattern. Returns the winning category index plus a distribution over all categories. This method returns a 4 item tuple: (winner, inferenceResult, dist, categoryDist) winner: The category with the greatest number of nearest neighbors within the kth nearest neighbors. If the inferenceResult contains no neighbors, the value of winner is None; this applies to the case of exact matching. inferenceResult: A list of length numCategories, each entry contains the number of neighbors within the top k neighbors that are in that category dist: A list of length numPrototypes. Each entry is the distance from the unknown to that prototype. All distances are between 0 and 1.0 categoryDist: A list of length numCategories. Each entry is the distance from the unknown to the nearest prototype of that category. All distances are between 0 and 1.0. """ if len(self._categoryList) == 0: # No categories learned yet; i.e. first inference w/ online learning. winner = 0 inferenceResult = numpy.zeros(1) dist = numpy.ones(1) categoryDist = numpy.ones(1) else: maxCategoryIdx = max(self._categoryList) inferenceResult = numpy.zeros(maxCategoryIdx+1) dist = self._getDistances(inputPattern, partitionId=partitionId) validVectorCount = len(self._categoryList) - self._categoryList.count(-1) # Loop through the indices of the nearest neighbors. if self.exact: # Is there an exact match in the distances? exactMatches = numpy.where(dist<0.00001)[0] if len(exactMatches) > 0: for i in exactMatches[:min(self.k, validVectorCount)]: inferenceResult[self._categoryList[i]] += 1.0 else: sorted = dist.argsort() for j in sorted[:min(self.k, validVectorCount)]: inferenceResult[self._categoryList[j]] += 1.0 # Prepare inference results. if inferenceResult.any(): winner = inferenceResult.argmax() inferenceResult /= inferenceResult.sum() else: winner = None categoryDist = min_score_per_category(maxCategoryIdx, self._categoryList, dist) categoryDist.clip(0, 1.0, categoryDist) if self.verbosity >= 1: print "%s infer:" % (g_debugPrefix) print " active inputs:", _labeledInput(inputPattern, cellsPerCol=self.cellsPerCol) print " winner category:", winner print " pct neighbors of each category:", inferenceResult print " dist of each prototype:", dist print " dist of each category:", categoryDist result = (winner, inferenceResult, dist, categoryDist) return result ########################################################################## def getClosest(self, inputPattern, topKCategories = 3): """Return index to the pattern that is closest to inputPattern as well as indices to the topKCategories closest categories.""" inferenceResult = numpy.zeros(max(self._categoryList)+1) dist = self._getDistances(inputPattern) sorted = dist.argsort() validVectorCount = len(self._categoryList) - self._categoryList.count(-1) for j in sorted[:min(self.k, validVectorCount)]: inferenceResult[self._categoryList[j]] += 1.0 winner = inferenceResult.argmax() topNCats = [] for i in range(topKCategories): topNCats.append((self._categoryList[sorted[i]], dist[sorted[i]] )) return winner, dist, topNCats ########################################################################## def closestTrainingPattern(self, inputPattern, cat): """ Return the training pattern belonging to the given category 'cat', that matches inputPattern the closest. inputPattern: the pattern to compare with cat: the category to consider retval: dense version of training pattern, None if no patterns found """ dist = self._getDistances(inputPattern) sorted = dist.argsort() for patIdx in sorted: patternCat = self._categoryList[patIdx] # If closest pattern belongs to desired category, return it if patternCat == cat: if self.useSparseMemory: closestPattern = self._Memory.getRow(int(patIdx)) else: closestPattern = self._M[patIdx] return closestPattern # No patterns were found! return None ########################################################################## def closestOtherTrainingPattern(self, inputPattern, cat): """ Return the closest training pattern that is *not* in the given category 'cat'. inputPattern: the pattern to compare with cat: the category to avoid retval: dense version of training pattern, None if no patterns found """ dist = self._getDistances(inputPattern) sorted = dist.argsort() for patIdx in sorted: patternCat = self._categoryList[patIdx] # If closest pattern does not belong to specified category, return it if patternCat != cat: if self.useSparseMemory: closestPattern = self._Memory.getRow(int(patIdx)) else: closestPattern = self._M[patIdx] return closestPattern # No patterns were found! return None ########################################################################## def getPattern(self, idx, sparseBinaryForm=False, cat=None): """Return a training pattern either by index or category number Parameters: ------------------------------------------------------------------------ idx: Index of the training pattern sparseBinaryForm: If true, return only a list of the non-zeros in the training pattern cat: If not None, get the first pattern belonging to category cat. If this is specified, idx must be None """ if cat is not None: assert idx is None idx = self._categoryList.index(cat) if not self.useSparseMemory: pattern = self._Memory[idx] if sparseBinaryForm: pattern = pattern.nonzero()[0] else: (nz, values) = self._Memory.rowNonZeros(idx) if not sparseBinaryForm: pattern = numpy.zeros(self._Memory.nCols()) numpy.put(pattern, nz, 1) else: pattern = nz return pattern ########################################################################## def _calcDistance(self, inputPattern, distanceNorm=None): """Calculate the distances from inputPattern to all stored patterns. The distances are all between 0 and 1.0""" if distanceNorm is None: distanceNorm = self.distanceNorm # Sparse memory if self.useSparseMemory: if self.distanceMethod == 'pctOvlerapOfLarger': if self._protoSizes is None: self._protoSizes = self._Memory.rowSums() dist = self._Memory.rightVecSumAtNZ(inputPattern) maxVal = numpy.maximum(self._protoSizes, inputPattern.sum()) if maxVal > 0: dist /= maxVal dist = 1.0 - dist elif self.distanceMethod == 'rawOverlap': if self._protoSizes is None: self._protoSizes = self._Memory.rowSums() inputPatternSum = inputPattern.sum() dist = (inputPatternSum - self._Memory.rightVecSumAtNZ(inputPattern)) if inputPatternSum > 0: dist /= inputPatternSum elif self.distanceMethod == 'pctOverlapOfProto': if self._protoSizes is None: self._protoSizes = self._Memory.rowSums() dist = self._Memory.rightVecSumAtNZ(inputPattern) dist /= self._protoSizes dist = 1.0 - dist elif self.distanceMethod == 'norm': dist = self._Memory.vecLpDist(self.distanceNorm, inputPattern) distMax = dist.max() if distMax > 0: dist /= distMax else: raise RuntimeError("Unimplemented distance method %s" % \ (self.distanceMethod)) # Dense memory else: if self.distanceMethod == 'norm': dist = numpy.power(numpy.abs(self._M - inputPattern), self.distanceNorm) dist = dist.sum(1) dist = numpy.power(dist, 1.0/self.distanceNorm) dist /= dist.max() else: raise RuntimeError ("Not implemented yet for dense storage....") return dist ########################################################################## def _getDistances(self, inputPattern, partitionId = None): """Return distances from inputPattern to all stored patterns.""" if not self._finishedLearning: self.finishLearning() self._finishedLearning = True if self._vt is not None and len(self._vt) > 0: inputPattern = numpy.dot(self._vt, inputPattern - self._mean) sparseInput = self._sparsifyVector(inputPattern) # Compute distances dist = self._calcDistance(sparseInput) # Invalidate results where category is -1 if self._specificIndexTraining: dist[numpy.array(self._categoryList) == -1] = numpy.inf # Ignore vectors with same partition id if self._partitionIdArray is not None: dist[self._partitionIdArray == partitionId] = numpy.inf return dist ########################################################################## def finishLearning(self): if self.numSVDDims is not None and self._vt is None: self.computeSVD() # Check if our partition ID list is non-trivial # (i.e., whether it contains at least two different # partition IDs) if self._partitionIdList: partitions = set(self._partitionIdList) if len(partitions) > 1: # Compile into a numpy array self._partitionIdArray = numpy.array(self._partitionIdList) else: # Trivial partitions; ignore self._partitionIdArray = None # Either way, we don't need the original list self._partitionIdList = [] ########################################################################## def restartLearning(self): """ This is only invoked if we have already called finishLearning() but now want to go back and provide more samples. """ # We need to convert the partition ID array back into a list if hasattr(self, '_partitionIdArray'): # In the case of trivial partitions, we need to regenerate # the 'null' partition ID if self._partitionIdArray is None: self._partitionIdList = [0] * self._numPatterns else: self._partitionIdList = self._partitionIdArray.tolist() ########################################################################## def computeSVD(self, numSVDSamples=None, finalize=True): if numSVDSamples is None: numSVDSamples = self._numPatterns if not self.useSparseMemory: self._a = self._Memory[:self._numPatterns] else: self._a = self._Memory.toDense()[:self._numPatterns] self._mean = numpy.mean(self._a, axis=0) self._a -= self._mean u,self._s,self._vt = numpy.linalg.svd(self._a[:numSVDSamples]) if finalize: self.finalizeSVD() return self._s ########################################################################## def getAdaptiveSVDDims(self, singularValues, fractionOfMax=0.001): v = singularValues/singularValues[0] idx = numpy.where(v<fractionOfMax)[0] if len(idx): print "Number of PCA dimensions chosen: ", idx[0], "out of ", len(v) return idx[0] else: print "Number of PCA dimensions chosen: ", len(v)-1, "out of ", len(v) return len(v)-1 ########################################################################## def finalizeSVD(self, numSVDDims=None): if numSVDDims is not None: self.numSVDDims = numSVDDims if self.numSVDDims=='adaptive': if self.fractionOfMax is not None: self.numSVDDims = self.getAdaptiveSVDDims(self._s, self.fractionOfMax) else: self.numSVDDims = self.getAdaptiveSVDDims(self._s) if self._vt.shape[0] < self.numSVDDims: print "******************************************************************************" print "Warning: The requested number of PCA dimensions is more than the number of pattern dimensions." print "Setting numSVDDims = ", self._vt.shape[0] print "******************************************************************************" self.numSVDDims = self._vt.shape[0] self._vt = self._vt[:self.numSVDDims] # Added when svd is not able to decompose vectors - uses raw spare vectors if len(self._vt) == 0: return self._Memory = numpy.zeros((self._numPatterns,self.numSVDDims)) self._M = self._Memory self.useSparseMemory = False for i in range(self._numPatterns): self._Memory[i] = numpy.dot(self._vt, self._a[i]) self._a = None ########################################################################## def leaveOneOutTest(self): """ Run leave-one-out testing. Returns the total number of samples and the number correctly classified. Ignores invalid vectors (those with a category of -1). Uses partitionIdList, if non-empty, to avoid matching a vector against other vectors that came from the same training sequence. """ if self.useSparseMemory: raise Exception("leaveOneOutTest only works with dense memory right now") # The basic test is simple, but we need to prepare some data structures to # handle _specificIndexTraining and _partitionIdList categoryListArray = numpy.array(self._categoryList[:self._M.shape[0]]) if self._specificIndexTraining: # Find valid and invalid vectors using the category list validIndices = (categoryListArray != -1) invalidIndices = (categoryListArray == -1) # Convert list of partitions to numpy array if we haven't # already done so. partitionIdArray = None if hasattr(self, '_partitionIdArray') and \ self._partitionIdArray is not None: partitionIdArray = self._partitionIdArray elif self._partitionIdList: # Use the partition id list partitionIdArray = numpy.array(self._partitionIdList) # Find the winning vector for each cache vector, excluding itself, # excluding invalid vectors, and excluding other vectors with the # same partition id winners = numpy.zeros(self._M.shape[0], numpy.int32) for i in xrange(self._M.shape[0]): if self._specificIndexTraining \ and categoryListArray[i] == -1: # This is an invalid vector continue # Calculate distance between this vector and all others distances = numpy.power(numpy.abs(self._M - self._M[i,:]), self.distanceNorm) distances = distances.sum(1) # Invalidate certain vectors by setting their distance to infinity if self._specificIndexTraining: distances[invalidIndices] = numpy.inf # Ignore invalid vectors if partitionIdArray is not None: # Ignore vectors with same partition id distances[partitionIdArray == partitionIdArray[i]] = numpy.inf else: distances[i] = numpy.inf # Don't match vector with itself if self.k == 1: # Take the closest vector as the winner (k=1) winners[i] = distances.argmin() else: # Have the top k winners vote on the category categoryScores = numpy.zeros(categoryListArray.max() + 1) for j in xrange(self.k): winner = distances.argmin() distances[winner] = numpy.inf categoryScores[categoryListArray[winner]] += 1 winners[i] = categoryScores.argmax() if self.k == 1: # Convert the winners (vector IDs) to their category indices # For k > 1, the winners are already category indices winners = categoryListArray[winners] if self._specificIndexTraining: # Count the number of correct categories, ignoring invalid vectors matches = (winners[validIndices] == categoryListArray[validIndices]) else: # Count the number of correct categories matches = (winners == categoryListArray) # number of samples, number correct return float(matches.shape[0]), matches.sum() ########################################################################## def remapCategories(self, mapping): """ Change the category indices. mapping -- List of new category indices. For example, mapping=[2,0,1] would change all vectors of category 0 to be category 2, category 1 to 0, and category 2 to 1. Used by the Network Builder to keep the category indices in sync with the ImageSensor categoryInfo when the user renames or removes categories. """ categoryArray = numpy.array(self._categoryList) newCategoryArray = numpy.zeros(categoryArray.shape[0]) newCategoryArray.fill(-1) for i in xrange(len(mapping)): newCategoryArray[categoryArray==i] = mapping[i] self._categoryList = list(newCategoryArray) ########################################################################## def setCategoryOfVectors(self, vectorIndices, categoryIndices): """ Change the category associated with this vector(s). vectorIndices -- Single index or list of indices. categoryIndices -- Single index or list of indices. Can also be a single index when vectorIndices is a list, in which case the same category will be used for all vectors. Used by the Network Builder to move vectors between categories, to enable categories, and to invalidate vectors by setting the category to -1. """ if not hasattr(vectorIndices, '__iter__'): vectorIndices = [vectorIndices] categoryIndices = [categoryIndices] elif not hasattr(categoryIndices, '__iter__'): categoryIndices = [categoryIndices] * len(vectorIndices) for i in xrange(len(vectorIndices)): vectorIndex = vectorIndices[i] categoryIndex = categoryIndices[i] # Out-of-bounds is not an error, because the KNN may not have seen the # vector yet if vectorIndex < len(self._categoryList): self._categoryList[vectorIndex] = categoryIndex def __getstate__(self): """ Return serializable state. This function will return a version of the __dict__. """ state = self.__dict__.copy() return state def __setstate__(self, state): """ Set the state of ourself from a serialized state. """ if 'version' not in state: pass elif state['version'] == 1: pass elif state['version'] == 2: raise RuntimeError("Invalid deserialization of invalid KNNClassifier" "Verison") self.__dict__.update(state) # Set to new version self.version = KNNCLASSIFIER_VERSION
class KNNClassifier(Serializable): """ This class implements NuPIC's k Nearest Neighbor Classifier. KNN is very useful as a basic classifier for many situations. This implementation contains many enhancements that are useful for HTM experiments. These enhancements include an optimized C++ class for sparse vectors, support for continuous online learning, support for various distance methods (including Lp-norm and raw overlap), support for performing SVD on the input vectors (very useful for large vectors), support for a fixed-size KNN, and a mechanism to store custom ID's for each vector. :param k: (int) The number of nearest neighbors used in the classification of patterns. Must be odd. :param exact: (boolean) If true, patterns must match exactly when assigning class labels :param distanceNorm: (int) When distance method is "norm", this specifies the p value of the Lp-norm :param distanceMethod: (string) The method used to compute distance between input patterns and prototype patterns. The possible options are: - ``norm``: When distanceNorm is 2, this is the euclidean distance, When distanceNorm is 1, this is the manhattan distance In general: sum(abs(x-proto) ^ distanceNorm) ^ (1/distanceNorm) The distances are normalized such that farthest prototype from a given input is 1.0. - ``rawOverlap``: Only appropriate when inputs are binary. This computes: (width of the input) - (# bits of overlap between input and prototype). - ``pctOverlapOfInput``: Only appropriate for binary inputs. This computes 1.0 - (# bits overlap between input and prototype) / (# ON bits in input) - ``pctOverlapOfProto``: Only appropriate for binary inputs. This computes 1.0 - (# bits overlap between input and prototype) / (# ON bits in prototype) - ``pctOverlapOfLarger``: Only appropriate for binary inputs. This computes 1.0 - (# bits overlap between input and prototype) / max(# ON bits in input, # ON bits in prototype) :param distThreshold: (float) A threshold on the distance between learned patterns and a new pattern proposed to be learned. The distance must be greater than this threshold in order for the new pattern to be added to the classifier's memory. :param doBinarization: (boolean) If True, then scalar inputs will be binarized. :param binarizationThreshold: (float) If doBinarization is True, this specifies the threshold for the binarization of inputs :param useSparseMemory: (boolean) If True, classifier will use a sparse memory matrix :param sparseThreshold: (float) If useSparseMemory is True, input variables whose absolute values are less than this threshold will be stored as zero :param relativeThreshold: (boolean) Flag specifying whether to multiply sparseThreshold by max value in input :param numWinners: (int) Number of elements of the input that are stored. If 0, all elements are stored :param numSVDSamples: (int) Number of samples the must occur before a SVD (Singular Value Decomposition) transformation will be performed. If 0, the transformation will never be performed :param numSVDDims: (string) Controls dimensions kept after SVD transformation. If "adaptive", the number is chosen automatically :param fractionOfMax: (float) If numSVDDims is "adaptive", this controls the smallest singular value that is retained as a fraction of the largest singular value :param verbosity: (int) Console verbosity level where 0 is no output and larger integers provide increasing levels of verbosity :param maxStoredPatterns: (int) Limits the maximum number of the training patterns stored. When KNN learns in a fixed capacity mode, the unused patterns are deleted once the number of stored patterns is greater than maxStoredPatterns. A value of -1 is no limit :param replaceDuplicates: (bool) A boolean flag that determines whether, during learning, the classifier replaces duplicates that match exactly, even if distThreshold is 0. Should be True for online learning :param cellsPerCol: (int) If >= 1, input is assumed to be organized into columns, in the same manner as the temporal memory AND whenever a new prototype is stored, only the start cell (first cell) is stored in any bursting column :param minSparsity: (float) If useSparseMemory is set, only vectors with sparsity >= minSparsity will be stored during learning. A value of 0.0 implies all vectors will be stored. A value of 0.1 implies only vectors with at least 10% sparsity will be stored """ def __init__(self, k=1, exact=False, distanceNorm=2.0, distanceMethod="norm", distThreshold=0, doBinarization=False, binarizationThreshold=0.5, useSparseMemory=True, sparseThreshold=0.1, relativeThreshold=False, numWinners=0, numSVDSamples=None, numSVDDims=None, fractionOfMax=None, verbosity=0, maxStoredPatterns=-1, replaceDuplicates=False, cellsPerCol=0, minSparsity=0.0): self.version = KNNCLASSIFIER_VERSION self.k = k self.exact = exact self.distanceNorm = distanceNorm assert (distanceMethod in ("norm", "rawOverlap", "pctOverlapOfLarger", "pctOverlapOfProto", "pctOverlapOfInput")) self.distanceMethod = distanceMethod self.distThreshold = distThreshold self.doBinarization = doBinarization self.binarizationThreshold = binarizationThreshold self.useSparseMemory = useSparseMemory self.sparseThreshold = sparseThreshold self.relativeThreshold = relativeThreshold self.numWinners = numWinners self.numSVDSamples = numSVDSamples self.numSVDDims = numSVDDims self.fractionOfMax = fractionOfMax if self.numSVDDims=="adaptive": self._adaptiveSVDDims = True else: self._adaptiveSVDDims = False self.verbosity = verbosity self.replaceDuplicates = replaceDuplicates self.cellsPerCol = cellsPerCol self.maxStoredPatterns = maxStoredPatterns self.minSparsity = minSparsity self.clear() def clear(self): """Clears the state of the KNNClassifier.""" self._Memory = None self._numPatterns = 0 self._M = None self._categoryList = [] self._partitionIdList = [] self._partitionIdMap = {} self._finishedLearning = False self._iterationIdx = -1 # Fixed capacity KNN if self.maxStoredPatterns > 0: assert self.useSparseMemory, ("Fixed capacity KNN is implemented only " "in the sparse memory mode") self.fixedCapacity = True self._categoryRecencyList = [] else: self.fixedCapacity = False # Cached value of the store prototype sizes self._protoSizes = None # Used by PCA self._s = None self._vt = None self._nc = None self._mean = None # Used by Network Builder self._specificIndexTraining = False self._nextTrainingIndices = None def _doubleMemoryNumRows(self): m = 2 * self._Memory.shape[0] n = self._Memory.shape[1] self._Memory = numpy.resize(self._Memory,(m,n)) self._M = self._Memory[:self._numPatterns] def _sparsifyVector(self, inputPattern, doWinners=False): # Do sparsification, using a relative or absolute threshold if not self.relativeThreshold: inputPattern = inputPattern*(abs(inputPattern) > self.sparseThreshold) elif self.sparseThreshold > 0: inputPattern = inputPattern * \ (abs(inputPattern) > (self.sparseThreshold * abs(inputPattern).max())) # Do winner-take-all if doWinners: if (self.numWinners>0) and (self.numWinners < (inputPattern > 0).sum()): sparseInput = numpy.zeros(inputPattern.shape) # Don't consider strongly negative numbers as winners. sorted = inputPattern.argsort()[0:self.numWinners] sparseInput[sorted] += inputPattern[sorted] inputPattern = sparseInput # Do binarization if self.doBinarization: # Don't binarize negative numbers to positive 1. inputPattern = (inputPattern > self.binarizationThreshold).astype(float) return inputPattern def prototypeSetCategory(self, idToCategorize, newCategory): """ Allows ids to be assigned a category and subsequently enables users to use: - :meth:`~.KNNClassifier.KNNClassifier.removeCategory` - :meth:`~.KNNClassifier.KNNClassifier.closestTrainingPattern` - :meth:`~.KNNClassifier.KNNClassifier.closestOtherTrainingPattern` """ if idToCategorize not in self._categoryRecencyList: return recordIndex = self._categoryRecencyList.index(idToCategorize) self._categoryList[recordIndex] = newCategory def removeIds(self, idsToRemove): """ There are two caveats. First, this is a potentially slow operation. Second, pattern indices will shift if patterns before them are removed. :param idsToRemove: A list of row indices to remove. """ # Form a list of all categories to remove rowsToRemove = [k for k, rowID in enumerate(self._categoryRecencyList) \ if rowID in idsToRemove] # Remove rows from the classifier self._removeRows(rowsToRemove) def removeCategory(self, categoryToRemove): """ There are two caveats. First, this is a potentially slow operation. Second, pattern indices will shift if patterns before them are removed. :param categoryToRemove: Category label to remove """ removedRows = 0 if self._Memory is None: return removedRows # The internal category indices are stored in float # format, so we should compare with a float catToRemove = float(categoryToRemove) # Form a list of all categories to remove rowsToRemove = [k for k, catID in enumerate(self._categoryList) \ if catID == catToRemove] # Remove rows from the classifier self._removeRows(rowsToRemove) assert catToRemove not in self._categoryList def _removeRows(self, rowsToRemove): """ A list of row indices to remove. There are two caveats. First, this is a potentially slow operation. Second, pattern indices will shift if patterns before them are removed. """ # Form a numpy array of row indices to be removed removalArray = numpy.array(rowsToRemove) # Remove categories self._categoryList = numpy.delete(numpy.array(self._categoryList), removalArray).tolist() if self.fixedCapacity: self._categoryRecencyList = numpy.delete( numpy.array(self._categoryRecencyList), removalArray).tolist() # Remove the partition ID, if any for these rows and rebuild the id map. for row in reversed(rowsToRemove): # Go backwards # Remove these patterns from partitionList self._partitionIdList.pop(row) self._rebuildPartitionIdMap(self._partitionIdList) # Remove actual patterns if self.useSparseMemory: # Delete backwards for rowIndex in rowsToRemove[::-1]: self._Memory.deleteRow(rowIndex) else: self._M = numpy.delete(self._M, removalArray, 0) numRemoved = len(rowsToRemove) # Sanity checks numRowsExpected = self._numPatterns - numRemoved if self.useSparseMemory: if self._Memory is not None: assert self._Memory.nRows() == numRowsExpected else: assert self._M.shape[0] == numRowsExpected assert len(self._categoryList) == numRowsExpected self._numPatterns -= numRemoved return numRemoved def doIteration(self): """ Utility method to increment the iteration index. Intended for models that don't learn each timestep. """ self._iterationIdx += 1 def learn(self, inputPattern, inputCategory, partitionId=None, isSparse=0, rowID=None): """ Train the classifier to associate specified input pattern with a particular category. :param inputPattern: (list) The pattern to be assigned a category. If isSparse is 0, this should be a dense array (both ON and OFF bits present). Otherwise, if isSparse > 0, this should be a list of the indices of the non-zero bits in sorted order :param inputCategory: (int) The category to be associated to the training pattern :param partitionId: (int) partitionID allows you to associate an id with each input vector. It can be used to associate input patterns stored in the classifier with an external id. This can be useful for debugging or visualizing. Another use case is to ignore vectors with a specific id during inference (see description of infer() for details). There can be at most one partitionId per stored pattern (i.e. if two patterns are within distThreshold, only the first partitionId will be stored). This is an optional parameter. :param isSparse: (int) If 0, the input pattern is a dense representation. If isSparse > 0, the input pattern is a list of non-zero indices and isSparse is the length of the dense representation :param rowID: (int) UNKNOWN :returns: The number of patterns currently stored in the classifier """ if self.verbosity >= 1: print("%s learn:" % g_debugPrefix) print(" category:", int(inputCategory)) print(" active inputs:", _labeledInput(inputPattern, cellsPerCol=self.cellsPerCol)) if isSparse > 0: assert all(inputPattern[i] <= inputPattern[i+1] for i in range(len(inputPattern)-1)), \ "Sparse inputPattern must be sorted." assert all(bit < isSparse for bit in inputPattern), \ ("Sparse inputPattern must not index outside the dense " "representation's bounds.") if rowID is None: rowID = self._iterationIdx # Dense vectors if not self.useSparseMemory: # Not supported assert self.cellsPerCol == 0, "not implemented for dense vectors" # If the input was given in sparse form, convert it to dense if isSparse > 0: denseInput = numpy.zeros(isSparse) denseInput[inputPattern] = 1.0 inputPattern = denseInput if self._specificIndexTraining and not self._nextTrainingIndices: # Specific index mode without any index provided - skip training return self._numPatterns if self._Memory is None: # Initialize memory with 100 rows and numPatterns = 0 inputWidth = len(inputPattern) self._Memory = numpy.zeros((100,inputWidth)) self._numPatterns = 0 self._M = self._Memory[:self._numPatterns] addRow = True if self._vt is not None: # Compute projection inputPattern = numpy.dot(self._vt, inputPattern - self._mean) if self.distThreshold > 0: # Check if input is too close to an existing input to be accepted dist = self._calcDistance(inputPattern) minDist = dist.min() addRow = (minDist >= self.distThreshold) if addRow: self._protoSizes = None # need to re-compute if self._numPatterns == self._Memory.shape[0]: # Double the size of the memory self._doubleMemoryNumRows() if not self._specificIndexTraining: # Normal learning - append the new input vector self._Memory[self._numPatterns] = inputPattern self._numPatterns += 1 self._categoryList.append(int(inputCategory)) else: # Specific index training mode - insert vector in specified slot vectorIndex = self._nextTrainingIndices.pop(0) while vectorIndex >= self._Memory.shape[0]: self._doubleMemoryNumRows() self._Memory[vectorIndex] = inputPattern self._numPatterns = max(self._numPatterns, vectorIndex + 1) if vectorIndex >= len(self._categoryList): self._categoryList += [-1] * (vectorIndex - len(self._categoryList) + 1) self._categoryList[vectorIndex] = int(inputCategory) # Set _M to the "active" part of _Memory self._M = self._Memory[0:self._numPatterns] self._addPartitionId(self._numPatterns-1, partitionId) # Sparse vectors else: # If the input was given in sparse form, convert it to dense if necessary if isSparse > 0 and (self._vt is not None or self.distThreshold > 0 \ or self.numSVDDims is not None or self.numSVDSamples is not None \ or self.numWinners > 0): denseInput = numpy.zeros(isSparse) denseInput[inputPattern] = 1.0 inputPattern = denseInput isSparse = 0 # Get the input width if isSparse > 0: inputWidth = isSparse else: inputWidth = len(inputPattern) # Allocate storage if this is the first training vector if self._Memory is None: self._Memory = NearestNeighbor(0, inputWidth) # Support SVD if it is on if self._vt is not None: inputPattern = numpy.dot(self._vt, inputPattern - self._mean) # Threshold the input, zeroing out entries that are too close to 0. # This is only done if we are given a dense input. if isSparse == 0: thresholdedInput = self._sparsifyVector(inputPattern, True) addRow = True # If given the layout of the cells, then turn on the logic that stores # only the start cell for bursting columns. if self.cellsPerCol >= 1: burstingCols = thresholdedInput.reshape(-1, self.cellsPerCol).min(axis=1).nonzero()[0] for col in burstingCols: thresholdedInput[(col * self.cellsPerCol) + 1 : (col * self.cellsPerCol) + self.cellsPerCol] = 0 # Don't learn entries that are too close to existing entries. if self._Memory.nRows() > 0: dist = None # if this vector is a perfect match for one we already learned, then # replace the category - it may have changed with online learning on. if self.replaceDuplicates: dist = self._calcDistance(thresholdedInput, distanceNorm=1) if dist.min() == 0: rowIdx = dist.argmin() self._categoryList[rowIdx] = int(inputCategory) if self.fixedCapacity: self._categoryRecencyList[rowIdx] = rowID addRow = False # Don't add this vector if it matches closely with another we already # added if self.distThreshold > 0: if dist is None or self.distanceNorm != 1: dist = self._calcDistance(thresholdedInput) minDist = dist.min() addRow = (minDist >= self.distThreshold) if not addRow: if self.fixedCapacity: rowIdx = dist.argmin() self._categoryRecencyList[rowIdx] = rowID # If sparsity is too low, we do not want to add this vector if addRow and self.minSparsity > 0.0: if isSparse==0: sparsity = ( float(len(thresholdedInput.nonzero()[0])) / len(thresholdedInput) ) else: sparsity = float(len(inputPattern)) / isSparse if sparsity < self.minSparsity: addRow = False # Add the new sparse vector to our storage if addRow: self._protoSizes = None # need to re-compute if isSparse == 0: self._Memory.addRow(thresholdedInput) else: self._Memory.addRowNZ(inputPattern, [1]*len(inputPattern)) self._numPatterns += 1 self._categoryList.append(int(inputCategory)) self._addPartitionId(self._numPatterns-1, partitionId) if self.fixedCapacity: self._categoryRecencyList.append(rowID) if self._numPatterns > self.maxStoredPatterns and \ self.maxStoredPatterns > 0: leastRecentlyUsedPattern = numpy.argmin(self._categoryRecencyList) self._Memory.deleteRow(leastRecentlyUsedPattern) self._categoryList.pop(leastRecentlyUsedPattern) self._categoryRecencyList.pop(leastRecentlyUsedPattern) self._numPatterns -= 1 if self.numSVDDims is not None and self.numSVDSamples is not None \ and self._numPatterns == self.numSVDSamples: self.computeSVD() return self._numPatterns def getOverlaps(self, inputPattern): """ Return the degree of overlap between an input pattern and each category stored in the classifier. The overlap is computed by computing: .. code-block:: python logical_and(inputPattern != 0, trainingPattern != 0).sum() :param inputPattern: pattern to check overlap of :returns: (overlaps, categories) Two numpy arrays of the same length, where: * overlaps: an integer overlap amount for each category * categories: category index for each element of overlaps """ assert self.useSparseMemory, "Not implemented yet for dense storage" overlaps = self._Memory.rightVecSumAtNZ(inputPattern) return (overlaps, self._categoryList) def getDistances(self, inputPattern): """Return the distances between the input pattern and all other stored patterns. :param inputPattern: pattern to check distance with :returns: (distances, categories) numpy arrays of the same length. - overlaps: an integer overlap amount for each category - categories: category index for each element of distances """ dist = self._getDistances(inputPattern) return (dist, self._categoryList) def infer(self, inputPattern, computeScores=True, overCategories=True, partitionId=None): """Finds the category that best matches the input pattern. Returns the winning category index as well as a distribution over all categories. :param inputPattern: (list) A pattern to be classified :param computeScores: NO EFFECT :param overCategories: NO EFFECT :param partitionId: (int) If provided, all training vectors with partitionId equal to that of the input pattern are ignored. For example, this may be used to perform k-fold cross validation without repopulating the classifier. First partition all the data into k equal partitions numbered 0, 1, 2, ... and then call learn() for each vector passing in its partitionId. Then, during inference, by passing in the partition ID in the call to infer(), all other vectors with the same partitionId are ignored simulating the effect of repopulating the classifier while ommitting the training vectors in the same partition. :returns: 4-tuple with these keys: - ``winner``: The category with the greatest number of nearest neighbors within the kth nearest neighbors. If the inferenceResult contains no neighbors, the value of winner is None. This can happen, for example, in cases of exact matching, if there are no stored vectors, or if minSparsity is not met. - ``inferenceResult``: A list of length numCategories, each entry contains the number of neighbors within the top k neighbors that are in that category. - ``dist``: A list of length numPrototypes. Each entry is the distance from the unknown to that prototype. All distances are between 0.0 and 1.0. - ``categoryDist``: A list of length numCategories. Each entry is the distance from the unknown to the nearest prototype of that category. All distances are between 0 and 1.0. """ # Calculate sparsity. If sparsity is too low, we do not want to run # inference with this vector sparsity = 0.0 if self.minSparsity > 0.0: sparsity = ( float(len(inputPattern.nonzero()[0])) / len(inputPattern) ) if len(self._categoryList) == 0 or sparsity < self.minSparsity: # No categories learned yet; i.e. first inference w/ online learning or # insufficient sparsity winner = None inferenceResult = numpy.zeros(1) dist = numpy.ones(1) categoryDist = numpy.ones(1) else: maxCategoryIdx = max(self._categoryList) inferenceResult = numpy.zeros(maxCategoryIdx+1) dist = self._getDistances(inputPattern, partitionId=partitionId) validVectorCount = len(self._categoryList) - self._categoryList.count(-1) # Loop through the indices of the nearest neighbors. if self.exact: # Is there an exact match in the distances? exactMatches = numpy.where(dist<0.00001)[0] if len(exactMatches) > 0: for i in exactMatches[:min(self.k, validVectorCount)]: inferenceResult[self._categoryList[i]] += 1.0 else: sorted = dist.argsort() for j in sorted[:min(self.k, validVectorCount)]: inferenceResult[self._categoryList[j]] += 1.0 # Prepare inference results. if inferenceResult.any(): winner = inferenceResult.argmax() inferenceResult /= inferenceResult.sum() else: winner = None categoryDist = min_score_per_category(maxCategoryIdx, self._categoryList, dist) categoryDist.clip(0, 1.0, categoryDist) if self.verbosity >= 1: print("%s infer:" % (g_debugPrefix)) print(" active inputs:", _labeledInput(inputPattern, cellsPerCol=self.cellsPerCol)) print(" winner category:", winner) print(" pct neighbors of each category:", inferenceResult) print(" dist of each prototype:", dist) print(" dist of each category:", categoryDist) result = (winner, inferenceResult, dist, categoryDist) return result def getClosest(self, inputPattern, topKCategories=3): """Returns the index of the pattern that is closest to inputPattern, the distances of all patterns to inputPattern, and the indices of the k closest categories. """ inferenceResult = numpy.zeros(max(self._categoryList)+1) dist = self._getDistances(inputPattern) sorted = dist.argsort() validVectorCount = len(self._categoryList) - self._categoryList.count(-1) for j in sorted[:min(self.k, validVectorCount)]: inferenceResult[self._categoryList[j]] += 1.0 winner = inferenceResult.argmax() topNCats = [] for i in range(topKCategories): topNCats.append((self._categoryList[sorted[i]], dist[sorted[i]] )) return winner, dist, topNCats def closestTrainingPattern(self, inputPattern, cat): """Returns the closest training pattern to inputPattern that belongs to category "cat". :param inputPattern: The pattern whose closest neighbor is sought :param cat: The required category of closest neighbor :returns: A dense version of the closest training pattern, or None if no such patterns exist """ dist = self._getDistances(inputPattern) sorted = dist.argsort() for patIdx in sorted: patternCat = self._categoryList[patIdx] # If closest pattern belongs to desired category, return it if patternCat == cat: if self.useSparseMemory: closestPattern = self._Memory.getRow(int(patIdx)) else: closestPattern = self._M[patIdx] return closestPattern # No patterns were found! return None def closestOtherTrainingPattern(self, inputPattern, cat): """Return the closest training pattern that is *not* of the given category "cat". :param inputPattern: The pattern whose closest neighbor is sought :param cat: Training patterns of this category will be ignored no matter their distance to inputPattern :returns: A dense version of the closest training pattern, or None if no such patterns exist """ dist = self._getDistances(inputPattern) sorted = dist.argsort() for patIdx in sorted: patternCat = self._categoryList[patIdx] # If closest pattern does not belong to specified category, return it if patternCat != cat: if self.useSparseMemory: closestPattern = self._Memory.getRow(int(patIdx)) else: closestPattern = self._M[patIdx] return closestPattern # No patterns were found! return None def getPattern(self, idx, sparseBinaryForm=False, cat=None): """Gets a training pattern either by index or category number. :param idx: Index of the training pattern :param sparseBinaryForm: If true, returns a list of the indices of the non-zero bits in the training pattern :param cat: If not None, get the first pattern belonging to category cat. If this is specified, idx must be None. :returns: The training pattern with specified index """ if cat is not None: assert idx is None idx = self._categoryList.index(cat) if not self.useSparseMemory: pattern = self._Memory[idx] if sparseBinaryForm: pattern = pattern.nonzero()[0] else: (nz, values) = self._Memory.rowNonZeros(idx) if not sparseBinaryForm: pattern = numpy.zeros(self._Memory.nCols()) numpy.put(pattern, nz, 1) else: pattern = nz return pattern def getPartitionId(self, i): """ Gets the partition id given an index. :param i: index of partition :returns: the partition id associated with pattern i. Returns None if no id is associated with it. """ if (i < 0) or (i >= self._numPatterns): raise RuntimeError("index out of bounds") partitionId = self._partitionIdList[i] if partitionId == numpy.inf: return None else: return partitionId def getPartitionIdList(self): """ :returns: a list of complete partition id objects """ return self._partitionIdList def getNumPartitionIds(self): """ :returns: the number of unique partition Ids stored. """ return len(self._partitionIdMap) def getPartitionIdKeys(self): """ :returns: a list containing unique (non-None) partition Ids (just the keys) """ return list(self._partitionIdMap.keys()) def getPatternIndicesWithPartitionId(self, partitionId): """ :returns: a list of pattern indices corresponding to this partitionId. Return an empty list if there are none. """ return self._partitionIdMap.get(partitionId, []) def _addPartitionId(self, index, partitionId=None): """ Adds partition id for pattern index """ if partitionId is None: self._partitionIdList.append(numpy.inf) else: self._partitionIdList.append(partitionId) indices = self._partitionIdMap.get(partitionId, []) indices.append(index) self._partitionIdMap[partitionId] = indices def _rebuildPartitionIdMap(self, partitionIdList): """ Rebuilds the partition Id map using the given partitionIdList """ self._partitionIdMap = {} for row, partitionId in enumerate(partitionIdList): indices = self._partitionIdMap.get(partitionId, []) indices.append(row) self._partitionIdMap[partitionId] = indices def _calcDistance(self, inputPattern, distanceNorm=None): """Calculate the distances from inputPattern to all stored patterns. All distances are between 0.0 and 1.0 :param inputPattern The pattern from which distances to all other patterns are calculated :param distanceNorm Degree of the distance norm """ if distanceNorm is None: distanceNorm = self.distanceNorm # Sparse memory if self.useSparseMemory: if self._protoSizes is None: self._protoSizes = self._Memory.rowSums() overlapsWithProtos = self._Memory.rightVecSumAtNZ(inputPattern) inputPatternSum = inputPattern.sum() if self.distanceMethod == "rawOverlap": dist = inputPattern.sum() - overlapsWithProtos elif self.distanceMethod == "pctOverlapOfInput": dist = inputPatternSum - overlapsWithProtos if inputPatternSum > 0: dist /= inputPatternSum elif self.distanceMethod == "pctOverlapOfProto": overlapsWithProtos /= self._protoSizes dist = 1.0 - overlapsWithProtos elif self.distanceMethod == "pctOverlapOfLarger": maxVal = numpy.maximum(self._protoSizes, inputPatternSum) if maxVal.all() > 0: overlapsWithProtos /= maxVal dist = 1.0 - overlapsWithProtos elif self.distanceMethod == "norm": dist = self._Memory.vecLpDist(self.distanceNorm, inputPattern) distMax = dist.max() if distMax > 0: dist /= distMax else: raise RuntimeError("Unimplemented distance method %s" % self.distanceMethod) # Dense memory else: if self.distanceMethod == "norm": dist = numpy.power(numpy.abs(self._M - inputPattern), self.distanceNorm) dist = dist.sum(1) dist = numpy.power(dist, 1.0/self.distanceNorm) dist /= dist.max() else: raise RuntimeError ("Not implemented yet for dense storage....") return dist def _getDistances(self, inputPattern, partitionId=None): """Return the distances from inputPattern to all stored patterns. :param inputPattern The pattern from which distances to all other patterns are returned :param partitionId If provided, ignore all training vectors with this partitionId. """ if not self._finishedLearning: self.finishLearning() self._finishedLearning = True if self._vt is not None and len(self._vt) > 0: inputPattern = numpy.dot(self._vt, inputPattern - self._mean) sparseInput = self._sparsifyVector(inputPattern) # Compute distances dist = self._calcDistance(sparseInput) # Invalidate results where category is -1 if self._specificIndexTraining: dist[numpy.array(self._categoryList) == -1] = numpy.inf # Ignore vectors with this partition id by setting their distances to inf if partitionId is not None: dist[self._partitionIdMap.get(partitionId, [])] = numpy.inf return dist def finishLearning(self): """ Used for batch scenarios. This method needs to be called between learning and inference. """ if self.numSVDDims is not None and self._vt is None: self.computeSVD() def computeSVD(self, numSVDSamples=None, finalize=True): """ Compute the singular value decomposition (SVD). The SVD is a factorization of a real or complex matrix. It factors the matrix `a` as `u * np.diag(s) * v`, where `u` and `v` are unitary and `s` is a 1-d array of `a`'s singular values. **Reason for computing the SVD:** There are cases where you want to feed a lot of vectors to the KNNClassifier. However, this can be slow. You can speed up training by (1) computing the SVD of the input patterns which will give you the eigenvectors, (2) only keeping a fraction of the eigenvectors, and (3) projecting the input patterns onto the remaining eigenvectors. Note that all input patterns are projected onto the eigenvectors in the same fashion. Keeping only the highest eigenvectors increases training performance since it reduces the dimensionality of the input. :param numSVDSamples: (int) the number of samples to use for the SVD computation. :param finalize: (bool) whether to apply SVD to the input patterns. :returns: (array) The singular values for every matrix, sorted in descending order. """ if numSVDSamples is None: numSVDSamples = self._numPatterns if not self.useSparseMemory: self._a = self._Memory[:self._numPatterns] else: self._a = self._Memory.toDense()[:self._numPatterns] self._mean = numpy.mean(self._a, axis=0) self._a -= self._mean u,self._s,self._vt = numpy.linalg.svd(self._a[:numSVDSamples]) if finalize: self._finalizeSVD() return self._s def getAdaptiveSVDDims(self, singularValues, fractionOfMax=0.001): """ Compute the number of eigenvectors (singularValues) to keep. :param singularValues: :param fractionOfMax: :return: """ v = singularValues/singularValues[0] idx = numpy.where(v<fractionOfMax)[0] if len(idx): print("Number of PCA dimensions chosen: ", idx[0], "out of ", len(v)) return idx[0] else: print("Number of PCA dimensions chosen: ", len(v)-1, "out of ", len(v)) return len(v)-1 def _finalizeSVD(self, numSVDDims=None): """ Called by finalizeLearning(). This will project all the patterns onto the SVD eigenvectors. :param numSVDDims: (int) number of egeinvectors used for projection. :return: """ if numSVDDims is not None: self.numSVDDims = numSVDDims if self.numSVDDims=="adaptive": if self.fractionOfMax is not None: self.numSVDDims = self.getAdaptiveSVDDims(self._s, self.fractionOfMax) else: self.numSVDDims = self.getAdaptiveSVDDims(self._s) if self._vt.shape[0] < self.numSVDDims: print("******************************************************************") print ("Warning: The requested number of PCA dimensions is more than " "the number of pattern dimensions.") print("Setting numSVDDims = ", self._vt.shape[0]) print("******************************************************************") self.numSVDDims = self._vt.shape[0] self._vt = self._vt[:self.numSVDDims] # Added when svd is not able to decompose vectors - uses raw spare vectors if len(self._vt) == 0: return self._Memory = numpy.zeros((self._numPatterns,self.numSVDDims)) self._M = self._Memory self.useSparseMemory = False for i in range(self._numPatterns): self._Memory[i] = numpy.dot(self._vt, self._a[i]) self._a = None def remapCategories(self, mapping): """Change the category indices. Used by the Network Builder to keep the category indices in sync with the ImageSensor categoryInfo when the user renames or removes categories. :param mapping: List of new category indices. For example, mapping=[2,0,1] would change all vectors of category 0 to be category 2, category 1 to 0, and category 2 to 1 """ categoryArray = numpy.array(self._categoryList) newCategoryArray = numpy.zeros(categoryArray.shape[0]) newCategoryArray.fill(-1) for i in range(len(mapping)): newCategoryArray[categoryArray==i] = mapping[i] self._categoryList = list(newCategoryArray) def setCategoryOfVectors(self, vectorIndices, categoryIndices): """Change the category associated with this vector(s). Used by the Network Builder to move vectors between categories, to enable categories, and to invalidate vectors by setting the category to -1. :param vectorIndices: Single index or list of indices :param categoryIndices: Single index or list of indices. Can also be a single index when vectorIndices is a list, in which case the same category will be used for all vectors """ if not hasattr(vectorIndices, "__iter__"): vectorIndices = [vectorIndices] categoryIndices = [categoryIndices] elif not hasattr(categoryIndices, "__iter__"): categoryIndices = [categoryIndices] * len(vectorIndices) for i in range(len(vectorIndices)): vectorIndex = vectorIndices[i] categoryIndex = categoryIndices[i] # Out-of-bounds is not an error, because the KNN may not have seen the # vector yet if vectorIndex < len(self._categoryList): self._categoryList[vectorIndex] = categoryIndex @staticmethod def getSchema(): return KNNClassifierProto @classmethod def read(cls, proto): if proto.version != KNNCLASSIFIER_VERSION: raise RuntimeError("Invalid KNNClassifier Version") knn = object.__new__(cls) knn.version = proto.version knn.k = proto.k knn.exact = proto.exact knn.distanceNorm = proto.distanceNorm knn.distanceMethod = proto.distanceMethod knn.distThreshold = proto.distThreshold knn.doBinarization = proto.doBinarization knn.binarizationThreshold = proto.binarizationThreshold knn.useSparseMemory = proto.useSparseMemory knn.sparseThreshold = proto.sparseThreshold knn.relativeThreshold = proto.relativeThreshold knn.numWinners = proto.numWinners knn.numSVDSamples = proto.numSVDSamples knn.numSVDDims = proto.numSVDDims knn.fractionOfMax = proto.fractionOfMax knn.verbosity = proto.verbosity knn.maxStoredPatterns = proto.maxStoredPatterns knn.replaceDuplicates = proto.replaceDuplicates knn.cellsPerCol = proto.cellsPerCol knn.minSparsity = proto.minSparsity if knn.numSVDDims == "adaptive": knn._adaptiveSVDDims = True else: knn._adaptiveSVDDims = False # Read private state knn.clear() if proto.memory is not None: which = proto.memory.which() if which == "ndarray": knn._Memory = numpy.array(proto.memory.ndarray, dtype=numpy.float64) elif which == "nearestNeighbor": knn._Memory = NearestNeighbor() knn._Memory.read(proto.memory.nearestNeighbor) knn._numPatterns = proto.numPatterns if proto.m is not None: knn._M = numpy.array(proto.m, dtype=numpy.float64) if proto.categoryList is not None: knn._categoryList = list(proto.categoryList) if proto.partitionIdList is not None: knn._partitionIdList = list(proto.partitionIdList) knn._rebuildPartitionIdMap(knn._partitionIdList) knn._iterationIdx = proto.iterationIdx knn._finishedLearning = proto.finishedLearning if proto.s is not None: knn._s = numpy.array(proto.s, dtype=numpy.float32) if proto.vt is not None: knn._vt = numpy.array(proto.vt, dtype=numpy.float32) if proto.mean is not None: knn._mean = numpy.array(proto.mean, dtype=numpy.float32) return knn def write(self, proto): proto.version = self.version proto.k = self.k proto.exact = bool(self.exact) proto.distanceNorm = self.distanceNorm proto.distanceMethod = self.distanceMethod proto.distThreshold = self.distThreshold proto.doBinarization = bool(self.doBinarization) proto.binarizationThreshold = self.binarizationThreshold proto.useSparseMemory = bool(self.useSparseMemory) proto.sparseThreshold = self.sparseThreshold proto.relativeThreshold = bool(self.relativeThreshold) proto.numWinners = self.numWinners proto.verbosity = self.verbosity proto.maxStoredPatterns = self.maxStoredPatterns proto.replaceDuplicates = bool(self.replaceDuplicates) proto.cellsPerCol = self.cellsPerCol proto.minSparsity = self.minSparsity # Write private state if self._Memory is not None: if isinstance(self._Memory, numpy.ndarray): proto.memory.ndarray = self._Memory.tolist() else: proto.memory.init("nearestNeighbor") self._Memory.write(proto.memory.nearestNeighbor) proto.numPatterns = self._numPatterns if self._M is not None: proto.m = self._M.tolist() if self._categoryList is not None: proto.categoryList = self._categoryList if self._partitionIdList is not None: proto.partitionIdList = self._partitionIdList proto.finishedLearning = bool(self._finishedLearning) proto.iterationIdx = self._iterationIdx if self._s is not None: proto.s = self._s.tolist() if self._vt is not None: proto.vt = self._vt.tolist() if self._mean is not None: proto.mean = self._mean.tolist() def __getstate__(self): """Return serializable state. This function will return a version of the __dict__. """ state = self.__dict__.copy() return state def __setstate__(self, state): """Set the state of this object from a serialized state.""" if "version" not in state: pass elif state["version"] == 1: pass elif state["version"] == 2: raise RuntimeError("Invalid deserialization of invalid KNNClassifier" "Version") # Backward compatibility if "_partitionIdArray" in state: state.pop("_partitionIdArray") if "minSparsity" not in state: state["minSparsity"] = 0.0 self.__dict__.update(state) # Backward compatibility if "_partitionIdMap" not in state: self._rebuildPartitionIdMap(self._partitionIdList) # Set to new version self.version = KNNCLASSIFIER_VERSION
class KNNClassifier(object): """k Nearest Neighbor Classifier""" def __init__(self, k=1, exact=False, distanceNorm=2.0, distanceMethod="norm", distThreshold=0, doBinarization=False, binarizationThreshold=0.5, useSparseMemory=True, sparseThreshold=0.1, relativeThreshold=False, numWinners=0, numSVDSamples=None, numSVDDims=None, fractionOfMax=None, verbosity=0, maxStoredPatterns=-1, replaceDuplicates=False, cellsPerCol=0): """Constructor for the kNN classifier. @param k (int) The number of nearest neighbors used in the classification of patterns. Must be odd @param exact (boolean) If true, patterns must match exactly when assigning class labels @param distanceNorm (int) When distance method is "norm", this specifies the p value of the Lp-norm @param distanceMethod (string) The method used to compute distance between input patterns and prototype patterns. The possible options are: "norm": When distanceNorm is 2, this is the euclidean distance, When distanceNorm is 1, this is the manhattan distance In general: sum(abs(x-proto) ^ distanceNorm) ^ (1/distanceNorm) The distances are normalized such that farthest prototype from a given input is 1.0. "rawOverlap": Only appropriate when inputs are binary. This computes: (width of the input) - (# bits of overlap between input and prototype). "pctOverlapOfInput": Only appropriate for binary inputs. This computes 1.0 - (# bits overlap between input and prototype) / (# ON bits in input) "pctOverlapOfProto": Only appropriate for binary inputs. This computes 1.0 - (# bits overlap between input and prototype) / (# ON bits in prototype) "pctOverlapOfLarger": Only appropriate for binary inputs. This computes 1.0 - (# bits overlap between input and prototype) / max(# ON bits in input, # ON bits in prototype) @param distThreshold (float) A threshold on the distance between learned patterns and a new pattern proposed to be learned. The distance must be greater than this threshold in order for the new pattern to be added to the classifier's memory @param doBinarization (boolean) If True, then scalar inputs will be binarized. @param binarizationThreshold (float) If doBinarization is True, this specifies the threshold for the binarization of inputs @param useSparseMemory (boolean) If True, classifier will use a sparse memory matrix @param sparseThreshold (float) If useSparseMemory is True, input variables whose absolute values are less than this threshold will be stored as zero @param relativeThreshold (boolean) Flag specifying whether to multiply sparseThreshold by max value in input @param numWinners (int) Number of elements of the input that are stored. If 0, all elements are stored @param numSVDSamples (int) Number of samples the must occur before a SVD (Singular Value Decomposition) transformation will be performed. If 0, the transformation will never be performed @param numSVDDims (string) Controls dimensions kept after SVD transformation. If "adaptive", the number is chosen automatically @param fractionOfMax (float) If numSVDDims is "adaptive", this controls the smallest singular value that is retained as a fraction of the largest singular value @param verbosity (int) Console verbosity level where 0 is no output and larger integers provide increasing levels of verbosity @param maxStoredPatterns (int) Limits the maximum number of the training patterns stored. When KNN learns in a fixed capacity mode, the unused patterns are deleted once the number of stored patterns is greater than maxStoredPatterns. A value of -1 is no limit @param replaceDuplicates (bool) A boolean flag that determines whether, during learning, the classifier replaces duplicates that match exactly, even if distThreshold is 0. Should be True for online learning @param cellsPerCol (int) If >= 1, input is assumed to be organized into columns, in the same manner as the temporal pooler AND whenever a new prototype is stored, only the start cell (first cell) is stored in any bursting column """ self.version = KNNCLASSIFIER_VERSION self.k = k self.exact = exact self.distanceNorm = distanceNorm assert (distanceMethod in ("norm", "rawOverlap", "pctOverlapOfLarger", "pctOverlapOfProto", "pctOverlapOfInput")) self.distanceMethod = distanceMethod self.distThreshold = distThreshold self.doBinarization = doBinarization self.binarizationThreshold = binarizationThreshold self.useSparseMemory = useSparseMemory self.sparseThreshold = sparseThreshold self.relativeThreshold = relativeThreshold self.numWinners = numWinners self.numSVDSamples = numSVDSamples self.numSVDDims = numSVDDims self.fractionOfMax = fractionOfMax if self.numSVDDims=="adaptive": self._adaptiveSVDDims = True else: self._adaptiveSVDDims = False self.verbosity = verbosity self.replaceDuplicates = replaceDuplicates self.cellsPerCol = cellsPerCol self.maxStoredPatterns = maxStoredPatterns self.clear() def clear(self): """Clears the state of the KNNClassifier.""" self._Memory = None self._numPatterns = 0 self._M = None self._categoryList = [] self._partitionIdList = [] self._partitionIdArray = None self._finishedLearning = False self._iterationIdx = -1 # Fixed capacity KNN if self.maxStoredPatterns > 0: assert self.useSparseMemory, ("Fixed capacity KNN is implemented only " "in the sparse memory mode") self.fixedCapacity = True self._categoryRecencyList = [] else: self.fixedCapacity = False # Cached value of the store prototype sizes self._protoSizes = None # Used by PCA self._s = None self._vt = None self._nc = None self._mean = None # Used by Network Builder self._specificIndexTraining = False self._nextTrainingIndices = None def _doubleMemoryNumRows(self): m = 2 * self._Memory.shape[0] n = self._Memory.shape[1] self._Memory = numpy.resize(self._Memory,(m,n)) self._M = self._Memory[:self._numPatterns] def _sparsifyVector(self, inputPattern, doWinners=False): # Do sparsification, using a relative or absolute threshold if not self.relativeThreshold: inputPattern = inputPattern*(abs(inputPattern) > self.sparseThreshold) elif self.sparseThreshold > 0: inputPattern = inputPattern * \ (abs(inputPattern) > (self.sparseThreshold * abs(inputPattern).max())) # Do winner-take-all if doWinners: if (self.numWinners>0) and (self.numWinners < (inputPattern > 0).sum()): sparseInput = numpy.zeros(inputPattern.shape) # Don't consider strongly negative numbers as winners. sorted = inputPattern.argsort()[0:self.numWinners] sparseInput[sorted] += inputPattern[sorted] inputPattern = sparseInput # Do binarization if self.doBinarization: # Don't binarize negative numbers to positive 1. inputPattern = (inputPattern > self.binarizationThreshold).astype(float) return inputPattern def prototypeSetCategory(self, idToRelabel, newCategory): if idToRelabel not in self._categoryRecencyList: return recordIndex = self._categoryRecencyList.index(idToRelabel) self._categoryList[recordIndex] = newCategory def removeIds(self, idsToRemove): # Form a list of all categories to remove rowsToRemove = [k for k, rowID in enumerate(self._categoryRecencyList) \ if rowID in idsToRemove] # Remove rows from the classifier self._removeRows(rowsToRemove) def removeCategory(self, categoryToRemove): removedRows = 0 if self._Memory is None: return removedRows # The internal category indices are stored in float # format, so we should compare with a float catToRemove = float(categoryToRemove) # Form a list of all categories to remove rowsToRemove = [k for k, catID in enumerate(self._categoryList) \ if catID == catToRemove] # Remove rows from the classifier self._removeRows(rowsToRemove) assert catToRemove not in self._categoryList def _removeRows(self, rowsToRemove): # Form a numpy array of row indices to be removed removalArray = numpy.array(rowsToRemove) # Remove categories self._categoryList = numpy.delete(numpy.array(self._categoryList), removalArray).tolist() self._categoryRecencyList = numpy.delete( numpy.array(self._categoryRecencyList), removalArray).tolist() # Remove the partition ID, if any if self._partitionIdArray is not None: self._partitionIdArray = numpy.delete(self._partitionIdArray, removalArray) # Remove actual patterns if self.useSparseMemory: # Delete backwards for rowIndex in rowsToRemove[::-1]: self._Memory.deleteRow(rowIndex) else: self._M = numpy.delete(self._M, removalArray, 0) numRemoved = len(rowsToRemove) # Sanity checks numRowsExpected = self._numPatterns - numRemoved if self.useSparseMemory: if self._Memory is not None: assert self._Memory.nRows() == numRowsExpected else: assert self._M.shape[0] == numRowsExpected assert len(self._categoryList) == numRowsExpected assert self._partitionIdArray is None or \ self._partitionIdArray.shape[0] == numRowsExpected self._numPatterns -= numRemoved return numRemoved def doIteration(self): """Utility method to increment the iteration index. Intended for models that don't learn each timestep. """ self._iterationIdx += 1 def learn(self, inputPattern, inputCategory, partitionId=None, isSparse=0, rowID=None): """Train the classifier to associate specified input pattern with a particular category. @param inputPattern (list) The pattern to be assigned a category. If isSparse is 0, this should be a dense array (both ON and OFF bits present). Otherwise, if isSparse > 0, this should be a list of the indices of the non-zero bits in sorted order @param inputCategory (int) The category to be associated to the training pattern @param partitionId (int) partitionID allows you to partition the data set by associating unique IDs with sets of vectors. One use case is to ignore a specific set of vectors during inference for k-fold cross validation (see description of infer() for further details). This is an optional parameter. @param isSparse (int) If 0, the input pattern is a dense representation. If isSparse > 0, the input pattern is a list of non-zero indices and isSparse is the length of the dense representation @param rowID (int) UNKNOWN @return The number of patterns currently stored in the classifier """ if self.verbosity >= 1: print "%s learn:" % g_debugPrefix print " category:", int(inputCategory) print " active inputs:", _labeledInput(inputPattern, cellsPerCol=self.cellsPerCol) if rowID is None: rowID = self._iterationIdx # Dense vectors if not self.useSparseMemory: # Not supported assert self.cellsPerCol == 0, "not implemented for dense vectors" # If the input was given in sparse form, convert it to dense if isSparse > 0: denseInput = numpy.zeros(isSparse) denseInput[inputPattern] = 1.0 inputPattern = denseInput if self._specificIndexTraining and not self._nextTrainingIndices: # Specific index mode without any index provided - skip training return self._numPatterns if self._Memory is None: # Initialize memory with 100 rows and numPatterns = 0 inputWidth = len(inputPattern) self._Memory = numpy.zeros((100,inputWidth)) self._numPatterns = 0 self._M = self._Memory[:self._numPatterns] addRow = True if self._vt is not None: # Compute projection inputPattern = numpy.dot(self._vt, inputPattern - self._mean) if self.distThreshold > 0: # Check if input is too close to an existing input to be accepted dist = self._calcDistance(inputPattern) minDist = dist.min() addRow = (minDist >= self.distThreshold) if addRow: self._protoSizes = None # need to re-compute if self._numPatterns == self._Memory.shape[0]: # Double the size of the memory self._doubleMemoryNumRows() if not self._specificIndexTraining: # Normal learning - append the new input vector self._Memory[self._numPatterns] = inputPattern self._numPatterns += 1 self._categoryList.append(int(inputCategory)) else: # Specific index training mode - insert vector in specified slot vectorIndex = self._nextTrainingIndices.pop(0) while vectorIndex >= self._Memory.shape[0]: self._doubleMemoryNumRows() self._Memory[vectorIndex] = inputPattern self._numPatterns = max(self._numPatterns, vectorIndex + 1) if vectorIndex >= len(self._categoryList): self._categoryList += [-1] * (vectorIndex - len(self._categoryList) + 1) self._categoryList[vectorIndex] = int(inputCategory) # Set _M to the "active" part of _Memory self._M = self._Memory[0:self._numPatterns] if partitionId is not None: self._partitionIdList.append(partitionId) # Sparse vectors else: # If the input was given in sparse form, convert it to dense if necessary if isSparse > 0 and (self._vt is not None or self.distThreshold > 0 \ or self.numSVDDims is not None or self.numSVDSamples is not None \ or self.numWinners > 0): denseInput = numpy.zeros(isSparse) denseInput[inputPattern] = 1.0 inputPattern = denseInput isSparse = 0 # Get the input width if isSparse > 0: inputWidth = isSparse else: inputWidth = len(inputPattern) # Allocate storage if this is the first training vector if self._Memory is None: self._Memory = NearestNeighbor(0, inputWidth) # Support SVD if it is on if self._vt is not None: inputPattern = numpy.dot(self._vt, inputPattern - self._mean) # Threshold the input, zeroing out entries that are too close to 0. # This is only done if we are given a dense input. if isSparse == 0: thresholdedInput = self._sparsifyVector(inputPattern, True) addRow = True # If given the layout of the cells, then turn on the logic that stores # only the start cell for bursting columns. if self.cellsPerCol >= 1: numCols = thresholdedInput.size / self.cellsPerCol burstingCols = thresholdedInput.reshape(-1, self.cellsPerCol).min(axis=1).nonzero()[0] for col in burstingCols: thresholdedInput[(col * self.cellsPerCol) + 1 : (col * self.cellsPerCol) + self.cellsPerCol] = 0 # Don't learn entries that are too close to existing entries. if self._Memory.nRows() > 0: dist = None # if this vector is a perfect match for one we already learned, then # replace the category - it may have changed with online learning on. if self.replaceDuplicates: dist = self._calcDistance(thresholdedInput, distanceNorm=1) if dist.min() == 0: rowIdx = dist.argmin() self._categoryList[rowIdx] = int(inputCategory) if self.fixedCapacity: self._categoryRecencyList[rowIdx] = rowID addRow = False # Don't add this vector if it matches closely with another we already # added if self.distThreshold > 0: if dist is None or self.distanceNorm != 1: dist = self._calcDistance(thresholdedInput) minDist = dist.min() addRow = (minDist >= self.distThreshold) if not addRow: if self.fixedCapacity: rowIdx = dist.argmin() self._categoryRecencyList[rowIdx] = rowID # Add the new vector to our storage if addRow: self._protoSizes = None # need to re-compute if isSparse == 0: self._Memory.addRow(thresholdedInput) else: self._Memory.addRowNZ(inputPattern, [1]*len(inputPattern)) self._numPatterns += 1 self._categoryList.append(int(inputCategory)) if partitionId is not None: self._partitionIdList.append(partitionId) if self.fixedCapacity: self._categoryRecencyList.append(rowID) if self._numPatterns > self.maxStoredPatterns and \ self.maxStoredPatterns > 0: leastRecentlyUsedPattern = numpy.argmin(self._categoryRecencyList) self._Memory.deleteRow(leastRecentlyUsedPattern) self._categoryList.pop(leastRecentlyUsedPattern) self._categoryRecencyList.pop(leastRecentlyUsedPattern) self._numPatterns -= 1 if self.numSVDDims is not None and self.numSVDSamples is not None \ and self._numPatterns == self.numSVDSamples: self.computeSVD() return self._numPatterns def getOverlaps(self, inputPattern): """Return the degree of overlap between an input pattern and each category stored in the classifier. The overlap is computed by compuing: logical_and(inputPattern != 0, trainingPattern != 0).sum() @param inputPattern pattern to check overlap of @return (overlaps, categories) Two numpy arrays of the same length: overlaps: an integer overlap amount for each category categories: category index for each element of overlaps """ assert self.useSparseMemory, "Not implemented yet for dense storage" overlaps = self._Memory.rightVecSumAtNZ(inputPattern) return (overlaps, self._categoryList) def getDistances(self, inputPattern): """Return the distances between the input pattern and all other stored patterns. @param inputPattern pattern to check distance with @return (distances, categories) numpy arrays of the same length: overlaps: an integer overlap amount for each category categories: category index for each element of distances """ dist = self._getDistances(inputPattern) return (dist, self._categoryList) def infer(self, inputPattern, computeScores=True, overCategories=True, partitionId=None): """Finds the category that best matches the input pattern. Returns the winning category index as well as a distribution over all categories. @param inputPattern (list) A pattern to be classified @param computeScores NO EFFECT @param overCategories NO EFFECT @param partitionId (int) If provided, all training vectors with partitionId equal to that of the input pattern are ignored. For example, this may be used to perform k-fold cross validation without repopulating the classifier. First partition all the data into k equal partitions numbered 0, 1, 2, ... and then call learn() for each vector passing in its partitionId. Then, during inference, by passing in the partition ID in the call to infer(), all other vectors with the same partitionId are ignored simulating the effect of repopulating the classifier while ommitting the training vectors in the same partition. This method returns a 4-tuple: (winner, inferenceResult, dist, categoryDist) winner: The category with the greatest number of nearest neighbors within the kth nearest neighbors. If the inferenceResult contains no neighbors, the value of winner is None; this applies to the case of exact matching. inferenceResult: A list of length numCategories, each entry contains the number of neighbors within the top k neighbors that are in that category. dist: A list of length numPrototypes. Each entry is the distance from the unknown to that prototype. All distances are between 0.0 and 1.0 categoryDist: A list of length numCategories. Each entry is the distance from the unknown to the nearest prototype of that category. All distances are between 0 and 1.0. """ if len(self._categoryList) == 0: # No categories learned yet; i.e. first inference w/ online learning. winner = 0 inferenceResult = numpy.zeros(1) dist = numpy.ones(1) categoryDist = numpy.ones(1) else: maxCategoryIdx = max(self._categoryList) inferenceResult = numpy.zeros(maxCategoryIdx+1) dist = self._getDistances(inputPattern, partitionId=partitionId) validVectorCount = len(self._categoryList) - self._categoryList.count(-1) # Loop through the indices of the nearest neighbors. if self.exact: # Is there an exact match in the distances? exactMatches = numpy.where(dist<0.00001)[0] if len(exactMatches) > 0: for i in exactMatches[:min(self.k, validVectorCount)]: inferenceResult[self._categoryList[i]] += 1.0 else: sorted = dist.argsort() for j in sorted[:min(self.k, validVectorCount)]: inferenceResult[self._categoryList[j]] += 1.0 # Prepare inference results. if inferenceResult.any(): winner = inferenceResult.argmax() inferenceResult /= inferenceResult.sum() else: winner = None categoryDist = min_score_per_category(maxCategoryIdx, self._categoryList, dist) categoryDist.clip(0, 1.0, categoryDist) if self.verbosity >= 1: print "%s infer:" % (g_debugPrefix) print " active inputs:", _labeledInput(inputPattern, cellsPerCol=self.cellsPerCol) print " winner category:", winner print " pct neighbors of each category:", inferenceResult print " dist of each prototype:", dist print " dist of each category:", categoryDist result = (winner, inferenceResult, dist, categoryDist) return result def getClosest(self, inputPattern, topKCategories=3): """Returns the index of the pattern that is closest to inputPattern, the distances of all patterns to inputPattern, and the indices of the k closest categories. """ inferenceResult = numpy.zeros(max(self._categoryList)+1) dist = self._getDistances(inputPattern) sorted = dist.argsort() validVectorCount = len(self._categoryList) - self._categoryList.count(-1) for j in sorted[:min(self.k, validVectorCount)]: inferenceResult[self._categoryList[j]] += 1.0 winner = inferenceResult.argmax() topNCats = [] for i in range(topKCategories): topNCats.append((self._categoryList[sorted[i]], dist[sorted[i]] )) return winner, dist, topNCats def closestTrainingPattern(self, inputPattern, cat): """Returns the closest training pattern to inputPattern that belongs to category "cat". @param inputPattern The pattern whose closest neighbor is sought @param cat The required category of closest neighbor @return A dense version of the closest training pattern, or None if no such patterns exist """ dist = self._getDistances(inputPattern) sorted = dist.argsort() for patIdx in sorted: patternCat = self._categoryList[patIdx] # If closest pattern belongs to desired category, return it if patternCat == cat: if self.useSparseMemory: closestPattern = self._Memory.getRow(int(patIdx)) else: closestPattern = self._M[patIdx] return closestPattern # No patterns were found! return None def closestOtherTrainingPattern(self, inputPattern, cat): """Return the closest training pattern that is *not* of the given category "cat". @param inputPattern The pattern whose closest neighbor is sought @param cat Training patterns of this category will be ignored no matter their distance to inputPattern @return A dense version of the closest training pattern, or None if no such patterns exist """ dist = self._getDistances(inputPattern) sorted = dist.argsort() for patIdx in sorted: patternCat = self._categoryList[patIdx] # If closest pattern does not belong to specified category, return it if patternCat != cat: if self.useSparseMemory: closestPattern = self._Memory.getRow(int(patIdx)) else: closestPattern = self._M[patIdx] return closestPattern # No patterns were found! return None def getPattern(self, idx, sparseBinaryForm=False, cat=None): """Gets a training pattern either by index or category number. @param idx Index of the training pattern @param sparseBinaryForm If true, returns a list of the indices of the non-zero bits in the training pattern @param cat If not None, get the first pattern belonging to category cat. If this is specified, idx must be None. @return The training pattern with specified index """ if cat is not None: assert idx is None idx = self._categoryList.index(cat) if not self.useSparseMemory: pattern = self._Memory[idx] if sparseBinaryForm: pattern = pattern.nonzero()[0] else: (nz, values) = self._Memory.rowNonZeros(idx) if not sparseBinaryForm: pattern = numpy.zeros(self._Memory.nCols()) numpy.put(pattern, nz, 1) else: pattern = nz return pattern def _calcDistance(self, inputPattern, distanceNorm=None): """Calculate the distances from inputPattern to all stored patterns. All distances are between 0.0 and 1.0 @param inputPattern The pattern from which distances to all other patterns are calculated @param distanceNorm Degree of the distance norm """ if distanceNorm is None: distanceNorm = self.distanceNorm # Sparse memory if self.useSparseMemory: if self._protoSizes is None: self._protoSizes = self._Memory.rowSums() overlapsWithProtos = self._Memory.rightVecSumAtNZ(inputPattern) inputPatternSum = inputPattern.sum() if self.distanceMethod == "rawOverlap": dist = inputPattern.sum() - overlapsWithProtos elif self.distanceMethod == "pctOverlapOfInput": dist = inputPatternSum - overlapsWithProtos if inputPatternSum > 0: dist /= inputPatternSum elif self.distanceMethod == "pctOverlapOfProto": overlapsWithProtos /= self._protoSizes dist = 1.0 - overlapsWithProtos elif self.distanceMethod == "pctOverlapOfLarger": maxVal = numpy.maximum(self._protoSizes, inputPatternSum) if maxVal.all() > 0: overlapsWithProtos /= maxVal dist = 1.0 - overlapsWithProtos elif self.distanceMethod == "norm": dist = self._Memory.vecLpDist(self.distanceNorm, inputPattern) distMax = dist.max() if distMax > 0: dist /= distMax else: raise RuntimeError("Unimplemented distance method %s" % self.distanceMethod) # Dense memory else: if self.distanceMethod == "norm": dist = numpy.power(numpy.abs(self._M - inputPattern), self.distanceNorm) dist = dist.sum(1) dist = numpy.power(dist, 1.0/self.distanceNorm) dist /= dist.max() else: raise RuntimeError ("Not implemented yet for dense storage....") return dist def _getDistances(self, inputPattern, partitionId=None): """Return the distances from inputPattern to all stored patterns. @param inputPattern The pattern from which distances to all other patterns are returned @param partitionId If provided, ignore all training vectors with this partitionId. """ if not self._finishedLearning: self.finishLearning() self._finishedLearning = True if self._vt is not None and len(self._vt) > 0: inputPattern = numpy.dot(self._vt, inputPattern - self._mean) sparseInput = self._sparsifyVector(inputPattern) # Compute distances dist = self._calcDistance(sparseInput) # Invalidate results where category is -1 if self._specificIndexTraining: dist[numpy.array(self._categoryList) == -1] = numpy.inf # Ignore vectors with same partition id if self._partitionIdArray is not None: dist[self._partitionIdArray == partitionId] = numpy.inf return dist def finishLearning(self): if self.numSVDDims is not None and self._vt is None: self.computeSVD() # Check if our partition ID list is non-trivial # (i.e., whether it contains at least two different # partition IDs) if self._partitionIdList: partitions = set(self._partitionIdList) if len(partitions) > 1: # Compile into a numpy array self._partitionIdArray = numpy.array(self._partitionIdList) else: # Trivial partitions; ignore self._partitionIdArray = None # Either way, we don't need the original list self._partitionIdList = [] def restartLearning(self): """This is only invoked if we have already called finishLearning() but now want to go back and provide more samples. """ # We need to convert the partition ID array back into a list if hasattr(self, "_partitionIdArray"): # In the case of trivial partitions, we need to regenerate # the "null" partition ID if self._partitionIdArray is None: self._partitionIdList = [0] * self._numPatterns else: self._partitionIdList = self._partitionIdArray.tolist() def computeSVD(self, numSVDSamples=None, finalize=True): if numSVDSamples is None: numSVDSamples = self._numPatterns if not self.useSparseMemory: self._a = self._Memory[:self._numPatterns] else: self._a = self._Memory.toDense()[:self._numPatterns] self._mean = numpy.mean(self._a, axis=0) self._a -= self._mean u,self._s,self._vt = numpy.linalg.svd(self._a[:numSVDSamples]) if finalize: self.finalizeSVD() return self._s def getAdaptiveSVDDims(self, singularValues, fractionOfMax=0.001): v = singularValues/singularValues[0] idx = numpy.where(v<fractionOfMax)[0] if len(idx): print "Number of PCA dimensions chosen: ", idx[0], "out of ", len(v) return idx[0] else: print "Number of PCA dimensions chosen: ", len(v)-1, "out of ", len(v) return len(v)-1 def finalizeSVD(self, numSVDDims=None): if numSVDDims is not None: self.numSVDDims = numSVDDims if self.numSVDDims=="adaptive": if self.fractionOfMax is not None: self.numSVDDims = self.getAdaptiveSVDDims(self._s, self.fractionOfMax) else: self.numSVDDims = self.getAdaptiveSVDDims(self._s) if self._vt.shape[0] < self.numSVDDims: print "******************************************************************" print ("Warning: The requested number of PCA dimensions is more than " "the number of pattern dimensions.") print "Setting numSVDDims = ", self._vt.shape[0] print "******************************************************************" self.numSVDDims = self._vt.shape[0] self._vt = self._vt[:self.numSVDDims] # Added when svd is not able to decompose vectors - uses raw spare vectors if len(self._vt) == 0: return self._Memory = numpy.zeros((self._numPatterns,self.numSVDDims)) self._M = self._Memory self.useSparseMemory = False for i in range(self._numPatterns): self._Memory[i] = numpy.dot(self._vt, self._a[i]) self._a = None def leaveOneOutTest(self): """Run leave-one-out testing. Returns the total number of samples and the number correctly classified. Ignores invalid vectors (those with a category of -1). Uses partitionIdList, if non-empty, to avoid matching a vector against other vectors that came from the same training sequence. """ if self.useSparseMemory: raise Exception("leaveOneOutTest only works with dense memory right now") # The basic test is simple, but we need to prepare some data structures to # handle _specificIndexTraining and _partitionIdList categoryListArray = numpy.array(self._categoryList[:self._M.shape[0]]) if self._specificIndexTraining: # Find valid and invalid vectors using the category list validIndices = (categoryListArray != -1) invalidIndices = (categoryListArray == -1) # Convert list of partitions to numpy array if we haven't # already done so. partitionIdArray = None if hasattr(self, "_partitionIdArray") and \ self._partitionIdArray is not None: partitionIdArray = self._partitionIdArray elif self._partitionIdList: # Use the partition id list partitionIdArray = numpy.array(self._partitionIdList) # Find the winning vector for each cache vector, excluding itself, # excluding invalid vectors, and excluding other vectors with the # same partition id winners = numpy.zeros(self._M.shape[0], numpy.int32) for i in xrange(self._M.shape[0]): if self._specificIndexTraining \ and categoryListArray[i] == -1: # This is an invalid vector continue # Calculate distance between this vector and all others distances = numpy.power(numpy.abs(self._M - self._M[i,:]), self.distanceNorm) distances = distances.sum(1) # Invalidate certain vectors by setting their distance to infinity if self._specificIndexTraining: distances[invalidIndices] = numpy.inf # Ignore invalid vectors if partitionIdArray is not None: # Ignore vectors with same partition id distances[partitionIdArray == partitionIdArray[i]] = numpy.inf else: distances[i] = numpy.inf # Don't match vector with itself if self.k == 1: # Take the closest vector as the winner (k=1) winners[i] = distances.argmin() else: # Have the top k winners vote on the category categoryScores = numpy.zeros(categoryListArray.max() + 1) for j in xrange(self.k): winner = distances.argmin() distances[winner] = numpy.inf categoryScores[categoryListArray[winner]] += 1 winners[i] = categoryScores.argmax() if self.k == 1: # Convert the winners (vector IDs) to their category indices # For k > 1, the winners are already category indices winners = categoryListArray[winners] if self._specificIndexTraining: # Count the number of correct categories, ignoring invalid vectors matches = (winners[validIndices] == categoryListArray[validIndices]) else: # Count the number of correct categories matches = (winners == categoryListArray) # number of samples, number correct return float(matches.shape[0]), matches.sum() def remapCategories(self, mapping): """Change the category indices. Used by the Network Builder to keep the category indices in sync with the ImageSensor categoryInfo when the user renames or removes categories. @param mapping List of new category indices. For example, mapping=[2,0,1] would change all vectors of category 0 to be category 2, category 1 to 0, and category 2 to 1 """ categoryArray = numpy.array(self._categoryList) newCategoryArray = numpy.zeros(categoryArray.shape[0]) newCategoryArray.fill(-1) for i in xrange(len(mapping)): newCategoryArray[categoryArray==i] = mapping[i] self._categoryList = list(newCategoryArray) def setCategoryOfVectors(self, vectorIndices, categoryIndices): """Change the category associated with this vector(s). Used by the Network Builder to move vectors between categories, to enable categories, and to invalidate vectors by setting the category to -1. @param vectorIndices Single index or list of indices @param categoryIndices Single index or list of indices. Can also be a single index when vectorIndices is a list, in which case the same category will be used for all vectors """ if not hasattr(vectorIndices, "__iter__"): vectorIndices = [vectorIndices] categoryIndices = [categoryIndices] elif not hasattr(categoryIndices, "__iter__"): categoryIndices = [categoryIndices] * len(vectorIndices) for i in xrange(len(vectorIndices)): vectorIndex = vectorIndices[i] categoryIndex = categoryIndices[i] # Out-of-bounds is not an error, because the KNN may not have seen the # vector yet if vectorIndex < len(self._categoryList): self._categoryList[vectorIndex] = categoryIndex def __getstate__(self): """Return serializable state. This function will return a version of the __dict__. """ state = self.__dict__.copy() return state def __setstate__(self, state): """Set the state of this object from a serialized state.""" if "version" not in state: pass elif state["version"] == 1: pass elif state["version"] == 2: raise RuntimeError("Invalid deserialization of invalid KNNClassifier" "Verison") self.__dict__.update(state) # Set to new version self.version = KNNCLASSIFIER_VERSION
class KNNClassifier(Serializable): """ This class implements NuPIC's k Nearest Neighbor Classifier. KNN is very useful as a basic classifier for many situations. This implementation contains many enhancements that are useful for HTM experiments. These enhancements include an optimized C++ class for sparse vectors, support for continuous online learning, support for various distance methods (including Lp-norm and raw overlap), support for performing SVD on the input vectors (very useful for large vectors), support for a fixed-size KNN, and a mechanism to store custom ID's for each vector. :param k: (int) The number of nearest neighbors used in the classification of patterns. Must be odd. :param exact: (boolean) If true, patterns must match exactly when assigning class labels :param distanceNorm: (int) When distance method is "norm", this specifies the p value of the Lp-norm :param distanceMethod: (string) The method used to compute distance between input patterns and prototype patterns. The possible options are: - ``norm``: When distanceNorm is 2, this is the euclidean distance, When distanceNorm is 1, this is the manhattan distance In general: sum(abs(x-proto) ^ distanceNorm) ^ (1/distanceNorm) The distances are normalized such that farthest prototype from a given input is 1.0. - ``rawOverlap``: Only appropriate when inputs are binary. This computes: (width of the input) - (# bits of overlap between input and prototype). - ``pctOverlapOfInput``: Only appropriate for binary inputs. This computes 1.0 - (# bits overlap between input and prototype) / (# ON bits in input) - ``pctOverlapOfProto``: Only appropriate for binary inputs. This computes 1.0 - (# bits overlap between input and prototype) / (# ON bits in prototype) - ``pctOverlapOfLarger``: Only appropriate for binary inputs. This computes 1.0 - (# bits overlap between input and prototype) / max(# ON bits in input, # ON bits in prototype) :param distThreshold: (float) A threshold on the distance between learned patterns and a new pattern proposed to be learned. The distance must be greater than this threshold in order for the new pattern to be added to the classifier's memory. :param doBinarization: (boolean) If True, then scalar inputs will be binarized. :param binarizationThreshold: (float) If doBinarization is True, this specifies the threshold for the binarization of inputs :param useSparseMemory: (boolean) If True, classifier will use a sparse memory matrix :param sparseThreshold: (float) If useSparseMemory is True, input variables whose absolute values are less than this threshold will be stored as zero :param relativeThreshold: (boolean) Flag specifying whether to multiply sparseThreshold by max value in input :param numWinners: (int) Number of elements of the input that are stored. If 0, all elements are stored :param numSVDSamples: (int) Number of samples the must occur before a SVD (Singular Value Decomposition) transformation will be performed. If 0, the transformation will never be performed :param numSVDDims: (string) Controls dimensions kept after SVD transformation. If "adaptive", the number is chosen automatically :param fractionOfMax: (float) If numSVDDims is "adaptive", this controls the smallest singular value that is retained as a fraction of the largest singular value :param verbosity: (int) Console verbosity level where 0 is no output and larger integers provide increasing levels of verbosity :param maxStoredPatterns: (int) Limits the maximum number of the training patterns stored. When KNN learns in a fixed capacity mode, the unused patterns are deleted once the number of stored patterns is greater than maxStoredPatterns. A value of -1 is no limit :param replaceDuplicates: (bool) A boolean flag that determines whether, during learning, the classifier replaces duplicates that match exactly, even if distThreshold is 0. Should be True for online learning :param cellsPerCol: (int) If >= 1, input is assumed to be organized into columns, in the same manner as the temporal memory AND whenever a new prototype is stored, only the start cell (first cell) is stored in any bursting column :param minSparsity: (float) If useSparseMemory is set, only vectors with sparsity >= minSparsity will be stored during learning. A value of 0.0 implies all vectors will be stored. A value of 0.1 implies only vectors with at least 10% sparsity will be stored """ def __init__(self, k=1, exact=False, distanceNorm=2.0, distanceMethod="norm", distThreshold=0, doBinarization=False, binarizationThreshold=0.5, useSparseMemory=True, sparseThreshold=0.1, relativeThreshold=False, numWinners=0, numSVDSamples=None, numSVDDims=None, fractionOfMax=None, verbosity=0, maxStoredPatterns=-1, replaceDuplicates=False, cellsPerCol=0, minSparsity=0.0): self.version = KNNCLASSIFIER_VERSION self.k = k self.exact = exact self.distanceNorm = distanceNorm assert (distanceMethod in ("norm", "rawOverlap", "pctOverlapOfLarger", "pctOverlapOfProto", "pctOverlapOfInput")) self.distanceMethod = distanceMethod self.distThreshold = distThreshold self.doBinarization = doBinarization self.binarizationThreshold = binarizationThreshold self.useSparseMemory = useSparseMemory self.sparseThreshold = sparseThreshold self.relativeThreshold = relativeThreshold self.numWinners = numWinners self.numSVDSamples = numSVDSamples self.numSVDDims = numSVDDims self.fractionOfMax = fractionOfMax if self.numSVDDims=="adaptive": self._adaptiveSVDDims = True else: self._adaptiveSVDDims = False self.verbosity = verbosity self.replaceDuplicates = replaceDuplicates self.cellsPerCol = cellsPerCol self.maxStoredPatterns = maxStoredPatterns self.minSparsity = minSparsity self.clear() def clear(self): """Clears the state of the KNNClassifier.""" self._Memory = None self._numPatterns = 0 self._M = None self._categoryList = [] self._partitionIdList = [] self._partitionIdMap = {} self._finishedLearning = False self._iterationIdx = -1 # Fixed capacity KNN if self.maxStoredPatterns > 0: assert self.useSparseMemory, ("Fixed capacity KNN is implemented only " "in the sparse memory mode") self.fixedCapacity = True self._categoryRecencyList = [] else: self.fixedCapacity = False # Cached value of the store prototype sizes self._protoSizes = None # Used by PCA self._s = None self._vt = None self._nc = None self._mean = None # Used by Network Builder self._specificIndexTraining = False self._nextTrainingIndices = None def _doubleMemoryNumRows(self): m = 2 * self._Memory.shape[0] n = self._Memory.shape[1] self._Memory = numpy.resize(self._Memory,(m,n)) self._M = self._Memory[:self._numPatterns] def _sparsifyVector(self, inputPattern, doWinners=False): # Do sparsification, using a relative or absolute threshold if not self.relativeThreshold: inputPattern = inputPattern*(abs(inputPattern) > self.sparseThreshold) elif self.sparseThreshold > 0: inputPattern = inputPattern * \ (abs(inputPattern) > (self.sparseThreshold * abs(inputPattern).max())) # Do winner-take-all if doWinners: if (self.numWinners>0) and (self.numWinners < (inputPattern > 0).sum()): sparseInput = numpy.zeros(inputPattern.shape) # Don't consider strongly negative numbers as winners. sorted = inputPattern.argsort()[0:self.numWinners] sparseInput[sorted] += inputPattern[sorted] inputPattern = sparseInput # Do binarization if self.doBinarization: # Don't binarize negative numbers to positive 1. inputPattern = (inputPattern > self.binarizationThreshold).astype(float) return inputPattern def prototypeSetCategory(self, idToCategorize, newCategory): """ Allows ids to be assigned a category and subsequently enables users to use: - :meth:`~.KNNClassifier.KNNClassifier.removeCategory` - :meth:`~.KNNClassifier.KNNClassifier.closestTrainingPattern` - :meth:`~.KNNClassifier.KNNClassifier.closestOtherTrainingPattern` """ if idToCategorize not in self._categoryRecencyList: return recordIndex = self._categoryRecencyList.index(idToCategorize) self._categoryList[recordIndex] = newCategory def removeIds(self, idsToRemove): """ There are two caveats. First, this is a potentially slow operation. Second, pattern indices will shift if patterns before them are removed. :param idsToRemove: A list of row indices to remove. """ # Form a list of all categories to remove rowsToRemove = [k for k, rowID in enumerate(self._categoryRecencyList) \ if rowID in idsToRemove] # Remove rows from the classifier self._removeRows(rowsToRemove) def removeCategory(self, categoryToRemove): """ There are two caveats. First, this is a potentially slow operation. Second, pattern indices will shift if patterns before them are removed. :param categoryToRemove: Category label to remove """ removedRows = 0 if self._Memory is None: return removedRows # The internal category indices are stored in float # format, so we should compare with a float catToRemove = float(categoryToRemove) # Form a list of all categories to remove rowsToRemove = [k for k, catID in enumerate(self._categoryList) \ if catID == catToRemove] # Remove rows from the classifier self._removeRows(rowsToRemove) assert catToRemove not in self._categoryList def _removeRows(self, rowsToRemove): """ A list of row indices to remove. There are two caveats. First, this is a potentially slow operation. Second, pattern indices will shift if patterns before them are removed. """ # Form a numpy array of row indices to be removed removalArray = numpy.array(rowsToRemove) # Remove categories self._categoryList = numpy.delete(numpy.array(self._categoryList), removalArray).tolist() if self.fixedCapacity: self._categoryRecencyList = numpy.delete( numpy.array(self._categoryRecencyList), removalArray).tolist() # Remove the partition ID, if any for these rows and rebuild the id map. for row in reversed(rowsToRemove): # Go backwards # Remove these patterns from partitionList self._partitionIdList.pop(row) self._rebuildPartitionIdMap(self._partitionIdList) # Remove actual patterns if self.useSparseMemory: # Delete backwards for rowIndex in rowsToRemove[::-1]: self._Memory.deleteRow(rowIndex) else: self._M = numpy.delete(self._M, removalArray, 0) numRemoved = len(rowsToRemove) # Sanity checks numRowsExpected = self._numPatterns - numRemoved if self.useSparseMemory: if self._Memory is not None: assert self._Memory.nRows() == numRowsExpected else: assert self._M.shape[0] == numRowsExpected assert len(self._categoryList) == numRowsExpected self._numPatterns -= numRemoved return numRemoved def doIteration(self): """ Utility method to increment the iteration index. Intended for models that don't learn each timestep. """ self._iterationIdx += 1 def learn(self, inputPattern, inputCategory, partitionId=None, isSparse=0, rowID=None): """ Train the classifier to associate specified input pattern with a particular category. :param inputPattern: (list) The pattern to be assigned a category. If isSparse is 0, this should be a dense array (both ON and OFF bits present). Otherwise, if isSparse > 0, this should be a list of the indices of the non-zero bits in sorted order :param inputCategory: (int) The category to be associated to the training pattern :param partitionId: (int) partitionID allows you to associate an id with each input vector. It can be used to associate input patterns stored in the classifier with an external id. This can be useful for debugging or visualizing. Another use case is to ignore vectors with a specific id during inference (see description of infer() for details). There can be at most one partitionId per stored pattern (i.e. if two patterns are within distThreshold, only the first partitionId will be stored). This is an optional parameter. :param isSparse: (int) 0 if the input pattern is a dense representation. When the input pattern is a list of non-zero indices, then isSparse is the number of total bits (n). E.g. for the dense array [0, 1, 1, 0, 0, 1], isSparse should be `0`. For the equivalent sparse representation [1, 2, 5] (which specifies the indices of active bits), isSparse should be `6`, which is the total number of bits in the input space. :param rowID: (int) UNKNOWN :returns: The number of patterns currently stored in the classifier """ if self.verbosity >= 1: print "%s learn:" % g_debugPrefix print " category:", int(inputCategory) print " active inputs:", _labeledInput(inputPattern, cellsPerCol=self.cellsPerCol) if isSparse > 0: assert all(inputPattern[i] <= inputPattern[i+1] for i in xrange(len(inputPattern)-1)), \ "Sparse inputPattern must be sorted." assert all(bit < isSparse for bit in inputPattern), \ ("Sparse inputPattern must not index outside the dense " "representation's bounds.") if rowID is None: rowID = self._iterationIdx # Dense vectors if not self.useSparseMemory: # Not supported assert self.cellsPerCol == 0, "not implemented for dense vectors" # If the input was given in sparse form, convert it to dense if isSparse > 0: denseInput = numpy.zeros(isSparse) denseInput[inputPattern] = 1.0 inputPattern = denseInput if self._specificIndexTraining and not self._nextTrainingIndices: # Specific index mode without any index provided - skip training return self._numPatterns if self._Memory is None: # Initialize memory with 100 rows and numPatterns = 0 inputWidth = len(inputPattern) self._Memory = numpy.zeros((100,inputWidth)) self._numPatterns = 0 self._M = self._Memory[:self._numPatterns] addRow = True if self._vt is not None: # Compute projection inputPattern = numpy.dot(self._vt, inputPattern - self._mean) if self.distThreshold > 0: # Check if input is too close to an existing input to be accepted dist = self._calcDistance(inputPattern) minDist = dist.min() addRow = (minDist >= self.distThreshold) if addRow: self._protoSizes = None # need to re-compute if self._numPatterns == self._Memory.shape[0]: # Double the size of the memory self._doubleMemoryNumRows() if not self._specificIndexTraining: # Normal learning - append the new input vector self._Memory[self._numPatterns] = inputPattern self._numPatterns += 1 self._categoryList.append(int(inputCategory)) else: # Specific index training mode - insert vector in specified slot vectorIndex = self._nextTrainingIndices.pop(0) while vectorIndex >= self._Memory.shape[0]: self._doubleMemoryNumRows() self._Memory[vectorIndex] = inputPattern self._numPatterns = max(self._numPatterns, vectorIndex + 1) if vectorIndex >= len(self._categoryList): self._categoryList += [-1] * (vectorIndex - len(self._categoryList) + 1) self._categoryList[vectorIndex] = int(inputCategory) # Set _M to the "active" part of _Memory self._M = self._Memory[0:self._numPatterns] self._addPartitionId(self._numPatterns-1, partitionId) # Sparse vectors else: # If the input was given in sparse form, convert it to dense if necessary if isSparse > 0 and (self._vt is not None or self.distThreshold > 0 \ or self.numSVDDims is not None or self.numSVDSamples is not None \ or self.numWinners > 0): denseInput = numpy.zeros(isSparse) denseInput[inputPattern] = 1.0 inputPattern = denseInput isSparse = 0 # Get the input width if isSparse > 0: inputWidth = isSparse else: inputWidth = len(inputPattern) # Allocate storage if this is the first training vector if self._Memory is None: self._Memory = NearestNeighbor(0, inputWidth) # Support SVD if it is on if self._vt is not None: inputPattern = numpy.dot(self._vt, inputPattern - self._mean) # Threshold the input, zeroing out entries that are too close to 0. # This is only done if we are given a dense input. if isSparse == 0: thresholdedInput = self._sparsifyVector(inputPattern, True) addRow = True # If given the layout of the cells, then turn on the logic that stores # only the start cell for bursting columns. if self.cellsPerCol >= 1: burstingCols = thresholdedInput.reshape(-1, self.cellsPerCol).min(axis=1).nonzero()[0] for col in burstingCols: thresholdedInput[(col * self.cellsPerCol) + 1 : (col * self.cellsPerCol) + self.cellsPerCol] = 0 # Don't learn entries that are too close to existing entries. if self._Memory.nRows() > 0: dist = None # if this vector is a perfect match for one we already learned, then # replace the category - it may have changed with online learning on. if self.replaceDuplicates: dist = self._calcDistance(thresholdedInput, distanceNorm=1) if dist.min() == 0: rowIdx = dist.argmin() self._categoryList[rowIdx] = int(inputCategory) if self.fixedCapacity: self._categoryRecencyList[rowIdx] = rowID addRow = False # Don't add this vector if it matches closely with another we already # added if self.distThreshold > 0: if dist is None or self.distanceNorm != 1: dist = self._calcDistance(thresholdedInput) minDist = dist.min() addRow = (minDist >= self.distThreshold) if not addRow: if self.fixedCapacity: rowIdx = dist.argmin() self._categoryRecencyList[rowIdx] = rowID # If sparsity is too low, we do not want to add this vector if addRow and self.minSparsity > 0.0: if isSparse==0: sparsity = ( float(len(thresholdedInput.nonzero()[0])) / len(thresholdedInput) ) else: sparsity = float(len(inputPattern)) / isSparse if sparsity < self.minSparsity: addRow = False # Add the new sparse vector to our storage if addRow: self._protoSizes = None # need to re-compute if isSparse == 0: self._Memory.addRow(thresholdedInput) else: self._Memory.addRowNZ(inputPattern, [1]*len(inputPattern)) self._numPatterns += 1 self._categoryList.append(int(inputCategory)) self._addPartitionId(self._numPatterns-1, partitionId) if self.fixedCapacity: self._categoryRecencyList.append(rowID) if self._numPatterns > self.maxStoredPatterns and \ self.maxStoredPatterns > 0: leastRecentlyUsedPattern = numpy.argmin(self._categoryRecencyList) self._Memory.deleteRow(leastRecentlyUsedPattern) self._categoryList.pop(leastRecentlyUsedPattern) self._categoryRecencyList.pop(leastRecentlyUsedPattern) self._numPatterns -= 1 if self.numSVDDims is not None and self.numSVDSamples is not None \ and self._numPatterns == self.numSVDSamples: self.computeSVD() return self._numPatterns def getOverlaps(self, inputPattern): """ Return the degree of overlap between an input pattern and each category stored in the classifier. The overlap is computed by computing: .. code-block:: python logical_and(inputPattern != 0, trainingPattern != 0).sum() :param inputPattern: pattern to check overlap of :returns: (overlaps, categories) Two numpy arrays of the same length, where: * overlaps: an integer overlap amount for each category * categories: category index for each element of overlaps """ assert self.useSparseMemory, "Not implemented yet for dense storage" overlaps = self._Memory.rightVecSumAtNZ(inputPattern) return (overlaps, self._categoryList) def getDistances(self, inputPattern): """Return the distances between the input pattern and all other stored patterns. :param inputPattern: pattern to check distance with :returns: (distances, categories) numpy arrays of the same length. - overlaps: an integer overlap amount for each category - categories: category index for each element of distances """ dist = self._getDistances(inputPattern) return (dist, self._categoryList) def infer(self, inputPattern, computeScores=True, overCategories=True, partitionId=None): """Finds the category that best matches the input pattern. Returns the winning category index as well as a distribution over all categories. :param inputPattern: (list or array) The pattern to be classified. This must be a dense representation of the array (e.g. [0, 0, 1, 1, 0, 1]). :param computeScores: NO EFFECT :param overCategories: NO EFFECT :param partitionId: (int) If provided, all training vectors with partitionId equal to that of the input pattern are ignored. For example, this may be used to perform k-fold cross validation without repopulating the classifier. First partition all the data into k equal partitions numbered 0, 1, 2, ... and then call learn() for each vector passing in its partitionId. Then, during inference, by passing in the partition ID in the call to infer(), all other vectors with the same partitionId are ignored simulating the effect of repopulating the classifier while ommitting the training vectors in the same partition. :returns: 4-tuple with these keys: - ``winner``: The category with the greatest number of nearest neighbors within the kth nearest neighbors. If the inferenceResult contains no neighbors, the value of winner is None. This can happen, for example, in cases of exact matching, if there are no stored vectors, or if minSparsity is not met. - ``inferenceResult``: A list of length numCategories, each entry contains the number of neighbors within the top k neighbors that are in that category. - ``dist``: A list of length numPrototypes. Each entry is the distance from the unknown to that prototype. All distances are between 0.0 and 1.0. - ``categoryDist``: A list of length numCategories. Each entry is the distance from the unknown to the nearest prototype of that category. All distances are between 0 and 1.0. """ # Calculate sparsity. If sparsity is too low, we do not want to run # inference with this vector sparsity = 0.0 if self.minSparsity > 0.0: sparsity = ( float(len(inputPattern.nonzero()[0])) / len(inputPattern) ) if len(self._categoryList) == 0 or sparsity < self.minSparsity: # No categories learned yet; i.e. first inference w/ online learning or # insufficient sparsity winner = None inferenceResult = numpy.zeros(1) dist = numpy.ones(1) categoryDist = numpy.ones(1) else: maxCategoryIdx = max(self._categoryList) inferenceResult = numpy.zeros(maxCategoryIdx+1) dist = self._getDistances(inputPattern, partitionId=partitionId) validVectorCount = len(self._categoryList) - self._categoryList.count(-1) # Loop through the indices of the nearest neighbors. if self.exact: # Is there an exact match in the distances? exactMatches = numpy.where(dist<0.00001)[0] if len(exactMatches) > 0: for i in exactMatches[:min(self.k, validVectorCount)]: inferenceResult[self._categoryList[i]] += 1.0 else: sorted = dist.argsort() for j in sorted[:min(self.k, validVectorCount)]: inferenceResult[self._categoryList[j]] += 1.0 # Prepare inference results. if inferenceResult.any(): winner = inferenceResult.argmax() inferenceResult /= inferenceResult.sum() else: winner = None categoryDist = min_score_per_category(maxCategoryIdx, self._categoryList, dist) categoryDist.clip(0, 1.0, categoryDist) if self.verbosity >= 1: print "%s infer:" % (g_debugPrefix) print " active inputs:", _labeledInput(inputPattern, cellsPerCol=self.cellsPerCol) print " winner category:", winner print " pct neighbors of each category:", inferenceResult print " dist of each prototype:", dist print " dist of each category:", categoryDist result = (winner, inferenceResult, dist, categoryDist) return result def getClosest(self, inputPattern, topKCategories=3): """Returns the index of the pattern that is closest to inputPattern, the distances of all patterns to inputPattern, and the indices of the k closest categories. """ inferenceResult = numpy.zeros(max(self._categoryList)+1) dist = self._getDistances(inputPattern) sorted = dist.argsort() validVectorCount = len(self._categoryList) - self._categoryList.count(-1) for j in sorted[:min(self.k, validVectorCount)]: inferenceResult[self._categoryList[j]] += 1.0 winner = inferenceResult.argmax() topNCats = [] for i in range(topKCategories): topNCats.append((self._categoryList[sorted[i]], dist[sorted[i]] )) return winner, dist, topNCats def closestTrainingPattern(self, inputPattern, cat): """Returns the closest training pattern to inputPattern that belongs to category "cat". :param inputPattern: The pattern whose closest neighbor is sought :param cat: The required category of closest neighbor :returns: A dense version of the closest training pattern, or None if no such patterns exist """ dist = self._getDistances(inputPattern) sorted = dist.argsort() for patIdx in sorted: patternCat = self._categoryList[patIdx] # If closest pattern belongs to desired category, return it if patternCat == cat: if self.useSparseMemory: closestPattern = self._Memory.getRow(int(patIdx)) else: closestPattern = self._M[patIdx] return closestPattern # No patterns were found! return None def closestOtherTrainingPattern(self, inputPattern, cat): """Return the closest training pattern that is *not* of the given category "cat". :param inputPattern: The pattern whose closest neighbor is sought :param cat: Training patterns of this category will be ignored no matter their distance to inputPattern :returns: A dense version of the closest training pattern, or None if no such patterns exist """ dist = self._getDistances(inputPattern) sorted = dist.argsort() for patIdx in sorted: patternCat = self._categoryList[patIdx] # If closest pattern does not belong to specified category, return it if patternCat != cat: if self.useSparseMemory: closestPattern = self._Memory.getRow(int(patIdx)) else: closestPattern = self._M[patIdx] return closestPattern # No patterns were found! return None def getPattern(self, idx, sparseBinaryForm=False, cat=None): """Gets a training pattern either by index or category number. :param idx: Index of the training pattern :param sparseBinaryForm: If true, returns a list of the indices of the non-zero bits in the training pattern :param cat: If not None, get the first pattern belonging to category cat. If this is specified, idx must be None. :returns: The training pattern with specified index """ if cat is not None: assert idx is None idx = self._categoryList.index(cat) if not self.useSparseMemory: pattern = self._Memory[idx] if sparseBinaryForm: pattern = pattern.nonzero()[0] else: (nz, values) = self._Memory.rowNonZeros(idx) if not sparseBinaryForm: pattern = numpy.zeros(self._Memory.nCols()) numpy.put(pattern, nz, 1) else: pattern = nz return pattern def getPartitionId(self, i): """ Gets the partition id given an index. :param i: index of partition :returns: the partition id associated with pattern i. Returns None if no id is associated with it. """ if (i < 0) or (i >= self._numPatterns): raise RuntimeError("index out of bounds") partitionId = self._partitionIdList[i] if partitionId == numpy.inf: return None else: return partitionId def getPartitionIdList(self): """ :returns: a list of complete partition id objects """ return self._partitionIdList def getNumPartitionIds(self): """ :returns: the number of unique partition Ids stored. """ return len(self._partitionIdMap) def getPartitionIdKeys(self): """ :returns: a list containing unique (non-None) partition Ids (just the keys) """ return self._partitionIdMap.keys() def getPatternIndicesWithPartitionId(self, partitionId): """ :returns: a list of pattern indices corresponding to this partitionId. Return an empty list if there are none. """ return self._partitionIdMap.get(partitionId, []) def _addPartitionId(self, index, partitionId=None): """ Adds partition id for pattern index """ if partitionId is None: self._partitionIdList.append(numpy.inf) else: self._partitionIdList.append(partitionId) indices = self._partitionIdMap.get(partitionId, []) indices.append(index) self._partitionIdMap[partitionId] = indices def _rebuildPartitionIdMap(self, partitionIdList): """ Rebuilds the partition Id map using the given partitionIdList """ self._partitionIdMap = {} for row, partitionId in enumerate(partitionIdList): indices = self._partitionIdMap.get(partitionId, []) indices.append(row) self._partitionIdMap[partitionId] = indices def _calcDistance(self, inputPattern, distanceNorm=None): """Calculate the distances from inputPattern to all stored patterns. All distances are between 0.0 and 1.0 :param inputPattern The pattern from which distances to all other patterns are calculated :param distanceNorm Degree of the distance norm """ if distanceNorm is None: distanceNorm = self.distanceNorm # Sparse memory if self.useSparseMemory: if self._protoSizes is None: self._protoSizes = self._Memory.rowSums() overlapsWithProtos = self._Memory.rightVecSumAtNZ(inputPattern) inputPatternSum = inputPattern.sum() if self.distanceMethod == "rawOverlap": dist = inputPattern.sum() - overlapsWithProtos elif self.distanceMethod == "pctOverlapOfInput": dist = inputPatternSum - overlapsWithProtos if inputPatternSum > 0: dist /= inputPatternSum elif self.distanceMethod == "pctOverlapOfProto": overlapsWithProtos /= self._protoSizes dist = 1.0 - overlapsWithProtos elif self.distanceMethod == "pctOverlapOfLarger": maxVal = numpy.maximum(self._protoSizes, inputPatternSum) if maxVal.all() > 0: overlapsWithProtos /= maxVal dist = 1.0 - overlapsWithProtos elif self.distanceMethod == "norm": dist = self._Memory.vecLpDist(self.distanceNorm, inputPattern) distMax = dist.max() if distMax > 0: dist /= distMax else: raise RuntimeError("Unimplemented distance method %s" % self.distanceMethod) # Dense memory else: if self.distanceMethod == "norm": dist = numpy.power(numpy.abs(self._M - inputPattern), self.distanceNorm) dist = dist.sum(1) dist = numpy.power(dist, 1.0/self.distanceNorm) dist /= dist.max() else: raise RuntimeError ("Not implemented yet for dense storage....") return dist def _getDistances(self, inputPattern, partitionId=None): """Return the distances from inputPattern to all stored patterns. :param inputPattern The pattern from which distances to all other patterns are returned :param partitionId If provided, ignore all training vectors with this partitionId. """ if not self._finishedLearning: self.finishLearning() self._finishedLearning = True if self._vt is not None and len(self._vt) > 0: inputPattern = numpy.dot(self._vt, inputPattern - self._mean) sparseInput = self._sparsifyVector(inputPattern) # Compute distances dist = self._calcDistance(sparseInput) # Invalidate results where category is -1 if self._specificIndexTraining: dist[numpy.array(self._categoryList) == -1] = numpy.inf # Ignore vectors with this partition id by setting their distances to inf if partitionId is not None: dist[self._partitionIdMap.get(partitionId, [])] = numpy.inf return dist def finishLearning(self): """ Used for batch scenarios. This method needs to be called between learning and inference. """ if self.numSVDDims is not None and self._vt is None: self.computeSVD() def computeSVD(self, numSVDSamples=None, finalize=True): """ Compute the singular value decomposition (SVD). The SVD is a factorization of a real or complex matrix. It factors the matrix `a` as `u * np.diag(s) * v`, where `u` and `v` are unitary and `s` is a 1-d array of `a`'s singular values. **Reason for computing the SVD:** There are cases where you want to feed a lot of vectors to the KNNClassifier. However, this can be slow. You can speed up training by (1) computing the SVD of the input patterns which will give you the eigenvectors, (2) only keeping a fraction of the eigenvectors, and (3) projecting the input patterns onto the remaining eigenvectors. Note that all input patterns are projected onto the eigenvectors in the same fashion. Keeping only the highest eigenvectors increases training performance since it reduces the dimensionality of the input. :param numSVDSamples: (int) the number of samples to use for the SVD computation. :param finalize: (bool) whether to apply SVD to the input patterns. :returns: (array) The singular values for every matrix, sorted in descending order. """ if numSVDSamples is None: numSVDSamples = self._numPatterns if not self.useSparseMemory: self._a = self._Memory[:self._numPatterns] else: self._a = self._Memory.toDense()[:self._numPatterns] self._mean = numpy.mean(self._a, axis=0) self._a -= self._mean u,self._s,self._vt = numpy.linalg.svd(self._a[:numSVDSamples]) if finalize: self._finalizeSVD() return self._s def getAdaptiveSVDDims(self, singularValues, fractionOfMax=0.001): """ Compute the number of eigenvectors (singularValues) to keep. :param singularValues: :param fractionOfMax: :return: """ v = singularValues/singularValues[0] idx = numpy.where(v<fractionOfMax)[0] if len(idx): print "Number of PCA dimensions chosen: ", idx[0], "out of ", len(v) return idx[0] else: print "Number of PCA dimensions chosen: ", len(v)-1, "out of ", len(v) return len(v)-1 def _finalizeSVD(self, numSVDDims=None): """ Called by finalizeLearning(). This will project all the patterns onto the SVD eigenvectors. :param numSVDDims: (int) number of egeinvectors used for projection. :return: """ if numSVDDims is not None: self.numSVDDims = numSVDDims if self.numSVDDims=="adaptive": if self.fractionOfMax is not None: self.numSVDDims = self.getAdaptiveSVDDims(self._s, self.fractionOfMax) else: self.numSVDDims = self.getAdaptiveSVDDims(self._s) if self._vt.shape[0] < self.numSVDDims: print "******************************************************************" print ("Warning: The requested number of PCA dimensions is more than " "the number of pattern dimensions.") print "Setting numSVDDims = ", self._vt.shape[0] print "******************************************************************" self.numSVDDims = self._vt.shape[0] self._vt = self._vt[:self.numSVDDims] # Added when svd is not able to decompose vectors - uses raw spare vectors if len(self._vt) == 0: return self._Memory = numpy.zeros((self._numPatterns,self.numSVDDims)) self._M = self._Memory self.useSparseMemory = False for i in range(self._numPatterns): self._Memory[i] = numpy.dot(self._vt, self._a[i]) self._a = None def remapCategories(self, mapping): """Change the category indices. Used by the Network Builder to keep the category indices in sync with the ImageSensor categoryInfo when the user renames or removes categories. :param mapping: List of new category indices. For example, mapping=[2,0,1] would change all vectors of category 0 to be category 2, category 1 to 0, and category 2 to 1 """ categoryArray = numpy.array(self._categoryList) newCategoryArray = numpy.zeros(categoryArray.shape[0]) newCategoryArray.fill(-1) for i in xrange(len(mapping)): newCategoryArray[categoryArray==i] = mapping[i] self._categoryList = list(newCategoryArray) def setCategoryOfVectors(self, vectorIndices, categoryIndices): """Change the category associated with this vector(s). Used by the Network Builder to move vectors between categories, to enable categories, and to invalidate vectors by setting the category to -1. :param vectorIndices: Single index or list of indices :param categoryIndices: Single index or list of indices. Can also be a single index when vectorIndices is a list, in which case the same category will be used for all vectors """ if not hasattr(vectorIndices, "__iter__"): vectorIndices = [vectorIndices] categoryIndices = [categoryIndices] elif not hasattr(categoryIndices, "__iter__"): categoryIndices = [categoryIndices] * len(vectorIndices) for i in xrange(len(vectorIndices)): vectorIndex = vectorIndices[i] categoryIndex = categoryIndices[i] # Out-of-bounds is not an error, because the KNN may not have seen the # vector yet if vectorIndex < len(self._categoryList): self._categoryList[vectorIndex] = categoryIndex @staticmethod def getSchema(): return KNNClassifierProto @classmethod def read(cls, proto): if proto.version != KNNCLASSIFIER_VERSION: raise RuntimeError("Invalid KNNClassifier Version") knn = object.__new__(cls) knn.version = proto.version knn.k = proto.k knn.exact = proto.exact knn.distanceNorm = proto.distanceNorm knn.distanceMethod = proto.distanceMethod knn.distThreshold = proto.distThreshold knn.doBinarization = proto.doBinarization knn.binarizationThreshold = proto.binarizationThreshold knn.useSparseMemory = proto.useSparseMemory knn.sparseThreshold = proto.sparseThreshold knn.relativeThreshold = proto.relativeThreshold knn.numWinners = proto.numWinners knn.numSVDSamples = proto.numSVDSamples knn.numSVDDims = proto.numSVDDims knn.fractionOfMax = proto.fractionOfMax knn.verbosity = proto.verbosity knn.maxStoredPatterns = proto.maxStoredPatterns knn.replaceDuplicates = proto.replaceDuplicates knn.cellsPerCol = proto.cellsPerCol knn.minSparsity = proto.minSparsity if knn.numSVDDims == "adaptive": knn._adaptiveSVDDims = True else: knn._adaptiveSVDDims = False # Read private state knn.clear() if proto.memory is not None: which = proto.memory.which() if which == "ndarray": knn._Memory = numpy.array(proto.memory.ndarray, dtype=numpy.float64) elif which == "nearestNeighbor": knn._Memory = NearestNeighbor() knn._Memory.read(proto.memory.nearestNeighbor) knn._numPatterns = proto.numPatterns if proto.m is not None: knn._M = numpy.array(proto.m, dtype=numpy.float64) if proto.categoryList is not None: knn._categoryList = list(proto.categoryList) if proto.partitionIdList is not None: knn._partitionIdList = list(proto.partitionIdList) knn._rebuildPartitionIdMap(knn._partitionIdList) knn._iterationIdx = proto.iterationIdx knn._finishedLearning = proto.finishedLearning if proto.s is not None: knn._s = numpy.array(proto.s, dtype=numpy.float32) if proto.vt is not None: knn._vt = numpy.array(proto.vt, dtype=numpy.float32) if proto.mean is not None: knn._mean = numpy.array(proto.mean, dtype=numpy.float32) return knn def write(self, proto): proto.version = self.version proto.k = self.k proto.exact = bool(self.exact) proto.distanceNorm = self.distanceNorm proto.distanceMethod = self.distanceMethod proto.distThreshold = self.distThreshold proto.doBinarization = bool(self.doBinarization) proto.binarizationThreshold = self.binarizationThreshold proto.useSparseMemory = bool(self.useSparseMemory) proto.sparseThreshold = self.sparseThreshold proto.relativeThreshold = bool(self.relativeThreshold) proto.numWinners = self.numWinners proto.verbosity = self.verbosity proto.maxStoredPatterns = self.maxStoredPatterns proto.replaceDuplicates = bool(self.replaceDuplicates) proto.cellsPerCol = self.cellsPerCol proto.minSparsity = self.minSparsity # Write private state if self._Memory is not None: if isinstance(self._Memory, numpy.ndarray): proto.memory.ndarray = self._Memory.tolist() else: proto.memory.init("nearestNeighbor") self._Memory.write(proto.memory.nearestNeighbor) proto.numPatterns = self._numPatterns if self._M is not None: proto.m = self._M.tolist() if self._categoryList is not None: proto.categoryList = self._categoryList if self._partitionIdList is not None: proto.partitionIdList = self._partitionIdList proto.finishedLearning = bool(self._finishedLearning) proto.iterationIdx = self._iterationIdx if self._s is not None: proto.s = self._s.tolist() if self._vt is not None: proto.vt = self._vt.tolist() if self._mean is not None: proto.mean = self._mean.tolist() def __getstate__(self): """Return serializable state. This function will return a version of the __dict__. """ state = self.__dict__.copy() return state def __setstate__(self, state): """Set the state of this object from a serialized state.""" if "version" not in state: pass elif state["version"] == 1: pass elif state["version"] == 2: raise RuntimeError("Invalid deserialization of invalid KNNClassifier" "Version") # Backward compatibility if "_partitionIdArray" in state: state.pop("_partitionIdArray") if "minSparsity" not in state: state["minSparsity"] = 0.0 self.__dict__.update(state) # Backward compatibility if "_partitionIdMap" not in state: self._rebuildPartitionIdMap(self._partitionIdList) # Set to new version self.version = KNNCLASSIFIER_VERSION