Exemple #1
0
  def learn(self, inputPattern, inputCategory, partitionId=None, isSparse=0,
            rowID=None):
    """
    Train the classifier to associate specified input pattern with a
    particular category.

    :param inputPattern: (list) The pattern to be assigned a category. If
        isSparse is 0, this should be a dense array (both ON and OFF bits
        present). Otherwise, if isSparse > 0, this should be a list of the
        indices of the non-zero bits in sorted order

    :param inputCategory: (int) The category to be associated to the training
        pattern

    :param partitionId: (int) partitionID allows you to associate an id with each
        input vector. It can be used to associate input patterns stored in the
        classifier with an external id. This can be useful for debugging or
        visualizing. Another use case is to ignore vectors with a specific id
        during inference (see description of infer() for details). There can be
        at most one partitionId per stored pattern (i.e. if two patterns are
        within distThreshold, only the first partitionId will be stored). This
        is an optional parameter.

    :param isSparse: (int) If 0, the input pattern is a dense representation. If
        isSparse > 0, the input pattern is a list of non-zero indices and
        isSparse is the length of the dense representation

    :param rowID: (int) UNKNOWN

    :returns: The number of patterns currently stored in the classifier
    """
    if self.verbosity >= 1:
      print("%s learn:" % g_debugPrefix)
      print("  category:", int(inputCategory))
      print("  active inputs:", _labeledInput(inputPattern,
                                              cellsPerCol=self.cellsPerCol))

    if isSparse > 0:
      assert all(inputPattern[i] <= inputPattern[i+1]
                 for i in range(len(inputPattern)-1)), \
                     "Sparse inputPattern must be sorted."
      assert all(bit < isSparse for bit in inputPattern), \
        ("Sparse inputPattern must not index outside the dense "
         "representation's bounds.")

    if rowID is None:
      rowID = self._iterationIdx

    # Dense vectors
    if not self.useSparseMemory:

      # Not supported
      assert self.cellsPerCol == 0, "not implemented for dense vectors"

      # If the input was given in sparse form, convert it to dense
      if isSparse > 0:
        denseInput = numpy.zeros(isSparse)
        denseInput[inputPattern] = 1.0
        inputPattern = denseInput

      if self._specificIndexTraining and not self._nextTrainingIndices:
        # Specific index mode without any index provided - skip training
        return self._numPatterns

      if self._Memory is None:
        # Initialize memory with 100 rows and numPatterns = 0
        inputWidth = len(inputPattern)
        self._Memory = numpy.zeros((100,inputWidth))
        self._numPatterns = 0
        self._M = self._Memory[:self._numPatterns]

      addRow = True

      if self._vt is not None:
        # Compute projection
        inputPattern = numpy.dot(self._vt, inputPattern - self._mean)

      if self.distThreshold > 0:
        # Check if input is too close to an existing input to be accepted
        dist = self._calcDistance(inputPattern)
        minDist = dist.min()
        addRow = (minDist >= self.distThreshold)

      if addRow:
        self._protoSizes = None     # need to re-compute
        if self._numPatterns == self._Memory.shape[0]:
          # Double the size of the memory
          self._doubleMemoryNumRows()

        if not self._specificIndexTraining:
          # Normal learning - append the new input vector
          self._Memory[self._numPatterns] = inputPattern
          self._numPatterns += 1
          self._categoryList.append(int(inputCategory))
        else:
          # Specific index training mode - insert vector in specified slot
          vectorIndex = self._nextTrainingIndices.pop(0)
          while vectorIndex >= self._Memory.shape[0]:
            self._doubleMemoryNumRows()
          self._Memory[vectorIndex] = inputPattern
          self._numPatterns = max(self._numPatterns, vectorIndex + 1)
          if vectorIndex >= len(self._categoryList):
            self._categoryList += [-1] * (vectorIndex -
                                          len(self._categoryList) + 1)
          self._categoryList[vectorIndex] = int(inputCategory)

        # Set _M to the "active" part of _Memory
        self._M = self._Memory[0:self._numPatterns]

        self._addPartitionId(self._numPatterns-1, partitionId)

    # Sparse vectors
    else:

      # If the input was given in sparse form, convert it to dense if necessary
      if isSparse > 0 and (self._vt is not None or self.distThreshold > 0 \
              or self.numSVDDims is not None or self.numSVDSamples is not None \
              or self.numWinners > 0):
          denseInput = numpy.zeros(isSparse)
          denseInput[inputPattern] = 1.0
          inputPattern = denseInput
          isSparse = 0

      # Get the input width
      if isSparse > 0:
        inputWidth = isSparse
      else:
        inputWidth = len(inputPattern)

      # Allocate storage if this is the first training vector
      if self._Memory is None:
        self._Memory = NearestNeighbor(0, inputWidth)

      # Support SVD if it is on
      if self._vt is not None:
        inputPattern = numpy.dot(self._vt, inputPattern - self._mean)

      # Threshold the input, zeroing out entries that are too close to 0.
      #  This is only done if we are given a dense input.
      if isSparse == 0:
        thresholdedInput = self._sparsifyVector(inputPattern, True)
      addRow = True

      # If given the layout of the cells, then turn on the logic that stores
      # only the start cell for bursting columns.
      if self.cellsPerCol >= 1:
        burstingCols = thresholdedInput.reshape(-1,
                                  self.cellsPerCol).min(axis=1).nonzero()[0]
        for col in burstingCols:
          thresholdedInput[(col * self.cellsPerCol) + 1 :
                           (col * self.cellsPerCol) + self.cellsPerCol] = 0


      # Don't learn entries that are too close to existing entries.
      if self._Memory.nRows() > 0:
        dist = None
        # if this vector is a perfect match for one we already learned, then
        #  replace the category - it may have changed with online learning on.
        if self.replaceDuplicates:
          dist = self._calcDistance(thresholdedInput, distanceNorm=1)
          if dist.min() == 0:
            rowIdx = dist.argmin()
            self._categoryList[rowIdx] = int(inputCategory)
            if self.fixedCapacity:
              self._categoryRecencyList[rowIdx] = rowID
            addRow = False

        # Don't add this vector if it matches closely with another we already
        #  added
        if self.distThreshold > 0:
          if dist is None or self.distanceNorm != 1:
            dist = self._calcDistance(thresholdedInput)
          minDist = dist.min()
          addRow = (minDist >= self.distThreshold)
          if not addRow:
            if self.fixedCapacity:
              rowIdx = dist.argmin()
              self._categoryRecencyList[rowIdx] = rowID


      # If sparsity is too low, we do not want to add this vector
      if addRow and self.minSparsity > 0.0:
        if isSparse==0:
          sparsity = ( float(len(thresholdedInput.nonzero()[0])) /
                       len(thresholdedInput) )
        else:
          sparsity = float(len(inputPattern)) / isSparse
        if sparsity < self.minSparsity:
          addRow = False

      # Add the new sparse vector to our storage
      if addRow:
        self._protoSizes = None     # need to re-compute
        if isSparse == 0:
          self._Memory.addRow(thresholdedInput)
        else:
          self._Memory.addRowNZ(inputPattern, [1]*len(inputPattern))
        self._numPatterns += 1
        self._categoryList.append(int(inputCategory))
        self._addPartitionId(self._numPatterns-1, partitionId)
        if self.fixedCapacity:
          self._categoryRecencyList.append(rowID)
          if self._numPatterns > self.maxStoredPatterns and \
            self.maxStoredPatterns > 0:
            leastRecentlyUsedPattern = numpy.argmin(self._categoryRecencyList)
            self._Memory.deleteRow(leastRecentlyUsedPattern)
            self._categoryList.pop(leastRecentlyUsedPattern)
            self._categoryRecencyList.pop(leastRecentlyUsedPattern)
            self._numPatterns -= 1



    if self.numSVDDims is not None and self.numSVDSamples is not None \
          and self._numPatterns == self.numSVDSamples:
        self.computeSVD()

    return self._numPatterns
Exemple #2
0
    def learn(self,
              inputPattern,
              inputCategory,
              partitionId=None,
              isSparse=0,
              rowID=None):
        """
    Learn a new training presentation

    Parameters:
    ------------------------------------------------------------------------
    inputPattern: training pattern to learn. This should be a dense array if
                  isSparse==0 or a list of non-zero indices if
                  isSparse>0
    inputCategory: category index of the training pattern.
    partitionID:  ??
    isSparse:     If >0, the input pattern is a list of non-zero indices and
                  isSparse is the length of the dense representation.

    """
        if self.verbosity >= 1:
            print "%s learn:" % (g_debugPrefix)
            print "  category:", int(inputCategory)
            print "  active inputs:", _labeledInput(
                inputPattern, cellsPerCol=self.cellsPerCol)

        if rowID is None:
            rowID = self._iterationIdx

        assert partitionId is None, \
          "No documentation is available for partitionId, not sure how it works."

        #---------------------------------------------------------------------------------
        # Dense vectors
        if not self.useSparseMemory:

            # Not supported
            assert self.cellsPerCol == 0, "not implemented for dense vectors"

            # If the input was given in sparse form, convert it to dense
            if isSparse > 0:
                denseInput = numpy.zeros(isSparse)
                denseInput[inputPattern] = 1.0
                inputPattern = denseInput

            if self._specificIndexTraining and not self._nextTrainingIndices:
                # Specific index mode without any index provided - skip training
                return self._numPatterns

            if self._Memory is None:
                # Initialize memory with 100 rows and numPatterns = 0
                inputWidth = len(inputPattern)
                self._Memory = numpy.zeros((100, inputWidth))
                self._numPatterns = 0
                self._M = self._Memory[:self._numPatterns]

            addRow = True

            if self._vt is not None:
                # Compute projection
                inputPattern = numpy.dot(self._vt, inputPattern - self._mean)

            if self.distThreshold > 0:
                # Check if input is too close to an existing input to be accepted
                dist = self._calcDistance(inputPattern)
                minDist = dist.min()
                addRow = (minDist >= self.distThreshold)

            if addRow:
                self._protoSizes = None  # need to re-compute
                if self._numPatterns == self._Memory.shape[0]:
                    # Double the size of the memory
                    self._doubleMemoryNumRows()

                if not self._specificIndexTraining:
                    # Normal learning - append the new input vector
                    self._Memory[self._numPatterns] = inputPattern
                    self._numPatterns += 1
                    self._categoryList.append(int(inputCategory))
                else:
                    # Specific index training mode - insert vector in specified slot
                    vectorIndex = self._nextTrainingIndices.pop(0)
                    while vectorIndex >= self._Memory.shape[0]:
                        self._doubleMemoryNumRows()
                    self._Memory[vectorIndex] = inputPattern
                    self._numPatterns = max(self._numPatterns, vectorIndex + 1)
                    if vectorIndex >= len(self._categoryList):
                        self._categoryList += [-1] * (
                            vectorIndex - len(self._categoryList) + 1)
                    self._categoryList[vectorIndex] = int(inputCategory)

                # Set _M to the "active" part of _Memory
                self._M = self._Memory[0:self._numPatterns]

                if partitionId is not None:
                    self._partitionIdList.append(partitionId)

        #---------------------------------------------------------------------------------
        # Sparse vectors
        else:

            # If the input was given in sparse form, convert it to dense if necessary
            if isSparse > 0 and (self._vt is not None or self.distThreshold > 0 \
                    or self.numSVDDims is not None or self.numSVDSamples is not None \
                    or self.numWinners > 0):
                denseInput = numpy.zeros(isSparse)
                denseInput[inputPattern] = 1.0
                inputPattern = denseInput
                isSparse = 0

            # Get the input width
            if isSparse > 0:
                inputWidth = isSparse
            else:
                inputWidth = len(inputPattern)

            # Allocate storage if this is the first training vector
            if self._Memory is None:
                self._Memory = NearestNeighbor(0, inputWidth)

            # Support SVD if it is on
            if self._vt is not None:
                inputPattern = numpy.dot(self._vt, inputPattern - self._mean)

            # Threshold the input, zeroing out entries that are too close to 0.
            #  This is only done if we are given a dense input.
            if isSparse == 0:
                thresholdedInput = self._sparsifyVector(inputPattern, True)
            addRow = True

            # If given the layout of the cells, then turn on the logic that stores
            # only the start cell for bursting columns.
            if self.cellsPerCol >= 1:
                numCols = thresholdedInput.size / self.cellsPerCol
                burstingCols = thresholdedInput.reshape(
                    -1, self.cellsPerCol).min(axis=1).nonzero()[0]
                for col in burstingCols:
                    thresholdedInput[(col * self.cellsPerCol) +
                                     1:(col * self.cellsPerCol) +
                                     self.cellsPerCol] = 0

            # Don't learn entries that are too close to existing entries.
            if self._Memory.nRows() > 0:
                dist = None
                # if this vector is a perfect match for one we already learned, then
                #  replace the category - it may have changed with online learning on.
                if self.replaceDuplicates:
                    dist = self._calcDistance(thresholdedInput, distanceNorm=1)
                    if dist.min() == 0:
                        rowIdx = dist.argmin()
                        self._categoryList[rowIdx] = int(inputCategory)
                        if self.fixedCapacity:
                            self._categoryRecencyList[rowIdx] = rowID
                        addRow = False

                # Don't add this vector if it matches closely with another we already
                #  added
                if self.distThreshold > 0:
                    if dist is None or self.distanceNorm != 1:
                        dist = self._calcDistance(thresholdedInput)
                    minDist = dist.min()
                    addRow = (minDist >= self.distThreshold)
                    if not addRow:
                        if self.fixedCapacity:
                            rowIdx = dist.argmin()
                            self._categoryRecencyList[rowIdx] = rowID

            # Add the new vector to our storage
            if addRow:
                self._protoSizes = None  # need to re-compute
                if isSparse == 0:
                    self._Memory.addRow(thresholdedInput)
                else:
                    self._Memory.addRowNZ(inputPattern,
                                          [1] * len(inputPattern))
                self._numPatterns += 1
                self._categoryList.append(int(inputCategory))
                if partitionId is not None:
                    self._partitionIdList.append(partitionId)
                if self.fixedCapacity:
                    self._categoryRecencyList.append(rowID)
                    if self._numPatterns > self.maxStoredPatterns and \
                      self.maxStoredPatterns > 0:
                        leastRecentlyUsedPattern = numpy.argmin(
                            self._categoryRecencyList)
                        self._Memory.deleteRow(leastRecentlyUsedPattern)
                        self._categoryList.pop(leastRecentlyUsedPattern)
                        self._categoryRecencyList.pop(leastRecentlyUsedPattern)
                        self._numPatterns -= 1



        if self.numSVDDims is not None and self.numSVDSamples is not None \
              and self._numPatterns == self.numSVDSamples:
            self.computeSVD()

        return self._numPatterns
Exemple #3
0
  def read(cls, proto):
    if proto.version != KNNCLASSIFIER_VERSION:
      raise RuntimeError("Invalid KNNClassifier Version")

    knn = object.__new__(cls)

    knn.version = proto.version
    knn.k = proto.k
    knn.exact = proto.exact
    knn.distanceNorm = proto.distanceNorm
    knn.distanceMethod = proto.distanceMethod
    knn.distThreshold = proto.distThreshold
    knn.doBinarization = proto.doBinarization
    knn.binarizationThreshold = proto.binarizationThreshold
    knn.useSparseMemory = proto.useSparseMemory
    knn.sparseThreshold = proto.sparseThreshold
    knn.relativeThreshold = proto.relativeThreshold
    knn.numWinners = proto.numWinners
    knn.numSVDSamples = proto.numSVDSamples
    knn.numSVDDims = proto.numSVDDims
    knn.fractionOfMax = proto.fractionOfMax
    knn.verbosity = proto.verbosity
    knn.maxStoredPatterns = proto.maxStoredPatterns
    knn.replaceDuplicates = proto.replaceDuplicates
    knn.cellsPerCol = proto.cellsPerCol
    knn.minSparsity = proto.minSparsity

    if knn.numSVDDims == "adaptive":
      knn._adaptiveSVDDims = True
    else:
      knn._adaptiveSVDDims = False

    # Read private state
    knn.clear()
    if proto.memory is not None:
      which = proto.memory.which()
      if which == "ndarray":
        knn._Memory = numpy.array(proto.memory.ndarray, dtype=numpy.float64)
      elif which == "nearestNeighbor":
        knn._Memory = NearestNeighbor()
        knn._Memory.read(proto.memory.nearestNeighbor)

    knn._numPatterns = proto.numPatterns

    if proto.m is not None:
      knn._M = numpy.array(proto.m, dtype=numpy.float64)

    if proto.categoryList is not None:
      knn._categoryList = list(proto.categoryList)

    if proto.partitionIdList is not None:
      knn._partitionIdList = list(proto.partitionIdList)
      knn._rebuildPartitionIdMap(knn._partitionIdList)

    knn._iterationIdx = proto.iterationIdx
    knn._finishedLearning = proto.finishedLearning

    if proto.s is not None:
      knn._s = numpy.array(proto.s, dtype=numpy.float32)

    if proto.vt is not None:
      knn._vt = numpy.array(proto.vt, dtype=numpy.float32)

    if proto.mean is not None:
      knn._mean = numpy.array(proto.mean, dtype=numpy.float32)

    return knn