コード例 #1
0
ファイル: nupic_random_test.py プロジェクト: Aaron-Gao/nupic
  def testSample(self):
    r = Random(42)
    population = numpy.array([1, 2, 3, 4], dtype="uint32")
    choices = numpy.zeros([2], dtype="uint32")

    r.sample(population, choices)

    self.assertEqual(choices[0], 2)
    self.assertEqual(choices[1], 4)
コード例 #2
0
  def testSample(self):
    r = Random(42)
    population = numpy.array([1, 2, 3, 4], dtype="uint32")
    choices = numpy.zeros([2], dtype="uint32")

    r.sample(population, choices)

    self.assertEqual(choices[0], 2)
    self.assertEqual(choices[1], 1)
コード例 #3
0
ファイル: nupic_random_test.py プロジェクト: Aaron-Gao/nupic
  def testSampleNone(self):
    r = Random(42)
    population = numpy.array([1, 2, 3, 4], dtype="uint32")
    choices = numpy.zeros([0], dtype="uint32")

    # Just make sure there is no exception thrown.
    r.sample(population, choices)

    self.assertEqual(choices.size, 0)
コード例 #4
0
  def testSampleNone(self):
    r = Random(42)
    population = numpy.array([1, 2, 3, 4], dtype="uint32")
    choices = numpy.zeros([0], dtype="uint32")

    # Just make sure there is no exception thrown.
    r.sample(population, choices)

    self.assertEqual(choices.size, 0)
コード例 #5
0
    def testSampleNone(self):
        r = Random(42)
        population = numpy.array([1, 2, 3, 4], dtype="uint32")

        # Just make sure there is no exception thrown.
        choices = r.sample(population, 0)

        self.assertEqual(len(choices), 0)
コード例 #6
0
    def testSampleAll(self):
        r = Random(42)
        population = numpy.array([1, 2, 3, 4], dtype="uint32")

        choices = r.sample(population, 4)

        self.assertEqual(choices[0], 2)
        self.assertEqual(choices[1], 1)
        self.assertEqual(choices[2], 4)
        self.assertEqual(choices[3], 3)
コード例 #7
0
ファイル: sdr_category.py プロジェクト: alfonsokim/nupic
class SDRCategoryEncoder(Encoder):
  """
  Encodes a list of discrete categories (described by strings), that aren't
  related to each other.

  Each  encoding is an SDR in which w out of n bits are turned on.

  Unknown categories are encoded as a single value.

  Internally we use a :class:`.ScalarEncoder` with a radius of 1, but since we
  only encode integers, we never get mixture outputs.

  The :class:`.CategoryEncoder` uses a different method to encode categories

  :param categoryList: list of discrete string categories, if ``None`` then
                       categories will automatically be added as they are
                       encountered
  :param forced: if True, skip checks for parameters' settings; see
                 :class:`.ScalarEncoder` for details. (default False)
  """


  def __init__(self, n, w, categoryList = None, name="category", verbosity=0,
               encoderSeed=1, forced=False):
    self.n = n
    self.w = w

    self._learningEnabled = True

    # initialize the random number generators
    self._seed(encoderSeed)

    if not forced:
      # -- this is just to catch bad parameter choices
      if (self.n/self.w) < 2: # w is 50% of total len
        raise ValueError("Number of ON bits in SDR (%d) must be much smaller than "
                           "the output width (%d)" % (self.w, self.n))

      # Another arbitrary cutoff to catch likely mistakes
      if self.w < 21:
        raise ValueError("Number of bits in the SDR (%d) must be greater than 2, and should be >= 21, pass forced=True to init() to override this check"
                           % self.w)

    self._initOverlap()

    self.verbosity = verbosity

    self.description = [(name, 0)]
    self.name = name

    self.categoryToIndex = dict()
    self.ncategories = 0
    self.categories = list()
    self.sdrs = None

    # Always include an 'unknown' category for
    # edge cases

    self._addCategory("<UNKNOWN>")
    if categoryList is None:
      self._learningEnabled = True
    else:
      self._learningEnabled = False
      for category in categoryList:
        self._addCategory(category)
      assert self.ncategories == len(categoryList) + 1

    # Not used by this class. Used for decoding (scalarsToStr())
    self.encoders = None

    # This matrix is used for the topDownCompute. We build it the first time
    #  topDownCompute is called
    self._topDownMappingM = None
    self._topDownValues = None


  def _initOverlap(self):
    # Calculate average overlap of SDRs for decoding
    # Density is fraction of bits on, and it is also the
    # probability that any individual bit is on.
    density = float(self.w) / self.n
    self.averageOverlap =  self.w * density
    # We can do a better job of calculating the threshold. For now, just
    # something quick and dirty, which is the midway point between average
    # and full overlap. averageOverlap is always < w,  so the threshold
    # is always < w.
    self.thresholdOverlap =  int((self.averageOverlap + self.w)/2)
    #  1.25 -- too sensitive for decode test, so make it less sensitive
    if self.thresholdOverlap < self.w - 3:
      self.thresholdOverlap = self.w - 3


  def __setstate__(self, state):
    self.__dict__.update(state)

    # Initialize self.random as an instance of NupicRandom derived from the
    # previous numpy random state
    randomState = state["random"]
    if isinstance(randomState, numpy.random.mtrand.RandomState):
      self.random = NupicRandom(randomState.randint(sys.maxint))


  def _seed(self, seed=-1):
    """
    Initialize the random seed
    """
    if seed != -1:
      self.random = NupicRandom(seed)
    else:
      self.random = NupicRandom()


  def getDecoderOutputFieldTypes(self):
    """ [Encoder class virtual method override]
    """
    # TODO: change back to string meta-type after the decoding logic is fixed
    #       to output strings instead of internal index values.
    return (FieldMetaType.string,)
    #return (FieldMetaType.integer,)


  def  _addCategory(self, category):
    if category in self.categories:
      raise RuntimeError("Attempt to add add encoder category '%s' "
                         "that already exists" % category)

    if self.sdrs is None:
      assert self.ncategories == 0
      assert len(self.categoryToIndex) == 0
      # Initial allocation -- 16 rows
      self.sdrs = numpy.zeros((16, self.n), dtype='uint8')
    elif self.ncategories > self.sdrs.shape[0] - 2:
      # Preallocated sdrs are used up. Double our size
      currentMax = self.sdrs.shape[0]
      newsdrs = numpy.zeros((currentMax * 2, self.n), dtype='uint8')
      newsdrs[0:currentMax] = self.sdrs[0:currentMax]
      self.sdrs = newsdrs

    newrep = self._newRep()
    self.sdrs[self.ncategories] = newrep
    self.categories.append(category)
    self.categoryToIndex[category] = self.ncategories
    self.ncategories += 1
    self._topDownMappingM = None


  def _newRep(self):
    """Generate a new and unique representation. Returns a numpy array
    of shape (n,). """
    maxAttempts = 1000

    for _ in xrange(maxAttempts):
      foundUnique = True
      population = numpy.arange(self.n, dtype=numpy.uint32)
      choices = numpy.arange(self.w, dtype=numpy.uint32)
      oneBits = sorted(self.random.sample(population, choices))
      sdr =  numpy.zeros(self.n, dtype='uint8')
      sdr[oneBits] = 1
      for i in xrange(self.ncategories):
        if (sdr == self.sdrs[i]).all():
          foundUnique = False
          break
      if foundUnique:
        break;
    if not foundUnique:
      raise RuntimeError("Error, could not find unique pattern %d after "
                         "%d attempts" % (self.ncategories, maxAttempts))
    return sdr


  def getWidth(self):
    return self.n


  def getDescription(self):
    return self.description


  def getScalars(self, input):
    """ See method description in base.py """
    if input == SENTINEL_VALUE_FOR_MISSING_DATA:
        return numpy.array([0])

    index = self.categoryToIndex.get(input, None)
    if index is None:
      if self._learningEnabled:
        self._addCategory(input)
        index = self.ncategories - 1
      else:
        # if not found, we encode category 0
        index = 0

    return numpy.array([index])


  def getBucketIndices(self, input):
    """ See method description in base.py """

    # For category encoder, the "scalar" we map to each category is the
    #  bucket index
    return self.getScalars(input)


  def encodeIntoArray(self, input, output):
    if input == SENTINEL_VALUE_FOR_MISSING_DATA:
      output[0:self.n] = 0
      index = 0
    else:
      index = self.getBucketIndices(input)[0]
      output[0:self.n] = self.sdrs[index,:]

    if self.verbosity >= 2:
      print "input:", input, "index:", index, "output:", output
      print "decoded:", self.decodedToStr(self.decode(output))


  def decode(self, encoded, parentFieldName=''):
    """ See the function description in base.py
    """

    assert (encoded[0:self.n] <= 1.0).all()

    resultString =  ""
    resultRanges = []

    overlaps =  (self.sdrs * encoded[0:self.n]).sum(axis=1)

    if self.verbosity >= 2:
      print "Overlaps for decoding:"
      for i in xrange(0, self.ncategories):
        print "%d %s" % (overlaps[i], self.categories[i])

    matchingCategories =  (overlaps > self.thresholdOverlap).nonzero()[0]

    for index in matchingCategories:
      if resultString != "":
        resultString += " "
      resultString +=  str(self.categories[index])
      resultRanges.append([int(index),int(index)])

    if parentFieldName != '':
      fieldName = "%s.%s" % (parentFieldName, self.name)
    else:
      fieldName = self.name
    return ({fieldName: (resultRanges, resultString)}, [fieldName])


  def _getTopDownMapping(self):
    """ Return the interal _topDownMappingM matrix used for handling the
    bucketInfo() and topDownCompute() methods. This is a matrix, one row per
    category (bucket) where each row contains the encoded output for that
    category.
    """

    # -------------------------------------------------------------------------
    # Do we need to build up our reverse mapping table?
    if self._topDownMappingM is None:

      # Each row represents an encoded output pattern
      self._topDownMappingM = SM32(self.ncategories, self.n)

      outputSpace = numpy.zeros(self.n, dtype=GetNTAReal())
      for i in xrange(self.ncategories):
        self.encodeIntoArray(self.categories[i], outputSpace)
        self._topDownMappingM.setRowFromDense(i, outputSpace)

    return self._topDownMappingM


  def getBucketValues(self):
    """ See the function description in base.py """

    return self.categories


  def getBucketInfo(self, buckets):
    """ See the function description in base.py
    """

    if self.ncategories==0:
      return 0

    topDownMappingM = self._getTopDownMapping()

    categoryIndex = buckets[0]
    category = self.categories[categoryIndex]
    encoding = topDownMappingM.getRow(categoryIndex)

    return [EncoderResult(value=category, scalar=categoryIndex,
                          encoding=encoding)]


  def topDownCompute(self, encoded):
    """ See the function description in base.py
    """

    if self.ncategories==0:
      return 0

    topDownMappingM = self._getTopDownMapping()

    categoryIndex = topDownMappingM.rightVecProd(encoded).argmax()
    category = self.categories[categoryIndex]
    encoding = topDownMappingM.getRow(categoryIndex)

    return EncoderResult(value=category, scalar=categoryIndex, encoding=encoding)


  def closenessScores(self, expValues, actValues, fractional=True):
    """ See the function description in base.py

    kwargs will have the keyword "fractional", which is ignored by this encoder
    """

    expValue = expValues[0]
    actValue = actValues[0]

    if expValue == actValue:
      closeness = 1.0
    else:
      closeness = 0.0

    if not fractional:
      closeness = 1.0 - closeness

    return numpy.array([closeness])


  @classmethod
  def getSchema(cls):
    return SDRCategoryEncoderProto

  @classmethod
  def read(cls, proto):
    encoder = object.__new__(cls)

    encoder.n = proto.n
    encoder.w = proto.w
    encoder.random = NupicRandom()
    encoder.random.read(proto.random)
    encoder.verbosity = proto.verbosity
    encoder.name = proto.name
    encoder.description = [(proto.name, 0)]
    encoder.categories = list(proto.categories)
    encoder.sdrs = numpy.array(proto.sdrs, dtype=numpy.uint8)

    encoder.categoryToIndex = {category:index
                               for index, category
                               in enumerate(encoder.categories)}
    encoder.ncategories = len(encoder.categories)
    encoder._learningEnabled = proto.learningEnabled
    encoder._initOverlap()
    encoder._topDownMappingM = None
    encoder._topDownValues = None
    encoder.encoders = None

    return encoder


  def write(self, proto):
    proto.n = self.n
    proto.w = self.w
    self.random.write(proto.random)
    proto.verbosity = self.verbosity
    proto.name = self.name
    proto.categories = self.categories
    proto.sdrs = self.sdrs.tolist()
    proto.learningEnabled = self._learningEnabled
コード例 #8
0
ファイル: sdr_category.py プロジェクト: pastorenick/nupic
class SDRCategoryEncoder(Encoder):
  """
  Encodes a list of discrete categories (described by strings), that aren't
  related to each other.

  Each  encoding is an SDR in which w out of n bits are turned on.

  Unknown categories are encoded as a single value.

  Internally we use a :class:`.ScalarEncoder` with a radius of 1, but since we
  only encode integers, we never get mixture outputs.

  The :class:`.CategoryEncoder` uses a different method to encode categories

  :param categoryList: list of discrete string categories, if ``None`` then
                       categories will automatically be added as they are
                       encountered
  :param forced: if True, skip checks for parameters' settings; see
                 :class:`.ScalarEncoder` for details. (default False)
  """


  def __init__(self, n, w, categoryList = None, name="category", verbosity=0,
               encoderSeed=1, forced=False):
    self.n = n
    self.w = w

    self._learningEnabled = True

    # initialize the random number generators
    self._seed(encoderSeed)

    if not forced:
      # -- this is just to catch bad parameter choices
      if (self.n/self.w) < 2: # w is 50% of total len
        raise ValueError("Number of ON bits in SDR (%d) must be much smaller than "
                           "the output width (%d)" % (self.w, self.n))

      # Another arbitrary cutoff to catch likely mistakes
      if self.w < 21:
        raise ValueError("Number of bits in the SDR (%d) must be greater than 2, and should be >= 21, pass forced=True to init() to override this check"
                           % self.w)

    self._initOverlap()

    self.verbosity = verbosity

    self.description = [(name, 0)]
    self.name = name

    self.categoryToIndex = dict()
    self.ncategories = 0
    self.categories = list()
    self.sdrs = None

    # Always include an 'unknown' category for
    # edge cases

    self._addCategory("<UNKNOWN>")
    if categoryList is None:
      self._learningEnabled = True
    else:
      self._learningEnabled = False
      for category in categoryList:
        self._addCategory(category)
      assert self.ncategories == len(categoryList) + 1

    # Not used by this class. Used for decoding (scalarsToStr())
    self.encoders = None

    # This matrix is used for the topDownCompute. We build it the first time
    #  topDownCompute is called
    self._topDownMappingM = None
    self._topDownValues = None


  def _initOverlap(self):
    # Calculate average overlap of SDRs for decoding
    # Density is fraction of bits on, and it is also the
    # probability that any individual bit is on.
    density = float(self.w) / self.n
    self.averageOverlap =  self.w * density
    # We can do a better job of calculating the threshold. For now, just
    # something quick and dirty, which is the midway point between average
    # and full overlap. averageOverlap is always < w,  so the threshold
    # is always < w.
    self.thresholdOverlap =  int((self.averageOverlap + self.w)/2)
    #  1.25 -- too sensitive for decode test, so make it less sensitive
    if self.thresholdOverlap < self.w - 3:
      self.thresholdOverlap = self.w - 3


  def __setstate__(self, state):
    self.__dict__.update(state)

    # Initialize self.random as an instance of NupicRandom derived from the
    # previous numpy random state
    randomState = state["random"]
    if isinstance(randomState, numpy.random.mtrand.RandomState):
      self.random = NupicRandom(randomState.randint(sys.maxsize))


  def _seed(self, seed=-1):
    """
    Initialize the random seed
    """
    if seed != -1:
      self.random = NupicRandom(seed)
    else:
      self.random = NupicRandom()


  def getDecoderOutputFieldTypes(self):
    """ [Encoder class virtual method override]
    """
    # TODO: change back to string meta-type after the decoding logic is fixed
    #       to output strings instead of internal index values.
    return (FieldMetaType.string,)
    #return (FieldMetaType.integer,)


  def  _addCategory(self, category):
    if category in self.categories:
      raise RuntimeError("Attempt to add add encoder category '%s' "
                         "that already exists" % category)

    if self.sdrs is None:
      assert self.ncategories == 0
      assert len(self.categoryToIndex) == 0
      # Initial allocation -- 16 rows
      self.sdrs = numpy.zeros((16, self.n), dtype='uint8')
    elif self.ncategories > self.sdrs.shape[0] - 2:
      # Preallocated sdrs are used up. Double our size
      currentMax = self.sdrs.shape[0]
      newsdrs = numpy.zeros((currentMax * 2, self.n), dtype='uint8')
      newsdrs[0:currentMax] = self.sdrs[0:currentMax]
      self.sdrs = newsdrs

    newrep = self._newRep()
    self.sdrs[self.ncategories] = newrep
    self.categories.append(category)
    self.categoryToIndex[category] = self.ncategories
    self.ncategories += 1
    self._topDownMappingM = None


  def _newRep(self):
    """Generate a new and unique representation. Returns a numpy array
    of shape (n,). """
    maxAttempts = 1000

    for _ in range(maxAttempts):
      foundUnique = True
      population = numpy.arange(self.n, dtype=numpy.uint32)
      choices = numpy.arange(self.w, dtype=numpy.uint32)
      oneBits = sorted(self.random.sample(population, choices))
      sdr =  numpy.zeros(self.n, dtype='uint8')
      sdr[oneBits] = 1
      for i in range(self.ncategories):
        if (sdr == self.sdrs[i]).all():
          foundUnique = False
          break
      if foundUnique:
        break;
    if not foundUnique:
      raise RuntimeError("Error, could not find unique pattern %d after "
                         "%d attempts" % (self.ncategories, maxAttempts))
    return sdr


  def getWidth(self):
    return self.n


  def getDescription(self):
    return self.description


  def getScalars(self, input):
    """ See method description in base.py """
    if input == SENTINEL_VALUE_FOR_MISSING_DATA:
        return numpy.array([0])

    index = self.categoryToIndex.get(input, None)
    if index is None:
      if self._learningEnabled:
        self._addCategory(input)
        index = self.ncategories - 1
      else:
        # if not found, we encode category 0
        index = 0

    return numpy.array([index])


  def getBucketIndices(self, input):
    """ See method description in base.py """

    # For category encoder, the "scalar" we map to each category is the
    #  bucket index
    return self.getScalars(input)


  def encodeIntoArray(self, input, output):
    if input == SENTINEL_VALUE_FOR_MISSING_DATA:
      output[0:self.n] = 0
      index = 0
    else:
      index = self.getBucketIndices(input)[0]
      output[0:self.n] = self.sdrs[index,:]

    if self.verbosity >= 2:
      print("input:", input, "index:", index, "output:", output)
      print("decoded:", self.decodedToStr(self.decode(output)))


  def decode(self, encoded, parentFieldName=''):
    """ See the function description in base.py
    """

    assert (encoded[0:self.n] <= 1.0).all()

    resultString =  ""
    resultRanges = []

    overlaps =  (self.sdrs * encoded[0:self.n]).sum(axis=1)

    if self.verbosity >= 2:
      print("Overlaps for decoding:")
      for i in range(0, self.ncategories):
        print("%d %s" % (overlaps[i], self.categories[i]))

    matchingCategories =  (overlaps > self.thresholdOverlap).nonzero()[0]

    for index in matchingCategories:
      if resultString != "":
        resultString += " "
      resultString +=  str(self.categories[index])
      resultRanges.append([int(index),int(index)])

    if parentFieldName != '':
      fieldName = "%s.%s" % (parentFieldName, self.name)
    else:
      fieldName = self.name
    return ({fieldName: (resultRanges, resultString)}, [fieldName])


  def _getTopDownMapping(self):
    """ Return the interal _topDownMappingM matrix used for handling the
    bucketInfo() and topDownCompute() methods. This is a matrix, one row per
    category (bucket) where each row contains the encoded output for that
    category.
    """

    # -------------------------------------------------------------------------
    # Do we need to build up our reverse mapping table?
    if self._topDownMappingM is None:

      # Each row represents an encoded output pattern
      self._topDownMappingM = SM32(self.ncategories, self.n)

      outputSpace = numpy.zeros(self.n, dtype=GetNTAReal())
      for i in range(self.ncategories):
        self.encodeIntoArray(self.categories[i], outputSpace)
        self._topDownMappingM.setRowFromDense(i, outputSpace)

    return self._topDownMappingM


  def getBucketValues(self):
    """ See the function description in base.py """

    return self.categories


  def getBucketInfo(self, buckets):
    """ See the function description in base.py
    """

    if self.ncategories==0:
      return 0

    topDownMappingM = self._getTopDownMapping()

    categoryIndex = buckets[0]
    category = self.categories[categoryIndex]
    encoding = topDownMappingM.getRow(categoryIndex)

    return [EncoderResult(value=category, scalar=categoryIndex,
                          encoding=encoding)]


  def topDownCompute(self, encoded):
    """ See the function description in base.py
    """

    if self.ncategories==0:
      return 0

    topDownMappingM = self._getTopDownMapping()

    categoryIndex = topDownMappingM.rightVecProd(encoded).argmax()
    category = self.categories[categoryIndex]
    encoding = topDownMappingM.getRow(categoryIndex)

    return EncoderResult(value=category, scalar=categoryIndex, encoding=encoding)


  def closenessScores(self, expValues, actValues, fractional=True):
    """ See the function description in base.py

    kwargs will have the keyword "fractional", which is ignored by this encoder
    """

    expValue = expValues[0]
    actValue = actValues[0]

    if expValue == actValue:
      closeness = 1.0
    else:
      closeness = 0.0

    if not fractional:
      closeness = 1.0 - closeness

    return numpy.array([closeness])


  @classmethod
  def read(cls, proto):
    encoder = object.__new__(cls)

    encoder.n = proto.n
    encoder.w = proto.w
    encoder.random = NupicRandom()
    encoder.random.read(proto.random)
    encoder.verbosity = proto.verbosity
    encoder.name = proto.name
    encoder.description = [(proto.name, 0)]
    encoder.categories = list(proto.categories)
    encoder.sdrs = numpy.array(proto.sdrs, dtype=numpy.uint8)

    encoder.categoryToIndex = {category:index
                               for index, category
                               in enumerate(encoder.categories)}
    encoder.ncategories = len(encoder.categories)
    encoder._learningEnabled = False
    encoder._initOverlap()

    return encoder


  def write(self, proto):
    proto.n = self.n
    proto.w = self.w
    self.random.write(proto.random)
    proto.verbosity = self.verbosity
    proto.name = self.name
    proto.categories = self.categories
    proto.sdrs = self.sdrs.tolist()

  def getSchema():
     raise NotImplementedError
コード例 #9
0
class ColumnPooler(object):
    """,
  This class constitutes a temporary implementation for a cross-column pooler.
  The implementation goal of this class is to prove basic properties before
  creating a cleaner implementation.
  """
    def __init__(
            self,
            inputWidth,
            lateralInputWidths=(),
            cellCount=4096,
            sdrSize=40,
            onlineLearning=False,
            maxSdrSize=None,
            minSdrSize=None,

            # Proximal
            synPermProximalInc=0.1,
            synPermProximalDec=0.001,
            initialProximalPermanence=0.6,
            sampleSizeProximal=20,
            minThresholdProximal=10,
            connectedPermanenceProximal=0.50,
            predictedInhibitionThreshold=20,

            # Distal
            synPermDistalInc=0.1,
            synPermDistalDec=0.001,
            initialDistalPermanence=0.6,
            sampleSizeDistal=20,
            activationThresholdDistal=13,
            connectedPermanenceDistal=0.50,
            inertiaFactor=1.,
            seed=42):
        """
    Parameters:
    ----------------------------
    @param  inputWidth (int)
            The number of bits in the feedforward input

    @param  lateralInputWidths (list of ints)
            The number of bits in each lateral input

    @param  sdrSize (int)
            The number of active cells in an object SDR

    @param  onlineLearning (Bool)
            Whether or not the column pooler should learn in online mode.

    @param  maxSdrSize (int)
            The maximum SDR size for learning.  If the column pooler has more
            than this many cells active, it will refuse to learn.  This serves
            to stop the pooler from learning when it is uncertain of what object
            it is sensing.

    @param  minSdrSize (int)
            The minimum SDR size for learning.  If the column pooler has fewer
            than this many active cells, it will create a new representation
            and learn that instead.  This serves to create separate
            representations for different objects and sequences.

            If online learning is enabled, this parameter should be at least
            inertiaFactor*sdrSize.  Otherwise, two different objects may be
            incorrectly inferred to be the same, as SDRs may still be active
            enough to learn even after inertial decay.

    @param  synPermProximalInc (float)
            Permanence increment for proximal synapses

    @param  synPermProximalDec (float)
            Permanence decrement for proximal synapses

    @param  initialProximalPermanence (float)
            Initial permanence value for proximal synapses

    @param  sampleSizeProximal (int)
            Number of proximal synapses a cell should grow to each feedforward
            pattern, or -1 to connect to every active bit

    @param  minThresholdProximal (int)
            Number of active synapses required for a cell to have feedforward
            support

    @param  connectedPermanenceProximal (float)
            Permanence required for a proximal synapse to be connected

    @param  predictedInhibitionThreshold (int)
            How much predicted input must be present for inhibitory behavior
            to be triggered.  Only has effects if onlineLearning is true.

    @param  synPermDistalInc (float)
            Permanence increment for distal synapses

    @param  synPermDistalDec (float)
            Permanence decrement for distal synapses

    @param  sampleSizeDistal (int)
            Number of distal synapses a cell should grow to each lateral
            pattern, or -1 to connect to every active bit

    @param  initialDistalPermanence (float)
            Initial permanence value for distal synapses

    @param  activationThresholdDistal (int)
            Number of active synapses required to activate a distal segment

    @param  connectedPermanenceDistal (float)
            Permanence required for a distal synapse to be connected

    @param  inertiaFactor (float)
            The proportion of previously active cells that remain
            active in the next timestep due to inertia (in the absence of
            inhibition).  If onlineLearning is enabled, should be at most
            1 - learningTolerance, or representations may incorrectly become
            mixed.

    @param  seed (int)
            Random number generator seed
    """

        assert maxSdrSize is None or maxSdrSize >= sdrSize
        assert minSdrSize is None or minSdrSize <= sdrSize

        self.inputWidth = inputWidth
        self.cellCount = cellCount
        self.sdrSize = sdrSize
        self.onlineLearning = onlineLearning
        if maxSdrSize is None:
            self.maxSdrSize = sdrSize
        else:
            self.maxSdrSize = maxSdrSize
        if minSdrSize is None:
            self.minSdrSize = sdrSize
        else:
            self.minSdrSize = minSdrSize
        self.synPermProximalInc = synPermProximalInc
        self.synPermProximalDec = synPermProximalDec
        self.initialProximalPermanence = initialProximalPermanence
        self.connectedPermanenceProximal = connectedPermanenceProximal
        self.sampleSizeProximal = sampleSizeProximal
        self.minThresholdProximal = minThresholdProximal
        self.predictedInhibitionThreshold = predictedInhibitionThreshold
        self.synPermDistalInc = synPermDistalInc
        self.synPermDistalDec = synPermDistalDec
        self.initialDistalPermanence = initialDistalPermanence
        self.connectedPermanenceDistal = connectedPermanenceDistal
        self.sampleSizeDistal = sampleSizeDistal
        self.activationThresholdDistal = activationThresholdDistal
        self.inertiaFactor = inertiaFactor

        self.activeCells = numpy.empty(0, dtype="uint32")
        self._random = Random(seed)

        # These sparse matrices will hold the synapses for each segment.
        # Each row represents one segment on a cell, so each cell potentially has
        # 1 proximal segment and 1+len(lateralInputWidths) distal segments.
        self.proximalPermanences = SparseMatrix(cellCount, inputWidth)
        self.internalDistalPermanences = SparseMatrix(cellCount, cellCount)
        self.distalPermanences = tuple(
            SparseMatrix(cellCount, n) for n in lateralInputWidths)

        self.useInertia = True

    def compute(
        self,
        feedforwardInput=(),
        lateralInputs=(),
        feedforwardGrowthCandidates=None,
        learn=True,
        predictedInput=None,
    ):
        """
    Runs one time step of the column pooler algorithm.

    @param  feedforwardInput (sequence)
            Sorted indices of active feedforward input bits

    @param  lateralInputs (list of sequences)
            For each lateral layer, a list of sorted indices of active lateral
            input bits

    @param  feedforwardGrowthCandidates (sequence or None)
            Sorted indices of feedforward input bits that active cells may grow
            new synapses to. If None, the entire feedforwardInput is used.

    @param  learn (bool)
            If True, we are learning a new object

    @param predictedInput (sequence)
           Sorted indices of predicted cells in the TM layer.
    """

        if feedforwardGrowthCandidates is None:
            feedforwardGrowthCandidates = feedforwardInput

        # inference step
        if not learn:  # inference
            self._computeInferenceMode(feedforwardInput, lateralInputs)

        # learning step
        elif not self.onlineLearning:
            self._computeLearningMode(feedforwardInput, lateralInputs,
                                      feedforwardGrowthCandidates)
        # online learning step
        else:
            if (predictedInput is not None and
                    len(predictedInput) > self.predictedInhibitionThreshold):
                predictedActiveInput = numpy.intersect1d(
                    feedforwardInput, predictedInput)
                predictedGrowthCandidates = numpy.intersect1d(
                    feedforwardGrowthCandidates, predictedInput)
                self._computeInferenceMode(predictedActiveInput, lateralInputs)
                self._computeLearningMode(predictedActiveInput, lateralInputs,
                                          feedforwardGrowthCandidates)
            elif not self.minSdrSize <= len(
                    self.activeCells) <= self.maxSdrSize:
                # If the pooler doesn't have a single representation, try to infer one,
                # before actually attempting to learn.
                self._computeInferenceMode(feedforwardInput, lateralInputs)
                self._computeLearningMode(feedforwardInput, lateralInputs,
                                          feedforwardGrowthCandidates)
            else:
                # If there isn't predicted input and we have a single SDR,
                # we are extending that representation and should just learn.
                self._computeLearningMode(feedforwardInput, lateralInputs,
                                          feedforwardGrowthCandidates)

    def _computeLearningMode(self, feedforwardInput, lateralInputs,
                             feedforwardGrowthCandidates):
        """
    Learning mode: we are learning a new object in an online fashion. If there
    is no prior activity, we randomly activate 'sdrSize' cells and create
    connections to incoming input. If there was prior activity, we maintain it.
    If we have a union, we simply do not learn at all.

    These cells will represent the object and learn distal connections to each
    other and to lateral cortical columns.

    Parameters:
    ----------------------------
    @param  feedforwardInput (sequence)
            Sorted indices of active feedforward input bits

    @param  lateralInputs (list of sequences)
            For each lateral layer, a list of sorted indices of active lateral
            input bits

    @param  feedforwardGrowthCandidates (sequence or None)
            Sorted indices of feedforward input bits that the active cells may
            grow new synapses to.  This is assumed to be the predicted active
            cells of the input layer.
    """
        prevActiveCells = self.activeCells

        # If there are not enough previously active cells, then we are no longer on
        # a familiar object.  Either our representation decayed due to the passage
        # of time (i.e. we moved somewhere else) or we were mistaken.  Either way,
        # create a new SDR and learn on it.
        # This case is the only way different object representations are created.
        # enforce the active cells in the output layer
        if len(self.activeCells) < self.minSdrSize:
            self.activeCells = _sampleRange(self._random,
                                            0,
                                            self.numberOfCells(),
                                            step=1,
                                            k=self.sdrSize)
            self.activeCells.sort()

        # If we have a union of cells active, don't learn.  This primarily affects
        # online learning.
        if len(self.activeCells) > self.maxSdrSize:
            return

        # Finally, now that we have decided which cells we should be learning on, do
        # the actual learning.
        if len(feedforwardInput) > 0:
            self._learn(self.proximalPermanences, self._random,
                        self.activeCells, feedforwardInput,
                        feedforwardGrowthCandidates, self.sampleSizeProximal,
                        self.initialProximalPermanence,
                        self.synPermProximalInc, self.synPermProximalDec,
                        self.connectedPermanenceProximal)

            # External distal learning cross column, segments
            for i, lateralInput in enumerate(lateralInputs):
                self._learn(self.distalPermanences[i], self._random,
                            self.activeCells, lateralInput, lateralInput,
                            self.sampleSizeDistal,
                            self.initialDistalPermanence,
                            self.synPermDistalInc, self.synPermDistalDec,
                            self.connectedPermanenceDistal)

            # Internal distal learning within the same column
            self._learn(self.internalDistalPermanences, self._random,
                        self.activeCells, prevActiveCells, prevActiveCells,
                        self.sampleSizeDistal, self.initialDistalPermanence,
                        self.synPermDistalInc, self.synPermDistalDec,
                        self.connectedPermanenceDistal)

    def _computeInferenceMode(self, feedforwardInput, lateralInputs):
        """
    Inference mode: if there is some feedforward activity, perform
    spatial pooling on it to recognize previously known objects, then use
    lateral activity to activate a subset of the cells with feedforward
    support. If there is no feedforward activity, use lateral activity to
    activate a subset of the previous active cells.

    Parameters:
    ----------------------------
    @param  feedforwardInput (sequence)
            Sorted indices of active feedforward input bits

    @param  lateralInputs (list of sequences)
            For each lateral layer, a list of sorted indices of active lateral
            input bits
    """

        prevActiveCells = self.activeCells
        print self.activeCells

        # Calculate the feedforward supported cells
        overlaps = self.proximalPermanences.rightVecSumAtNZGteThresholdSparse(
            feedforwardInput, self.connectedPermanenceProximal)
        feedforwardSupportedCells = numpy.where(
            overlaps >= self.minThresholdProximal)[0]

        # Calculate the number of active segments on each cell
        numActiveSegmentsByCell = numpy.zeros(self.cellCount, dtype="int")
        overlaps = self.internalDistalPermanences.rightVecSumAtNZGteThresholdSparse(
            prevActiveCells, self.connectedPermanenceDistal)
        numActiveSegmentsByCell[
            overlaps >= self.activationThresholdDistal] += 1
        for i, lateralInput in enumerate(lateralInputs):
            overlaps = self.distalPermanences[
                i].rightVecSumAtNZGteThresholdSparse(
                    lateralInput, self.connectedPermanenceDistal)
            numActiveSegmentsByCell[
                overlaps >= self.activationThresholdDistal] += 1

        chosenCells = []

        # First, activate the FF-supported cells that have the highest number of
        if len(feedforwardSupportedCells) == 0:
            # lateral active segments (as long as it's not 0)
            pass
        else:
            numActiveSegsForFFSuppCells = numActiveSegmentsByCell[
                feedforwardSupportedCells]

            # This loop will select the FF-supported AND laterally-active cells, in
            # order of descending lateral activation, until we exceed the sdrSize
            # quorum - but will exclude cells with 0 lateral active segments.
            ttop = numpy.max(numActiveSegsForFFSuppCells)
            while ttop > 0 and len(chosenCells) < self.sdrSize:
                chosenCells = numpy.union1d(
                    chosenCells, feedforwardSupportedCells[
                        numActiveSegsForFFSuppCells > ttop])
                ttop -= 1

        # If we haven't filled the sdrSize quorum, add in inertial cells.
        if len(chosenCells) < self.sdrSize:
            if self.useInertia:
                prevCells = numpy.setdiff1d(prevActiveCells, chosenCells)
                inertialCap = int(len(prevCells) * self.inertiaFactor)
                if inertialCap > 0:
                    numActiveSegsForPrevCells = numActiveSegmentsByCell[
                        prevCells]
                    # We sort the previously-active cells by number of active lateral
                    # segments (this really helps).  We then activate them in order of
                    # descending lateral activation.
                    sortIndices = numpy.argsort(
                        numActiveSegsForPrevCells)[::-1]
                    prevCells = prevCells[sortIndices]
                    numActiveSegsForPrevCells = numActiveSegsForPrevCells[
                        sortIndices]

                    # We use inertiaFactor to limit the number of previously-active cells
                    # which can become active, forcing decay even if we are below quota.
                    prevCells = prevCells[:inertialCap]
                    numActiveSegsForPrevCells = numActiveSegsForPrevCells[:
                                                                          inertialCap]

                    # Activate groups of previously active cells by order of their lateral
                    # support until we either meet quota or run out of cells.
                    ttop = numpy.max(numActiveSegsForPrevCells)
                    while ttop >= 0 and len(chosenCells) < self.sdrSize:
                        chosenCells = numpy.union1d(
                            chosenCells,
                            prevCells[numActiveSegsForPrevCells > ttop])
                        ttop -= 1

        # If we haven't filled the sdrSize quorum, add cells that have feedforward
        # support and no lateral support.
        discrepancy = self.sdrSize - len(chosenCells)
        if discrepancy > 0:
            remFFcells = numpy.setdiff1d(feedforwardSupportedCells,
                                         chosenCells)
            if len(remFFcells) > discrepancy:
                # Inhibit cells proportionally to the number of cells that have already
                # been chosen. If ~0 have been chosen activate ~all of the feedforward
                # supported cells. If ~sdrSize have been chosen, activate very few of
                # the feedforward supported cells.
                n = min(
                    max(discrepancy,
                        len(remFFcells) * discrepancy / self.sdrSize),
                    len(remFFcells))
                selected = numpy.empty(n, dtype="uint32")
                self._random.sample(numpy.asarray(remFFcells, dtype="uint32"),
                                    selected)
                chosenCells = numpy.append(chosenCells, selected)
            else:
                chosenCells = numpy.append(chosenCells, remFFcells)

        chosenCells.sort()
        self.activeCells = numpy.asarray(chosenCells, dtype="uint32")

    def numberOfInputs(self):
        """
    Returns the number of inputs into this layer
    """
        return self.inputWidth

    def numberOfCells(self):
        """
    Returns the number of cells in this layer.
    @return (int) Number of cells
    """
        return self.cellCount

    def getActiveCells(self):
        """
    Returns the indices of the active cells.
    @return (list) Indices of active cells.
    """
        return self.activeCells

    def numberOfConnectedProximalSynapses(self, cells=None):
        """
    Returns the number of proximal connected synapses on these cells.

    Parameters:
    ----------------------------
    @param  cells (iterable)
            Indices of the cells. If None return count for all cells.
    """
        if cells is None:
            cells = xrange(self.numberOfCells())

        return _countWhereGreaterEqualInRows(self.proximalPermanences, cells,
                                             self.connectedPermanenceProximal)

    def numberOfProximalSynapses(self, cells=None):
        """
    Returns the number of proximal synapses with permanence>0 on these cells.

    Parameters:
    ----------------------------
    @param  cells (iterable)
            Indices of the cells. If None return count for all cells.
    """
        if cells is None:
            cells = xrange(self.numberOfCells())

        n = 0
        for cell in cells:
            n += self.proximalPermanences.nNonZerosOnRow(cell)
        return n

    def numberOfDistalSegments(self, cells=None):
        """
    Returns the total number of distal segments for these cells.

    A segment "exists" if its row in the matrix has any permanence values > 0.

    Parameters:
    ----------------------------
    @param  cells (iterable)
            Indices of the cells
    """
        if cells is None:
            cells = xrange(self.numberOfCells())

        n = 0

        for cell in cells:
            if self.internalDistalPermanences.nNonZerosOnRow(cell) > 0:
                n += 1

            for permanences in self.distalPermanences:
                if permanences.nNonZerosOnRow(cell) > 0:
                    n += 1

        return n

    def numberOfConnectedDistalSynapses(self, cells=None):
        """
    Returns the number of connected distal synapses on these cells.

    Parameters:
    ----------------------------
    @param  cells (iterable)
            Indices of the cells. If None return count for all cells.
    """
        if cells is None:
            cells = xrange(self.numberOfCells())

        n = _countWhereGreaterEqualInRows(self.internalDistalPermanences,
                                          cells,
                                          self.connectedPermanenceDistal)

        for permanences in self.distalPermanences:
            n += _countWhereGreaterEqualInRows(permanences, cells,
                                               self.connectedPermanenceDistal)

        return n

    def numberOfDistalSynapses(self, cells=None):
        """
    Returns the total number of distal synapses for these cells.

    Parameters:
    ----------------------------
    @param  cells (iterable)
            Indices of the cells
    """
        if cells is None:
            cells = xrange(self.numberOfCells())
        n = 0
        for cell in cells:
            n += self.internalDistalPermanences.nNonZerosOnRow(cell)

            for permanences in self.distalPermanences:
                n += permanences.nNonZerosOnRow(cell)
        return n

    def reset(self):
        """
    Reset internal states. When learning this signifies we are to learn a
    unique new object.
    """
        self.activeCells = numpy.empty(0, dtype="uint32")

    def getUseInertia(self):
        """
    Get whether we actually use inertia  (i.e. a fraction of the
    previously active cells remain active at the next time step unless
    inhibited by cells with both feedforward and lateral support).
    @return (Bool) Whether inertia is used.
    """
        return self.useInertia

    def setUseInertia(self, useInertia):
        """
    Sets whether we actually use inertia (i.e. a fraction of the
    previously active cells remain active at the next time step unless
    inhibited by cells with both feedforward and lateral support).
    @param useInertia (Bool) Whether inertia is used.
    """
        self.useInertia = useInertia

    @staticmethod
    def _learn(  # mutated args
            permanences,
            rng,

            # activity
            activeCells,
            activeInput,
            growthCandidateInput,

            # configuration
            sampleSize,
            initialPermanence,
            permanenceIncrement,
            permanenceDecrement,
            connectedPermanence):
        """
    For each active cell, reinforce active synapses, punish inactive synapses,
    and grow new synapses to a subset of the active input bits that the cell
    isn't already connected to.

    Parameters:
    ----------------------------
    @param  permanences (SparseMatrix)
            Matrix of permanences, with cells as rows and inputs as columns

    @param  rng (Random)
            Random number generator

    @param  activeCells (sorted sequence)
            Sorted list of the cells that are learning

    @param  activeInput (sorted sequence)
            Sorted list of active bits in the input

    @param  growthCandidateInput (sorted sequence)
            Sorted list of active bits in the input that the activeCells may
            grow new synapses to

    For remaining parameters, see the __init__ docstring.
    """

        permanences.incrementNonZerosOnOuter(activeCells, activeInput,
                                             permanenceIncrement)
        permanences.incrementNonZerosOnRowsExcludingCols(
            activeCells, activeInput, -permanenceDecrement)
        permanences.clipRowsBelowAndAbove(activeCells, 0.0, 1.0)
        if sampleSize == -1:
            permanences.setZerosOnOuter(activeCells, activeInput,
                                        initialPermanence)
        else:
            existingSynapseCounts = permanences.nNonZerosPerRowOnCols(
                activeCells, activeInput)

            maxNewByCell = numpy.empty(len(activeCells), dtype="int32")
            numpy.subtract(sampleSize, existingSynapseCounts, out=maxNewByCell)

            permanences.setRandomZerosOnOuter(activeCells,
                                              growthCandidateInput,
                                              maxNewByCell, initialPermanence,
                                              rng)
コード例 #10
0
ファイル: sdrcategory.py プロジェクト: trung-duc/mac-nupic
class SDRCategoryEncoder(Encoder):
  """Encodes a list of discrete categories (described by strings), that aren't
  related to each other.        # problem_with_this_approach: a lot of time categories, or generally,
                                # values are related to each other, which lays some common base for
                                # learning.

  Each  encoding is an SDR in which w out of n bits are turned on.

  Unknown categories are encoded as a single

  Internally we use a ScalarEncoder with a radius of 1, but since we only encode
  integers, we never get mixture outputs.

  The SDRCategoryEncoder uses a different method to encode categories""" # problem_with_this_code: this is just bad plain copy
                                        # problem_with_this_approach: this kind of encoding cannot capture the nuence of objects
                                        # since the encoded representation is completely random, and the encoded representation
                                        # does not share any meaning with the actual element (e.g. under this encoding scheme,
                                        # a lion can be to the snake as it can be to a tiger), and each bit in this sdr category
                                        # does not have any meaning.


  ############################################################################
  def __init__(self, n, w, categoryList = None, name="category", verbosity=0,
               encoderSeed=1, forced=False):    # to_note: this encoder encodes elements depending on the order of elements in category list
    """
    n is  total bits in output
    w is the number of bits that are turned on for each rep
    categoryList is a list of strings that define the categories.
    If "none" then categories will automatically be added as they are encountered.
    forced (default False) : if True, skip checks for parameters' settings; see encoders/scalar.py for details
    """

    self.n = n
    self.w = w

    self._learningEnabled = True

    # initialize the random number generators
    self._seed(encoderSeed)

    if not forced:
      # -- this is just to catch bad parameter choices
      if (self.n/self.w) < 2: # w is 50% of total len
        raise ValueError("Number of ON bits in SDR (%d) must be much smaller than "
                           "the output width (%d)" % (self.w, self.n))

      # Another arbitrary cutoff to catch likely mistakes
      if self.w < 21:
        raise ValueError("Number of bits in the SDR (%d) must be greater than 2, and should be >= 21, pass forced=True to init() to override this check"
                           % self.w)

    self._initOverlap()

    self.verbosity = verbosity

    self.description = [(name, 0)]
    self.name = name

    self.categoryToIndex = dict()
    self.ncategories = 0
    self.categories = list()    # problem_with_this_code: might need a more efficient way than python list to store categories. That efficient way must
                                # facillitate search. Otherwise, with a category of 10000 items, it would be extremely slow using python list.
    self.sdrs = None          # to_note: matrix storing the representation of all categories

    # Always include an 'unknown' category for
    # edge cases

    self._addCategory("<UNKNOWN>")
    if categoryList is None:
      self._learningEnabled = True
    else:
      self._learningEnabled = False
      for category in categoryList:
        self._addCategory(category)
      assert self.ncategories == len(categoryList) + 1

    # Not used by this class. Used for decoding (scalarsToStr())
    self.encoders = None

    # This matrix is used for the topDownCompute. We build it the first time
    #  topDownCompute is called
    self._topDownMappingM = None
    self._topDownValues = None


  def _initOverlap(self):
    # Calculate average overlap of SDRs for decoding
    # Density is fraction of bits on, and it is also the
    # probability that any individual bit is on.
    density = float(self.w) / self.n
    self.averageOverlap =  self.w * density           # answer_needed: why the average overlap is equal to the probability that a bit will be ON 
                                                      # multiplied by the numbers of ON bits?
    # We can do a better job of calculating the threshold. For now, just
    # something quick and dirty, which is the midway point between average
    # and full overlap. averageOverlap is always < w,  so the threshold
    # is always < w.
    self.thresholdOverlap =  int((self.averageOverlap + self.w)/2)          # to_note: thresholdOverlap is used to eliminate representational similarities
                                                                            # that happen by chance
    #  1.25 -- too sensitive for decode test, so make it less sensitive
    if self.thresholdOverlap < self.w - 3:
      self.thresholdOverlap = self.w - 3


  def __setstate__(self, state):
    self.__dict__.update(state)

    # Initialize self.random as an instance of NupicRandom derived from the
    # previous numpy random state
    randomState = state["random"]
    if isinstance(randomState, numpy.random.mtrand.RandomState):
      self.random = NupicRandom(randomState.randint(sys.maxint))      # problem_with_this_code: sys is not imported


  def _seed(self, seed=-1):
    """
    Initialize the random seed
    """
    if seed != -1:
      self.random = NupicRandom(seed)
    else:
      self.random = NupicRandom()


  ############################################################################
  def getDecoderOutputFieldTypes(self):
    """ [Encoder class virtual method override]
    """
    # TODO: change back to string meta-type after the decoding logic is fixed
    #       to output strings instead of internal index values.
    return (FieldMetaType.string,)
    #return (FieldMetaType.integer,)


  ############################################################################
  def  _addCategory(self, category):
    if category in self.categories:
      raise RuntimeError("Attempt to add add encoder category '%s' "
                         "that already exists" % category)

    if self.sdrs is None:
      assert self.ncategories == 0
      assert len(self.categoryToIndex) == 0
      # Initial allocation -- 16 rows
      self.sdrs = numpy.zeros((16, self.n), dtype='uint8')
    elif self.ncategories > self.sdrs.shape[0] - 2:       # to_note: .shape returns (row x column)
      # Preallocated sdrs are used up. Double our size
      currentMax = self.sdrs.shape[0]
      newsdrs = numpy.zeros((currentMax * 2, self.n), dtype='uint8')
      newsdrs[0:currentMax] = self.sdrs[0:currentMax]
      self.sdrs = newsdrs           # problem_with_this_code: is there any way to optimize this elif clause (making it faster)

    newrep = self._newRep()
    self.sdrs[self.ncategories] = newrep
    self.categories.append(category)
    self.categoryToIndex[category] = self.ncategories
    self.ncategories += 1
    self._topDownMappingM = None


  ############################################################################
  def _newRep(self):          # problem_with_this_approach: encoding scheme is set completely random. To be more effective, there must be a feedback loop from
                              # learning to teach the encoder how it should encodes information. For example, suppose that a machine learned something about 4-leg
                              # animals, it then create a blueprint of 4 leg animals, so that whenever another unknown animal that has similar visual pattern with
                              # the 4 leg animal, the encoding will encode that new animal more meaningfully (having some similar bits with other known 4 leg animals)
                              # Having this ability will make learning more effective, and might yield better result.
    """Generate a new and unique representation. Returns a numpy array
    of shape (n,). """
    maxAttempts = 1000

    for _ in xrange(maxAttempts):
      foundUnique = True
      population = numpy.arange(self.n, dtype=numpy.uint32)
      choices = numpy.arange(self.w, dtype=numpy.uint32)
      oneBits = sorted(self.random.sample(population, choices))     # problem_with_this_code: should be random.sample(population, len(choices))
      sdr =  numpy.zeros(self.n, dtype='uint8')
      sdr[oneBits] = 1
      for i in xrange(self.ncategories):
        if (sdr == self.sdrs[i]).all():
          foundUnique = False
          break
      if foundUnique:
        break;
    if not foundUnique:
      raise RuntimeError("Error, could not find unique pattern %d after "
                         "%d attempts" % (self.ncategories, maxAttempts))
    return sdr


  ############################################################################
  def getWidth(self):
    return self.n

  ############################################################################
  def getDescription(self):
    return self.description

  ############################################################################
  def getScalars(self, input):
    """ See method description in base.py """
    if input == SENTINEL_VALUE_FOR_MISSING_DATA:
        return numpy.array([0])

    index = self.categoryToIndex.get(input, None)         # to_note: get the scalar index from the categoryToIndex dictionary
    if index is None:
      if self._learningEnabled:
        self._addCategory(input)
        index = self.ncategories - 1
      else:
        # if not found, we encode category 0
        index = 0

    return numpy.array([index])

  ############################################################################
  def getBucketIndices(self, input):
    """ See method description in base.py """

    # For category encoder, the "scalar" we map to each category is the
    #  bucket index
    return self.getScalars(input)       # to_note: this module performs the same function as getScalars module

  ############################################################################
  def encodeIntoArray(self, input, output):
    if input == SENTINEL_VALUE_FOR_MISSING_DATA:
      output[0:self.n] = 0
      index = 0
    else:
      index = self.getBucketIndices(input)[0]
      output[0:self.n] = self.sdrs[index,:]

    if self.verbosity >= 2:
      print "input:", input, "index:", index, "output:", output
      print "decoded:", self.decodedToStr(self.decode(output))


  ############################################################################
  def decode(self, encoded, parentFieldName=''):                    # to_note: decoding scheme returns every result that has high overlap with encoded
    """ See the function description in base.py
    """

    assert (encoded[0:self.n] <= 1.0).all()

    resultString =  ""
    resultRanges = []

    overlaps =  (self.sdrs * encoded[0:self.n]).sum(axis=1)         # to_note: return the amount of overlap between encoded and every representation in test.sdrs

    if self.verbosity >= 2:
      print "Overlaps for decoding:"
      for i in xrange(0, self.ncategories):
        print "%d %s" % (overlaps[i], self.categories[i])

    matchingCategories =  (overlaps > self.thresholdOverlap).nonzero()[0]     # to_note: (overlap > self.thresholdOverlap) returns a list with len(overlaps)
                                                                              # which each element represent True or False depending on the condition
                                                                              # to_note: .nonzero()[0] returns list of all non zero values
                                                                              # to_note: the role of thresholdOverlap is to eliminates the overlapping cases
                                                                              # that have a high chance of being random. The larger the value of thresholdOverlap,
                                                                              # the more selective the decoding scheme, and more susceptible to noise.
    for index in matchingCategories:
      if resultString != "":
        resultString += " "
      resultString +=  str(self.categories[index])
      resultRanges.append([int(index),int(index)])

    if parentFieldName != '':
      fieldName = "%s.%s" % (parentFieldName, self.name)
    else:
      fieldName = self.name
    return ({fieldName: (resultRanges, resultString)}, [fieldName])


  ############################################################################
  def _getTopDownMapping(self):
    """ Return the interal _topDownMappingM matrix used for handling the
    bucketInfo() and topDownCompute() methods. This is a matrix, one row per
    category (bucket) where each row contains the encoded output for that
    category.
    """

    # -------------------------------------------------------------------------
    # Do we need to build up our reverse mapping table?
    if self._topDownMappingM is None:

      # Each row represents an encoded output pattern
      self._topDownMappingM = SM32(self.ncategories, self.n)

      outputSpace = numpy.zeros(self.n, dtype=GetNTAReal())
      for i in xrange(self.ncategories):
        self.encodeIntoArray(self.categories[i], outputSpace)
        self._topDownMappingM.setRowFromDense(i, outputSpace)

    return self._topDownMappingM


  ############################################################################
  def getBucketValues(self):
    """ See the function description in base.py """

    return self.categories


  ############################################################################
  def getBucketInfo(self, buckets):         

    """ See the function description in base.py
    """
                                            # to_note: take list of bucket index (indices of representation from topDownMapping) and returns
                                            # the corresponding information.
    if self.ncategories==0:
      return 0

    topDownMappingM = self._getTopDownMapping()

    categoryIndex = buckets[0]
    category = self.categories[categoryIndex]
    encoding = topDownMappingM.getRow(categoryIndex)

    return [EncoderResult(value=category, scalar=categoryIndex,
                          encoding=encoding)]

  ############################################################################
  def topDownCompute(self, encoded):
    """ See the function description in base.py
    """

    if self.ncategories==0:
      return 0

    topDownMappingM = self._getTopDownMapping()                                       # to_note: generates matrix of representations
                                                                                      # to_note: matrix.rightVecProd(vector) perform [matrix x vector]'
    categoryIndex = topDownMappingM.rightVecProd(encoded).argmax()                    # to_note: list.argmax() return the position of the largest element in the list
    category = self.categories[categoryIndex]
    encoding = topDownMappingM.getRow(categoryIndex)

    return EncoderResult(value=category, scalar=categoryIndex, encoding=encoding)

  ############################################################################
  def closenessScores(self, expValues, actValues, fractional=True):
    """ See the function description in base.py

    kwargs will have the keyword "fractional", which is ignored by this encoder
    """

    expValue = expValues[0]
    actValue = actValues[0]

    if expValue == actValue:
      closeness = 1.0
    else:
      closeness = 0.0

    if not fractional:
      closeness = 1.0 - closeness

    return numpy.array([closeness])


  @classmethod
  def read(cls, proto):
    encoder = object.__new__(cls)

    encoder.n = proto.n
    encoder.w = proto.w
    encoder.random = NupicRandom()
    encoder.random.read(proto.random)
    encoder.verbosity = proto.verbosity
    encoder.name = proto.name
    encoder.description = [(proto.name, 0)]
    encoder.categories = list(proto.categories)
    encoder.sdrs = numpy.array(proto.sdrs, dtype=numpy.uint8)

    encoder.categoryToIndex = {category:index
                               for index, category
                               in enumerate(encoder.categories)}
    encoder.ncategories = len(encoder.categories)
    encoder._learningEnabled = False
    encoder._initOverlap()

    return encoder


  def write(self, proto):
    proto.n = self.n
    proto.w = self.w
    self.random.write(proto.random)
    proto.verbosity = self.verbosity
    proto.name = self.name
    proto.categories = self.categories
    proto.sdrs = self.sdrs.tolist()