def testWordFingerprint(self):
    """Test the Cortical.io term (word-lelevl) encoding."""

    cio = CioEncoder(fingerprintType=EncoderTypes.word)
    response = cio.encode(self.text)

    self.assertFingerprintFields(response)

    encodingDict = getTestData("cio_encoding_word.json")

    self.assertEqual(encodingDict["fingerprint"]["positions"],
      response["fingerprint"]["positions"], "Cio bitmap is not as expected.")
    def testWordFingerprint(self):
        """Test the Cortical.io term (word-lelevl) encoding."""

        cio = CioEncoder(fingerprintType=EncoderTypes.word)
        response = cio.encode(self.text)

        self.assertFingerprintFields(response)

        encodingDict = getTestData("cio_encoding_word.json")

        self.assertEqual(encodingDict["fingerprint"]["positions"],
                         response["fingerprint"]["positions"],
                         "Cio bitmap is not as expected.")
    def testRetinaScaling(self):
        """Test the CioEncoder for retina dimension scaling."""

        cio = CioEncoder(retinaScaling=1.0,
                         fingerprintType=EncoderTypes.document)
        cioScaled = CioEncoder(retinaScaling=0.5,
                               fingerprintType=EncoderTypes.document)
        cioScaled2 = CioEncoder(retinaScaling=0.71,
                                fingerprintType=EncoderTypes.document)

        self.assertAlmostEqual(int(0.5 * cio.width), cioScaled.width)
        self.assertAlmostEqual(int(0.5 * cio.height), cioScaled.height)
        self.assertAlmostEqual(int(0.71 * cio.height), cioScaled2.height)

        response = cio.encode(self.text)
        responseScaled = cioScaled.encode(self.text)
        responseScaled2 = cioScaled2.encode(self.text)

        # Each bit position should be scaled down by retinaScaling*retinaScaling
        self.assertLessEqual(
            responseScaled["fingerprint"]["positions"].sum(),
            0.5 * 0.5 * response["fingerprint"]["positions"].sum())

        self.assertLessEqual(
            responseScaled2["fingerprint"]["positions"].sum(),
            0.71 * 0.71 * response["fingerprint"]["positions"].sum())

        # The number of on bits in scaled retina should normally be slightly less
        # than the original, but can be equal in some cases
        self.assertLessEqual(len(responseScaled["fingerprint"]["positions"]),
                             len(response["fingerprint"]["positions"]))
        self.assertLessEqual(len(responseScaled["fingerprint"]["positions"]),
                             len(responseScaled2["fingerprint"]["positions"]))

        # Check that encodeIntoArray works even with weird scaling
        a = numpy.zeros(cioScaled2.width * cioScaled2.height)
        cioScaled2.encodeIntoArray(self.text, a)
        self.assertEqual(len(responseScaled2["fingerprint"]["positions"]),
                         a.sum())
  def testRetinaScaling(self):
    """Test the CioEncoder for retina dimension scaling."""

    cio = CioEncoder(
      retinaScaling = 1.0, fingerprintType=EncoderTypes.document)
    cioScaled = CioEncoder(
      retinaScaling = 0.5, fingerprintType=EncoderTypes.document)
    cioScaled2 = CioEncoder(
      retinaScaling = 0.71, fingerprintType=EncoderTypes.document)

    self.assertAlmostEqual(int(0.5*cio.width), cioScaled.width)
    self.assertAlmostEqual(int(0.5*cio.height), cioScaled.height)
    self.assertAlmostEqual(int(0.71*cio.height), cioScaled2.height)

    response = cio.encode(self.text)
    responseScaled = cioScaled.encode(self.text)
    responseScaled2 = cioScaled2.encode(self.text)

    # Each bit position should be scaled down by retinaScaling*retinaScaling
    self.assertLessEqual(responseScaled["fingerprint"]["positions"].sum(),
                         0.5*0.5*response["fingerprint"]["positions"].sum())

    self.assertLessEqual(responseScaled2["fingerprint"]["positions"].sum(),
                         0.71*0.71*response["fingerprint"]["positions"].sum())

    # The number of on bits in scaled retina should normally be slightly less
    # than the original, but can be equal in some cases
    self.assertLessEqual(len(responseScaled["fingerprint"]["positions"]),
                         len(response["fingerprint"]["positions"]))
    self.assertLessEqual(len(responseScaled["fingerprint"]["positions"]),
                         len(responseScaled2["fingerprint"]["positions"]))

    # Check that encodeIntoArray works even with weird scaling
    a = numpy.zeros(cioScaled2.width*cioScaled2.height)
    cioScaled2.encodeIntoArray(self.text, a)
    self.assertEqual(len(responseScaled2["fingerprint"]["positions"]),
                     a.sum())
    def testDocumentFingerprint(self):
        """Test the Cortical.io text (document-level) encoding."""

        cio = CioEncoder(fingerprintType=EncoderTypes.document)
        response = cio.encode(self.text)

        self.assertFingerprintFields(response)

        encodingDict = getTestData("cio_encoding_document.json")

        self.assertEqual(
            encodingDict["fingerprint"]["positions"],
            response["fingerprint"]["positions"],
            "Cio bitmap is not as expected.",
        )
  def testRetinaScaling(self):
    """Test the CioEncoder for retina dimension scaling."""

    cio = CioEncoder(
      retinaScaling = 0.25, fingerprintType=EncoderTypes.document)
    response = cio.encode(self.text)

    encodingDict = getTestData("cio_encoding_scaled_retina.json")

    self.assertEqual(encodingDict["fingerprint"]["positions"],
      response["fingerprint"]["positions"], "Cio bitmap is not as expected.")

    fullRetinaEncodingDict = getTestData("cio_encoding_document.json")
    fullLength = len(fullRetinaEncodingDict["fingerprint"]["positions"])
    responseLength = len(response["fingerprint"]["positions"])

    self.assertTrue(responseLength <= fullLength,
      "Retina scaling did not decrease the fingerprint size.")
    def testRetinaScaling(self):
        """Test the CioEncoder for retina dimension scaling."""

        cio = CioEncoder(retinaScaling=0.25,
                         fingerprintType=EncoderTypes.document)
        response = cio.encode(self.text)

        encodingDict = getTestData("cio_encoding_scaled_retina.json")

        self.assertEqual(encodingDict["fingerprint"]["positions"],
                         response["fingerprint"]["positions"],
                         "Cio bitmap is not as expected.")

        fullRetinaEncodingDict = getTestData("cio_encoding_document.json")
        fullLength = len(fullRetinaEncodingDict["fingerprint"]["positions"])
        responseLength = len(response["fingerprint"]["positions"])

        self.assertTrue(
            responseLength <= fullLength,
            "Retina scaling did not decrease the fingerprint size.")
  def testMaxSparsity(self):
    """Test that CioEncoder's maxSparsity works."""

    # This text seems to generate bitmaps with about 8% sparsity
    text = ("Smoking harms nearly every organ in your body. Over 7000 chemicals"
            " have been identified in tobacco smoke. After reading all this"
            " James and Sue decided to abruptly quit cigarette smoking to"
            " improve their health but it clearly was not an easy decision.")

    # Encoders with maxSparsity of 100%, 10%, 5%, and 1%
    cio100 = CioEncoder(maxSparsity=1.0, fingerprintType=EncoderTypes.document)
    cio10 = CioEncoder(maxSparsity=0.1, fingerprintType=EncoderTypes.document)
    cio5 = CioEncoder(maxSparsity=0.05, fingerprintType=EncoderTypes.document)
    cio1 = CioEncoder(maxSparsity=0.01, fingerprintType=EncoderTypes.document)

    bitmapSize = cio100.width*cio100.height
    r100 = cio100.encode(text)
    r10 = cio10.encode(text)
    r5 = cio5.encode(text)
    r1 = cio1.encode(text)

    length100 = len(r100["fingerprint"]["positions"])
    length10 = len(r10["fingerprint"]["positions"])
    length5 = len(r5["fingerprint"]["positions"])
    length1 = len(r1["fingerprint"]["positions"])

    # Encodings must have no more than desired sparsity
    self.assertLessEqual(r100["sparsity"], 1.0)
    self.assertLessEqual(r10["sparsity"], 0.1)
    self.assertLessEqual(r5["sparsity"], 0.05)
    self.assertLessEqual(r1["sparsity"], 0.01)

    self.assertLessEqual(length100, bitmapSize)
    self.assertLessEqual(length10, 0.1*bitmapSize)
    self.assertLessEqual(length5, 0.05*bitmapSize)
    self.assertLessEqual(length1, 0.01*bitmapSize)

    # Encodings can't be zero
    self.assertGreater(length100, 0)
    self.assertGreater(length10, 0)
    self.assertGreater(length5, 0)
    self.assertGreater(length1, 0)

    # Encodings must have complete overlap with the next higher encoding
    s100 = set(r100["fingerprint"]["positions"])
    s10 = set(r10["fingerprint"]["positions"])
    s5 = set(r5["fingerprint"]["positions"])
    s1 = set(r1["fingerprint"]["positions"])
    self.assertEqual(len(s100 & s10), length10)
    self.assertEqual(len(s10 & s5), length5)
    self.assertEqual(len(s5 & s1), length1)

    # Test that if you encode a second time, you get the same bitmap
    r100_2 = cio100.encode(text)
    r10_2 = cio10.encode(text)
    r5_2 = cio5.encode(text)
    r1_2 = cio1.encode(text)

    self.assertEqual(hashlib.sha224(str(r100)).hexdigest(),
                      hashlib.sha224(str(r100_2)).hexdigest())
    self.assertEqual(hashlib.sha224(str(r10)).hexdigest(),
                      hashlib.sha224(str(r10_2)).hexdigest())
    self.assertEqual(hashlib.sha224(str(r5)).hexdigest(),
                      hashlib.sha224(str(r5_2)).hexdigest())
    self.assertEqual(hashlib.sha224(str(r1)).hexdigest(),
                      hashlib.sha224(str(r1_2)).hexdigest())
class ClassificationModelFingerprint(ClassificationModel):
  """
  Class to run the survey response classification task with Coritcal.io
  fingerprint encodings.

  From the experiment runner, the methods expect to be fed one sample at a time.
  """

  def __init__(self,
               verbosity=1,
               numLabels=3,
               modelDir="ClassificationModelFingerprint",
               fingerprintType=EncoderTypes.word,
               unionSparsity=0.20,
               retinaScaling=1.0,
               retina="en_associative",
               apiKey=None,
               classifierMetric="rawOverlap",
               cacheRoot=None):

    super(ClassificationModelFingerprint, self).__init__(
      verbosity=verbosity, numLabels=numLabels, modelDir=modelDir)

    # Init kNN classifier and Cortical.io encoder; need valid API key (see
    # CioEncoder init for details).
    self.classifier = KNNClassifier(k=numLabels,
                                    distanceMethod=classifierMetric,
                                    exact=False,
                                    verbosity=verbosity-1)

    if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word):
      raise ValueError("Invaid type of fingerprint encoding; see the "
                       "EncoderTypes class for eligble types.")

    cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__))

    self.encoder = CioEncoder(retinaScaling=retinaScaling,
                              cacheDir=os.path.join(cacheRoot, "CioCache"),
                              fingerprintType=fingerprintType,
                              unionSparsity=unionSparsity,
                              retina=retina,
                              apiKey=apiKey)


  def encodeSample(self, sample):
    """
    Encode an SDR of the input string by querying the Cortical.io API. If the
    client returns None, we create a random SDR with the model's dimensions n
    and w.

    @param sample     (list)        Tokenized sample, where each item is a str.
    @return fp        (dict)        The sample text, sparsity, and bitmap.
    Example return dict:
      {
        "text": "Example text",
        "sparsity": 0.03,
        "bitmap": numpy.array([])
      }
    """
    sample = " ".join(sample)
    fpInfo = self.encoder.encode(sample)
    if fpInfo:
      fp = {"text":fpInfo["text"] if "text" in fpInfo else fpInfo["term"],
            "sparsity":fpInfo["sparsity"],
            "bitmap":numpy.array(fpInfo["fingerprint"]["positions"])}
    else:
      fp = {"text":sample,
            "sparsity":float(self.encoder.w)/self.encoder.n,
            "bitmap":self.encodeRandomly(
              sample, self.encoder.n, self.encoder.w)}

    return fp


  def trainModel(self, i):
    # TODO: add batch training, where i is a list
    """
    Train the classifier on the sample and labels for record i. The list
    sampleReference is populated to correlate classifier prototypes to sample
    IDs.
    """
    bitmap = self.patterns[i]["pattern"]["bitmap"]
    count = 0
    if bitmap.any():
      for count, label in enumerate(self.patterns[i]["labels"]):
        self.classifier.learn(bitmap, label, isSparse=self.encoder.n)
        self.sampleReference.append(self.patterns[i]["ID"])
      count += 1

    return count


  def testModel(self, i, seed=42):
    """
    Test the model on record i. The random seed is used in getWinningLabels().

    @return           (numpy array)   numLabels most-frequent classifications
                                      for the data samples; int or empty.
    """
    (_, inferenceResult, _, _) = self.classifier.infer(self.sparsifyPattern(
      self.patterns[i]["pattern"]["bitmap"], self.encoder.n))
    return self.getWinningLabels(inferenceResult, seed)
class ClassificationModelFingerprint(ClassificationModel):
  """
  Class to run the survey response classification task with Coritcal.io
  fingerprint encodings.

  From the experiment runner, the methods expect to be fed one sample at a time.
  """

  def __init__(self,
               fingerprintType=EncoderTypes.word,
               unionSparsity=0.20,
               retinaScaling=1.0,
               retina="en_associative",
               apiKey=None,
               k=1,
               classifierMetric="rawOverlap",
               cacheRoot=None,
               **kwargs):

    super(ClassificationModelFingerprint, self).__init__(**kwargs)

    self.classifier = KNNClassifier(k=k,
                                    distanceMethod=classifierMetric,
                                    exact=False,
                                    verbosity=self.verbosity-1)

    # Need a valid API key for the Cortical.io encoder (see CioEncoder
    # constructor for details).
    if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word):
      raise ValueError("Invalid type of fingerprint encoding; see the "
                       "EncoderTypes class for eligble types.")

    self.encoder = CioEncoder(retinaScaling=retinaScaling,
                              fingerprintType=fingerprintType,
                              unionSparsity=unionSparsity,
                              retina=retina,
                              apiKey=apiKey,
                              cacheDir=cacheRoot)

    self.currentDocument = None


  def trainToken(self, token, labels, sampleId, reset=0):
    """
    Train the model with the given text token, associated labels, and
    sampleId.

    See base class for params and return type descriptions.
    """
    if self.currentDocument is None:
      # start of a new document
      self.currentDocument = [token]
    else:
      # accumulate text for this document
      self.currentDocument.append(token)

    if reset == 1:
      # all text accumulated, proceed w/ training on this document
      document = " ".join(self.currentDocument)
      bitmap = self.encoder.encode(document)["fingerprint"]["positions"]


      if self.verbosity >= 2:
        print "CioFP model training with: '{}'".format(document)
        print "\tBitmap:", bitmap

      for label in labels:
        self.classifier.learn(
            bitmap, label, isSparse=self.encoder.n, partitionId=sampleId)

      self.currentDocument = None


  def inferToken(self, token, reset=0, returnDetailedResults=False,
                 sortResults=True):
    """
    Classify the token (i.e. run inference on the model with this document) and
    return classification results and (optionally) a list of sampleIds and
    distances.   Repeated sampleIds are NOT removed from the results.

    See base class for params and return type descriptions.
    """
    if self.currentDocument is None:
      # start of a new document
      self.currentDocument = [token]
    else:
      # accumulate text for this document
      self.currentDocument.append(token)

    if reset == 0:
      return numpy.zeros(self.numLabels), [], numpy.zeros(0)

    # With reset=1, all text accumulated, proceed w/ classifying this document
    document = " ".join(self.currentDocument)
    bitmap = self.encoder.encode(document)["fingerprint"]["positions"]

    densePattern  =self.encoder.densifyPattern(bitmap)

    (_, inferenceResult, dist, _) = self.classifier.infer(densePattern)

    if self.verbosity >= 2:
      print "CioFP model inference with: '{}'".format(document)
      print "\tBitmap:", bitmap
      print "\tInference result=", inferenceResult
      print "\tDistances=", dist

    self.currentDocument = None

    # Figure out format of returned results

    if not returnDetailedResults:
      # Return non-detailed results.
      return inferenceResult, None, None

    if not sortResults:
      idList = [self.classifier.getPartitionId(i) for i in xrange(len(dist))]
      return inferenceResult, idList, dist

    # Return sorted results
    sortedIndices = dist.argsort()
    idList = [self.classifier.getPartitionId(i) for i in sortedIndices]
    sortedDistances = dist[sortedIndices]
    return inferenceResult, idList, sortedDistances


  def getEncoder(self):
    """
    Returns the encoder instance for the model.
    """
    return self.encoder


  def getClassifier(self):
    """
    Returns the classifier instance for the model.
    """
    return self.classifier
Example #11
0
class ClassificationModelEndpoint(ClassificationModel):
  """
  Class to run the survey response classification task with Cortical.io
  text endpoint encodings and classification system.

  From the experiment runner, the methods expect to be fed one sample at a time.
  """

  def __init__(self,
               verbosity=1,
               numLabels=3,
               modelDir="ClassificationModelEndpoint",
               unionSparsity=0.20,
               cacheRoot=None):
    """
    Initializes the encoder as CioEncoder; requires a valid API key.
    """
    super(ClassificationModelEndpoint, self).__init__(
      verbosity=verbosity, numLabels=numLabels, modelDir=modelDir)

    cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__))

    self.encoder = CioEncoder(cacheDir=os.path.join(cacheRoot, "CioCache"),
                              unionSparsity=unionSparsity)
    self.compareEncoder = LanguageEncoder()

    self.n = self.encoder.n
    self.w = int((self.encoder.targetSparsity/100) * self.n)

    self.categoryBitmaps = {}
    self.negatives = defaultdict(list)
    self.positives = defaultdict(list)


  def encodeSample(self, sample):
    """
    Encode an SDR of the input string by querying the Cortical.io API.

    @param sample         (list)          Tokenized sample, where each item is
                                          a string
    @return fp            (dict)          The sample text, sparsity, and bitmap.
    Example return dict:
      {
        "text": "Example text",
        "sparsity": 0.03,
        "bitmap": numpy.array([])
      }
    """
    sample = " ".join(sample)
    fpInfo = self.encoder.encode(sample)
    if fpInfo:
      fp = {"text":fpInfo["text"] if "text" in fpInfo else fpInfo["term"],
            "sparsity":fpInfo["sparsity"],
            "bitmap":numpy.array(fpInfo["fingerprint"]["positions"])}
    else:
      fp = {"text":sample,
            "sparsity":float(self.w)/self.n,
            "bitmap":self.encodeRandomly(sample, self.n, self.w)}

    return fp


  def resetModel(self):
    """Reset the model"""
    self.positives.clear()
    self.negatives.clear()
    self.categoryBitmaps.clear()


  def trainModel(self, i, negatives=None):
    # TODO: add batch training, where i is a list; note we should only add
    # negatives when training on one sample so we know which labels to use.
    """
    Train the classifier on the sample and labels for record i. Use
    Cortical.io's createClassification() to make a bitmap that represents the
    class. The list sampleReference is populated to correlate classifier
    prototypes to sample IDs.

    @param negative   (list)            Each item is the dictionary containing
                                        text, sparsity and bitmap for the
                                        negative samples.
    """
    record = self.patterns[i]
    labelsToUpdateBitmaps = set()
    for label in record["labels"]:
      if record["pattern"]["text"] and record["pattern"]["bitmap"].any():
        self.positives[label].append(record["pattern"]["text"])
        if negatives:
          for neg in negatives:
            if neg["text"]:
              self.negatives[label].append(neg["text"])
        labelsToUpdateBitmaps.add(label)

    for label in labelsToUpdateBitmaps:
      self.categoryBitmaps[label] = self.encoder.createCategory(
        str(label), self.positives[label], self.negatives[label])["positions"]
      self.sampleReference.append(i)


  def testModel(self, i, _, metric="overlappingAll"):
    """
    Test on record i. The Cortical.io classifier returns a dictionary
    containing various distance metrics between the sample and the classes.

    @param metric     (str)           Distance metric use by classifier.
    @return           (numpy array)   numLabels most-frequent classifications
                                      for the data samples; int or empty.
    """
    sampleBitmap = self.patterns[i]["pattern"]["bitmap"].tolist()

    distances = defaultdict(list)
    for cat, catBitmap in self.categoryBitmaps.iteritems():
      distances[cat] = self.compareEncoder.compare(sampleBitmap, catBitmap)

    return self.getWinningLabels(distances, metric=metric)


  def getWinningLabels(self, distances, metric):
    """
    Return indices of winning categories, based off of the input metric.
    Overrides the base class implementation.
    """
    metricValues = numpy.array([v[metric] for v in distances.values()])
    sortedIdx = numpy.argsort(metricValues)

    # euclideanDistance and jaccardDistance are ascending
    descendingOrder = ("overlappingAll", "overlappingLeftRight",
                       "overlappingRightLeft", "cosineSimilarity",
                       "weightedScoring")
    if metric in descendingOrder:
      sortedIdx = sortedIdx[::-1]

    return numpy.array(
      [distances.keys()[catIdx] for catIdx in sortedIdx[:self.numLabels]])


  def getCategoryDistances(self, sort=True, save=None, labelRefs=None):
    """
    Return a dict where keys are categories and values are dicts of distances.

    @param sort      (bool)        Sort the inner dicts with compareCategories()
    @param save      (str)         Dump catDistances to a JSON in this dir.
    @return          (defaultdict)

    E.g. w/ categories 0 and 1:
      catDistances = {
          0: {
              0: {"cosineSimilarity": 1.0, ...},
              1: {"cosineSimilarity": 0.33, ...}
              },
          1: {
              0: {"cosineSimilarity": 0.33, ...},
              1: {"cosineSimilarity": 1.0, ...}
              }
    Note the inner-dicts of catDistances are OrderedDict objects.
    """
    catDistances = defaultdict(list)
    for cat, catBitmap in self.categoryBitmaps.iteritems():
      catDistances[cat] = OrderedDict()
      for compareCat, compareBitmap in self.categoryBitmaps.iteritems():
        # List is in order of self.categoryBitmaps.keys()
        catDistances[cat][compareCat] = self.compareEncoder.compare(
          catBitmap, compareBitmap)

    if sort:
      # Order each inner dict of catDistances such that the ranking is most to
      # least similar.
      catDistances = self.compareCategories(catDistances)

    if save is not None:
      self.writeOutCategories(
        save, comparisons=catDistances, labelRefs=labelRefs)

    return catDistances


  @staticmethod
  def compareCategories(catDistances, metric="overlappingAll"):
    """
    Calculate category distances. Returns a defaultdict of category keys, where
    values are OrderedDicts sorted such that the most similar categories
    (according to the input metric) are listed first.
    """
    descendingOrder = ("overlappingAll", "overlappingLeftRight",
                       "overlappingRightLeft", "cosineSimilarity",
                       "weightedScoring")

    categoryComparisons = defaultdict(list)
    for k, v in catDistances.iteritems():
      # Create a dict for this category
      metricDict = {compareCat: distances[metric]
                    for compareCat, distances in v.iteritems()}
      # Sort the dict by the metric
      reverse = True if metric in descendingOrder else False
      categoryComparisons[k] = OrderedDict(
        sorted(metricDict.items(), key=lambda k: k[1], reverse=reverse))

    return categoryComparisons


  @staticmethod
  def query():
    print "The Classification Endpoint model doesn't support this method."


  @staticmethod
  def infer():
    print "The Classification Endpoint model doesn't support this method."
Example #12
0
class ClassificationModelFingerprint(ClassificationModel):
    """
  Class to run the survey response classification task with Coritcal.io
  fingerprint encodings.

  From the experiment runner, the methods expect to be fed one sample at a time.
  """
    def __init__(self,
                 verbosity=1,
                 numLabels=3,
                 modelDir="ClassificationModelFingerprint",
                 fingerprintType=EncoderTypes.word,
                 unionSparsity=0.20,
                 retinaScaling=1.0,
                 retina="en_associative",
                 apiKey=None,
                 classifierMetric="rawOverlap",
                 cacheRoot=None):

        super(ClassificationModelFingerprint,
              self).__init__(verbosity=verbosity,
                             numLabels=numLabels,
                             modelDir=modelDir)

        # Init kNN classifier and Cortical.io encoder; need valid API key (see
        # CioEncoder init for details).
        self.classifier = KNNClassifier(k=numLabels,
                                        distanceMethod=classifierMetric,
                                        exact=False,
                                        verbosity=verbosity - 1)

        if fingerprintType is (not EncoderTypes.document
                               or not EncoderTypes.word):
            raise ValueError("Invaid type of fingerprint encoding; see the "
                             "EncoderTypes class for eligble types.")

        cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__))

        self.encoder = CioEncoder(retinaScaling=retinaScaling,
                                  cacheDir=os.path.join(cacheRoot, "CioCache"),
                                  fingerprintType=fingerprintType,
                                  unionSparsity=unionSparsity,
                                  retina=retina,
                                  apiKey=apiKey)

    def encodeSample(self, sample):
        """
    Encode an SDR of the input string by querying the Cortical.io API. If the
    client returns None, we create a random SDR with the model's dimensions n
    and w.

    @param sample     (list)        Tokenized sample, where each item is a str.
    @return fp        (dict)        The sample text, sparsity, and bitmap.
    Example return dict:
      {
        "text": "Example text",
        "sparsity": 0.03,
        "bitmap": numpy.array([])
      }
    """
        sample = " ".join(sample)
        fpInfo = self.encoder.encode(sample)
        if fpInfo:
            fp = {
                "text": fpInfo["text"] if "text" in fpInfo else fpInfo["term"],
                "sparsity": fpInfo["sparsity"],
                "bitmap": numpy.array(fpInfo["fingerprint"]["positions"])
            }
        else:
            fp = {
                "text":
                sample,
                "sparsity":
                float(self.encoder.w) / self.encoder.n,
                "bitmap":
                self.encodeRandomly(sample, self.encoder.n, self.encoder.w)
            }

        return fp

    def trainModel(self, i):
        # TODO: add batch training, where i is a list
        """
    Train the classifier on the sample and labels for record i. The list
    sampleReference is populated to correlate classifier prototypes to sample
    IDs.
    """
        bitmap = self.patterns[i]["pattern"]["bitmap"]
        count = 0
        if bitmap.any():
            for count, label in enumerate(self.patterns[i]["labels"]):
                self.classifier.learn(bitmap, label, isSparse=self.encoder.n)
                self.sampleReference.append(self.patterns[i]["ID"])
            count += 1

        return count

    def testModel(self, i, seed=42):
        """
    Test the model on record i. The random seed is used in getWinningLabels().

    @return           (numpy array)   numLabels most-frequent classifications
                                      for the data samples; int or empty.
    """
        (_, inferenceResult, _, _) = self.classifier.infer(
            self.sparsifyPattern(self.patterns[i]["pattern"]["bitmap"],
                                 self.encoder.n))
        return self.getWinningLabels(inferenceResult, seed)
class ClassificationModelFingerprint(ClassificationModel):
    """
  Class to run the survey response classification task with Coritcal.io
  fingerprint encodings.

  From the experiment runner, the methods expect to be fed one sample at a time.
  """
    def __init__(self,
                 fingerprintType=EncoderTypes.word,
                 unionSparsity=0.20,
                 retinaScaling=1.0,
                 retina="en_associative",
                 apiKey=None,
                 k=1,
                 classifierMetric="rawOverlap",
                 cacheRoot=None,
                 **kwargs):

        super(ClassificationModelFingerprint, self).__init__(**kwargs)

        self.classifier = KNNClassifier(k=k,
                                        distanceMethod=classifierMetric,
                                        exact=False,
                                        verbosity=self.verbosity - 1)

        # Need a valid API key for the Cortical.io encoder (see CioEncoder
        # constructor for details).
        if fingerprintType is (not EncoderTypes.document
                               or not EncoderTypes.word):
            raise ValueError("Invalid type of fingerprint encoding; see the "
                             "EncoderTypes class for eligble types.")

        self.encoder = CioEncoder(retinaScaling=retinaScaling,
                                  fingerprintType=fingerprintType,
                                  unionSparsity=unionSparsity,
                                  retina=retina,
                                  apiKey=apiKey,
                                  cacheDir=cacheRoot)

        self.currentDocument = None

    def trainToken(self, token, labels, sampleId, reset=0):
        """
    Train the model with the given text token, associated labels, and
    sampleId.

    See base class for params and return type descriptions.
    """
        if self.currentDocument is None:
            # start of a new document
            self.currentDocument = [token]
        else:
            # accumulate text for this document
            self.currentDocument.append(token)

        if reset == 1:
            # all text accumulated, proceed w/ training on this document
            document = " ".join(self.currentDocument)
            bitmap = self.encoder.encode(document)["fingerprint"]["positions"]

            if self.verbosity >= 2:
                print "CioFP model training with: '{}'".format(document)
                print "\tBitmap:", bitmap

            for label in labels:
                self.classifier.learn(bitmap,
                                      label,
                                      isSparse=self.encoder.n,
                                      partitionId=sampleId)

            self.currentDocument = None

    def inferToken(self,
                   token,
                   reset=0,
                   returnDetailedResults=False,
                   sortResults=True):
        """
    Classify the token (i.e. run inference on the model with this document) and
    return classification results and (optionally) a list of sampleIds and
    distances.   Repeated sampleIds are NOT removed from the results.

    See base class for params and return type descriptions.
    """
        if self.currentDocument is None:
            # start of a new document
            self.currentDocument = [token]
        else:
            # accumulate text for this document
            self.currentDocument.append(token)

        if reset == 0:
            return numpy.zeros(self.numLabels), [], numpy.zeros(0)

        # With reset=1, all text accumulated, proceed w/ classifying this document
        document = " ".join(self.currentDocument)
        bitmap = self.encoder.encode(document)["fingerprint"]["positions"]

        densePattern = self.encoder.densifyPattern(bitmap)

        (_, inferenceResult, dist, _) = self.classifier.infer(densePattern)

        if self.verbosity >= 2:
            print "CioFP model inference with: '{}'".format(document)
            print "\tBitmap:", bitmap
            print "\tInference result=", inferenceResult
            print "\tDistances=", dist

        self.currentDocument = None

        # Figure out format of returned results

        if not returnDetailedResults:
            # Return non-detailed results.
            return inferenceResult, None, None

        if not sortResults:
            idList = [
                self.classifier.getPartitionId(i) for i in xrange(len(dist))
            ]
            return inferenceResult, idList, dist

        # Return sorted results
        sortedIndices = dist.argsort()
        idList = [self.classifier.getPartitionId(i) for i in sortedIndices]
        sortedDistances = dist[sortedIndices]
        return inferenceResult, idList, sortedDistances

    def getEncoder(self):
        """
    Returns the encoder instance for the model.
    """
        return self.encoder

    def getClassifier(self):
        """
    Returns the classifier instance for the model.
    """
        return self.classifier
Example #14
0
class ClassificationModelContext(ClassificationModel):
  """
  Class to run the survey response classification task with Cortical.io
  text context, then AND the context

  From the experiment runner, the methods expect to be fed one sample at a time.
  """

  def __init__(self, verbosity=1, numLabels=1):
    """
    Initialize the CorticalClient and CioEncoder. Requires a valid API key.
    """
    super(ClassificationModelContext, self).__init__(verbosity)

    root = os.path.dirname(os.path.realpath(__file__))
    self.encoder = CioEncoder(cacheDir=os.path.join(root, "CioCache"))
    self.client = CorticalClient(self.encoder.apiKey)

    self.n = self.encoder.n
    self.w = int((self.encoder.targetSparsity / 100) * self.n)

    self.categoryBitmaps = {}
    self.numLabels = numLabels


  def encodePattern(self, pattern):
    """
    Encode an SDR of the input string by querying the Cortical.io API.

    @param pattern     (list)           Tokenized sample, where each item is a
                                        string
    @return            (dictionary)     Dictionary, containing text, sparsity,
                                        and bitmap
    Example return dict:
    {
      "text": "Example text",
      "sparsity": 0.0,
      "bitmap": numpy.zeros(0)
    }
    """
    text = " ".join(pattern)
    return {"text": text, "sparsity": 0.0, "bitmap": self._encodeText(text)}


  def _encodeText(self, text):
    fpInfo = self.encoder.encode(text)
    if self.verbosity > 1:
      print "Fingerprint sparsity = {0}%.".format(fpInfo["sparsity"])

    if fpInfo:
      bitmap = numpy.array(fpInfo["fingerprint"]["positions"])
    else:
      bitmap = self.encodeRandomly(text, self.n, self.w)

    return bitmap.astype(int)


  def resetModel(self):
    """Reset the model"""
    self.categoryBitmaps.clear()


  def trainModel(self, samples, labels):
    """
    Train the classifier on the input sample and label. Use Cortical.io's
    keyword extraction to get the most relevant terms then get the intersection
    of those bitmaps

    @param samples     (dictionary)      Dictionary, containing text, sparsity,
                                         and bitmap
    @param labels      (int)             Reference index for the classification
                                         of this sample.
    """
    for sample, sample_labels in zip(samples, labels):
      bitmaps = [sample["bitmap"].tolist()]
      context = self.client.getContextFromText(bitmaps, maxResults=5,
                                               getFingerprint=True)

      if len(context) != 0:
        union = numpy.zeros(0)
        for c in context:
          bitmap = c["fingerprint"]["positions"]
          union = numpy.union1d(bitmap, union).astype(int)

        for label in sample_labels:
          # Haven't seen the label before
          if label not in self.categoryBitmaps:
            self.categoryBitmaps[label] = union

          intersection = numpy.intersect1d(union, self.categoryBitmaps[label])
          if intersection.size == 0:
            # Don't want to lose all the old information
            union = numpy.union1d(union, self.categoryBitmaps[label]).astype(int)
            # Need to sample to stay sparse
            count = len(union)
            sampleIndices = random.sample(xrange(count), min(count, self.w))
            intersection = numpy.sort(union[sampleIndices])

          self.categoryBitmaps[label] = intersection


  def testModel(self, sample):
    """
    Test the intersection bitmap on the input sample. Returns a dictionary
    containing various distance metrics between the sample and the classes.

    @param sample     (dictionary)      Dictionary, containing text, sparsity,
                                        and bitmap
    @return           (dictionary)      The distances between the sample and
                                        the classes
    Example return dict:
      {
        0: {
          "cosineSimilarity": 0.6666666666666666,
          "euclideanDistance": 0.3333333333333333,
          "jaccardDistance": 0.5,
          "overlappingAll": 6,
          "overlappingLeftRight": 0.6666666666666666,
          "overlappingRightLeft": 0.6666666666666666,
          "sizeLeft": 9,
          "sizeRight": 9,
          "weightedScoring": 0.4436476984102028
        }
      }
    """

    sampleBitmap = sample["bitmap"].tolist()

    distances = {}
    for cat, catBitmap in self.categoryBitmaps.iteritems():
      distances[cat] = self.client.compare(sampleBitmap, catBitmap.tolist())

    return self.winningLabels(distances, numberCats=self.numLabels,
      metric="overlappingAll") 


  @staticmethod
  def winningLabels(distances, numberCats, metric):
    """
    Return indices of winning categories, based off of the input metric.
    Overrides the base class implementation.
    """
    metricValues = numpy.array([v[metric] for v in distances.values()])
    sortedIdx = numpy.argsort(metricValues)

    # euclideanDistance and jaccardDistance are ascending
    descendingOrder = set(["overlappingAll", "overlappingLeftRight",
      "overlappingRightLeft", "cosineSimilarity", "weightedScoring"])
    if metric in descendingOrder:
      sortedIdx = sortedIdx[::-1]

    return [distances.keys()[catIdx] for catIdx in sortedIdx[:numberCats]]
    def testMaxSparsity(self):
        """Test that CioEncoder's maxSparsity works."""

        # This text seems to generate bitmaps with about 8% sparsity
        text = (
            "Smoking harms nearly every organ in your body. Over 7000 chemicals"
            " have been identified in tobacco smoke. After reading all this"
            " James and Sue decided to abruptly quit cigarette smoking to"
            " improve their health but it clearly was not an easy decision.")

        # Encoders with maxSparsity of 100%, 10%, 5%, and 1%
        cio100 = CioEncoder(maxSparsity=1.0,
                            fingerprintType=EncoderTypes.document)
        cio10 = CioEncoder(maxSparsity=0.1,
                           fingerprintType=EncoderTypes.document)
        cio5 = CioEncoder(maxSparsity=0.05,
                          fingerprintType=EncoderTypes.document)
        cio1 = CioEncoder(maxSparsity=0.01,
                          fingerprintType=EncoderTypes.document)

        bitmapSize = cio100.width * cio100.height
        r100 = cio100.encode(text)
        r10 = cio10.encode(text)
        r5 = cio5.encode(text)
        r1 = cio1.encode(text)

        length100 = len(r100["fingerprint"]["positions"])
        length10 = len(r10["fingerprint"]["positions"])
        length5 = len(r5["fingerprint"]["positions"])
        length1 = len(r1["fingerprint"]["positions"])

        # Encodings must have no more than desired sparsity
        self.assertLessEqual(r100["sparsity"], 1.0)
        self.assertLessEqual(r10["sparsity"], 0.1)
        self.assertLessEqual(r5["sparsity"], 0.05)
        self.assertLessEqual(r1["sparsity"], 0.01)

        self.assertLessEqual(length100, bitmapSize)
        self.assertLessEqual(length10, 0.1 * bitmapSize)
        self.assertLessEqual(length5, 0.05 * bitmapSize)
        self.assertLessEqual(length1, 0.01 * bitmapSize)

        # Encodings can't be zero
        self.assertGreater(length100, 0)
        self.assertGreater(length10, 0)
        self.assertGreater(length5, 0)
        self.assertGreater(length1, 0)

        # Encodings must have complete overlap with the next higher encoding
        s100 = set(r100["fingerprint"]["positions"])
        s10 = set(r10["fingerprint"]["positions"])
        s5 = set(r5["fingerprint"]["positions"])
        s1 = set(r1["fingerprint"]["positions"])
        self.assertEqual(len(s100 & s10), length10)
        self.assertEqual(len(s10 & s5), length5)
        self.assertEqual(len(s5 & s1), length1)

        # Test that if you encode a second time, you get the same bitmap
        r100_2 = cio100.encode(text)
        r10_2 = cio10.encode(text)
        r5_2 = cio5.encode(text)
        r1_2 = cio1.encode(text)

        self.assertEqual(
            hashlib.sha224(str(r100)).hexdigest(),
            hashlib.sha224(str(r100_2)).hexdigest())
        self.assertEqual(
            hashlib.sha224(str(r10)).hexdigest(),
            hashlib.sha224(str(r10_2)).hexdigest())
        self.assertEqual(
            hashlib.sha224(str(r5)).hexdigest(),
            hashlib.sha224(str(r5_2)).hexdigest())
        self.assertEqual(
            hashlib.sha224(str(r1)).hexdigest(),
            hashlib.sha224(str(r1_2)).hexdigest())