def testWindowEncodings(self):
    """Test the CioEncoder for the sliding window encodings."""
    cio = CioEncoder(fingerprintType=EncoderTypes.word)

    text = """
      I grok people. I am people, so now I can say it in people talk. I've found
      out why people laugh. They laugh because it hurts so much, because it's
      the only thing that'll make it stop hurting."""

    tokens = TextPreprocess().tokenize(text)

    encodingDicts = cio.getWindowEncoding(tokens, minSparsity=0.19)
    
    # Test that only dense windows get encoded
    self.assertTrue(len(tokens) > len(encodingDicts),
      "Returned incorrect number of window encodings.")

    # Test window
    windowEncoding = getTestData("cio_encoding_window.json")
    self.assertEqual(windowEncoding["text"], encodingDicts[-1]["text"],
      "Window encoding represents the wrong text.")
    self.assertTrue(encodingDicts[-1]["sparsity"] <= cio.unionSparsity,
      "Sparsity for large window is larger than the max.")
    self.assertSequenceEqual(
      windowEncoding["bitmap"], encodingDicts[-1]["bitmap"].tolist(),
      "Window encoding's bitmap is not as expected.")
  def testWordFingerprint(self):
    """Test the Cortical.io term (word-lelevl) encoding."""

    cio = CioEncoder(fingerprintType=EncoderTypes.word)
    response = cio.encode(self.text)

    self.assertFingerprintFields(response)

    encodingDict = getTestData("cio_encoding_word.json")

    self.assertEqual(encodingDict["fingerprint"]["positions"],
      response["fingerprint"]["positions"], "Cio bitmap is not as expected.")
Example #3
0
  def __init__(self, verbosity=1, numLabels=1):
    """
    Initialize the CorticalClient and CioEncoder. Requires a valid API key.
    """
    super(ClassificationModelContext, self).__init__(verbosity)

    root = os.path.dirname(os.path.realpath(__file__))
    self.encoder = CioEncoder(cacheDir=os.path.join(root, "CioCache"))
    self.client = CorticalClient(self.encoder.apiKey)

    self.n = self.encoder.n
    self.w = int((self.encoder.targetSparsity / 100) * self.n)

    self.categoryBitmaps = {}
    self.numLabels = numLabels
    def testDocumentFingerprint(self):
        """Test the Cortical.io text (document-level) encoding."""

        cio = CioEncoder(fingerprintType=EncoderTypes.document)
        response = cio.encode(self.text)

        self.assertFingerprintFields(response)

        encodingDict = getTestData("cio_encoding_document.json")

        self.assertEqual(
            encodingDict["fingerprint"]["positions"],
            response["fingerprint"]["positions"],
            "Cio bitmap is not as expected.",
        )
  def __init__(self,
               fingerprintType=EncoderTypes.word,
               unionSparsity=0.20,
               retinaScaling=1.0,
               retina="en_associative",
               apiKey=None,
               k=1,
               classifierMetric="rawOverlap",
               cacheRoot=None,
               **kwargs):

    super(ClassificationModelFingerprint, self).__init__(**kwargs)

    self.classifier = KNNClassifier(k=k,
                                    distanceMethod=classifierMetric,
                                    exact=False,
                                    verbosity=self.verbosity-1)

    # Need a valid API key for the Cortical.io encoder (see CioEncoder
    # constructor for details).
    if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word):
      raise ValueError("Invalid type of fingerprint encoding; see the "
                       "EncoderTypes class for eligble types.")

    self.encoder = CioEncoder(retinaScaling=retinaScaling,
                              fingerprintType=fingerprintType,
                              unionSparsity=unionSparsity,
                              retina=retina,
                              apiKey=apiKey,
                              cacheDir=cacheRoot)

    self.currentDocument = None
  def __init__(self,
               verbosity=1,
               numLabels=3,
               modelDir="ClassificationModelFingerprint",
               fingerprintType=EncoderTypes.word,
               unionSparsity=0.20,
               retinaScaling=1.0,
               retina="en_associative",
               apiKey=None,
               classifierMetric="rawOverlap",
               cacheRoot=None):

    super(ClassificationModelFingerprint, self).__init__(
      verbosity=verbosity, numLabels=numLabels, modelDir=modelDir)

    # Init kNN classifier and Cortical.io encoder; need valid API key (see
    # CioEncoder init for details).
    self.classifier = KNNClassifier(k=numLabels,
                                    distanceMethod=classifierMetric,
                                    exact=False,
                                    verbosity=verbosity-1)

    if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word):
      raise ValueError("Invaid type of fingerprint encoding; see the "
                       "EncoderTypes class for eligble types.")

    cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__))

    self.encoder = CioEncoder(retinaScaling=retinaScaling,
                              cacheDir=os.path.join(cacheRoot, "CioCache"),
                              fingerprintType=fingerprintType,
                              unionSparsity=unionSparsity,
                              retina=retina,
                              apiKey=apiKey)
  def __init__(self,
               verbosity=1,
               numLabels=3,
               modelDir="ClassificationModelWindow",
               unionSparsity=0.20,
               retinaScaling=1.0,
               retina="en_associative",
               apiKey=None,
               classifierMetric="rawOverlap",
               cacheRoot=None):

    super(ClassificationModelWindows, self).__init__(
      verbosity=verbosity, numLabels=numLabels, modelDir=modelDir)

    # window patterns below minSparsity will be skipped over
    self.minSparsity = 0.9 * unionSparsity

    self.classifier = KNNClassifier(k=numLabels,
                                    distanceMethod=classifierMetric,
                                    exact=False,
                                    verbosity=verbosity-1)

    # need valid API key (see CioEncoder init for details)
    cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__))
    self.encoder = CioEncoder(retinaScaling=retinaScaling,
                              cacheDir=os.path.join(cacheRoot, "CioCache"),
                              fingerprintType=EncoderTypes.word,
                              unionSparsity=unionSparsity,
                              retina=retina,
                              apiKey=apiKey)
  def testRetinaScaling(self):
    """Test the CioEncoder for retina dimension scaling."""

    cio = CioEncoder(
      retinaScaling = 0.25, fingerprintType=EncoderTypes.document)
    response = cio.encode(self.text)

    encodingDict = getTestData("cio_encoding_scaled_retina.json")

    self.assertEqual(encodingDict["fingerprint"]["positions"],
      response["fingerprint"]["positions"], "Cio bitmap is not as expected.")

    fullRetinaEncodingDict = getTestData("cio_encoding_document.json")
    fullLength = len(fullRetinaEncodingDict["fingerprint"]["positions"])
    responseLength = len(response["fingerprint"]["positions"])

    self.assertTrue(responseLength <= fullLength,
      "Retina scaling did not decrease the fingerprint size.")
    def testRetinaScaling(self):
        """Test the CioEncoder for retina dimension scaling."""

        cio = CioEncoder(retinaScaling=0.25,
                         fingerprintType=EncoderTypes.document)
        response = cio.encode(self.text)

        encodingDict = getTestData("cio_encoding_scaled_retina.json")

        self.assertEqual(encodingDict["fingerprint"]["positions"],
                         response["fingerprint"]["positions"],
                         "Cio bitmap is not as expected.")

        fullRetinaEncodingDict = getTestData("cio_encoding_document.json")
        fullLength = len(fullRetinaEncodingDict["fingerprint"]["positions"])
        responseLength = len(response["fingerprint"]["positions"])

        self.assertTrue(
            responseLength <= fullLength,
            "Retina scaling did not decrease the fingerprint size.")
    def initModel(self):
        """
    Initialize the network; self.networdDataPath must already be set.
    """
        encoder = CioEncoder(retinaScaling=self.retinaScaling,
                             retina=self.retina,
                             apiKey=self.apiKey,
                             maxSparsity=self.maxSparsity,
                             verbosity=self.verbosity - 1)

        # This encoder specifies the LanguageSensor output width.
        return configureNetwork(None, self.networkConfig, encoder)
Example #11
0
    def __init__(self,
                 verbosity=1,
                 numLabels=3,
                 modelDir="ClassificationModelEndpoint",
                 unionSparsity=0.20):
        """
    Initializes the encoder as CioEncoder; requires a valid API key.
    """
        super(ClassificationModelEndpoint, self).__init__(verbosity=verbosity,
                                                          numLabels=numLabels,
                                                          modelDir=modelDir)

        root = os.path.dirname(os.path.realpath(__file__))
        self.encoder = CioEncoder(cacheDir=os.path.join(root, "CioCache"),
                                  unionSparsity=unionSparsity)
        self.compareEncoder = LanguageEncoder()

        self.n = self.encoder.n
        self.w = int((self.encoder.targetSparsity / 100) * self.n)

        self.categoryBitmaps = {}
        self.negatives = defaultdict(list)
        self.positives = defaultdict(list)
Example #12
0
    def __init__(self,
                 verbosity=1,
                 numLabels=3,
                 modelDir="ClassificationModelFingerprint",
                 fingerprintType=EncoderTypes.word,
                 unionSparsity=0.20,
                 retinaScaling=1.0,
                 retina="en_associative",
                 apiKey=None,
                 classifierMetric="rawOverlap",
                 cacheRoot=None):

        super(ClassificationModelFingerprint,
              self).__init__(verbosity=verbosity,
                             numLabels=numLabels,
                             modelDir=modelDir)

        # Init kNN classifier and Cortical.io encoder; need valid API key (see
        # CioEncoder init for details).
        self.classifier = KNNClassifier(k=numLabels,
                                        distanceMethod=classifierMetric,
                                        exact=False,
                                        verbosity=verbosity - 1)

        if fingerprintType is (not EncoderTypes.document
                               or not EncoderTypes.word):
            raise ValueError("Invaid type of fingerprint encoding; see the "
                             "EncoderTypes class for eligble types.")

        cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__))

        self.encoder = CioEncoder(retinaScaling=retinaScaling,
                                  cacheDir=os.path.join(cacheRoot, "CioCache"),
                                  fingerprintType=fingerprintType,
                                  unionSparsity=unionSparsity,
                                  retina=retina,
                                  apiKey=apiKey)
Example #13
0
  def _initModel(self, k):
    """
    Initialize the network
    """
    encoder = CioEncoder(retinaScaling=self.retinaScaling,
                         retina=self.retina,
                         fingerprintType=EncoderTypes.document,
                         apiKey=self.apiKey,
                         verbosity=self.verbosity-1)

    modelConfig["classifierRegionConfig"]["regionParams"]["k"] = k
    modelConfig["classifierRegionConfig"]["regionParams"][
                "maxCategoryCount"] = self.numLabels
    self.networkConfig = modelConfig
    self.network = configureNetwork(None, self.networkConfig, encoder)
Example #14
0
  def _initModel(self, k):
    """
    Initialize the network
    """
    root = os.path.dirname(os.path.realpath(__file__))
    encoder = CioEncoder(retinaScaling=self.retinaScaling,
                         cacheDir=os.path.join(root, "CioCache"),
                         retina=self.retina,
                         fingerprintType=EncoderTypes.document,
                         apiKey=self.apiKey)

    modelConfig["classifierRegionConfig"]["regionParams"]["k"] = k
    modelConfig["classifierRegionConfig"]["regionParams"][
                "maxCategoryCount"] = self.numLabels
    self.networkConfig = modelConfig
    self.network = configureNetwork(None, self.networkConfig, encoder)
Example #15
0
    def initModel(self):
        """
    Initialize the network; self.networdDataPath must already be set.
    """
        if self.networkDataPath is not None:
            recordStream = FileRecordStream(streamID=self.networkDataPath)
        else:
            recordStream = None

        root = os.path.dirname(os.path.realpath(__file__))
        encoder = CioEncoder(retinaScaling=self.retinaScaling,
                             cacheDir=os.path.join(root, "CioCache"),
                             retina=self.retina,
                             apiKey=self.apiKey)

        # This encoder specifies the LanguageSensor output width.
        return configureNetwork(recordStream, self.networkConfig, encoder)
  def testRetinaScaling(self):
    """Test the CioEncoder for retina dimension scaling."""

    cio = CioEncoder(
      retinaScaling = 1.0, fingerprintType=EncoderTypes.document)
    cioScaled = CioEncoder(
      retinaScaling = 0.5, fingerprintType=EncoderTypes.document)
    cioScaled2 = CioEncoder(
      retinaScaling = 0.71, fingerprintType=EncoderTypes.document)

    self.assertAlmostEqual(int(0.5*cio.width), cioScaled.width)
    self.assertAlmostEqual(int(0.5*cio.height), cioScaled.height)
    self.assertAlmostEqual(int(0.71*cio.height), cioScaled2.height)

    response = cio.encode(self.text)
    responseScaled = cioScaled.encode(self.text)
    responseScaled2 = cioScaled2.encode(self.text)

    # Each bit position should be scaled down by retinaScaling*retinaScaling
    self.assertLessEqual(responseScaled["fingerprint"]["positions"].sum(),
                         0.5*0.5*response["fingerprint"]["positions"].sum())

    self.assertLessEqual(responseScaled2["fingerprint"]["positions"].sum(),
                         0.71*0.71*response["fingerprint"]["positions"].sum())

    # The number of on bits in scaled retina should normally be slightly less
    # than the original, but can be equal in some cases
    self.assertLessEqual(len(responseScaled["fingerprint"]["positions"]),
                         len(response["fingerprint"]["positions"]))
    self.assertLessEqual(len(responseScaled["fingerprint"]["positions"]),
                         len(responseScaled2["fingerprint"]["positions"]))

    # Check that encodeIntoArray works even with weird scaling
    a = numpy.zeros(cioScaled2.width*cioScaled2.height)
    cioScaled2.encodeIntoArray(self.text, a)
    self.assertEqual(len(responseScaled2["fingerprint"]["positions"]),
                     a.sum())
  def __init__(self,
               verbosity=1,
               numLabels=3,
               modelDir="ClassificationModelEndpoint",
               unionSparsity=0.20):
    """
    Initializes the encoder as CioEncoder; requires a valid API key.
    """
    super(ClassificationModelEndpoint, self).__init__(
      verbosity=verbosity, numLabels=numLabels, modelDir=modelDir)
    
    root = os.path.dirname(os.path.realpath(__file__))
    self.encoder = CioEncoder(cacheDir=os.path.join(root, "CioCache"),
                              unionSparsity=unionSparsity)
    self.compareEncoder = LanguageEncoder()

    self.n = self.encoder.n
    self.w = int((self.encoder.targetSparsity/100) * self.n)

    self.categoryBitmaps = {}
    self.negatives = defaultdict(list)
    self.positives = defaultdict(list)
    def testMaxSparsity(self):
        """Test that CioEncoder's maxSparsity works."""

        # This text seems to generate bitmaps with about 8% sparsity
        text = (
            "Smoking harms nearly every organ in your body. Over 7000 chemicals"
            " have been identified in tobacco smoke. After reading all this"
            " James and Sue decided to abruptly quit cigarette smoking to"
            " improve their health but it clearly was not an easy decision.")

        # Encoders with maxSparsity of 100%, 10%, 5%, and 1%
        cio100 = CioEncoder(maxSparsity=1.0,
                            fingerprintType=EncoderTypes.document)
        cio10 = CioEncoder(maxSparsity=0.1,
                           fingerprintType=EncoderTypes.document)
        cio5 = CioEncoder(maxSparsity=0.05,
                          fingerprintType=EncoderTypes.document)
        cio1 = CioEncoder(maxSparsity=0.01,
                          fingerprintType=EncoderTypes.document)

        bitmapSize = cio100.width * cio100.height
        r100 = cio100.encode(text)
        r10 = cio10.encode(text)
        r5 = cio5.encode(text)
        r1 = cio1.encode(text)

        length100 = len(r100["fingerprint"]["positions"])
        length10 = len(r10["fingerprint"]["positions"])
        length5 = len(r5["fingerprint"]["positions"])
        length1 = len(r1["fingerprint"]["positions"])

        # Encodings must have no more than desired sparsity
        self.assertLessEqual(r100["sparsity"], 1.0)
        self.assertLessEqual(r10["sparsity"], 0.1)
        self.assertLessEqual(r5["sparsity"], 0.05)
        self.assertLessEqual(r1["sparsity"], 0.01)

        self.assertLessEqual(length100, bitmapSize)
        self.assertLessEqual(length10, 0.1 * bitmapSize)
        self.assertLessEqual(length5, 0.05 * bitmapSize)
        self.assertLessEqual(length1, 0.01 * bitmapSize)

        # Encodings can't be zero
        self.assertGreater(length100, 0)
        self.assertGreater(length10, 0)
        self.assertGreater(length5, 0)
        self.assertGreater(length1, 0)

        # Encodings must have complete overlap with the next higher encoding
        s100 = set(r100["fingerprint"]["positions"])
        s10 = set(r10["fingerprint"]["positions"])
        s5 = set(r5["fingerprint"]["positions"])
        s1 = set(r1["fingerprint"]["positions"])
        self.assertEqual(len(s100 & s10), length10)
        self.assertEqual(len(s10 & s5), length5)
        self.assertEqual(len(s5 & s1), length1)

        # Test that if you encode a second time, you get the same bitmap
        r100_2 = cio100.encode(text)
        r10_2 = cio10.encode(text)
        r5_2 = cio5.encode(text)
        r1_2 = cio1.encode(text)

        self.assertEqual(
            hashlib.sha224(str(r100)).hexdigest(),
            hashlib.sha224(str(r100_2)).hexdigest())
        self.assertEqual(
            hashlib.sha224(str(r10)).hexdigest(),
            hashlib.sha224(str(r10_2)).hexdigest())
        self.assertEqual(
            hashlib.sha224(str(r5)).hexdigest(),
            hashlib.sha224(str(r5_2)).hexdigest())
        self.assertEqual(
            hashlib.sha224(str(r1)).hexdigest(),
            hashlib.sha224(str(r1_2)).hexdigest())
Example #19
0
class ClassificationModelWindows(ClassificationModel):
    """
  Class to run classification tasks with a sliding windwo of Coritcal.io word
  fingerprint encodings.
  """
    def __init__(self,
                 verbosity=1,
                 numLabels=3,
                 modelDir="ClassificationModelWindow",
                 unionSparsity=0.20,
                 retinaScaling=1.0,
                 retina="en_associative",
                 apiKey=None,
                 classifierMetric="rawOverlap"):

        super(ClassificationModelWindows, self).__init__(verbosity=verbosity,
                                                         numLabels=numLabels,
                                                         modelDir=modelDir)

        # window patterns below minSparsity will be skipped over
        self.minSparsity = 0.9 * unionSparsity

        self.classifier = KNNClassifier(k=numLabels,
                                        distanceMethod=classifierMetric,
                                        exact=False,
                                        verbosity=verbosity - 1)

        # need valid API key (see CioEncoder init for details)
        root = os.path.dirname(os.path.realpath(__file__))
        self.encoder = CioEncoder(retinaScaling=retinaScaling,
                                  cacheDir=os.path.join(root, "CioCache"),
                                  fingerprintType=EncoderTypes.word,
                                  unionSparsity=unionSparsity,
                                  retina=retina,
                                  apiKey=apiKey)

    def encodeSample(self, sample):
        """
    Encode an SDR of the input string by querying the Cortical.io API for each
    word. The resulting bitmaps are unionized in a sliding window.

    @param sample     (list)        Tokenized sample, where each item is a str.
    @return           (list)        Pattern dicts for the windows, each with the
                                    sample text, sparsity, and bitmap.
    """
        return self.encoder.getWindowEncoding(sample, self.minSparsity)

    def writeOutEncodings(self):
        """
    Write the encoding dictionaries to a txt file; overrides the superclass
    implementation.
    """
        if not os.path.isdir(self.modelDir):
            raise ValueError("Invalid path to write file.")

        # Cast numpy arrays to list objects for serialization.
        jsonPatterns = copy.deepcopy(self.patterns)
        for jp in jsonPatterns:
            for tokenPattern in jp["pattern"]:
                tokenPattern["bitmap"] = tokenPattern.get(
                    "bitmap", numpy.array([])).tolist()
            jp["labels"] = jp.get("labels", numpy.array([])).tolist()

        with open(os.path.join(self.modelDir, "encoding_log.txt"), "w") as f:
            f.write(json.dumps(jsonPatterns, indent=1))

    def trainModel(self, i):
        # TODO: add batch training, where i is a list
        """
    Train the classifier on the sample and labels for record i. The list
    sampleReference is populated to correlate classifier prototypes to sample
    IDs. This model is unique in that a single sample contains multiple encoded
    patterns, of which, any that are too sparse are skipped over.

    @return       (int)     Number of patterns trained on.
    """
        patternWindows = self.patterns[i]["pattern"]
        if len(patternWindows) == 0:
            # no patterns b/c no windows were large enough for encoding
            return
        count = 0
        for window in patternWindows:
            for label in self.patterns[i]["labels"]:
                self.classifier.learn(window["bitmap"],
                                      label,
                                      isSparse=self.encoder.n)
                self.sampleReference.append(self.patterns[i]["ID"])
                count += 1

        return count

    def testModel(self, i, seed=42):
        """
    Test the model on record i. Returns the classifications most frequent
    amongst the classifications of the sample's individual tokens.
    We ignore the terms that are unclassified, picking the most frequent
    classifications among those that are detected; in getWinningLabels().

    @return           (numpy array)   numLabels most-frequent classifications
                                      for the data samples; int or empty.
    """
        totalInferenceResult = None
        for pattern in self.patterns[i]["pattern"]:
            if not pattern:
                continue

            _, inferenceResult, _, _ = self.classifier.infer(
                self.sparsifyPattern(pattern["bitmap"], self.encoder.n))

            if totalInferenceResult is None:
                totalInferenceResult = inferenceResult
            else:
                totalInferenceResult += inferenceResult

        return self.getWinningLabels(totalInferenceResult, seed)

    def queryModel(self, query, preprocess=False):
        """
    Preprocesses the query, encodes it into a pattern, then queries the
    classifier to infer distances to trained-on samples.
    @return       (list)          Two-tuples of sample ID and distance, sorted
                                  closest to farthest from the query.
    """
        if preprocess:
            sample = TextPreprocess().tokenize(
                query,
                ignoreCommon=100,
                removeStrings=["[identifier deleted]"],
                correctSpell=True)
        else:
            sample = TextPreprocess().tokenize(query)

        # Get window patterns for the query, but if the query is too small such that
        # the window encodings are too sparse, we default to a pure union.
        encodedQuery = self.encodeSample(sample)
        if len(encodedQuery) == 0:
            sample = " ".join(sample)
            fpInfo = self.encoder.getUnionEncoding(sample)
            encodedQuery = [{
                "text":
                fpInfo["text"],
                "sparsity":
                fpInfo["sparsity"],
                "bitmap":
                numpy.array(fpInfo["fingerprint"]["positions"])
            }]
        allDistances = self.infer(encodedQuery)

        if len(allDistances) != len(self.sampleReference):
            raise IndexError(
                "Number of protoype distances must match number of "
                "samples trained on.")

        sampleDistances = defaultdict()
        for uniqueID in self.sampleReference:
            sampleDistances[uniqueID] = min([
                allDistances[i] for i, x in enumerate(self.sampleReference)
                if x == uniqueID
            ])

        return sorted(sampleDistances.items(), key=operator.itemgetter(1))

    def infer(self, patterns):
        """
    Get the classifier output for a single input pattern; assumes classifier
    has an infer() method (as specified in NuPIC kNN implementation). For this
    model we sum the distances across the patterns and normalize
    before returning.

    NOTE: there is no check here that the pattern sparsities are > the minimum.

    @return       (numpy.array)       Each entry is the distance from the
        input pattern to that prototype (pattern in the classifier). All
        distances are between 0.0 and 1.0
    """
        distances = numpy.zeros((self.classifier._numPatterns))

        for i, p in enumerate(patterns):
            (_, _, dist, _) = self.classifier.infer(
                self.sparsifyPattern(p["bitmap"], self.encoder.n))

            distances = distances + dist

        return distances / float(i + 1)
class ClassificationModelWindows(ClassificationModel):
  """
  Class to run classification tasks with a sliding windwo of Coritcal.io word
  fingerprint encodings.
  """

  def __init__(self,
               verbosity=1,
               numLabels=3,
               modelDir="ClassificationModelWindow",
               unionSparsity=0.20,
               retinaScaling=1.0,
               retina="en_associative",
               apiKey=None,
               classifierMetric="rawOverlap",
               cacheRoot=None):

    super(ClassificationModelWindows, self).__init__(
      verbosity=verbosity, numLabels=numLabels, modelDir=modelDir)

    # window patterns below minSparsity will be skipped over
    self.minSparsity = 0.9 * unionSparsity

    self.classifier = KNNClassifier(k=numLabels,
                                    distanceMethod=classifierMetric,
                                    exact=False,
                                    verbosity=verbosity-1)

    # need valid API key (see CioEncoder init for details)
    cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__))
    self.encoder = CioEncoder(retinaScaling=retinaScaling,
                              cacheDir=os.path.join(cacheRoot, "CioCache"),
                              fingerprintType=EncoderTypes.word,
                              unionSparsity=unionSparsity,
                              retina=retina,
                              apiKey=apiKey)


  def encodeSample(self, sample):
    """
    Encode an SDR of the input string by querying the Cortical.io API for each
    word. The resulting bitmaps are unionized in a sliding window.

    @param sample     (list)        Tokenized sample, where each item is a str.
    @return           (list)        Pattern dicts for the windows, each with the
                                    sample text, sparsity, and bitmap.
    """
    return self.encoder.getWindowEncoding(sample, self.minSparsity)


  def writeOutEncodings(self):
    """
    Write the encoding dictionaries to a txt file; overrides the superclass
    implementation.
    """
    if not os.path.isdir(self.modelDir):
      raise ValueError("Invalid path to write file.")

    # Cast numpy arrays to list objects for serialization.
    jsonPatterns = copy.deepcopy(self.patterns)
    for jp in jsonPatterns:
      for tokenPattern in jp["pattern"]:
        tokenPattern["bitmap"] = tokenPattern.get(
          "bitmap", numpy.array([])).tolist()
      jp["labels"] = jp.get("labels", numpy.array([])).tolist()

    with open(os.path.join(self.modelDir, "encoding_log.txt"), "w") as f:
      f.write(json.dumps(jsonPatterns, indent=1))


  def trainModel(self, i):
    # TODO: add batch training, where i is a list
    """
    Train the classifier on the sample and labels for record i. The list
    sampleReference is populated to correlate classifier prototypes to sample
    IDs. This model is unique in that a single sample contains multiple encoded
    patterns, of which, any that are too sparse are skipped over.

    @return       (int)     Number of patterns trained on.
    """
    patternWindows = self.patterns[i]["pattern"]
    if len(patternWindows) == 0:
      # no patterns b/c no windows were large enough for encoding
      return
    count = 0
    for window in patternWindows:
      for label in self.patterns[i]["labels"]:
        self.classifier.learn(
          window["bitmap"], label, isSparse=self.encoder.n)
        self.sampleReference.append(self.patterns[i]["ID"])
        count += 1

    return count


  def testModel(self, i, seed=42):
    """
    Test the model on record i. Returns the classifications most frequent
    amongst the classifications of the sample's individual tokens.
    We ignore the terms that are unclassified, picking the most frequent
    classifications among those that are detected; in getWinningLabels().

    @return           (numpy array)   numLabels most-frequent classifications
                                      for the data samples; int or empty.
    """
    totalInferenceResult = None
    for pattern in self.patterns[i]["pattern"]:
      if not pattern:
        continue

      _, inferenceResult, _, _ = self.classifier.infer(
        self.sparsifyPattern(pattern["bitmap"], self.encoder.n))

      if totalInferenceResult is None:
        totalInferenceResult = inferenceResult
      else:
        totalInferenceResult += inferenceResult

    return self.getWinningLabels(totalInferenceResult, seed)


  def queryModel(self, query, preprocess=False):
    """
    Preprocesses the query, encodes it into a pattern, then queries the
    classifier to infer distances to trained-on samples.
    @return       (list)          Two-tuples of sample ID and distance, sorted
                                  closest to farthest from the query.
    """
    if preprocess:
      sample = TextPreprocess().tokenize(query,
                                         ignoreCommon=100,
                                         removeStrings=["[identifier deleted]"],
                                         correctSpell=True)
    else:
      sample = TextPreprocess().tokenize(query)

    # Get window patterns for the query, but if the query is too small such that
    # the window encodings are too sparse, we default to a pure union.
    encodedQuery = self.encodeSample(sample)
    if len(encodedQuery) == 0:
      sample = " ".join(sample)
      fpInfo = self.encoder.getUnionEncoding(sample)
      encodedQuery = [{
        "text":fpInfo["text"],
        "sparsity":fpInfo["sparsity"],
        "bitmap":numpy.array(fpInfo["fingerprint"]["positions"])
      }]
    allDistances = self.infer(encodedQuery)

    if len(allDistances) != len(self.sampleReference):
      raise IndexError("Number of protoype distances must match number of "
                       "samples trained on.")

    sampleDistances = defaultdict()
    for uniqueID in self.sampleReference:
      sampleDistances[uniqueID] = min(
        [allDistances[i] for i, x in enumerate(self.sampleReference)
         if x == uniqueID])

    return sorted(sampleDistances.items(), key=operator.itemgetter(1))


  def infer(self, patterns):
    """
    Get the classifier output for a single input pattern; assumes classifier
    has an infer() method (as specified in NuPIC kNN implementation). For this
    model we sum the distances across the patterns and normalize
    before returning.

    NOTE: there is no check here that the pattern sparsities are > the minimum.

    @return       (numpy.array)       Each entry is the distance from the
        input pattern to that prototype (pattern in the classifier). All
        distances are between 0.0 and 1.0
    """
    distances = numpy.zeros((self.classifier._numPatterns))

    for i, p in enumerate(patterns):
      (_, _, dist, _) = self.classifier.infer(
        self.sparsifyPattern(p["bitmap"], self.encoder.n))

      distances = distances + dist

    return distances / float(i+1)
  def testMaxSparsity(self):
    """Test that CioEncoder's maxSparsity works."""

    # This text seems to generate bitmaps with about 8% sparsity
    text = ("Smoking harms nearly every organ in your body. Over 7000 chemicals"
            " have been identified in tobacco smoke. After reading all this"
            " James and Sue decided to abruptly quit cigarette smoking to"
            " improve their health but it clearly was not an easy decision.")

    # Encoders with maxSparsity of 100%, 10%, 5%, and 1%
    cio100 = CioEncoder(maxSparsity=1.0, fingerprintType=EncoderTypes.document)
    cio10 = CioEncoder(maxSparsity=0.1, fingerprintType=EncoderTypes.document)
    cio5 = CioEncoder(maxSparsity=0.05, fingerprintType=EncoderTypes.document)
    cio1 = CioEncoder(maxSparsity=0.01, fingerprintType=EncoderTypes.document)

    bitmapSize = cio100.width*cio100.height
    r100 = cio100.encode(text)
    r10 = cio10.encode(text)
    r5 = cio5.encode(text)
    r1 = cio1.encode(text)

    length100 = len(r100["fingerprint"]["positions"])
    length10 = len(r10["fingerprint"]["positions"])
    length5 = len(r5["fingerprint"]["positions"])
    length1 = len(r1["fingerprint"]["positions"])

    # Encodings must have no more than desired sparsity
    self.assertLessEqual(r100["sparsity"], 1.0)
    self.assertLessEqual(r10["sparsity"], 0.1)
    self.assertLessEqual(r5["sparsity"], 0.05)
    self.assertLessEqual(r1["sparsity"], 0.01)

    self.assertLessEqual(length100, bitmapSize)
    self.assertLessEqual(length10, 0.1*bitmapSize)
    self.assertLessEqual(length5, 0.05*bitmapSize)
    self.assertLessEqual(length1, 0.01*bitmapSize)

    # Encodings can't be zero
    self.assertGreater(length100, 0)
    self.assertGreater(length10, 0)
    self.assertGreater(length5, 0)
    self.assertGreater(length1, 0)

    # Encodings must have complete overlap with the next higher encoding
    s100 = set(r100["fingerprint"]["positions"])
    s10 = set(r10["fingerprint"]["positions"])
    s5 = set(r5["fingerprint"]["positions"])
    s1 = set(r1["fingerprint"]["positions"])
    self.assertEqual(len(s100 & s10), length10)
    self.assertEqual(len(s10 & s5), length5)
    self.assertEqual(len(s5 & s1), length1)

    # Test that if you encode a second time, you get the same bitmap
    r100_2 = cio100.encode(text)
    r10_2 = cio10.encode(text)
    r5_2 = cio5.encode(text)
    r1_2 = cio1.encode(text)

    self.assertEqual(hashlib.sha224(str(r100)).hexdigest(),
                      hashlib.sha224(str(r100_2)).hexdigest())
    self.assertEqual(hashlib.sha224(str(r10)).hexdigest(),
                      hashlib.sha224(str(r10_2)).hexdigest())
    self.assertEqual(hashlib.sha224(str(r5)).hexdigest(),
                      hashlib.sha224(str(r5_2)).hexdigest())
    self.assertEqual(hashlib.sha224(str(r1)).hexdigest(),
                      hashlib.sha224(str(r1_2)).hexdigest())
Example #22
0
class ClassificationModelContext(ClassificationModel):
  """
  Class to run the survey response classification task with Cortical.io
  text context, then AND the context

  From the experiment runner, the methods expect to be fed one sample at a time.
  """

  def __init__(self, verbosity=1, numLabels=1):
    """
    Initialize the CorticalClient and CioEncoder. Requires a valid API key.
    """
    super(ClassificationModelContext, self).__init__(verbosity)

    root = os.path.dirname(os.path.realpath(__file__))
    self.encoder = CioEncoder(cacheDir=os.path.join(root, "CioCache"))
    self.client = CorticalClient(self.encoder.apiKey)

    self.n = self.encoder.n
    self.w = int((self.encoder.targetSparsity / 100) * self.n)

    self.categoryBitmaps = {}
    self.numLabels = numLabels


  def encodePattern(self, pattern):
    """
    Encode an SDR of the input string by querying the Cortical.io API.

    @param pattern     (list)           Tokenized sample, where each item is a
                                        string
    @return            (dictionary)     Dictionary, containing text, sparsity,
                                        and bitmap
    Example return dict:
    {
      "text": "Example text",
      "sparsity": 0.0,
      "bitmap": numpy.zeros(0)
    }
    """
    text = " ".join(pattern)
    return {"text": text, "sparsity": 0.0, "bitmap": self._encodeText(text)}


  def _encodeText(self, text):
    fpInfo = self.encoder.encode(text)
    if self.verbosity > 1:
      print "Fingerprint sparsity = {0}%.".format(fpInfo["sparsity"])

    if fpInfo:
      bitmap = numpy.array(fpInfo["fingerprint"]["positions"])
    else:
      bitmap = self.encodeRandomly(text, self.n, self.w)

    return bitmap.astype(int)


  def resetModel(self):
    """Reset the model"""
    self.categoryBitmaps.clear()


  def trainModel(self, samples, labels):
    """
    Train the classifier on the input sample and label. Use Cortical.io's
    keyword extraction to get the most relevant terms then get the intersection
    of those bitmaps

    @param samples     (dictionary)      Dictionary, containing text, sparsity,
                                         and bitmap
    @param labels      (int)             Reference index for the classification
                                         of this sample.
    """
    for sample, sample_labels in zip(samples, labels):
      bitmaps = [sample["bitmap"].tolist()]
      context = self.client.getContextFromText(bitmaps, maxResults=5,
                                               getFingerprint=True)

      if len(context) != 0:
        union = numpy.zeros(0)
        for c in context:
          bitmap = c["fingerprint"]["positions"]
          union = numpy.union1d(bitmap, union).astype(int)

        for label in sample_labels:
          # Haven't seen the label before
          if label not in self.categoryBitmaps:
            self.categoryBitmaps[label] = union

          intersection = numpy.intersect1d(union, self.categoryBitmaps[label])
          if intersection.size == 0:
            # Don't want to lose all the old information
            union = numpy.union1d(union, self.categoryBitmaps[label]).astype(int)
            # Need to sample to stay sparse
            count = len(union)
            sampleIndices = random.sample(xrange(count), min(count, self.w))
            intersection = numpy.sort(union[sampleIndices])

          self.categoryBitmaps[label] = intersection


  def testModel(self, sample):
    """
    Test the intersection bitmap on the input sample. Returns a dictionary
    containing various distance metrics between the sample and the classes.

    @param sample     (dictionary)      Dictionary, containing text, sparsity,
                                        and bitmap
    @return           (dictionary)      The distances between the sample and
                                        the classes
    Example return dict:
      {
        0: {
          "cosineSimilarity": 0.6666666666666666,
          "euclideanDistance": 0.3333333333333333,
          "jaccardDistance": 0.5,
          "overlappingAll": 6,
          "overlappingLeftRight": 0.6666666666666666,
          "overlappingRightLeft": 0.6666666666666666,
          "sizeLeft": 9,
          "sizeRight": 9,
          "weightedScoring": 0.4436476984102028
        }
      }
    """

    sampleBitmap = sample["bitmap"].tolist()

    distances = {}
    for cat, catBitmap in self.categoryBitmaps.iteritems():
      distances[cat] = self.client.compare(sampleBitmap, catBitmap.tolist())

    return self.winningLabels(distances, numberCats=self.numLabels,
      metric="overlappingAll") 


  @staticmethod
  def winningLabels(distances, numberCats, metric):
    """
    Return indices of winning categories, based off of the input metric.
    Overrides the base class implementation.
    """
    metricValues = numpy.array([v[metric] for v in distances.values()])
    sortedIdx = numpy.argsort(metricValues)

    # euclideanDistance and jaccardDistance are ascending
    descendingOrder = set(["overlappingAll", "overlappingLeftRight",
      "overlappingRightLeft", "cosineSimilarity", "weightedScoring"])
    if metric in descendingOrder:
      sortedIdx = sortedIdx[::-1]

    return [distances.keys()[catIdx] for catIdx in sortedIdx[:numberCats]]
class ClassificationModelFingerprint(ClassificationModel):
  """
  Class to run the survey response classification task with Coritcal.io
  fingerprint encodings.

  From the experiment runner, the methods expect to be fed one sample at a time.
  """

  def __init__(self,
               verbosity=1,
               numLabels=3,
               modelDir="ClassificationModelFingerprint",
               fingerprintType=EncoderTypes.word,
               unionSparsity=0.20,
               retinaScaling=1.0,
               retina="en_associative",
               apiKey=None,
               classifierMetric="rawOverlap",
               cacheRoot=None):

    super(ClassificationModelFingerprint, self).__init__(
      verbosity=verbosity, numLabels=numLabels, modelDir=modelDir)

    # Init kNN classifier and Cortical.io encoder; need valid API key (see
    # CioEncoder init for details).
    self.classifier = KNNClassifier(k=numLabels,
                                    distanceMethod=classifierMetric,
                                    exact=False,
                                    verbosity=verbosity-1)

    if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word):
      raise ValueError("Invaid type of fingerprint encoding; see the "
                       "EncoderTypes class for eligble types.")

    cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__))

    self.encoder = CioEncoder(retinaScaling=retinaScaling,
                              cacheDir=os.path.join(cacheRoot, "CioCache"),
                              fingerprintType=fingerprintType,
                              unionSparsity=unionSparsity,
                              retina=retina,
                              apiKey=apiKey)


  def encodeSample(self, sample):
    """
    Encode an SDR of the input string by querying the Cortical.io API. If the
    client returns None, we create a random SDR with the model's dimensions n
    and w.

    @param sample     (list)        Tokenized sample, where each item is a str.
    @return fp        (dict)        The sample text, sparsity, and bitmap.
    Example return dict:
      {
        "text": "Example text",
        "sparsity": 0.03,
        "bitmap": numpy.array([])
      }
    """
    sample = " ".join(sample)
    fpInfo = self.encoder.encode(sample)
    if fpInfo:
      fp = {"text":fpInfo["text"] if "text" in fpInfo else fpInfo["term"],
            "sparsity":fpInfo["sparsity"],
            "bitmap":numpy.array(fpInfo["fingerprint"]["positions"])}
    else:
      fp = {"text":sample,
            "sparsity":float(self.encoder.w)/self.encoder.n,
            "bitmap":self.encodeRandomly(
              sample, self.encoder.n, self.encoder.w)}

    return fp


  def trainModel(self, i):
    # TODO: add batch training, where i is a list
    """
    Train the classifier on the sample and labels for record i. The list
    sampleReference is populated to correlate classifier prototypes to sample
    IDs.
    """
    bitmap = self.patterns[i]["pattern"]["bitmap"]
    count = 0
    if bitmap.any():
      for count, label in enumerate(self.patterns[i]["labels"]):
        self.classifier.learn(bitmap, label, isSparse=self.encoder.n)
        self.sampleReference.append(self.patterns[i]["ID"])
      count += 1

    return count


  def testModel(self, i, seed=42):
    """
    Test the model on record i. The random seed is used in getWinningLabels().

    @return           (numpy array)   numLabels most-frequent classifications
                                      for the data samples; int or empty.
    """
    (_, inferenceResult, _, _) = self.classifier.infer(self.sparsifyPattern(
      self.patterns[i]["pattern"]["bitmap"], self.encoder.n))
    return self.getWinningLabels(inferenceResult, seed)
class ClassificationModelFingerprint(ClassificationModel):
  """
  Class to run the survey response classification task with Coritcal.io
  fingerprint encodings.

  From the experiment runner, the methods expect to be fed one sample at a time.
  """

  def __init__(self,
               fingerprintType=EncoderTypes.word,
               unionSparsity=0.20,
               retinaScaling=1.0,
               retina="en_associative",
               apiKey=None,
               k=1,
               classifierMetric="rawOverlap",
               cacheRoot=None,
               **kwargs):

    super(ClassificationModelFingerprint, self).__init__(**kwargs)

    self.classifier = KNNClassifier(k=k,
                                    distanceMethod=classifierMetric,
                                    exact=False,
                                    verbosity=self.verbosity-1)

    # Need a valid API key for the Cortical.io encoder (see CioEncoder
    # constructor for details).
    if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word):
      raise ValueError("Invalid type of fingerprint encoding; see the "
                       "EncoderTypes class for eligble types.")

    self.encoder = CioEncoder(retinaScaling=retinaScaling,
                              fingerprintType=fingerprintType,
                              unionSparsity=unionSparsity,
                              retina=retina,
                              apiKey=apiKey,
                              cacheDir=cacheRoot)

    self.currentDocument = None


  def trainToken(self, token, labels, sampleId, reset=0):
    """
    Train the model with the given text token, associated labels, and
    sampleId.

    See base class for params and return type descriptions.
    """
    if self.currentDocument is None:
      # start of a new document
      self.currentDocument = [token]
    else:
      # accumulate text for this document
      self.currentDocument.append(token)

    if reset == 1:
      # all text accumulated, proceed w/ training on this document
      document = " ".join(self.currentDocument)
      bitmap = self.encoder.encode(document)["fingerprint"]["positions"]


      if self.verbosity >= 2:
        print "CioFP model training with: '{}'".format(document)
        print "\tBitmap:", bitmap

      for label in labels:
        self.classifier.learn(
            bitmap, label, isSparse=self.encoder.n, partitionId=sampleId)

      self.currentDocument = None


  def inferToken(self, token, reset=0, returnDetailedResults=False,
                 sortResults=True):
    """
    Classify the token (i.e. run inference on the model with this document) and
    return classification results and (optionally) a list of sampleIds and
    distances.   Repeated sampleIds are NOT removed from the results.

    See base class for params and return type descriptions.
    """
    if self.currentDocument is None:
      # start of a new document
      self.currentDocument = [token]
    else:
      # accumulate text for this document
      self.currentDocument.append(token)

    if reset == 0:
      return numpy.zeros(self.numLabels), [], numpy.zeros(0)

    # With reset=1, all text accumulated, proceed w/ classifying this document
    document = " ".join(self.currentDocument)
    bitmap = self.encoder.encode(document)["fingerprint"]["positions"]

    densePattern  =self.encoder.densifyPattern(bitmap)

    (_, inferenceResult, dist, _) = self.classifier.infer(densePattern)

    if self.verbosity >= 2:
      print "CioFP model inference with: '{}'".format(document)
      print "\tBitmap:", bitmap
      print "\tInference result=", inferenceResult
      print "\tDistances=", dist

    self.currentDocument = None

    # Figure out format of returned results

    if not returnDetailedResults:
      # Return non-detailed results.
      return inferenceResult, None, None

    if not sortResults:
      idList = [self.classifier.getPartitionId(i) for i in xrange(len(dist))]
      return inferenceResult, idList, dist

    # Return sorted results
    sortedIndices = dist.argsort()
    idList = [self.classifier.getPartitionId(i) for i in sortedIndices]
    sortedDistances = dist[sortedIndices]
    return inferenceResult, idList, sortedDistances


  def getEncoder(self):
    """
    Returns the encoder instance for the model.
    """
    return self.encoder


  def getClassifier(self):
    """
    Returns the classifier instance for the model.
    """
    return self.classifier
Example #25
0
class ClassificationModelEndpoint(ClassificationModel):
  """
  Class to run the survey response classification task with Cortical.io
  text endpoint encodings and classification system.

  From the experiment runner, the methods expect to be fed one sample at a time.
  """

  def __init__(self,
               verbosity=1,
               numLabels=3,
               modelDir="ClassificationModelEndpoint",
               unionSparsity=0.20,
               cacheRoot=None):
    """
    Initializes the encoder as CioEncoder; requires a valid API key.
    """
    super(ClassificationModelEndpoint, self).__init__(
      verbosity=verbosity, numLabels=numLabels, modelDir=modelDir)

    cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__))

    self.encoder = CioEncoder(cacheDir=os.path.join(cacheRoot, "CioCache"),
                              unionSparsity=unionSparsity)
    self.compareEncoder = LanguageEncoder()

    self.n = self.encoder.n
    self.w = int((self.encoder.targetSparsity/100) * self.n)

    self.categoryBitmaps = {}
    self.negatives = defaultdict(list)
    self.positives = defaultdict(list)


  def encodeSample(self, sample):
    """
    Encode an SDR of the input string by querying the Cortical.io API.

    @param sample         (list)          Tokenized sample, where each item is
                                          a string
    @return fp            (dict)          The sample text, sparsity, and bitmap.
    Example return dict:
      {
        "text": "Example text",
        "sparsity": 0.03,
        "bitmap": numpy.array([])
      }
    """
    sample = " ".join(sample)
    fpInfo = self.encoder.encode(sample)
    if fpInfo:
      fp = {"text":fpInfo["text"] if "text" in fpInfo else fpInfo["term"],
            "sparsity":fpInfo["sparsity"],
            "bitmap":numpy.array(fpInfo["fingerprint"]["positions"])}
    else:
      fp = {"text":sample,
            "sparsity":float(self.w)/self.n,
            "bitmap":self.encodeRandomly(sample, self.n, self.w)}

    return fp


  def resetModel(self):
    """Reset the model"""
    self.positives.clear()
    self.negatives.clear()
    self.categoryBitmaps.clear()


  def trainModel(self, i, negatives=None):
    # TODO: add batch training, where i is a list; note we should only add
    # negatives when training on one sample so we know which labels to use.
    """
    Train the classifier on the sample and labels for record i. Use
    Cortical.io's createClassification() to make a bitmap that represents the
    class. The list sampleReference is populated to correlate classifier
    prototypes to sample IDs.

    @param negative   (list)            Each item is the dictionary containing
                                        text, sparsity and bitmap for the
                                        negative samples.
    """
    record = self.patterns[i]
    labelsToUpdateBitmaps = set()
    for label in record["labels"]:
      if record["pattern"]["text"] and record["pattern"]["bitmap"].any():
        self.positives[label].append(record["pattern"]["text"])
        if negatives:
          for neg in negatives:
            if neg["text"]:
              self.negatives[label].append(neg["text"])
        labelsToUpdateBitmaps.add(label)

    for label in labelsToUpdateBitmaps:
      self.categoryBitmaps[label] = self.encoder.createCategory(
        str(label), self.positives[label], self.negatives[label])["positions"]
      self.sampleReference.append(i)


  def testModel(self, i, _, metric="overlappingAll"):
    """
    Test on record i. The Cortical.io classifier returns a dictionary
    containing various distance metrics between the sample and the classes.

    @param metric     (str)           Distance metric use by classifier.
    @return           (numpy array)   numLabels most-frequent classifications
                                      for the data samples; int or empty.
    """
    sampleBitmap = self.patterns[i]["pattern"]["bitmap"].tolist()

    distances = defaultdict(list)
    for cat, catBitmap in self.categoryBitmaps.iteritems():
      distances[cat] = self.compareEncoder.compare(sampleBitmap, catBitmap)

    return self.getWinningLabels(distances, metric=metric)


  def getWinningLabels(self, distances, metric):
    """
    Return indices of winning categories, based off of the input metric.
    Overrides the base class implementation.
    """
    metricValues = numpy.array([v[metric] for v in distances.values()])
    sortedIdx = numpy.argsort(metricValues)

    # euclideanDistance and jaccardDistance are ascending
    descendingOrder = ("overlappingAll", "overlappingLeftRight",
                       "overlappingRightLeft", "cosineSimilarity",
                       "weightedScoring")
    if metric in descendingOrder:
      sortedIdx = sortedIdx[::-1]

    return numpy.array(
      [distances.keys()[catIdx] for catIdx in sortedIdx[:self.numLabels]])


  def getCategoryDistances(self, sort=True, save=None, labelRefs=None):
    """
    Return a dict where keys are categories and values are dicts of distances.

    @param sort      (bool)        Sort the inner dicts with compareCategories()
    @param save      (str)         Dump catDistances to a JSON in this dir.
    @return          (defaultdict)

    E.g. w/ categories 0 and 1:
      catDistances = {
          0: {
              0: {"cosineSimilarity": 1.0, ...},
              1: {"cosineSimilarity": 0.33, ...}
              },
          1: {
              0: {"cosineSimilarity": 0.33, ...},
              1: {"cosineSimilarity": 1.0, ...}
              }
    Note the inner-dicts of catDistances are OrderedDict objects.
    """
    catDistances = defaultdict(list)
    for cat, catBitmap in self.categoryBitmaps.iteritems():
      catDistances[cat] = OrderedDict()
      for compareCat, compareBitmap in self.categoryBitmaps.iteritems():
        # List is in order of self.categoryBitmaps.keys()
        catDistances[cat][compareCat] = self.compareEncoder.compare(
          catBitmap, compareBitmap)

    if sort:
      # Order each inner dict of catDistances such that the ranking is most to
      # least similar.
      catDistances = self.compareCategories(catDistances)

    if save is not None:
      self.writeOutCategories(
        save, comparisons=catDistances, labelRefs=labelRefs)

    return catDistances


  @staticmethod
  def compareCategories(catDistances, metric="overlappingAll"):
    """
    Calculate category distances. Returns a defaultdict of category keys, where
    values are OrderedDicts sorted such that the most similar categories
    (according to the input metric) are listed first.
    """
    descendingOrder = ("overlappingAll", "overlappingLeftRight",
                       "overlappingRightLeft", "cosineSimilarity",
                       "weightedScoring")

    categoryComparisons = defaultdict(list)
    for k, v in catDistances.iteritems():
      # Create a dict for this category
      metricDict = {compareCat: distances[metric]
                    for compareCat, distances in v.iteritems()}
      # Sort the dict by the metric
      reverse = True if metric in descendingOrder else False
      categoryComparisons[k] = OrderedDict(
        sorted(metricDict.items(), key=lambda k: k[1], reverse=reverse))

    return categoryComparisons


  @staticmethod
  def query():
    print "The Classification Endpoint model doesn't support this method."


  @staticmethod
  def infer():
    print "The Classification Endpoint model doesn't support this method."
class ClassificationModelFingerprint(ClassificationModel):
    """
  Class to run the survey response classification task with Coritcal.io
  fingerprint encodings.

  From the experiment runner, the methods expect to be fed one sample at a time.
  """
    def __init__(self,
                 fingerprintType=EncoderTypes.word,
                 unionSparsity=0.20,
                 retinaScaling=1.0,
                 retina="en_associative",
                 apiKey=None,
                 k=1,
                 classifierMetric="rawOverlap",
                 cacheRoot=None,
                 **kwargs):

        super(ClassificationModelFingerprint, self).__init__(**kwargs)

        self.classifier = KNNClassifier(k=k,
                                        distanceMethod=classifierMetric,
                                        exact=False,
                                        verbosity=self.verbosity - 1)

        # Need a valid API key for the Cortical.io encoder (see CioEncoder
        # constructor for details).
        if fingerprintType is (not EncoderTypes.document
                               or not EncoderTypes.word):
            raise ValueError("Invalid type of fingerprint encoding; see the "
                             "EncoderTypes class for eligble types.")

        self.encoder = CioEncoder(retinaScaling=retinaScaling,
                                  fingerprintType=fingerprintType,
                                  unionSparsity=unionSparsity,
                                  retina=retina,
                                  apiKey=apiKey,
                                  cacheDir=cacheRoot)

        self.currentDocument = None

    def trainToken(self, token, labels, sampleId, reset=0):
        """
    Train the model with the given text token, associated labels, and
    sampleId.

    See base class for params and return type descriptions.
    """
        if self.currentDocument is None:
            # start of a new document
            self.currentDocument = [token]
        else:
            # accumulate text for this document
            self.currentDocument.append(token)

        if reset == 1:
            # all text accumulated, proceed w/ training on this document
            document = " ".join(self.currentDocument)
            bitmap = self.encoder.encode(document)["fingerprint"]["positions"]

            if self.verbosity >= 2:
                print "CioFP model training with: '{}'".format(document)
                print "\tBitmap:", bitmap

            for label in labels:
                self.classifier.learn(bitmap,
                                      label,
                                      isSparse=self.encoder.n,
                                      partitionId=sampleId)

            self.currentDocument = None

    def inferToken(self,
                   token,
                   reset=0,
                   returnDetailedResults=False,
                   sortResults=True):
        """
    Classify the token (i.e. run inference on the model with this document) and
    return classification results and (optionally) a list of sampleIds and
    distances.   Repeated sampleIds are NOT removed from the results.

    See base class for params and return type descriptions.
    """
        if self.currentDocument is None:
            # start of a new document
            self.currentDocument = [token]
        else:
            # accumulate text for this document
            self.currentDocument.append(token)

        if reset == 0:
            return numpy.zeros(self.numLabels), [], numpy.zeros(0)

        # With reset=1, all text accumulated, proceed w/ classifying this document
        document = " ".join(self.currentDocument)
        bitmap = self.encoder.encode(document)["fingerprint"]["positions"]

        densePattern = self.encoder.densifyPattern(bitmap)

        (_, inferenceResult, dist, _) = self.classifier.infer(densePattern)

        if self.verbosity >= 2:
            print "CioFP model inference with: '{}'".format(document)
            print "\tBitmap:", bitmap
            print "\tInference result=", inferenceResult
            print "\tDistances=", dist

        self.currentDocument = None

        # Figure out format of returned results

        if not returnDetailedResults:
            # Return non-detailed results.
            return inferenceResult, None, None

        if not sortResults:
            idList = [
                self.classifier.getPartitionId(i) for i in xrange(len(dist))
            ]
            return inferenceResult, idList, dist

        # Return sorted results
        sortedIndices = dist.argsort()
        idList = [self.classifier.getPartitionId(i) for i in sortedIndices]
        sortedDistances = dist[sortedIndices]
        return inferenceResult, idList, sortedDistances

    def getEncoder(self):
        """
    Returns the encoder instance for the model.
    """
        return self.encoder

    def getClassifier(self):
        """
    Returns the classifier instance for the model.
    """
        return self.classifier
    def testRetinaScaling(self):
        """Test the CioEncoder for retina dimension scaling."""

        cio = CioEncoder(retinaScaling=1.0,
                         fingerprintType=EncoderTypes.document)
        cioScaled = CioEncoder(retinaScaling=0.5,
                               fingerprintType=EncoderTypes.document)
        cioScaled2 = CioEncoder(retinaScaling=0.71,
                                fingerprintType=EncoderTypes.document)

        self.assertAlmostEqual(int(0.5 * cio.width), cioScaled.width)
        self.assertAlmostEqual(int(0.5 * cio.height), cioScaled.height)
        self.assertAlmostEqual(int(0.71 * cio.height), cioScaled2.height)

        response = cio.encode(self.text)
        responseScaled = cioScaled.encode(self.text)
        responseScaled2 = cioScaled2.encode(self.text)

        # Each bit position should be scaled down by retinaScaling*retinaScaling
        self.assertLessEqual(
            responseScaled["fingerprint"]["positions"].sum(),
            0.5 * 0.5 * response["fingerprint"]["positions"].sum())

        self.assertLessEqual(
            responseScaled2["fingerprint"]["positions"].sum(),
            0.71 * 0.71 * response["fingerprint"]["positions"].sum())

        # The number of on bits in scaled retina should normally be slightly less
        # than the original, but can be equal in some cases
        self.assertLessEqual(len(responseScaled["fingerprint"]["positions"]),
                             len(response["fingerprint"]["positions"]))
        self.assertLessEqual(len(responseScaled["fingerprint"]["positions"]),
                             len(responseScaled2["fingerprint"]["positions"]))

        # Check that encodeIntoArray works even with weird scaling
        a = numpy.zeros(cioScaled2.width * cioScaled2.height)
        cioScaled2.encodeIntoArray(self.text, a)
        self.assertEqual(len(responseScaled2["fingerprint"]["positions"]),
                         a.sum())
Example #28
0
class ClassificationModelFingerprint(ClassificationModel):
    """
  Class to run the survey response classification task with Coritcal.io
  fingerprint encodings.

  From the experiment runner, the methods expect to be fed one sample at a time.
  """
    def __init__(self,
                 verbosity=1,
                 numLabels=3,
                 modelDir="ClassificationModelFingerprint",
                 fingerprintType=EncoderTypes.word,
                 unionSparsity=0.20,
                 retinaScaling=1.0,
                 retina="en_associative",
                 apiKey=None,
                 classifierMetric="rawOverlap",
                 cacheRoot=None):

        super(ClassificationModelFingerprint,
              self).__init__(verbosity=verbosity,
                             numLabels=numLabels,
                             modelDir=modelDir)

        # Init kNN classifier and Cortical.io encoder; need valid API key (see
        # CioEncoder init for details).
        self.classifier = KNNClassifier(k=numLabels,
                                        distanceMethod=classifierMetric,
                                        exact=False,
                                        verbosity=verbosity - 1)

        if fingerprintType is (not EncoderTypes.document
                               or not EncoderTypes.word):
            raise ValueError("Invaid type of fingerprint encoding; see the "
                             "EncoderTypes class for eligble types.")

        cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__))

        self.encoder = CioEncoder(retinaScaling=retinaScaling,
                                  cacheDir=os.path.join(cacheRoot, "CioCache"),
                                  fingerprintType=fingerprintType,
                                  unionSparsity=unionSparsity,
                                  retina=retina,
                                  apiKey=apiKey)

    def encodeSample(self, sample):
        """
    Encode an SDR of the input string by querying the Cortical.io API. If the
    client returns None, we create a random SDR with the model's dimensions n
    and w.

    @param sample     (list)        Tokenized sample, where each item is a str.
    @return fp        (dict)        The sample text, sparsity, and bitmap.
    Example return dict:
      {
        "text": "Example text",
        "sparsity": 0.03,
        "bitmap": numpy.array([])
      }
    """
        sample = " ".join(sample)
        fpInfo = self.encoder.encode(sample)
        if fpInfo:
            fp = {
                "text": fpInfo["text"] if "text" in fpInfo else fpInfo["term"],
                "sparsity": fpInfo["sparsity"],
                "bitmap": numpy.array(fpInfo["fingerprint"]["positions"])
            }
        else:
            fp = {
                "text":
                sample,
                "sparsity":
                float(self.encoder.w) / self.encoder.n,
                "bitmap":
                self.encodeRandomly(sample, self.encoder.n, self.encoder.w)
            }

        return fp

    def trainModel(self, i):
        # TODO: add batch training, where i is a list
        """
    Train the classifier on the sample and labels for record i. The list
    sampleReference is populated to correlate classifier prototypes to sample
    IDs.
    """
        bitmap = self.patterns[i]["pattern"]["bitmap"]
        count = 0
        if bitmap.any():
            for count, label in enumerate(self.patterns[i]["labels"]):
                self.classifier.learn(bitmap, label, isSparse=self.encoder.n)
                self.sampleReference.append(self.patterns[i]["ID"])
            count += 1

        return count

    def testModel(self, i, seed=42):
        """
    Test the model on record i. The random seed is used in getWinningLabels().

    @return           (numpy array)   numLabels most-frequent classifications
                                      for the data samples; int or empty.
    """
        (_, inferenceResult, _, _) = self.classifier.infer(
            self.sparsifyPattern(self.patterns[i]["pattern"]["bitmap"],
                                 self.encoder.n))
        return self.getWinningLabels(inferenceResult, seed)