def testWordFingerprint(self): """Test the Cortical.io term (word-lelevl) encoding.""" cio = CioEncoder(fingerprintType=EncoderTypes.word) response = cio.encode(self.text) self.assertFingerprintFields(response) encodingDict = getTestData("cio_encoding_word.json") self.assertEqual(encodingDict["fingerprint"]["positions"], response["fingerprint"]["positions"], "Cio bitmap is not as expected.")
def testRetinaScaling(self): """Test the CioEncoder for retina dimension scaling.""" cio = CioEncoder(retinaScaling=1.0, fingerprintType=EncoderTypes.document) cioScaled = CioEncoder(retinaScaling=0.5, fingerprintType=EncoderTypes.document) cioScaled2 = CioEncoder(retinaScaling=0.71, fingerprintType=EncoderTypes.document) self.assertAlmostEqual(int(0.5 * cio.width), cioScaled.width) self.assertAlmostEqual(int(0.5 * cio.height), cioScaled.height) self.assertAlmostEqual(int(0.71 * cio.height), cioScaled2.height) response = cio.encode(self.text) responseScaled = cioScaled.encode(self.text) responseScaled2 = cioScaled2.encode(self.text) # Each bit position should be scaled down by retinaScaling*retinaScaling self.assertLessEqual( responseScaled["fingerprint"]["positions"].sum(), 0.5 * 0.5 * response["fingerprint"]["positions"].sum()) self.assertLessEqual( responseScaled2["fingerprint"]["positions"].sum(), 0.71 * 0.71 * response["fingerprint"]["positions"].sum()) # The number of on bits in scaled retina should normally be slightly less # than the original, but can be equal in some cases self.assertLessEqual(len(responseScaled["fingerprint"]["positions"]), len(response["fingerprint"]["positions"])) self.assertLessEqual(len(responseScaled["fingerprint"]["positions"]), len(responseScaled2["fingerprint"]["positions"])) # Check that encodeIntoArray works even with weird scaling a = numpy.zeros(cioScaled2.width * cioScaled2.height) cioScaled2.encodeIntoArray(self.text, a) self.assertEqual(len(responseScaled2["fingerprint"]["positions"]), a.sum())
def testRetinaScaling(self): """Test the CioEncoder for retina dimension scaling.""" cio = CioEncoder( retinaScaling = 1.0, fingerprintType=EncoderTypes.document) cioScaled = CioEncoder( retinaScaling = 0.5, fingerprintType=EncoderTypes.document) cioScaled2 = CioEncoder( retinaScaling = 0.71, fingerprintType=EncoderTypes.document) self.assertAlmostEqual(int(0.5*cio.width), cioScaled.width) self.assertAlmostEqual(int(0.5*cio.height), cioScaled.height) self.assertAlmostEqual(int(0.71*cio.height), cioScaled2.height) response = cio.encode(self.text) responseScaled = cioScaled.encode(self.text) responseScaled2 = cioScaled2.encode(self.text) # Each bit position should be scaled down by retinaScaling*retinaScaling self.assertLessEqual(responseScaled["fingerprint"]["positions"].sum(), 0.5*0.5*response["fingerprint"]["positions"].sum()) self.assertLessEqual(responseScaled2["fingerprint"]["positions"].sum(), 0.71*0.71*response["fingerprint"]["positions"].sum()) # The number of on bits in scaled retina should normally be slightly less # than the original, but can be equal in some cases self.assertLessEqual(len(responseScaled["fingerprint"]["positions"]), len(response["fingerprint"]["positions"])) self.assertLessEqual(len(responseScaled["fingerprint"]["positions"]), len(responseScaled2["fingerprint"]["positions"])) # Check that encodeIntoArray works even with weird scaling a = numpy.zeros(cioScaled2.width*cioScaled2.height) cioScaled2.encodeIntoArray(self.text, a) self.assertEqual(len(responseScaled2["fingerprint"]["positions"]), a.sum())
def testDocumentFingerprint(self): """Test the Cortical.io text (document-level) encoding.""" cio = CioEncoder(fingerprintType=EncoderTypes.document) response = cio.encode(self.text) self.assertFingerprintFields(response) encodingDict = getTestData("cio_encoding_document.json") self.assertEqual( encodingDict["fingerprint"]["positions"], response["fingerprint"]["positions"], "Cio bitmap is not as expected.", )
def testRetinaScaling(self): """Test the CioEncoder for retina dimension scaling.""" cio = CioEncoder( retinaScaling = 0.25, fingerprintType=EncoderTypes.document) response = cio.encode(self.text) encodingDict = getTestData("cio_encoding_scaled_retina.json") self.assertEqual(encodingDict["fingerprint"]["positions"], response["fingerprint"]["positions"], "Cio bitmap is not as expected.") fullRetinaEncodingDict = getTestData("cio_encoding_document.json") fullLength = len(fullRetinaEncodingDict["fingerprint"]["positions"]) responseLength = len(response["fingerprint"]["positions"]) self.assertTrue(responseLength <= fullLength, "Retina scaling did not decrease the fingerprint size.")
def testRetinaScaling(self): """Test the CioEncoder for retina dimension scaling.""" cio = CioEncoder(retinaScaling=0.25, fingerprintType=EncoderTypes.document) response = cio.encode(self.text) encodingDict = getTestData("cio_encoding_scaled_retina.json") self.assertEqual(encodingDict["fingerprint"]["positions"], response["fingerprint"]["positions"], "Cio bitmap is not as expected.") fullRetinaEncodingDict = getTestData("cio_encoding_document.json") fullLength = len(fullRetinaEncodingDict["fingerprint"]["positions"]) responseLength = len(response["fingerprint"]["positions"]) self.assertTrue( responseLength <= fullLength, "Retina scaling did not decrease the fingerprint size.")
def testMaxSparsity(self): """Test that CioEncoder's maxSparsity works.""" # This text seems to generate bitmaps with about 8% sparsity text = ("Smoking harms nearly every organ in your body. Over 7000 chemicals" " have been identified in tobacco smoke. After reading all this" " James and Sue decided to abruptly quit cigarette smoking to" " improve their health but it clearly was not an easy decision.") # Encoders with maxSparsity of 100%, 10%, 5%, and 1% cio100 = CioEncoder(maxSparsity=1.0, fingerprintType=EncoderTypes.document) cio10 = CioEncoder(maxSparsity=0.1, fingerprintType=EncoderTypes.document) cio5 = CioEncoder(maxSparsity=0.05, fingerprintType=EncoderTypes.document) cio1 = CioEncoder(maxSparsity=0.01, fingerprintType=EncoderTypes.document) bitmapSize = cio100.width*cio100.height r100 = cio100.encode(text) r10 = cio10.encode(text) r5 = cio5.encode(text) r1 = cio1.encode(text) length100 = len(r100["fingerprint"]["positions"]) length10 = len(r10["fingerprint"]["positions"]) length5 = len(r5["fingerprint"]["positions"]) length1 = len(r1["fingerprint"]["positions"]) # Encodings must have no more than desired sparsity self.assertLessEqual(r100["sparsity"], 1.0) self.assertLessEqual(r10["sparsity"], 0.1) self.assertLessEqual(r5["sparsity"], 0.05) self.assertLessEqual(r1["sparsity"], 0.01) self.assertLessEqual(length100, bitmapSize) self.assertLessEqual(length10, 0.1*bitmapSize) self.assertLessEqual(length5, 0.05*bitmapSize) self.assertLessEqual(length1, 0.01*bitmapSize) # Encodings can't be zero self.assertGreater(length100, 0) self.assertGreater(length10, 0) self.assertGreater(length5, 0) self.assertGreater(length1, 0) # Encodings must have complete overlap with the next higher encoding s100 = set(r100["fingerprint"]["positions"]) s10 = set(r10["fingerprint"]["positions"]) s5 = set(r5["fingerprint"]["positions"]) s1 = set(r1["fingerprint"]["positions"]) self.assertEqual(len(s100 & s10), length10) self.assertEqual(len(s10 & s5), length5) self.assertEqual(len(s5 & s1), length1) # Test that if you encode a second time, you get the same bitmap r100_2 = cio100.encode(text) r10_2 = cio10.encode(text) r5_2 = cio5.encode(text) r1_2 = cio1.encode(text) self.assertEqual(hashlib.sha224(str(r100)).hexdigest(), hashlib.sha224(str(r100_2)).hexdigest()) self.assertEqual(hashlib.sha224(str(r10)).hexdigest(), hashlib.sha224(str(r10_2)).hexdigest()) self.assertEqual(hashlib.sha224(str(r5)).hexdigest(), hashlib.sha224(str(r5_2)).hexdigest()) self.assertEqual(hashlib.sha224(str(r1)).hexdigest(), hashlib.sha224(str(r1_2)).hexdigest())
class ClassificationModelFingerprint(ClassificationModel): """ Class to run the survey response classification task with Coritcal.io fingerprint encodings. From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelFingerprint", fingerprintType=EncoderTypes.word, unionSparsity=0.20, retinaScaling=1.0, retina="en_associative", apiKey=None, classifierMetric="rawOverlap", cacheRoot=None): super(ClassificationModelFingerprint, self).__init__( verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) # Init kNN classifier and Cortical.io encoder; need valid API key (see # CioEncoder init for details). self.classifier = KNNClassifier(k=numLabels, distanceMethod=classifierMetric, exact=False, verbosity=verbosity-1) if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word): raise ValueError("Invaid type of fingerprint encoding; see the " "EncoderTypes class for eligble types.") cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(retinaScaling=retinaScaling, cacheDir=os.path.join(cacheRoot, "CioCache"), fingerprintType=fingerprintType, unionSparsity=unionSparsity, retina=retina, apiKey=apiKey) def encodeSample(self, sample): """ Encode an SDR of the input string by querying the Cortical.io API. If the client returns None, we create a random SDR with the model's dimensions n and w. @param sample (list) Tokenized sample, where each item is a str. @return fp (dict) The sample text, sparsity, and bitmap. Example return dict: { "text": "Example text", "sparsity": 0.03, "bitmap": numpy.array([]) } """ sample = " ".join(sample) fpInfo = self.encoder.encode(sample) if fpInfo: fp = {"text":fpInfo["text"] if "text" in fpInfo else fpInfo["term"], "sparsity":fpInfo["sparsity"], "bitmap":numpy.array(fpInfo["fingerprint"]["positions"])} else: fp = {"text":sample, "sparsity":float(self.encoder.w)/self.encoder.n, "bitmap":self.encodeRandomly( sample, self.encoder.n, self.encoder.w)} return fp def trainModel(self, i): # TODO: add batch training, where i is a list """ Train the classifier on the sample and labels for record i. The list sampleReference is populated to correlate classifier prototypes to sample IDs. """ bitmap = self.patterns[i]["pattern"]["bitmap"] count = 0 if bitmap.any(): for count, label in enumerate(self.patterns[i]["labels"]): self.classifier.learn(bitmap, label, isSparse=self.encoder.n) self.sampleReference.append(self.patterns[i]["ID"]) count += 1 return count def testModel(self, i, seed=42): """ Test the model on record i. The random seed is used in getWinningLabels(). @return (numpy array) numLabels most-frequent classifications for the data samples; int or empty. """ (_, inferenceResult, _, _) = self.classifier.infer(self.sparsifyPattern( self.patterns[i]["pattern"]["bitmap"], self.encoder.n)) return self.getWinningLabels(inferenceResult, seed)
class ClassificationModelFingerprint(ClassificationModel): """ Class to run the survey response classification task with Coritcal.io fingerprint encodings. From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, fingerprintType=EncoderTypes.word, unionSparsity=0.20, retinaScaling=1.0, retina="en_associative", apiKey=None, k=1, classifierMetric="rawOverlap", cacheRoot=None, **kwargs): super(ClassificationModelFingerprint, self).__init__(**kwargs) self.classifier = KNNClassifier(k=k, distanceMethod=classifierMetric, exact=False, verbosity=self.verbosity-1) # Need a valid API key for the Cortical.io encoder (see CioEncoder # constructor for details). if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word): raise ValueError("Invalid type of fingerprint encoding; see the " "EncoderTypes class for eligble types.") self.encoder = CioEncoder(retinaScaling=retinaScaling, fingerprintType=fingerprintType, unionSparsity=unionSparsity, retina=retina, apiKey=apiKey, cacheDir=cacheRoot) self.currentDocument = None def trainToken(self, token, labels, sampleId, reset=0): """ Train the model with the given text token, associated labels, and sampleId. See base class for params and return type descriptions. """ if self.currentDocument is None: # start of a new document self.currentDocument = [token] else: # accumulate text for this document self.currentDocument.append(token) if reset == 1: # all text accumulated, proceed w/ training on this document document = " ".join(self.currentDocument) bitmap = self.encoder.encode(document)["fingerprint"]["positions"] if self.verbosity >= 2: print "CioFP model training with: '{}'".format(document) print "\tBitmap:", bitmap for label in labels: self.classifier.learn( bitmap, label, isSparse=self.encoder.n, partitionId=sampleId) self.currentDocument = None def inferToken(self, token, reset=0, returnDetailedResults=False, sortResults=True): """ Classify the token (i.e. run inference on the model with this document) and return classification results and (optionally) a list of sampleIds and distances. Repeated sampleIds are NOT removed from the results. See base class for params and return type descriptions. """ if self.currentDocument is None: # start of a new document self.currentDocument = [token] else: # accumulate text for this document self.currentDocument.append(token) if reset == 0: return numpy.zeros(self.numLabels), [], numpy.zeros(0) # With reset=1, all text accumulated, proceed w/ classifying this document document = " ".join(self.currentDocument) bitmap = self.encoder.encode(document)["fingerprint"]["positions"] densePattern =self.encoder.densifyPattern(bitmap) (_, inferenceResult, dist, _) = self.classifier.infer(densePattern) if self.verbosity >= 2: print "CioFP model inference with: '{}'".format(document) print "\tBitmap:", bitmap print "\tInference result=", inferenceResult print "\tDistances=", dist self.currentDocument = None # Figure out format of returned results if not returnDetailedResults: # Return non-detailed results. return inferenceResult, None, None if not sortResults: idList = [self.classifier.getPartitionId(i) for i in xrange(len(dist))] return inferenceResult, idList, dist # Return sorted results sortedIndices = dist.argsort() idList = [self.classifier.getPartitionId(i) for i in sortedIndices] sortedDistances = dist[sortedIndices] return inferenceResult, idList, sortedDistances def getEncoder(self): """ Returns the encoder instance for the model. """ return self.encoder def getClassifier(self): """ Returns the classifier instance for the model. """ return self.classifier
class ClassificationModelEndpoint(ClassificationModel): """ Class to run the survey response classification task with Cortical.io text endpoint encodings and classification system. From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelEndpoint", unionSparsity=0.20, cacheRoot=None): """ Initializes the encoder as CioEncoder; requires a valid API key. """ super(ClassificationModelEndpoint, self).__init__( verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(cacheDir=os.path.join(cacheRoot, "CioCache"), unionSparsity=unionSparsity) self.compareEncoder = LanguageEncoder() self.n = self.encoder.n self.w = int((self.encoder.targetSparsity/100) * self.n) self.categoryBitmaps = {} self.negatives = defaultdict(list) self.positives = defaultdict(list) def encodeSample(self, sample): """ Encode an SDR of the input string by querying the Cortical.io API. @param sample (list) Tokenized sample, where each item is a string @return fp (dict) The sample text, sparsity, and bitmap. Example return dict: { "text": "Example text", "sparsity": 0.03, "bitmap": numpy.array([]) } """ sample = " ".join(sample) fpInfo = self.encoder.encode(sample) if fpInfo: fp = {"text":fpInfo["text"] if "text" in fpInfo else fpInfo["term"], "sparsity":fpInfo["sparsity"], "bitmap":numpy.array(fpInfo["fingerprint"]["positions"])} else: fp = {"text":sample, "sparsity":float(self.w)/self.n, "bitmap":self.encodeRandomly(sample, self.n, self.w)} return fp def resetModel(self): """Reset the model""" self.positives.clear() self.negatives.clear() self.categoryBitmaps.clear() def trainModel(self, i, negatives=None): # TODO: add batch training, where i is a list; note we should only add # negatives when training on one sample so we know which labels to use. """ Train the classifier on the sample and labels for record i. Use Cortical.io's createClassification() to make a bitmap that represents the class. The list sampleReference is populated to correlate classifier prototypes to sample IDs. @param negative (list) Each item is the dictionary containing text, sparsity and bitmap for the negative samples. """ record = self.patterns[i] labelsToUpdateBitmaps = set() for label in record["labels"]: if record["pattern"]["text"] and record["pattern"]["bitmap"].any(): self.positives[label].append(record["pattern"]["text"]) if negatives: for neg in negatives: if neg["text"]: self.negatives[label].append(neg["text"]) labelsToUpdateBitmaps.add(label) for label in labelsToUpdateBitmaps: self.categoryBitmaps[label] = self.encoder.createCategory( str(label), self.positives[label], self.negatives[label])["positions"] self.sampleReference.append(i) def testModel(self, i, _, metric="overlappingAll"): """ Test on record i. The Cortical.io classifier returns a dictionary containing various distance metrics between the sample and the classes. @param metric (str) Distance metric use by classifier. @return (numpy array) numLabels most-frequent classifications for the data samples; int or empty. """ sampleBitmap = self.patterns[i]["pattern"]["bitmap"].tolist() distances = defaultdict(list) for cat, catBitmap in self.categoryBitmaps.iteritems(): distances[cat] = self.compareEncoder.compare(sampleBitmap, catBitmap) return self.getWinningLabels(distances, metric=metric) def getWinningLabels(self, distances, metric): """ Return indices of winning categories, based off of the input metric. Overrides the base class implementation. """ metricValues = numpy.array([v[metric] for v in distances.values()]) sortedIdx = numpy.argsort(metricValues) # euclideanDistance and jaccardDistance are ascending descendingOrder = ("overlappingAll", "overlappingLeftRight", "overlappingRightLeft", "cosineSimilarity", "weightedScoring") if metric in descendingOrder: sortedIdx = sortedIdx[::-1] return numpy.array( [distances.keys()[catIdx] for catIdx in sortedIdx[:self.numLabels]]) def getCategoryDistances(self, sort=True, save=None, labelRefs=None): """ Return a dict where keys are categories and values are dicts of distances. @param sort (bool) Sort the inner dicts with compareCategories() @param save (str) Dump catDistances to a JSON in this dir. @return (defaultdict) E.g. w/ categories 0 and 1: catDistances = { 0: { 0: {"cosineSimilarity": 1.0, ...}, 1: {"cosineSimilarity": 0.33, ...} }, 1: { 0: {"cosineSimilarity": 0.33, ...}, 1: {"cosineSimilarity": 1.0, ...} } Note the inner-dicts of catDistances are OrderedDict objects. """ catDistances = defaultdict(list) for cat, catBitmap in self.categoryBitmaps.iteritems(): catDistances[cat] = OrderedDict() for compareCat, compareBitmap in self.categoryBitmaps.iteritems(): # List is in order of self.categoryBitmaps.keys() catDistances[cat][compareCat] = self.compareEncoder.compare( catBitmap, compareBitmap) if sort: # Order each inner dict of catDistances such that the ranking is most to # least similar. catDistances = self.compareCategories(catDistances) if save is not None: self.writeOutCategories( save, comparisons=catDistances, labelRefs=labelRefs) return catDistances @staticmethod def compareCategories(catDistances, metric="overlappingAll"): """ Calculate category distances. Returns a defaultdict of category keys, where values are OrderedDicts sorted such that the most similar categories (according to the input metric) are listed first. """ descendingOrder = ("overlappingAll", "overlappingLeftRight", "overlappingRightLeft", "cosineSimilarity", "weightedScoring") categoryComparisons = defaultdict(list) for k, v in catDistances.iteritems(): # Create a dict for this category metricDict = {compareCat: distances[metric] for compareCat, distances in v.iteritems()} # Sort the dict by the metric reverse = True if metric in descendingOrder else False categoryComparisons[k] = OrderedDict( sorted(metricDict.items(), key=lambda k: k[1], reverse=reverse)) return categoryComparisons @staticmethod def query(): print "The Classification Endpoint model doesn't support this method." @staticmethod def infer(): print "The Classification Endpoint model doesn't support this method."
class ClassificationModelFingerprint(ClassificationModel): """ Class to run the survey response classification task with Coritcal.io fingerprint encodings. From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelFingerprint", fingerprintType=EncoderTypes.word, unionSparsity=0.20, retinaScaling=1.0, retina="en_associative", apiKey=None, classifierMetric="rawOverlap", cacheRoot=None): super(ClassificationModelFingerprint, self).__init__(verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) # Init kNN classifier and Cortical.io encoder; need valid API key (see # CioEncoder init for details). self.classifier = KNNClassifier(k=numLabels, distanceMethod=classifierMetric, exact=False, verbosity=verbosity - 1) if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word): raise ValueError("Invaid type of fingerprint encoding; see the " "EncoderTypes class for eligble types.") cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(retinaScaling=retinaScaling, cacheDir=os.path.join(cacheRoot, "CioCache"), fingerprintType=fingerprintType, unionSparsity=unionSparsity, retina=retina, apiKey=apiKey) def encodeSample(self, sample): """ Encode an SDR of the input string by querying the Cortical.io API. If the client returns None, we create a random SDR with the model's dimensions n and w. @param sample (list) Tokenized sample, where each item is a str. @return fp (dict) The sample text, sparsity, and bitmap. Example return dict: { "text": "Example text", "sparsity": 0.03, "bitmap": numpy.array([]) } """ sample = " ".join(sample) fpInfo = self.encoder.encode(sample) if fpInfo: fp = { "text": fpInfo["text"] if "text" in fpInfo else fpInfo["term"], "sparsity": fpInfo["sparsity"], "bitmap": numpy.array(fpInfo["fingerprint"]["positions"]) } else: fp = { "text": sample, "sparsity": float(self.encoder.w) / self.encoder.n, "bitmap": self.encodeRandomly(sample, self.encoder.n, self.encoder.w) } return fp def trainModel(self, i): # TODO: add batch training, where i is a list """ Train the classifier on the sample and labels for record i. The list sampleReference is populated to correlate classifier prototypes to sample IDs. """ bitmap = self.patterns[i]["pattern"]["bitmap"] count = 0 if bitmap.any(): for count, label in enumerate(self.patterns[i]["labels"]): self.classifier.learn(bitmap, label, isSparse=self.encoder.n) self.sampleReference.append(self.patterns[i]["ID"]) count += 1 return count def testModel(self, i, seed=42): """ Test the model on record i. The random seed is used in getWinningLabels(). @return (numpy array) numLabels most-frequent classifications for the data samples; int or empty. """ (_, inferenceResult, _, _) = self.classifier.infer( self.sparsifyPattern(self.patterns[i]["pattern"]["bitmap"], self.encoder.n)) return self.getWinningLabels(inferenceResult, seed)
class ClassificationModelFingerprint(ClassificationModel): """ Class to run the survey response classification task with Coritcal.io fingerprint encodings. From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, fingerprintType=EncoderTypes.word, unionSparsity=0.20, retinaScaling=1.0, retina="en_associative", apiKey=None, k=1, classifierMetric="rawOverlap", cacheRoot=None, **kwargs): super(ClassificationModelFingerprint, self).__init__(**kwargs) self.classifier = KNNClassifier(k=k, distanceMethod=classifierMetric, exact=False, verbosity=self.verbosity - 1) # Need a valid API key for the Cortical.io encoder (see CioEncoder # constructor for details). if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word): raise ValueError("Invalid type of fingerprint encoding; see the " "EncoderTypes class for eligble types.") self.encoder = CioEncoder(retinaScaling=retinaScaling, fingerprintType=fingerprintType, unionSparsity=unionSparsity, retina=retina, apiKey=apiKey, cacheDir=cacheRoot) self.currentDocument = None def trainToken(self, token, labels, sampleId, reset=0): """ Train the model with the given text token, associated labels, and sampleId. See base class for params and return type descriptions. """ if self.currentDocument is None: # start of a new document self.currentDocument = [token] else: # accumulate text for this document self.currentDocument.append(token) if reset == 1: # all text accumulated, proceed w/ training on this document document = " ".join(self.currentDocument) bitmap = self.encoder.encode(document)["fingerprint"]["positions"] if self.verbosity >= 2: print "CioFP model training with: '{}'".format(document) print "\tBitmap:", bitmap for label in labels: self.classifier.learn(bitmap, label, isSparse=self.encoder.n, partitionId=sampleId) self.currentDocument = None def inferToken(self, token, reset=0, returnDetailedResults=False, sortResults=True): """ Classify the token (i.e. run inference on the model with this document) and return classification results and (optionally) a list of sampleIds and distances. Repeated sampleIds are NOT removed from the results. See base class for params and return type descriptions. """ if self.currentDocument is None: # start of a new document self.currentDocument = [token] else: # accumulate text for this document self.currentDocument.append(token) if reset == 0: return numpy.zeros(self.numLabels), [], numpy.zeros(0) # With reset=1, all text accumulated, proceed w/ classifying this document document = " ".join(self.currentDocument) bitmap = self.encoder.encode(document)["fingerprint"]["positions"] densePattern = self.encoder.densifyPattern(bitmap) (_, inferenceResult, dist, _) = self.classifier.infer(densePattern) if self.verbosity >= 2: print "CioFP model inference with: '{}'".format(document) print "\tBitmap:", bitmap print "\tInference result=", inferenceResult print "\tDistances=", dist self.currentDocument = None # Figure out format of returned results if not returnDetailedResults: # Return non-detailed results. return inferenceResult, None, None if not sortResults: idList = [ self.classifier.getPartitionId(i) for i in xrange(len(dist)) ] return inferenceResult, idList, dist # Return sorted results sortedIndices = dist.argsort() idList = [self.classifier.getPartitionId(i) for i in sortedIndices] sortedDistances = dist[sortedIndices] return inferenceResult, idList, sortedDistances def getEncoder(self): """ Returns the encoder instance for the model. """ return self.encoder def getClassifier(self): """ Returns the classifier instance for the model. """ return self.classifier
class ClassificationModelContext(ClassificationModel): """ Class to run the survey response classification task with Cortical.io text context, then AND the context From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1, numLabels=1): """ Initialize the CorticalClient and CioEncoder. Requires a valid API key. """ super(ClassificationModelContext, self).__init__(verbosity) root = os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(cacheDir=os.path.join(root, "CioCache")) self.client = CorticalClient(self.encoder.apiKey) self.n = self.encoder.n self.w = int((self.encoder.targetSparsity / 100) * self.n) self.categoryBitmaps = {} self.numLabels = numLabels def encodePattern(self, pattern): """ Encode an SDR of the input string by querying the Cortical.io API. @param pattern (list) Tokenized sample, where each item is a string @return (dictionary) Dictionary, containing text, sparsity, and bitmap Example return dict: { "text": "Example text", "sparsity": 0.0, "bitmap": numpy.zeros(0) } """ text = " ".join(pattern) return {"text": text, "sparsity": 0.0, "bitmap": self._encodeText(text)} def _encodeText(self, text): fpInfo = self.encoder.encode(text) if self.verbosity > 1: print "Fingerprint sparsity = {0}%.".format(fpInfo["sparsity"]) if fpInfo: bitmap = numpy.array(fpInfo["fingerprint"]["positions"]) else: bitmap = self.encodeRandomly(text, self.n, self.w) return bitmap.astype(int) def resetModel(self): """Reset the model""" self.categoryBitmaps.clear() def trainModel(self, samples, labels): """ Train the classifier on the input sample and label. Use Cortical.io's keyword extraction to get the most relevant terms then get the intersection of those bitmaps @param samples (dictionary) Dictionary, containing text, sparsity, and bitmap @param labels (int) Reference index for the classification of this sample. """ for sample, sample_labels in zip(samples, labels): bitmaps = [sample["bitmap"].tolist()] context = self.client.getContextFromText(bitmaps, maxResults=5, getFingerprint=True) if len(context) != 0: union = numpy.zeros(0) for c in context: bitmap = c["fingerprint"]["positions"] union = numpy.union1d(bitmap, union).astype(int) for label in sample_labels: # Haven't seen the label before if label not in self.categoryBitmaps: self.categoryBitmaps[label] = union intersection = numpy.intersect1d(union, self.categoryBitmaps[label]) if intersection.size == 0: # Don't want to lose all the old information union = numpy.union1d(union, self.categoryBitmaps[label]).astype(int) # Need to sample to stay sparse count = len(union) sampleIndices = random.sample(xrange(count), min(count, self.w)) intersection = numpy.sort(union[sampleIndices]) self.categoryBitmaps[label] = intersection def testModel(self, sample): """ Test the intersection bitmap on the input sample. Returns a dictionary containing various distance metrics between the sample and the classes. @param sample (dictionary) Dictionary, containing text, sparsity, and bitmap @return (dictionary) The distances between the sample and the classes Example return dict: { 0: { "cosineSimilarity": 0.6666666666666666, "euclideanDistance": 0.3333333333333333, "jaccardDistance": 0.5, "overlappingAll": 6, "overlappingLeftRight": 0.6666666666666666, "overlappingRightLeft": 0.6666666666666666, "sizeLeft": 9, "sizeRight": 9, "weightedScoring": 0.4436476984102028 } } """ sampleBitmap = sample["bitmap"].tolist() distances = {} for cat, catBitmap in self.categoryBitmaps.iteritems(): distances[cat] = self.client.compare(sampleBitmap, catBitmap.tolist()) return self.winningLabels(distances, numberCats=self.numLabels, metric="overlappingAll") @staticmethod def winningLabels(distances, numberCats, metric): """ Return indices of winning categories, based off of the input metric. Overrides the base class implementation. """ metricValues = numpy.array([v[metric] for v in distances.values()]) sortedIdx = numpy.argsort(metricValues) # euclideanDistance and jaccardDistance are ascending descendingOrder = set(["overlappingAll", "overlappingLeftRight", "overlappingRightLeft", "cosineSimilarity", "weightedScoring"]) if metric in descendingOrder: sortedIdx = sortedIdx[::-1] return [distances.keys()[catIdx] for catIdx in sortedIdx[:numberCats]]
def testMaxSparsity(self): """Test that CioEncoder's maxSparsity works.""" # This text seems to generate bitmaps with about 8% sparsity text = ( "Smoking harms nearly every organ in your body. Over 7000 chemicals" " have been identified in tobacco smoke. After reading all this" " James and Sue decided to abruptly quit cigarette smoking to" " improve their health but it clearly was not an easy decision.") # Encoders with maxSparsity of 100%, 10%, 5%, and 1% cio100 = CioEncoder(maxSparsity=1.0, fingerprintType=EncoderTypes.document) cio10 = CioEncoder(maxSparsity=0.1, fingerprintType=EncoderTypes.document) cio5 = CioEncoder(maxSparsity=0.05, fingerprintType=EncoderTypes.document) cio1 = CioEncoder(maxSparsity=0.01, fingerprintType=EncoderTypes.document) bitmapSize = cio100.width * cio100.height r100 = cio100.encode(text) r10 = cio10.encode(text) r5 = cio5.encode(text) r1 = cio1.encode(text) length100 = len(r100["fingerprint"]["positions"]) length10 = len(r10["fingerprint"]["positions"]) length5 = len(r5["fingerprint"]["positions"]) length1 = len(r1["fingerprint"]["positions"]) # Encodings must have no more than desired sparsity self.assertLessEqual(r100["sparsity"], 1.0) self.assertLessEqual(r10["sparsity"], 0.1) self.assertLessEqual(r5["sparsity"], 0.05) self.assertLessEqual(r1["sparsity"], 0.01) self.assertLessEqual(length100, bitmapSize) self.assertLessEqual(length10, 0.1 * bitmapSize) self.assertLessEqual(length5, 0.05 * bitmapSize) self.assertLessEqual(length1, 0.01 * bitmapSize) # Encodings can't be zero self.assertGreater(length100, 0) self.assertGreater(length10, 0) self.assertGreater(length5, 0) self.assertGreater(length1, 0) # Encodings must have complete overlap with the next higher encoding s100 = set(r100["fingerprint"]["positions"]) s10 = set(r10["fingerprint"]["positions"]) s5 = set(r5["fingerprint"]["positions"]) s1 = set(r1["fingerprint"]["positions"]) self.assertEqual(len(s100 & s10), length10) self.assertEqual(len(s10 & s5), length5) self.assertEqual(len(s5 & s1), length1) # Test that if you encode a second time, you get the same bitmap r100_2 = cio100.encode(text) r10_2 = cio10.encode(text) r5_2 = cio5.encode(text) r1_2 = cio1.encode(text) self.assertEqual( hashlib.sha224(str(r100)).hexdigest(), hashlib.sha224(str(r100_2)).hexdigest()) self.assertEqual( hashlib.sha224(str(r10)).hexdigest(), hashlib.sha224(str(r10_2)).hexdigest()) self.assertEqual( hashlib.sha224(str(r5)).hexdigest(), hashlib.sha224(str(r5_2)).hexdigest()) self.assertEqual( hashlib.sha224(str(r1)).hexdigest(), hashlib.sha224(str(r1_2)).hexdigest())