def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelFingerprint", fingerprintType=EncoderTypes.word, unionSparsity=20.0): super(ClassificationModelFingerprint, self).__init__(verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) # Init kNN classifier and Cortical.io encoder; need valid API key (see # CioEncoder init for details). self.classifier = KNNClassifier(k=numLabels, distanceMethod='rawOverlap', exact=False, verbosity=verbosity - 1) if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word): raise ValueError("Invaid type of fingerprint encoding; see the " "EncoderTypes class for eligble types.") self.encoder = CioEncoder(cacheDir="./fluent/experiments/cioCache", fingerprintType=fingerprintType, unionSparsity=unionSparsity) self.n = self.encoder.n self.w = int((self.encoder.targetSparsity / 100) * self.n)
def testWordFingerprint(self): """Test the Cortical.io term (word-lelevl) encoding.""" cio = CioEncoder(fingerprintType=EncoderTypes.word) response = cio.encode(self.text) self.assertFingerprintFields(response) encodingDict = getTestData("cio_encoding_word.json") self.assertEqual(encodingDict["fingerprint"]["positions"], response["fingerprint"]["positions"], "Cio bitmap is not as expected.")
def __init__(self, verbosity=1, numLabels=1): """ Initialize the CorticalClient and CioEncoder. Requires a valid API key """ super(ClassificationModelContext, self).__init__(verbosity) self.encoder = CioEncoder(cacheDir="./experiments/cache") self.client = CorticalClient(self.encoder.apiKey) self.n = self.encoder.n self.w = int((self.encoder.targetSparsity / 100) * self.n) self.categoryBitmaps = {} self.numLabels = numLabels
def __init__(self, verbosity=1, numLabels=3): """ Initialize the encoder as CioEncoder; requires a valid API key. """ super(ClassificationModelEndpoint, self).__init__(verbosity, numLabels) self.encoder = CioEncoder(cacheDir="./experiments/cache") self.compareEncoder = LanguageEncoder() self.n = self.encoder.n self.w = int((self.encoder.targetSparsity / 100) * self.n) self.categoryBitmaps = {} self.negatives = defaultdict(list) self.positives = defaultdict(list)
def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelFingerprint", fingerprintType=EncoderTypes.word, unionSparsity=20.0): super(ClassificationModelFingerprint, self).__init__( verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) # Init kNN classifier and Cortical.io encoder; need valid API key (see # CioEncoder init for details). self.classifier = KNNClassifier(k=numLabels, distanceMethod='rawOverlap', exact=False, verbosity=verbosity-1) if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word): raise ValueError("Invaid type of fingerprint encoding; see the " "EncoderTypes class for eligble types.") self.encoder = CioEncoder(cacheDir="./fluent/experiments/cioCache", fingerprintType=fingerprintType, unionSparsity=unionSparsity) self.n = self.encoder.n self.w = int((self.encoder.targetSparsity/100)*self.n)
def __init__(self, inputFilePath, verbosity=1, numLabels=3, spTrainingSize=0, tmTrainingSize=0, clsTrainingSize=0, classifierType="KNN"): """ @param inputFilePath (str) Path to data formatted for network API @param spTrainingSize (int) Number of samples the network has to be trained on before training the spatial pooler @param tmTrainingSize (int) Number of samples the network has to be trained on before training the temporal memory @param clsTrainingSize (int) Number of samples the network has to be trained on before training the classifier @param classifierType (str) Either "KNN" or "CLA" See ClassificationModel for remaining parameters """ self.spTrainingSize = spTrainingSize self.tmTrainingSize = tmTrainingSize self.clsTrainingSize = clsTrainingSize super(ClassificationModelHTM, self).__init__(verbosity=verbosity, numLabels=numLabels) # Initialize Network self.classifierType = classifierType self.recordStream = FileRecordStream(streamID=inputFilePath) self.encoder = CioEncoder(cacheDir="./experiments/cache") self._initModel()
def initModel(self): """ Initialize the network; self.networdDataPath must already be set. """ recordStream = FileRecordStream(streamID=self.networkDataPath) encoder = CioEncoder(cacheDir="./experiments/cache") return configureNetwork(recordStream, self.networkConfig, encoder)
def __init__(self, verbosity=1): super(ClassificationModelFingerprint, self).__init__(verbosity) # Init kNN classifier and Cortical.io encoder; need valid API key (see # CioEncoder init for details). self.classifier = KNNClassifier(k=1, exact=False, verbosity=verbosity-1) self.encoder = CioEncoder(cacheDir="./experiments/cache") self.n = self.encoder.n self.w = int((self.encoder.targetSparsity/100)*self.n)
def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelEndpoint"): """ Initializes the encoder as CioEncoder; requires a valid API key. """ super(ClassificationModelEndpoint, self).__init__( verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) self.encoder = CioEncoder(cacheDir="./experiments/cache") self.compareEncoder = LanguageEncoder() self.n = self.encoder.n self.w = int((self.encoder.targetSparsity/100) * self.n) self.categoryBitmaps = {} self.negatives = defaultdict(list) self.positives = defaultdict(list)
class ClassificationModelFingerprint(ClassificationModel): """ Class to run the survey response classification task with Coritcal.io fingerprint encodings. From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelFingerprint", fingerprintType=EncoderTypes.word, unionSparsity=20.0): super(ClassificationModelFingerprint, self).__init__( verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) # Init kNN classifier and Cortical.io encoder; need valid API key (see # CioEncoder init for details). self.classifier = KNNClassifier(k=numLabels, distanceMethod='rawOverlap', exact=False, verbosity=verbosity-1) if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word): raise ValueError("Invaid type of fingerprint encoding; see the " "EncoderTypes class for eligble types.") self.encoder = CioEncoder(cacheDir="./fluent/experiments/cioCache", fingerprintType=fingerprintType, unionSparsity=unionSparsity) self.n = self.encoder.n self.w = int((self.encoder.targetSparsity/100)*self.n) def encodeSample(self, sample): """ Encode an SDR of the input string by querying the Cortical.io API. If the client returns None, we create a random SDR with the model's dimensions n and w. @param sample (list) Tokenized sample, where each item is a str. @return fp (dict) The sample text, sparsity, and bitmap. Example return dict: { "text": "Example text", "sparsity": 0.03, "bitmap": numpy.array([]) } """ sample = " ".join(sample) fpInfo = self.encoder.encode(sample) if fpInfo: fp = {"text":fpInfo["text"] if "text" in fpInfo else fpInfo["term"], "sparsity":fpInfo["sparsity"], "bitmap":numpy.array(fpInfo["fingerprint"]["positions"])} else: fp = {"text":sample, "sparsity":float(self.w)/self.n, "bitmap":self.encodeRandomly(sample)} return fp def trainModel(self, i): # TODO: add batch training, where i is a list """ Train the classifier on the sample and labels for record i. The list sampleReference is populated to correlate classifier prototypes to sample IDs. """ bitmap = self.patterns[i]["pattern"]["bitmap"] if bitmap.any(): for label in self.patterns[i]["labels"]: self.classifier.learn(bitmap, label, isSparse=self.n) self.sampleReference.append(self.patterns[i]["ID"]) def testModel(self, i, numLabels=3): """ Test the model on record i. @param numLabels (int) Number of classification predictions. @return (numpy array) numLabels most-frequent classifications for the data samples; int or empty. """ (_, inferenceResult, _, _) = self.classifier.infer( self.sparsifyPattern(self.patterns[i]["pattern"]["bitmap"], self.n)) return self.getWinningLabels(inferenceResult, numLabels)
class ClassificationModelFingerprint(ClassificationModel): """ Class to run the survey response classification task with Coritcal.io fingerprint encodings. From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1, numLabels=3, fingerprintType=EncoderTypes.document): super(ClassificationModelFingerprint, self).__init__(verbosity, numLabels) # Init kNN classifier and Cortical.io encoder; need valid API key (see # CioEncoder init for details). self.classifier = KNNClassifier(k=numLabels, distanceMethod='rawOverlap', exact=False, verbosity=verbosity - 1) if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word): raise ValueError("Invaid type of fingerprint encoding; see the " "EncoderTypes class for eligble types.") self.encoder = CioEncoder(cacheDir="./fluent/experiments/cioCache", fingerprintType=fingerprintType) self.n = self.encoder.n self.w = int((self.encoder.targetSparsity / 100) * self.n) def encodePattern(self, sample): """ Encode an SDR of the input string by querying the Cortical.io API. If the client returns None, we create a random SDR with the model's dimensions n and w. @param sample (list) Tokenized sample, where each item is a string token. @return fp (dict) The sample text, sparsity, and bitmap. Example return dict: { "text": "Example text", "sparsity": 0.03, "bitmap": numpy.array([]) } """ sample = " ".join(sample) fpInfo = self.encoder.encode(sample) if fpInfo: fp = { "text": fpInfo["text"] if "text" in fpInfo else fpInfo["term"], "sparsity": fpInfo["sparsity"], "bitmap": numpy.array(fpInfo["fingerprint"]["positions"]) } else: fp = { "text": sample, "sparsity": float(self.w) / self.n, "bitmap": self.encodeRandomly(sample) } return fp def resetModel(self): """Reset the model by clearing the classifier.""" self.classifier.clear() def trainModel(self, samples, labels): """ Train the classifier on the input sample and labels. @param samples (list) List of dictionaries containing the sample text, sparsity, and bitmap. @param labels (list) List of numpy arrays containing the reference indices for the classifications of each sample. """ for sample, sample_labels in zip(samples, labels): if sample["bitmap"].any(): for label in sample_labels: self.classifier.learn(sample["bitmap"], label, isSparse=self.n) def testModel(self, sample, numLabels=3): """ Test the kNN classifier on the input sample. Returns the classification most frequent amongst the classifications of the sample's individual tokens. We ignore the terms that are unclassified, picking the most frequent classification among those that are detected. @param sample (dict) The sample text, sparsity, and bitmap. @param numLabels (int) Number of predicted classifications. @return (numpy array) The numLabels most-frequent classifications for the data samples; values are int or empty. """ (_, inferenceResult, _, _) = self.classifier.infer(self._densifyPattern(sample["bitmap"])) return self.getWinningLabels(inferenceResult, numLabels)
class ClassificationModelContext(ClassificationModel): """ Class to run the survey response classification task with Cortical.io text context, then AND the context From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1, numLabels=1): """ Initialize the CorticalClient and CioEncoder. Requires a valid API key """ super(ClassificationModelContext, self).__init__(verbosity) self.encoder = CioEncoder(cacheDir="./experiments/cache") self.client = CorticalClient(self.encoder.apiKey) self.n = self.encoder.n self.w = int((self.encoder.targetSparsity / 100) * self.n) self.categoryBitmaps = {} self.numLabels = numLabels def encodePattern(self, pattern): """ Encode an SDR of the input string by querying the Cortical.io API. @param pattern (list) Tokenized sample, where each item is a string @return (dictionary) Dictionary, containing text, sparsity, and bitmap Example return dict: { "text": "Example text", "sparsity": 0.0, "bitmap": numpy.zeros(0) } """ text = " ".join(pattern) return {"text": text, "sparsity": 0.0, "bitmap": self._encodeText(text)} def _encodeText(self, text): fpInfo = self.encoder.encode(text) if self.verbosity > 1: print "Fingerprint sparsity = {0}%.".format(fpInfo["sparsity"]) if fpInfo: bitmap = numpy.array(fpInfo["fingerprint"]["positions"]) else: bitmap = self.encodeRandomly(text) return bitmap.astype(int) def resetModel(self): """Reset the model""" self.categoryBitmaps.clear() def trainModel(self, samples, labels): """ Train the classifier on the input sample and label. Use Cortical.io's keyword extraction to get the most relevant terms then get the intersection of those bitmaps @param samples (dictionary) Dictionary, containing text, sparsity, and bitmap @param labels (int) Reference index for the classification of this sample. """ for sample, sample_labels in zip(samples, labels): bitmaps = [sample["bitmap"].tolist()] context = self.client.getContextFromText(bitmaps, maxResults=5, getFingerprint=True) if len(context) != 0: union = numpy.zeros(0) for c in context: bitmap = c["fingerprint"]["positions"] union = numpy.union1d(bitmap, union).astype(int) for label in sample_labels: # Haven't seen the label before if label not in self.categoryBitmaps: self.categoryBitmaps[label] = union intersection = numpy.intersect1d(union, self.categoryBitmaps[label]) if intersection.size == 0: # Don't want to lose all the old information union = numpy.union1d(union, self.categoryBitmaps[label]).astype(int) # Need to sample to stay sparse count = len(union) sampleIndices = random.sample(xrange(count), min(count, self.w)) intersection = numpy.sort(union[sampleIndices]) self.categoryBitmaps[label] = intersection def testModel(self, sample): """ Test the intersection bitmap on the input sample. Returns a dictionary containing various distance metrics between the sample and the classes. @param sample (dictionary) Dictionary, containing text, sparsity, and bitmap @return (dictionary) The distances between the sample and the classes Example return dict: { 0: { "cosineSimilarity": 0.6666666666666666, "euclideanDistance": 0.3333333333333333, "jaccardDistance": 0.5, "overlappingAll": 6, "overlappingLeftRight": 0.6666666666666666, "overlappingRightLeft": 0.6666666666666666, "sizeLeft": 9, "sizeRight": 9, "weightedScoring": 0.4436476984102028 } } """ sampleBitmap = sample["bitmap"].tolist() distances = {} for cat, catBitmap in self.categoryBitmaps.iteritems(): distances[cat] = self.client.compare(sampleBitmap, catBitmap.tolist()) return self.winningLabels(distances, numberCats=self.numLabels, metric="overlappingAll") @staticmethod def winningLabels(distances, numberCats, metric): """ Return indices of winning categories, based off of the input metric. Overrides the base class implementation. """ metricValues = numpy.array([v[metric] for v in distances.values()]) sortedIdx = numpy.argsort(metricValues) # euclideanDistance and jaccardDistance are ascending descendingOrder = set(["overlappingAll", "overlappingLeftRight", "overlappingRightLeft", "cosineSimilarity", "weightedScoring"]) if metric in descendingOrder: sortedIdx = sortedIdx[::-1] return [distances.keys()[catIdx] for catIdx in sortedIdx[:numberCats]]
class ClassificationModelFingerprint(ClassificationModel): """ Class to run the survey response classification task with Coritcal.io fingerprint encodings. From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1): super(ClassificationModelFingerprint, self).__init__(verbosity) # Init kNN classifier and Cortical.io encoder; need valid API key (see # CioEncoder init for details). self.classifier = KNNClassifier(k=1, exact=False, verbosity=verbosity-1) self.encoder = CioEncoder(cacheDir="./experiments/cache") self.n = self.encoder.n self.w = int((self.encoder.targetSparsity/100)*self.n) def encodePattern(self, sample): """ Encode an SDR of the input string by querying the Cortical.io API. @param sample (list) Tokenized sample, where each item is a string token. @return (list) Numpy arrays, each with a bitmap of the encoding. """ fpInfo = self.encoder.encode(string.join(sample)) if self.verbosity > 1: print "Fingerprint sparsity = {0}%.".format(fpInfo["sparsity"]) if fpInfo: return numpy.array(fpInfo["fingerprint"]["positions"], dtype="uint32") else: return numpy.empty(0) def resetModel(self): """Reset the model by clearing the classifier.""" self.classifier.clear() def trainModel(self, sample, label): """ Train the classifier on the input sample and label. @param sample (numpy.array) Bitmap encoding of the sample. @param label (int) Reference index for the classification of this sample. """ if sample.any(): _ = self.classifier.learn(sample, label, isSparse=self.n) def testModel(self, sample): """ Test the kNN classifier on the input sample. Returns the classification most frequent amongst the classifications of the sample's individual tokens. We ignore the terms that are unclassified, picking the most frequent classification among those that are detected. @param sample (numpy.array) Bitmap encoding of the sample. @return (list) The n most-frequent classifications for the data samples; for more, see the KNNClassifier.infer() documentation. Values are int or None. Note: to return multiple winner classifications, modify the return statement accordingly. """ tokenLabels = [] (tokenLabel, _, _, _) = self.classifier.infer(self._densifyPattern(sample)) ## TODO: get list of closest classifications, not just the winner return [tokenLabel]
class ClassificationModelFingerprint(ClassificationModel): """ Class to run the survey response classification task with Coritcal.io fingerprint encodings. From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1, numLabels=3): super(ClassificationModelFingerprint, self).__init__(verbosity, numLabels) # Init kNN classifier and Cortical.io encoder; need valid API key (see # CioEncoder init for details). self.classifier = KNNClassifier(k=numLabels, distanceMethod='rawOverlap', exact=False, verbosity=verbosity-1) self.encoder = CioEncoder(cacheDir="./experiments/cache") self.n = self.encoder.n self.w = int((self.encoder.targetSparsity/100)*self.n) def encodePattern(self, sample): """ Encode an SDR of the input string by querying the Cortical.io API. If the client returns None, we create a random SDR with the model's dimensions n and w. @param sample (list) Tokenized sample, where each item is a string token. @return fp (dict) The sample text, sparsity, and bitmap. Example return dict: { "text": "Example text", "sparsity": 0.03, "bitmap": numpy.array([]) } """ sample = " ".join(sample) fpInfo = self.encoder.encode(sample) if fpInfo: fp = {"text":fpInfo["text"] if "text" in fpInfo else fpInfo["term"], "sparsity":fpInfo["sparsity"], "bitmap":numpy.array(fpInfo["fingerprint"]["positions"]) } else: fp = {"text":sample, "sparsity":float(self.w)/self.n, "bitmap":self.encodeRandomly(sample) } return fp def resetModel(self): """Reset the model by clearing the classifier.""" self.classifier.clear() def trainModel(self, sample, labels): """ Train the classifier on the input sample and labels. @param sample (dict) The sample text, sparsity, and bitmap. @param labels (numpy array) Reference indices for the classifications of this sample. """ if sample["bitmap"].any(): for label in labels: self.classifier.learn(sample["bitmap"], label, isSparse=self.n) def testModel(self, sample, numLabels=3): """ Test the kNN classifier on the input sample. Returns the classification most frequent amongst the classifications of the sample's individual tokens. We ignore the terms that are unclassified, picking the most frequent classification among those that are detected. @param sample (dict) The sample text, sparsity, and bitmap. @param numLabels (int) Number of predicted classifications. @return (numpy array) The numLabels most-frequent classifications for the data samples; values are int or empty. """ (_, inferenceResult, _, _) = self.classifier.infer( self._densifyPattern(sample["bitmap"])) return self.getWinningLabels(inferenceResult, numLabels)
class ClassificationModelEndpoint(ClassificationModel): """ Class to run the survey response classification task with Cortical.io text endpoint encodings and classification system. From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1, numLabels=3): """ Initialize the encoder as CioEncoder; requires a valid API key. """ super(ClassificationModelEndpoint, self).__init__(verbosity, numLabels) self.encoder = CioEncoder(cacheDir="./experiments/cache") self.compareEncoder = LanguageEncoder() self.n = self.encoder.n self.w = int((self.encoder.targetSparsity / 100) * self.n) self.categoryBitmaps = {} self.negatives = defaultdict(list) self.positives = defaultdict(list) def encodePattern(self, sample): """ Encode an SDR of the input string by querying the Cortical.io API. @param sample (list) Tokenized sample, where each item is a string @return fp (dict) The sample text, sparsity, and bitmap. Example return dict: { "text": "Example text", "sparsity": 0.03, "bitmap": numpy.array([]) } """ sample = " ".join(sample) fpInfo = self.encoder.encode(sample) if fpInfo: fp = { "text": fpInfo["text"] if "text" in fpInfo else fpInfo["term"], "sparsity": fpInfo["sparsity"], "bitmap": numpy.array(fpInfo["fingerprint"]["positions"]) } else: fp = { "text": sample, "sparsity": float(self.w) / self.n, "bitmap": self.encodeRandomly(sample) } return fp def resetModel(self): """Reset the model""" self.positives.clear() self.negatives.clear() self.categoryBitmaps.clear() def trainModel(self, samples, labels, negatives=None): """ Train the classifier on the input sample and label. Use Cortical.io's createClassification to make a bitmap that represents the class @param samples (list) List of dictionaries containing the sample text, sparsity, and bitmap. @param labels (list) List of numpy arrays containing the reference indices for the classifications of each sample. @param negatives (list) Each item is the dictionary containing text, sparsity and bitmap for the negative samples. """ labelsToUpdateBitmaps = set() for sample, sampleLabels in zip(samples, labels): for label in sampleLabels: fpInfo = self.encoder.encode(sample["text"]) if sample["text"] and fpInfo: self.positives[label].append(sample["text"]) # Only add negatives when training on one sample so we know which # labels to use if negatives and len(samples) == 1: for neg in negatives: if neg["text"]: self.negatives[label].append(neg["text"]) labelsToUpdateBitmaps.add(label) for label in labelsToUpdateBitmaps: self.categoryBitmaps[label] = self.encoder.createCategory( str(label), self.positives[label], self.negatives[label])["positions"] def testModel(self, sample, numLabels=3, metric="overlappingAll"): """ Test the Cortical.io classifier on the input sample. Returns a dictionary containing various distance metrics between the sample and the classes. @param sample (dict) The sample text, sparsity, and bitmap. @return (list) Winning classifications based on the specified metric. The number of items returned will be <= numLabels. """ sampleBitmap = sample["bitmap"].tolist() distances = defaultdict(list) for cat, catBitmap in self.categoryBitmaps.iteritems(): distances[cat] = self.compareEncoder.compare( sampleBitmap, catBitmap) return self.getWinningLabels(distances, numLabels=numLabels, metric=metric) @staticmethod def compareCategories(catDistances, metric="overlappingAll"): """ Calculate category distances. Returns a defaultdict of category keys, where values are OrderedDicts sorted such that the most similar categories (according to the input metric) are listed first. """ descendingOrder = ("overlappingAll", "overlappingLeftRight", "overlappingRightLeft", "cosineSimilarity", "weightedScoring") categoryComparisons = defaultdict(list) for k, v in catDistances.iteritems(): # Create a dict for this category metricDict = { compareCat: distances[metric] for compareCat, distances in v.iteritems() } # Sort the dict by the metric reverse = True if metric in descendingOrder else False categoryComparisons[k] = OrderedDict( sorted(metricDict.items(), key=lambda k: k[1], reverse=reverse)) return categoryComparisons def getCategoryDistances(self, sort=True, save=None, labelRefs=None): """ Return a dict where keys are categories and values are dicts of distances. @param sort (bool) Sort the inner dicts with compareCategories() @param save (str) Dump catDistances to a JSON in this dir. @return (defaultdict) E.g. w/ categories 0 and 1: catDistances = { 0: { 0: {"cosineSimilarity": 1.0, ...}, 1: {"cosineSimilarity": 0.33, ...} }, 1: { 0: {"cosineSimilarity": 0.33, ...}, 1: {"cosineSimilarity": 1.0, ...} } Note the inner-dicts of catDistances are OrderedDict objects. """ catDistances = defaultdict(list) for cat, catBitmap in self.categoryBitmaps.iteritems(): catDistances[cat] = OrderedDict() for compareCat, compareBitmap in self.categoryBitmaps.iteritems(): # List is in order of self.categoryBitmaps.keys() catDistances[cat][compareCat] = self.compareEncoder.compare( catBitmap, compareBitmap) if sort: # Order each inner dict of catDistances such that the ranking is most to # least similar. catDistances = self.compareCategories(catDistances) if save is not None: self.writeOutCategories(save, comparisons=catDistances, labelRefs=labelRefs) return catDistances @staticmethod def getWinningLabels(distances, numLabels, metric): """ Return indices of winning categories, based off of the input metric. Overrides the base class implementation. """ metricValues = numpy.array([v[metric] for v in distances.values()]) sortedIdx = numpy.argsort(metricValues) # euclideanDistance and jaccardDistance are ascending descendingOrder = ("overlappingAll", "overlappingLeftRight", "overlappingRightLeft", "cosineSimilarity", "weightedScoring") if metric in descendingOrder: sortedIdx = sortedIdx[::-1] return numpy.array( [distances.keys()[catIdx] for catIdx in sortedIdx[:numLabels]])
class ClassificationModelFingerprint(ClassificationModel): """ Class to run the survey response classification task with Coritcal.io fingerprint encodings. From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelFingerprint", fingerprintType=EncoderTypes.word, unionSparsity=20.0): super(ClassificationModelFingerprint, self).__init__(verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) # Init kNN classifier and Cortical.io encoder; need valid API key (see # CioEncoder init for details). self.classifier = KNNClassifier(k=numLabels, distanceMethod='rawOverlap', exact=False, verbosity=verbosity - 1) if fingerprintType is (not EncoderTypes.document or not EncoderTypes.word): raise ValueError("Invaid type of fingerprint encoding; see the " "EncoderTypes class for eligble types.") self.encoder = CioEncoder(cacheDir="./fluent/experiments/cioCache", fingerprintType=fingerprintType, unionSparsity=unionSparsity) self.n = self.encoder.n self.w = int((self.encoder.targetSparsity / 100) * self.n) def encodeSample(self, sample): """ Encode an SDR of the input string by querying the Cortical.io API. If the client returns None, we create a random SDR with the model's dimensions n and w. @param sample (list) Tokenized sample, where each item is a str. @return fp (dict) The sample text, sparsity, and bitmap. Example return dict: { "text": "Example text", "sparsity": 0.03, "bitmap": numpy.array([]) } """ sample = " ".join(sample) fpInfo = self.encoder.encode(sample) if fpInfo: fp = { "text": fpInfo["text"] if "text" in fpInfo else fpInfo["term"], "sparsity": fpInfo["sparsity"], "bitmap": numpy.array(fpInfo["fingerprint"]["positions"]) } else: fp = { "text": sample, "sparsity": float(self.w) / self.n, "bitmap": self.encodeRandomly(sample) } return fp def trainModel(self, i): # TODO: add batch training, where i is a list """ Train the classifier on the sample and labels for record i. The list sampleReference is populated to correlate classifier prototypes to sample IDs. """ bitmap = self.patterns[i]["pattern"]["bitmap"] if bitmap.any(): for label in self.patterns[i]["labels"]: self.classifier.learn(bitmap, label, isSparse=self.n) self.sampleReference.append(self.patterns[i]["ID"]) def testModel(self, i, numLabels=3): """ Test the model on record i. @param numLabels (int) Number of classification predictions. @return (numpy array) numLabels most-frequent classifications for the data samples; int or empty. """ (_, inferenceResult, _, _) = self.classifier.infer( self.sparsifyPattern(self.patterns[i]["pattern"]["bitmap"], self.n)) return self.getWinningLabels(inferenceResult, numLabels)
class ClassificationModelContext(ClassificationModel): """ Class to run the survey response classification task with Cortical.io text context, then AND the context From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1, numLabels=1): """ Initialize the CorticalClient and CioEncoder. Requires a valid API key """ super(ClassificationModelContext, self).__init__(verbosity) self.encoder = CioEncoder(cacheDir="./experiments/cache") self.client = CorticalClient(self.encoder.apiKey) self.n = self.encoder.n self.w = int((self.encoder.targetSparsity / 100) * self.n) self.categoryBitmaps = {} self.numLabels = numLabels def encodePattern(self, pattern): """ Encode an SDR of the input string by querying the Cortical.io API. @param pattern (list) Tokenized sample, where each item is a string @return (dictionary) Dictionary, containing text, sparsity, and bitmap Example return dict: { "text": "Example text", "sparsity": 0.0, "bitmap": numpy.zeros(0) } """ text = " ".join(pattern) return { "text": text, "sparsity": 0.0, "bitmap": self._encodeText(text) } def _encodeText(self, text): fpInfo = self.encoder.encode(text) if self.verbosity > 1: print "Fingerprint sparsity = {0}%.".format(fpInfo["sparsity"]) if fpInfo: bitmap = numpy.array(fpInfo["fingerprint"]["positions"]) else: bitmap = self.encodeRandomly(text) return bitmap.astype(int) def resetModel(self): """Reset the model""" self.categoryBitmaps.clear() def trainModel(self, samples, labels): """ Train the classifier on the input sample and label. Use Cortical.io's keyword extraction to get the most relevant terms then get the intersection of those bitmaps @param samples (dictionary) Dictionary, containing text, sparsity, and bitmap @param labels (int) Reference index for the classification of this sample. """ for sample, sample_labels in zip(samples, labels): bitmaps = [sample["bitmap"].tolist()] context = self.client.getContextFromText(bitmaps, maxResults=5, getFingerprint=True) if len(context) != 0: union = numpy.zeros(0) for c in context: bitmap = c["fingerprint"]["positions"] union = numpy.union1d(bitmap, union).astype(int) for label in sample_labels: # Haven't seen the label before if label not in self.categoryBitmaps: self.categoryBitmaps[label] = union intersection = numpy.intersect1d( union, self.categoryBitmaps[label]) if intersection.size == 0: # Don't want to lose all the old information union = numpy.union1d( union, self.categoryBitmaps[label]).astype(int) # Need to sample to stay sparse count = len(union) sampleIndices = random.sample(xrange(count), min(count, self.w)) intersection = numpy.sort(union[sampleIndices]) self.categoryBitmaps[label] = intersection def testModel(self, sample): """ Test the intersection bitmap on the input sample. Returns a dictionary containing various distance metrics between the sample and the classes. @param sample (dictionary) Dictionary, containing text, sparsity, and bitmap @return (dictionary) The distances between the sample and the classes Example return dict: { 0: { "cosineSimilarity": 0.6666666666666666, "euclideanDistance": 0.3333333333333333, "jaccardDistance": 0.5, "overlappingAll": 6, "overlappingLeftRight": 0.6666666666666666, "overlappingRightLeft": 0.6666666666666666, "sizeLeft": 9, "sizeRight": 9, "weightedScoring": 0.4436476984102028 } } """ sampleBitmap = sample["bitmap"].tolist() distances = {} for cat, catBitmap in self.categoryBitmaps.iteritems(): distances[cat] = self.client.compare(sampleBitmap, catBitmap.tolist()) return self.winningLabels(distances, numberCats=self.numLabels, metric="overlappingAll") @staticmethod def winningLabels(distances, numberCats, metric): """ Return indices of winning categories, based off of the input metric. Overrides the base class implementation. """ metricValues = numpy.array([v[metric] for v in distances.values()]) sortedIdx = numpy.argsort(metricValues) # euclideanDistance and jaccardDistance are ascending descendingOrder = set([ "overlappingAll", "overlappingLeftRight", "overlappingRightLeft", "cosineSimilarity", "weightedScoring" ]) if metric in descendingOrder: sortedIdx = sortedIdx[::-1] return [distances.keys()[catIdx] for catIdx in sortedIdx[:numberCats]]
class ClassificationModelEndpoint(ClassificationModel): """ Class to run the survey response classification task with Cortical.io text endpoint encodings and classification system. From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelEndpoint", unionSparsity=20.0): """ Initializes the encoder as CioEncoder; requires a valid API key. """ super(ClassificationModelEndpoint, self).__init__( verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) self.encoder = CioEncoder(cacheDir="./experiments/cache", unionSparsity=unionSparsity) self.compareEncoder = LanguageEncoder() self.n = self.encoder.n self.w = int((self.encoder.targetSparsity/100) * self.n) self.categoryBitmaps = {} self.negatives = defaultdict(list) self.positives = defaultdict(list) def encodeSample(self, sample): """ Encode an SDR of the input string by querying the Cortical.io API. @param sample (list) Tokenized sample, where each item is a string @return fp (dict) The sample text, sparsity, and bitmap. Example return dict: { "text": "Example text", "sparsity": 0.03, "bitmap": numpy.array([]) } """ sample = " ".join(sample) fpInfo = self.encoder.encode(sample) if fpInfo: fp = {"text":fpInfo["text"] if "text" in fpInfo else fpInfo["term"], "sparsity":fpInfo["sparsity"], "bitmap":numpy.array(fpInfo["fingerprint"]["positions"])} else: fp = {"text":sample, "sparsity":float(self.w)/self.n, "bitmap":self.encodeRandomly(sample)} return fp def resetModel(self): """Reset the model""" self.positives.clear() self.negatives.clear() self.categoryBitmaps.clear() def trainModel(self, i, negatives=None): # TODO: add batch training, where i is a list; note we should only add # negatives when training on one sample so we know which labels to use. """ Train the classifier on the sample and labels for record i. Use Cortical.io's createClassification() to make a bitmap that represents the class. The list sampleReference is populated to correlate classifier prototypes to sample IDs. @param negative (list) Each item is the dictionary containing text, sparsity and bitmap for the negative samples. """ record = self.patterns[i] labelsToUpdateBitmaps = set() for label in record["labels"]: if record["pattern"]["text"] and record["pattern"]["bitmap"].any(): self.positives[label].append(record["pattern"]["text"]) if negatives: for neg in negatives: if neg["text"]: self.negatives[label].append(neg["text"]) labelsToUpdateBitmaps.add(label) for label in labelsToUpdateBitmaps: self.categoryBitmaps[label] = self.encoder.createCategory( str(label), self.positives[label], self.negatives[label])["positions"] self.sampleReference.append(i) def testModel(self, i, numLabels=3, metric="overlappingAll"): """ Test on record i. The Cortical.io classifier returns a dictionary containing various distance metrics between the sample and the classes. @param numLabels (int) Number of classification predictions. @param metric (str) Distance metric use by classifier. @return (numpy array) numLabels most-frequent classifications for the data samples; int or empty. """ sampleBitmap = self.patterns[i]["pattern"]["bitmap"].tolist() distances = defaultdict(list) for cat, catBitmap in self.categoryBitmaps.iteritems(): distances[cat] = self.compareEncoder.compare(sampleBitmap, catBitmap) return self.getWinningLabels(distances, numLabels=numLabels, metric=metric) @staticmethod def compareCategories(catDistances, metric="overlappingAll"): """ Calculate category distances. Returns a defaultdict of category keys, where values are OrderedDicts sorted such that the most similar categories (according to the input metric) are listed first. """ descendingOrder = ("overlappingAll", "overlappingLeftRight", "overlappingRightLeft", "cosineSimilarity", "weightedScoring") categoryComparisons = defaultdict(list) for k, v in catDistances.iteritems(): # Create a dict for this category metricDict = {compareCat: distances[metric] for compareCat, distances in v.iteritems()} # Sort the dict by the metric reverse = True if metric in descendingOrder else False categoryComparisons[k] = OrderedDict( sorted(metricDict.items(), key=lambda k: k[1], reverse=reverse)) return categoryComparisons def getCategoryDistances(self, sort=True, save=None, labelRefs=None): """ Return a dict where keys are categories and values are dicts of distances. @param sort (bool) Sort the inner dicts with compareCategories() @param save (str) Dump catDistances to a JSON in this dir. @return (defaultdict) E.g. w/ categories 0 and 1: catDistances = { 0: { 0: {"cosineSimilarity": 1.0, ...}, 1: {"cosineSimilarity": 0.33, ...} }, 1: { 0: {"cosineSimilarity": 0.33, ...}, 1: {"cosineSimilarity": 1.0, ...} } Note the inner-dicts of catDistances are OrderedDict objects. """ catDistances = defaultdict(list) for cat, catBitmap in self.categoryBitmaps.iteritems(): catDistances[cat] = OrderedDict() for compareCat, compareBitmap in self.categoryBitmaps.iteritems(): # List is in order of self.categoryBitmaps.keys() catDistances[cat][compareCat] = self.compareEncoder.compare( catBitmap, compareBitmap) if sort: # Order each inner dict of catDistances such that the ranking is most to # least similar. catDistances = self.compareCategories(catDistances) if save is not None: self.writeOutCategories( save, comparisons=catDistances, labelRefs=labelRefs) return catDistances @staticmethod def getWinningLabels(distances, numLabels, metric): """ Return indices of winning categories, based off of the input metric. Overrides the base class implementation. """ metricValues = numpy.array([v[metric] for v in distances.values()]) sortedIdx = numpy.argsort(metricValues) # euclideanDistance and jaccardDistance are ascending descendingOrder = ("overlappingAll", "overlappingLeftRight", "overlappingRightLeft", "cosineSimilarity", "weightedScoring") if metric in descendingOrder: sortedIdx = sortedIdx[::-1] return numpy.array( [distances.keys()[catIdx] for catIdx in sortedIdx[:numLabels]]) @staticmethod def query(): print "The Classification Endpoint model doesn't support this method." @staticmethod def infer(): print "The Classification Endpoint model doesn't support this method."