def __init__(self, verbosity=1, numLabels=3): """ Initialize the encoder as CioEncoder; requires a valid API key. """ super(ClassificationModelEndpoint, self).__init__(verbosity, numLabels) self.encoder = CioEncoder(cacheDir="./experiments/cache") self.compareEncoder = LanguageEncoder() self.n = self.encoder.n self.w = int((self.encoder.targetSparsity / 100) * self.n) self.categoryBitmaps = {} self.negatives = defaultdict(list) self.positives = defaultdict(list)
def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelEndpoint"): """ Initializes the encoder as CioEncoder; requires a valid API key. """ super(ClassificationModelEndpoint, self).__init__( verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) self.encoder = CioEncoder(cacheDir="./experiments/cache") self.compareEncoder = LanguageEncoder() self.n = self.encoder.n self.w = int((self.encoder.targetSparsity/100) * self.n) self.categoryBitmaps = {} self.negatives = defaultdict(list) self.positives = defaultdict(list)
class ClassificationModelEndpoint(ClassificationModel): """ Class to run the survey response classification task with Cortical.io text endpoint encodings and classification system. From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelEndpoint", unionSparsity=20.0): """ Initializes the encoder as CioEncoder; requires a valid API key. """ super(ClassificationModelEndpoint, self).__init__( verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) self.encoder = CioEncoder(cacheDir="./experiments/cache", unionSparsity=unionSparsity) self.compareEncoder = LanguageEncoder() self.n = self.encoder.n self.w = int((self.encoder.targetSparsity/100) * self.n) self.categoryBitmaps = {} self.negatives = defaultdict(list) self.positives = defaultdict(list) def encodeSample(self, sample): """ Encode an SDR of the input string by querying the Cortical.io API. @param sample (list) Tokenized sample, where each item is a string @return fp (dict) The sample text, sparsity, and bitmap. Example return dict: { "text": "Example text", "sparsity": 0.03, "bitmap": numpy.array([]) } """ sample = " ".join(sample) fpInfo = self.encoder.encode(sample) if fpInfo: fp = {"text":fpInfo["text"] if "text" in fpInfo else fpInfo["term"], "sparsity":fpInfo["sparsity"], "bitmap":numpy.array(fpInfo["fingerprint"]["positions"])} else: fp = {"text":sample, "sparsity":float(self.w)/self.n, "bitmap":self.encodeRandomly(sample)} return fp def resetModel(self): """Reset the model""" self.positives.clear() self.negatives.clear() self.categoryBitmaps.clear() def trainModel(self, i, negatives=None): # TODO: add batch training, where i is a list; note we should only add # negatives when training on one sample so we know which labels to use. """ Train the classifier on the sample and labels for record i. Use Cortical.io's createClassification() to make a bitmap that represents the class. The list sampleReference is populated to correlate classifier prototypes to sample IDs. @param negative (list) Each item is the dictionary containing text, sparsity and bitmap for the negative samples. """ record = self.patterns[i] labelsToUpdateBitmaps = set() for label in record["labels"]: if record["pattern"]["text"] and record["pattern"]["bitmap"].any(): self.positives[label].append(record["pattern"]["text"]) if negatives: for neg in negatives: if neg["text"]: self.negatives[label].append(neg["text"]) labelsToUpdateBitmaps.add(label) for label in labelsToUpdateBitmaps: self.categoryBitmaps[label] = self.encoder.createCategory( str(label), self.positives[label], self.negatives[label])["positions"] self.sampleReference.append(i) def testModel(self, i, numLabels=3, metric="overlappingAll"): """ Test on record i. The Cortical.io classifier returns a dictionary containing various distance metrics between the sample and the classes. @param numLabels (int) Number of classification predictions. @param metric (str) Distance metric use by classifier. @return (numpy array) numLabels most-frequent classifications for the data samples; int or empty. """ sampleBitmap = self.patterns[i]["pattern"]["bitmap"].tolist() distances = defaultdict(list) for cat, catBitmap in self.categoryBitmaps.iteritems(): distances[cat] = self.compareEncoder.compare(sampleBitmap, catBitmap) return self.getWinningLabels(distances, numLabels=numLabels, metric=metric) @staticmethod def compareCategories(catDistances, metric="overlappingAll"): """ Calculate category distances. Returns a defaultdict of category keys, where values are OrderedDicts sorted such that the most similar categories (according to the input metric) are listed first. """ descendingOrder = ("overlappingAll", "overlappingLeftRight", "overlappingRightLeft", "cosineSimilarity", "weightedScoring") categoryComparisons = defaultdict(list) for k, v in catDistances.iteritems(): # Create a dict for this category metricDict = {compareCat: distances[metric] for compareCat, distances in v.iteritems()} # Sort the dict by the metric reverse = True if metric in descendingOrder else False categoryComparisons[k] = OrderedDict( sorted(metricDict.items(), key=lambda k: k[1], reverse=reverse)) return categoryComparisons def getCategoryDistances(self, sort=True, save=None, labelRefs=None): """ Return a dict where keys are categories and values are dicts of distances. @param sort (bool) Sort the inner dicts with compareCategories() @param save (str) Dump catDistances to a JSON in this dir. @return (defaultdict) E.g. w/ categories 0 and 1: catDistances = { 0: { 0: {"cosineSimilarity": 1.0, ...}, 1: {"cosineSimilarity": 0.33, ...} }, 1: { 0: {"cosineSimilarity": 0.33, ...}, 1: {"cosineSimilarity": 1.0, ...} } Note the inner-dicts of catDistances are OrderedDict objects. """ catDistances = defaultdict(list) for cat, catBitmap in self.categoryBitmaps.iteritems(): catDistances[cat] = OrderedDict() for compareCat, compareBitmap in self.categoryBitmaps.iteritems(): # List is in order of self.categoryBitmaps.keys() catDistances[cat][compareCat] = self.compareEncoder.compare( catBitmap, compareBitmap) if sort: # Order each inner dict of catDistances such that the ranking is most to # least similar. catDistances = self.compareCategories(catDistances) if save is not None: self.writeOutCategories( save, comparisons=catDistances, labelRefs=labelRefs) return catDistances @staticmethod def getWinningLabels(distances, numLabels, metric): """ Return indices of winning categories, based off of the input metric. Overrides the base class implementation. """ metricValues = numpy.array([v[metric] for v in distances.values()]) sortedIdx = numpy.argsort(metricValues) # euclideanDistance and jaccardDistance are ascending descendingOrder = ("overlappingAll", "overlappingLeftRight", "overlappingRightLeft", "cosineSimilarity", "weightedScoring") if metric in descendingOrder: sortedIdx = sortedIdx[::-1] return numpy.array( [distances.keys()[catIdx] for catIdx in sortedIdx[:numLabels]]) @staticmethod def query(): print "The Classification Endpoint model doesn't support this method." @staticmethod def infer(): print "The Classification Endpoint model doesn't support this method."
class ClassificationModelEndpoint(ClassificationModel): """ Class to run the survey response classification task with Cortical.io text endpoint encodings and classification system. From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1, numLabels=3): """ Initialize the encoder as CioEncoder; requires a valid API key. """ super(ClassificationModelEndpoint, self).__init__(verbosity, numLabels) self.encoder = CioEncoder(cacheDir="./experiments/cache") self.compareEncoder = LanguageEncoder() self.n = self.encoder.n self.w = int((self.encoder.targetSparsity / 100) * self.n) self.categoryBitmaps = {} self.negatives = defaultdict(list) self.positives = defaultdict(list) def encodePattern(self, sample): """ Encode an SDR of the input string by querying the Cortical.io API. @param sample (list) Tokenized sample, where each item is a string @return fp (dict) The sample text, sparsity, and bitmap. Example return dict: { "text": "Example text", "sparsity": 0.03, "bitmap": numpy.array([]) } """ sample = " ".join(sample) fpInfo = self.encoder.encode(sample) if fpInfo: fp = { "text": fpInfo["text"] if "text" in fpInfo else fpInfo["term"], "sparsity": fpInfo["sparsity"], "bitmap": numpy.array(fpInfo["fingerprint"]["positions"]) } else: fp = { "text": sample, "sparsity": float(self.w) / self.n, "bitmap": self.encodeRandomly(sample) } return fp def resetModel(self): """Reset the model""" self.positives.clear() self.negatives.clear() self.categoryBitmaps.clear() def trainModel(self, samples, labels, negatives=None): """ Train the classifier on the input sample and label. Use Cortical.io's createClassification to make a bitmap that represents the class @param samples (list) List of dictionaries containing the sample text, sparsity, and bitmap. @param labels (list) List of numpy arrays containing the reference indices for the classifications of each sample. @param negatives (list) Each item is the dictionary containing text, sparsity and bitmap for the negative samples. """ labelsToUpdateBitmaps = set() for sample, sampleLabels in zip(samples, labels): for label in sampleLabels: fpInfo = self.encoder.encode(sample["text"]) if sample["text"] and fpInfo: self.positives[label].append(sample["text"]) # Only add negatives when training on one sample so we know which # labels to use if negatives and len(samples) == 1: for neg in negatives: if neg["text"]: self.negatives[label].append(neg["text"]) labelsToUpdateBitmaps.add(label) for label in labelsToUpdateBitmaps: self.categoryBitmaps[label] = self.encoder.createCategory( str(label), self.positives[label], self.negatives[label])["positions"] def testModel(self, sample, numLabels=3, metric="overlappingAll"): """ Test the Cortical.io classifier on the input sample. Returns a dictionary containing various distance metrics between the sample and the classes. @param sample (dict) The sample text, sparsity, and bitmap. @return (list) Winning classifications based on the specified metric. The number of items returned will be <= numLabels. """ sampleBitmap = sample["bitmap"].tolist() distances = defaultdict(list) for cat, catBitmap in self.categoryBitmaps.iteritems(): distances[cat] = self.compareEncoder.compare( sampleBitmap, catBitmap) return self.getWinningLabels(distances, numLabels=numLabels, metric=metric) @staticmethod def compareCategories(catDistances, metric="overlappingAll"): """ Calculate category distances. Returns a defaultdict of category keys, where values are OrderedDicts sorted such that the most similar categories (according to the input metric) are listed first. """ descendingOrder = ("overlappingAll", "overlappingLeftRight", "overlappingRightLeft", "cosineSimilarity", "weightedScoring") categoryComparisons = defaultdict(list) for k, v in catDistances.iteritems(): # Create a dict for this category metricDict = { compareCat: distances[metric] for compareCat, distances in v.iteritems() } # Sort the dict by the metric reverse = True if metric in descendingOrder else False categoryComparisons[k] = OrderedDict( sorted(metricDict.items(), key=lambda k: k[1], reverse=reverse)) return categoryComparisons def getCategoryDistances(self, sort=True, save=None, labelRefs=None): """ Return a dict where keys are categories and values are dicts of distances. @param sort (bool) Sort the inner dicts with compareCategories() @param save (str) Dump catDistances to a JSON in this dir. @return (defaultdict) E.g. w/ categories 0 and 1: catDistances = { 0: { 0: {"cosineSimilarity": 1.0, ...}, 1: {"cosineSimilarity": 0.33, ...} }, 1: { 0: {"cosineSimilarity": 0.33, ...}, 1: {"cosineSimilarity": 1.0, ...} } Note the inner-dicts of catDistances are OrderedDict objects. """ catDistances = defaultdict(list) for cat, catBitmap in self.categoryBitmaps.iteritems(): catDistances[cat] = OrderedDict() for compareCat, compareBitmap in self.categoryBitmaps.iteritems(): # List is in order of self.categoryBitmaps.keys() catDistances[cat][compareCat] = self.compareEncoder.compare( catBitmap, compareBitmap) if sort: # Order each inner dict of catDistances such that the ranking is most to # least similar. catDistances = self.compareCategories(catDistances) if save is not None: self.writeOutCategories(save, comparisons=catDistances, labelRefs=labelRefs) return catDistances @staticmethod def getWinningLabels(distances, numLabels, metric): """ Return indices of winning categories, based off of the input metric. Overrides the base class implementation. """ metricValues = numpy.array([v[metric] for v in distances.values()]) sortedIdx = numpy.argsort(metricValues) # euclideanDistance and jaccardDistance are ascending descendingOrder = ("overlappingAll", "overlappingLeftRight", "overlappingRightLeft", "cosineSimilarity", "weightedScoring") if metric in descendingOrder: sortedIdx = sortedIdx[::-1] return numpy.array( [distances.keys()[catIdx] for catIdx in sortedIdx[:numLabels]])