def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelEndpoint", unionSparsity=0.20): """ Initializes the encoder as CioEncoder; requires a valid API key. """ super(ClassificationModelEndpoint, self).__init__(verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) root = os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(cacheDir=os.path.join(root, "CioCache"), unionSparsity=unionSparsity) self.compareEncoder = LanguageEncoder() self.n = self.encoder.n self.w = int((self.encoder.targetSparsity / 100) * self.n) self.categoryBitmaps = {} self.negatives = defaultdict(list) self.positives = defaultdict(list)
def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelEndpoint", unionSparsity=0.20): """ Initializes the encoder as CioEncoder; requires a valid API key. """ super(ClassificationModelEndpoint, self).__init__( verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) root = os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(cacheDir=os.path.join(root, "CioCache"), unionSparsity=unionSparsity) self.compareEncoder = LanguageEncoder() self.n = self.encoder.n self.w = int((self.encoder.targetSparsity/100) * self.n) self.categoryBitmaps = {} self.negatives = defaultdict(list) self.positives = defaultdict(list)
class ClassificationModelEndpoint(ClassificationModel): """ Class to run the survey response classification task with Cortical.io text endpoint encodings and classification system. From the experiment runner, the methods expect to be fed one sample at a time. """ def __init__(self, verbosity=1, numLabels=3, modelDir="ClassificationModelEndpoint", unionSparsity=0.20, cacheRoot=None): """ Initializes the encoder as CioEncoder; requires a valid API key. """ super(ClassificationModelEndpoint, self).__init__( verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__)) self.encoder = CioEncoder(cacheDir=os.path.join(cacheRoot, "CioCache"), unionSparsity=unionSparsity) self.compareEncoder = LanguageEncoder() self.n = self.encoder.n self.w = int((self.encoder.targetSparsity/100) * self.n) self.categoryBitmaps = {} self.negatives = defaultdict(list) self.positives = defaultdict(list) def encodeSample(self, sample): """ Encode an SDR of the input string by querying the Cortical.io API. @param sample (list) Tokenized sample, where each item is a string @return fp (dict) The sample text, sparsity, and bitmap. Example return dict: { "text": "Example text", "sparsity": 0.03, "bitmap": numpy.array([]) } """ sample = " ".join(sample) fpInfo = self.encoder.encode(sample) if fpInfo: fp = {"text":fpInfo["text"] if "text" in fpInfo else fpInfo["term"], "sparsity":fpInfo["sparsity"], "bitmap":numpy.array(fpInfo["fingerprint"]["positions"])} else: fp = {"text":sample, "sparsity":float(self.w)/self.n, "bitmap":self.encodeRandomly(sample, self.n, self.w)} return fp def resetModel(self): """Reset the model""" self.positives.clear() self.negatives.clear() self.categoryBitmaps.clear() def trainModel(self, i, negatives=None): # TODO: add batch training, where i is a list; note we should only add # negatives when training on one sample so we know which labels to use. """ Train the classifier on the sample and labels for record i. Use Cortical.io's createClassification() to make a bitmap that represents the class. The list sampleReference is populated to correlate classifier prototypes to sample IDs. @param negative (list) Each item is the dictionary containing text, sparsity and bitmap for the negative samples. """ record = self.patterns[i] labelsToUpdateBitmaps = set() for label in record["labels"]: if record["pattern"]["text"] and record["pattern"]["bitmap"].any(): self.positives[label].append(record["pattern"]["text"]) if negatives: for neg in negatives: if neg["text"]: self.negatives[label].append(neg["text"]) labelsToUpdateBitmaps.add(label) for label in labelsToUpdateBitmaps: self.categoryBitmaps[label] = self.encoder.createCategory( str(label), self.positives[label], self.negatives[label])["positions"] self.sampleReference.append(i) def testModel(self, i, _, metric="overlappingAll"): """ Test on record i. The Cortical.io classifier returns a dictionary containing various distance metrics between the sample and the classes. @param metric (str) Distance metric use by classifier. @return (numpy array) numLabels most-frequent classifications for the data samples; int or empty. """ sampleBitmap = self.patterns[i]["pattern"]["bitmap"].tolist() distances = defaultdict(list) for cat, catBitmap in self.categoryBitmaps.iteritems(): distances[cat] = self.compareEncoder.compare(sampleBitmap, catBitmap) return self.getWinningLabels(distances, metric=metric) def getWinningLabels(self, distances, metric): """ Return indices of winning categories, based off of the input metric. Overrides the base class implementation. """ metricValues = numpy.array([v[metric] for v in distances.values()]) sortedIdx = numpy.argsort(metricValues) # euclideanDistance and jaccardDistance are ascending descendingOrder = ("overlappingAll", "overlappingLeftRight", "overlappingRightLeft", "cosineSimilarity", "weightedScoring") if metric in descendingOrder: sortedIdx = sortedIdx[::-1] return numpy.array( [distances.keys()[catIdx] for catIdx in sortedIdx[:self.numLabels]]) def getCategoryDistances(self, sort=True, save=None, labelRefs=None): """ Return a dict where keys are categories and values are dicts of distances. @param sort (bool) Sort the inner dicts with compareCategories() @param save (str) Dump catDistances to a JSON in this dir. @return (defaultdict) E.g. w/ categories 0 and 1: catDistances = { 0: { 0: {"cosineSimilarity": 1.0, ...}, 1: {"cosineSimilarity": 0.33, ...} }, 1: { 0: {"cosineSimilarity": 0.33, ...}, 1: {"cosineSimilarity": 1.0, ...} } Note the inner-dicts of catDistances are OrderedDict objects. """ catDistances = defaultdict(list) for cat, catBitmap in self.categoryBitmaps.iteritems(): catDistances[cat] = OrderedDict() for compareCat, compareBitmap in self.categoryBitmaps.iteritems(): # List is in order of self.categoryBitmaps.keys() catDistances[cat][compareCat] = self.compareEncoder.compare( catBitmap, compareBitmap) if sort: # Order each inner dict of catDistances such that the ranking is most to # least similar. catDistances = self.compareCategories(catDistances) if save is not None: self.writeOutCategories( save, comparisons=catDistances, labelRefs=labelRefs) return catDistances @staticmethod def compareCategories(catDistances, metric="overlappingAll"): """ Calculate category distances. Returns a defaultdict of category keys, where values are OrderedDicts sorted such that the most similar categories (according to the input metric) are listed first. """ descendingOrder = ("overlappingAll", "overlappingLeftRight", "overlappingRightLeft", "cosineSimilarity", "weightedScoring") categoryComparisons = defaultdict(list) for k, v in catDistances.iteritems(): # Create a dict for this category metricDict = {compareCat: distances[metric] for compareCat, distances in v.iteritems()} # Sort the dict by the metric reverse = True if metric in descendingOrder else False categoryComparisons[k] = OrderedDict( sorted(metricDict.items(), key=lambda k: k[1], reverse=reverse)) return categoryComparisons @staticmethod def query(): print "The Classification Endpoint model doesn't support this method." @staticmethod def infer(): print "The Classification Endpoint model doesn't support this method."