def queryModel(self, query, preprocess): """ Preprocesses the query, encodes it into a pattern, then queries the classifier to infer distances to trained-on samples. @return (list) Two-tuples of sample ID and distance, sorted closest to farthest from the query. """ if preprocess: sample = TextPreprocess().tokenize( query, ignoreCommon=100, removeStrings=["[identifier deleted]"], correctSpell=True) else: sample = TextPreprocess().tokenize(query) allDistances = self.infer(self.encodeSample(sample)) # Model trains multiple times for multi-label samples, so remove repeats. # note: numpy.unique() auto sorts least to greatest if len(allDistances) != len(self.sampleReference): raise IndexError( "Number of protoype distances must match number of " "samples trained on.") sampleDistances = defaultdict() for i, uniqueID in enumerate(self.sampleReference): sampleDistances[uniqueID] = min([ allDistances[i] for i, x in enumerate(self.sampleReference) if x == uniqueID ]) return sorted(sampleDistances.items(), key=operator.itemgetter(1))
def setupData(args): """ Performs data preprocessing and setup given the user-specified args. @param args (Namespace) User-provided arguments via the cmd line. @return (tuple) Tuple where first entry is a list of the samples, the second is the list of gold labels per example, the third is the list of all possible labels, and the fourth is the labels per example in the data. """ dataDict = readCSV(args.dataPath, 2, args.numLabels) # Collect each possible label string into a list, where the indices will be # their references throughout the experiment. labelReference = list(set( itertools.chain.from_iterable(dataDict.values()))) for sample, labels in dataDict.iteritems(): dataDict[sample] = numpy.array([labelReference.index(label) for label in labels], dtype="int8") texter = TextPreprocess() if args.textPreprocess: samples = [(texter.tokenize(sample, ignoreCommon=100, removeStrings=["[identifier deleted]"], correctSpell=True), labels) for sample, labels in dataDict.iteritems()] else: samples = [(texter.tokenize(sample), labels) for sample, labels in dataDict.iteritems()] return samples, labelReference
def testFunctionsWithoutDataFiles(self): """ Ensures a TextPreprocess object can be created and tokenize when there are no text data files (corpus text, abbreviations, and contractions). """ text = "I can't work at [identifier deleted] if you don't allw me to wfh" processor = TextPreprocess(corpusTxt="fake.txt", abbrCSV="not_here.csv", contrCSV="not_real.csv") tokens = processor.tokenize(text) expected_tokens = [ "i", "can", "t", "work", "at", "identifier", "deleted", "if", "you", "don", "t", "allw", "me", "to", "wfh", ] self.assertSequenceEqual(tokens, expected_tokens)
def testReadExpansionFileWithSuffixes(self): """Tests TextPreprocess reads csv files correctly and adds suffixes.""" processor = TextPreprocess() suffixes = ["", "s", "'s"] abbreviations = processor.readExpansionFile("abbreviations.csv", suffixes) expectedAbbreviations = {"wfh": "work from home", "wfhs": "work from homes", "wfh's": "work from home's"} self.assertEqual(abbreviations, expectedAbbreviations)
def _preprocess(self, preprocess): """Tokenize the samples, with or without preprocessing.""" texter = TextPreprocess() if preprocess: self.samples = {category: [(texter.tokenize(data[0], ignoreCommon=100, removeStrings=["identifier deleted]"], correctSpell=True), data[1], idx) for idx, data in samples.iteritems()] for category, samples in self.dataDict.iteritems()} if self.testDict: self.testSamples = [(texter.tokenize(data[0], ignoreCommon=100, removeStrings=["identifier deleted]"], correctSpell=True), data[1], idx) for idx, data in self.testDict.iteritems()] else: self.samples = {category: [(texter.tokenize(data[0]), data[1], idx) for idx, data in samples.iteritems()] for category, samples in self.dataDict.iteritems()} if self.testDict: self.testSamples = [(texter.tokenize(data[0]), data[1], idx) for idx, data in self.testDict.iteritems()]
def testTokenizeExpandAbbreviation(self): """Tests abbreviations are expanded.""" text = "I can't work at [identifier deleted] if you don't allw me to wfh" processor = TextPreprocess() expected_tokens = [ "i", "can", "t", "work", "at", "identifier", "deleted", "if", "you", "don", "t", "allw", "me", "to", "work", "from", "home", ] tokens = processor.tokenize(text, expandAbbr=True) self.assertSequenceEqual(tokens, expected_tokens)
def testTokenizeRemoveString(self): """Tests a provided string is ignored.""" text = "I can't work at [identifier deleted] if you don't allw me to wfh" processor = TextPreprocess() expected_tokens = ["i", "can", "t", "work", "at", "if", "you", "don", "t", "allw", "me", "to", "wfh"] tokens = processor.tokenize(text, removeStrings=["[identifier deleted]"]) self.assertSequenceEqual(tokens, expected_tokens)
def testTokenizeExpandContraction(self): """Tests contractions are expanded.""" text = "I can't work at [identifier deleted] if you don't allw me to wfh" processor = TextPreprocess() expected_tokens = ["i", "can", "not", "work", "at", "identifier", "deleted", "if", "you", "do", "not", "allw", "me", "to", "wfh"] tokens = processor.tokenize(text, expandContr=True) self.assertSequenceEqual(tokens, expected_tokens)
def testTokenizeNoPreprocess(self): """Tests none of the preprocessing methods are used.""" text = "I can't work at [identifier deleted] if you don't allw me to wfh" processor = TextPreprocess() expected_tokens = ["i", "can", "t", "work", "at", "identifier", "deleted", "if", "you", "don", "t", "allw", "me", "to", "wfh"] tokens = processor.tokenize(text) self.assertSequenceEqual(tokens, expected_tokens)
def split(self, filePath, numLabels, textPreprocess=False, abbrCSV="", contrCSV="", ignoreCommon=100, removeStrings="[identifier deleted]", correctSpell=True): """ Split all the comments in a file into tokens. Preprocess if necessary. @param filePath (str) Path to csv file @param numLabels (int) Number of columns of category labels. @param textPreprocess (bool) True will preprocess text while tokenizing. Please see TextPreprocess tokenize() for the other parameters; they're only used when textPrepricess is True. """ dataDict = readCSV(filePath, numLabels=numLabels) if dataDict is None: raise Exception("Could not read CSV.") preprocessor = TextPreprocess(abbrCSV=abbrCSV, contrCSV=contrCSV) expandAbbr = (abbrCSV != "") expandContr = (contrCSV != "") for i, uniqueID in enumerate(dataDict.keys()): comment, categories = dataDict[uniqueID] # Convert the categories to a string of their IDs categories = string.join( [str(self.categoryToId[c]) for c in categories]) if textPreprocess: tokens = preprocessor.tokenize(comment, ignoreCommon, removeStrings, correctSpell, expandAbbr, expandContr) else: tokens = preprocessor.tokenize(comment) # Write the sequence of data records for this sample. record = {"_categories": categories, "_sequenceID": i} data = [] reset = 1 for t in tokens: tokenRecord = record.copy() tokenRecord["_token"] = t tokenRecord["_reset"] = reset tokenRecord["ID"] = uniqueID reset = 0 data.append(tokenRecord) self.records.append(data)
def _preprocess(self, preprocess): """Tokenize the samples, with or without preprocessing.""" texter = TextPreprocess() if preprocess: self.samples = [(texter.tokenize(data[0], ignoreCommon=100, removeStrings=["[identifier deleted]"], correctSpell=True), data[1]) for _, data in self.dataDict.iteritems()] else: self.samples = [(texter.tokenize(data[0]), data[1]) for _, data in self.dataDict.iteritems()]
def _preprocess(self, preprocess): """Tokenize the samples, with or without preprocessing.""" texter = TextPreprocess() if preprocess: self.samples = [ (texter.tokenize(data[0], ignoreCommon=100, removeStrings=["[identifier deleted]"], correctSpell=True), data[1]) for id, data in self.dataDict.iteritems() ] else: self.samples = [(texter.tokenize(data[0]), data[1]) for id, data in self.dataDict.iteritems()]
def testFunctionsWithoutDataFiles(self): """ Ensures a TextPreprocess object can be created and tokenize when there are no text data files (corpus text, abbreviations, and contractions). """ text = "I can't work at [identifier deleted] if you don't allw me to wfh" processor = TextPreprocess(corpusTxt="fake.txt", abbrCSV="not_here.csv", contrCSV="not_real.csv") tokens = processor.tokenize(text) expected_tokens = ["i", "can", "t", "work", "at", "identifier", "deleted", "if", "you", "don", "t", "allw", "me", "to", "wfh"] self.assertSequenceEqual(tokens, expected_tokens)
def prepText(text, preprocess=False): """ Returns a list of the text tokens. @param preprocess (bool) Whether or not to preprocess the text data. """ if preprocess: sample = TextPreprocess().tokenize( text, ignoreCommon=100, removeStrings=["[identifier deleted]"], correctSpell=True) else: sample = TextPreprocess().tokenize(text) return sample
def getUnionEncoding(self, text): """ Encode each token of the input text, take the union, and then sparsify. @param text (str) A non-tokenized sample of text. @return (dict) The bitmap encoding is at encoding["fingerprint"]["positions"]. """ tokens = TextPreprocess().tokenize(text) # Count the ON bits represented in the encoded tokens. counts = Counter() for t in tokens: bitmap = self.client.getBitmap(t)["fingerprint"]["positions"] counts.update(bitmap) positions = self.sparseUnion(counts) # Populate encoding encoding = { "text": text, "sparsity": len(positions) * 100 / float(self.n), "df": 0.0, "height": self.h, "width": self.w, "score": 0.0, "fingerprint": { "positions": sorted(positions) }, "pos_types": [] } return encoding
def split(self, filePath, numLabels, textPreprocess=False, abbrCSV="", contrCSV="", ignoreCommon=100, removeStrings="[identifier deleted]", correctSpell=True): """ Split all the comments in a file into tokens. Preprocess if necessary. @param filePath (str) Path to csv file @param numLabels (int) Number of columns of category labels. @param textPreprocess (bool) True will preprocess text while tokenizing. Please see TextPreprocess tokenize() for the other parameters; they're only used when textPrepricess is True. """ dataDict = readCSV(filePath, numLabels=numLabels) if dataDict is None: raise Exception("Could not read CSV.") preprocessor = TextPreprocess(abbrCSV=abbrCSV, contrCSV=contrCSV) expandAbbr = (abbrCSV != "") expandContr = (contrCSV != "") for i, uniqueID in enumerate(dataDict.keys()): comment, categories = dataDict[uniqueID] # Convert the categories to a string of their IDs categories = string.join([str(self.categoryToId[c]) for c in categories]) if textPreprocess: tokens = preprocessor.tokenize( comment, ignoreCommon, removeStrings, correctSpell, expandAbbr, expandContr) else: tokens = preprocessor.tokenize(comment) # Write the sequence of data records for this sample. record = {"_categories":categories, "_sequenceID":i} data = [] reset = 1 for t in tokens: tokenRecord = record.copy() tokenRecord["_token"] = t tokenRecord["_reset"] = reset tokenRecord["ID"] = uniqueID reset = 0 data.append(tokenRecord) self.records.append(data)
def setupData(args): """ Performs data preprocessing and setup given the user-specified args. @param args (Namespace) User-provided arguments via the cmd line. @return (tuple) Tuple where first entry is a list of the samples, the second is the list of gold labels per example, the third is the list of all possible labels, and the fourth is the labels per example in the data. """ dataDict = readCSV(args.dataPath, 2, args.numLabels) # Collect each possible label string into a list, where the indices will be # their references throughout the experiment. labelReference = list( set( itertools.chain.from_iterable( map(lambda x: x[1], dataDict.values())))) for idx, data in dataDict.iteritems(): comment, labels = data dataDict[idx] = (comment, numpy.array( [labelReference.index(label) for label in labels], dtype="int8")) texter = TextPreprocess(abbrCSV=args.abbrCSV, contrCSV=args.contrCSV) expandAbbr = (args.abbrCSV != "") expandContr = (args.contrCSV != "") if args.textPreprocess: samples = [(texter.tokenize(data[0], ignoreCommon=100, removeStrings=["[identifier deleted]"], correctSpell=True, expandAbbr=expandAbbr, expandContr=expandContr), data[1]) for _, data in dataDict.iteritems()] else: samples = [(texter.tokenize(data[0]), data[1]) for _, data in dataDict.iteritems()] return samples, labelReference
def _preprocess(self, preprocess): """Tokenize the samples, with or without preprocessing.""" texter = TextPreprocess() if preprocess: self.samples = { category: [(texter.tokenize(data[0], ignoreCommon=100, removeStrings=["identifier deleted]"], correctSpell=True), data[1], idx) for idx, data in samples.iteritems()] for category, samples in self.dataDict.iteritems() } if self.testDict: self.testSamples = [ (texter.tokenize(data[0], ignoreCommon=100, removeStrings=["identifier deleted]"], correctSpell=True), data[1], idx) for idx, data in self.testDict.iteritems() ] else: self.samples = { category: [(texter.tokenize(data[0]), data[1], idx) for idx, data in samples.iteritems()] for category, samples in self.dataDict.iteritems() } if self.testDict: self.testSamples = [(texter.tokenize(data[0]), data[1], idx) for idx, data in self.testDict.iteritems()]
def generateSequence(self, text, preprocess=False): """ Return a list of lists representing the text sequence in network data format. Does not preprocess the text. """ # TODO: enable text preprocessing; abstract out the logic in split() into a common method. tokens = TextPreprocess().tokenize(text) cat = [-1] self.sequenceCount += 1 uniqueID = "q" data = self._formatSequence(tokens, cat, self.sequenceCount, uniqueID) return data
def run(args): """ The experiment is configured to run on question response data. The runner sets up the data path to such that the experiment runs on a single data file located in the nupic.fluent/data directory. The data path MUST BE SPECIFIED at the cmd line, e.g. from the fluent dir: python experiments/random_baseline_runner.py data/sample_reviews/sample_reviews_data_training.csv To run k-folds cross validation, arguments must be: kFolds > 1, train = False, test = False. To run either training or testing, kFolds = 1. """ start = time.time() # Setup directories. root = os.path.dirname(__file__) dataPath = os.path.abspath(os.path.join(root, '../..', args.dataFile)) modelPath = os.path.abspath( os.path.join(root, args.resultsDir, args.expName, args.modelName)) if not os.path.exists(modelPath): os.makedirs(modelPath) # Verify input params. if not os.path.isfile(dataPath): raise ValueError("Invalid data path.") if (not isinstance(args.kFolds, int)) or (args.kFolds < 1): raise ValueError("Invalid value for number of cross-validation folds.") if args.train and args.test: raise ValueError("Run training and testing independently.") if (args.train or args.test) and args.kFolds > 1: raise ValueError("Experiment runs either k-folds CV or training/testing, " "not both.") # Load or init model. if args.load: with open( os.path.join(modelPath, "model.pkl"), "rb") as f: model = pkl.load(f) print "Model loaded from \'{0}\'.".format(modelPath) else: model = ClassificationModelRandomSDR(verbosity=args.verbosity) # Get and prep data. texter = TextPreprocess() samples, labels = readCSV(dataPath, 2, [3]) # Y data, [3] -> range(3,6) labelReference = list(set(labels)) labels = numpy.array([labelReference.index(l) for l in labels], dtype=int) split = len(samples)/args.kFolds samples = [texter.tokenize(sample, ignoreCommon=100, removeStrings=["[identifier deleted]"], correctSpell=True) for sample in samples] if args.verbosity > 1: for i, s in enumerate(samples): print i, s, labelReference[labels[i]] patterns = [[model.encodePattern(t) for t in tokens] for tokens in samples] # Either we train on all the data, test on all the data, or run k-fold CV. if args.train: training(model, [(p, labels[i]) for i, p in enumerate(patterns)]) elif args.test: trialResults = testing(model, [(p, labels[i]) for i, p in enumerate(patterns)]) elif args.kFolds>1: intermResults = [] predictions = [] for k in range(args.kFolds): # Train the model on a subset, and hold the evaluation subset. model.resetModel() evalIndices = range(k*split, (k+1)*split) trainIndices = [i for i in range(len(samples)) if not i in evalIndices] print "Training for CV fold {0}.".format(k) training(model, [(patterns[i], labels[i]) for i in trainIndices]) print "Evaluating for trial {0}.".format(k) trialResults = testing(model, [(patterns[i], labels[i]) for i in evalIndices]) if args.expectationDataPath: # Keep the predicted labels (top prediction only) for later. p = [l if l else [None] for l in trialResults[0]] predictions.append([labelReference[idx[0]] if idx[0] != None else '(none)' for idx in p]) print "Calculating intermediate results for this fold." result = model.evaluateTrialResults( trialResults, labelReference, evalIndices) intermResults.append(result) result[1].to_csv(os.path.join( modelPath, "evaluation_fold_" + str(k) + ".csv")) print "Calculating cumulative results for {0} trials.".format(args.kFolds) results = model.evaluateFinalResults(intermResults) results["total_cm"].to_csv(os.path.join(modelPath, "evaluation_totals.csv")) if args.expectationDataPath: computeExpectedAccuracy(list(itertools.chain.from_iterable(predictions)), os.path.abspath(os.path.join(root, '../..', args.expectationDataPath))) print "Calculating random classifier results for comparison." print model.classifyRandomly(labels) print "Saving model to \'{0}\' directory.".format(modelPath) with open( os.path.join(modelPath, "model.pkl"), "wb") as f: pkl.dump(model, f) print "Experiment complete in {0:.2f} seconds.".format(time.time() - start)
def testReadExpansionFileNoSuffixes(self): """Tests TextPreprocess reads csv files correctly.""" processor = TextPreprocess() abbreviations = processor.readExpansionFile("abbreviations.csv") expectedAbbreviations = {"wfh": "work from home"} self.assertEqual(abbreviations, expectedAbbreviations)
def run(args): """ The experiment is configured to run on question response data. To run k-folds cross validation, arguments must be: kFolds > 1, train = False, test = False. To run either training or testing, kFolds = 1. """ start = time.time() # Setup directories. root = os.path.dirname(__file__) dataPath = os.path.abspath(os.path.join(root, '../..', args.dataFile)) modelPath = os.path.abspath( os.path.join(root, args.resultsDir, args.expName, args.modelName)) if not os.path.exists(modelPath): os.makedirs(modelPath) # Verify input params. if not os.path.isfile(dataPath): raise ValueError("Invalid data path.") if (not isinstance(args.kFolds, int)) or (args.kFolds < 1): raise ValueError("Invalid value for number of cross-validation folds.") if args.train and args.test: raise ValueError("Run training and testing independently.") if (args.train or args.test) and args.kFolds > 1: raise ValueError("Experiment runs either k-folds CV or training/testing, " "not both.") # Load or init model. if args.load: with open( os.path.join(modelPath, "model.pkl"), "rb") as f: model = pkl.load(f) print "Model loaded from \'{0}\'.".format(modelPath) else: try: module = __import__(args.modelModuleName, {}, {}, args.modelName) modelClass = getattr(module, args.modelName) model = modelClass(verbosity=args.verbosity) except ImportError: raise RuntimeError("Could not find model class \'%s\' to import." % args.modelName) print "Reading in data and preprocessing." texter = TextPreprocess() samples, labels = readCSV(dataPath, 2, [3]) # Y data, [3] -> range(3,6) labelReference = list(set(labels)) labels = numpy.array([labelReference.index(l) for l in labels], dtype="int8") samples = [texter.tokenize(sample, ignoreCommon=100, removeStrings=["[identifier deleted]"], correctSpell=True) for sample in samples] if args.verbosity > 1: for i, s in enumerate(samples): print i, s, labelReference[labels[i]] patterns = [model.encodePattern(s) for s in samples] # Either we train on all the data, test on all the data, or run k-fold CV. if args.train: training(model, [(p, labels[i]) for i, p in enumerate(patterns)]) elif args.test: results = testing(model, [(p, labels[i]) for i, p in enumerate(patterns)]) calculateTrialResults(model, results, labelReference, xrange(len(samples)), os.path.join(modelPath, "test_results.csv")) elif args.kFolds>1: # Run k-folds cross validation -- train the model on a subset, and evaluate # on the remaining subset. partitions = KFolds(args.kFolds).split(xrange(len(samples))) intermResults = [] predictions = [] for k in xrange(args.kFolds): print "Training and testing for CV fold {0}.".format(k) trialResults = runExperiment(model, patterns, labels, partitions[k]) if args.expectationDataPath: # Keep the predicted labels (top prediction only) for later. p = [l if l else [None] for l in trialResults[0]] predictions.append( [labelReference[idx[0]] if idx[0] != None else '(none)' for idx in p]) print "Calculating intermediate results for this fold. Writing to CSV." intermResults.append(calculateTrialResults(model, trialResults, labelReference, partitions[k][1], os.path.join(modelPath, "evaluation_fold_" + str(k) + ".csv"))) print "Calculating cumulative results for {0} trials.".format(args.kFolds) results = model.evaluateFinalResults(intermResults) results["total_cm"].to_csv(os.path.join(modelPath, "evaluation_totals.csv")) if args.expectationDataPath: computeExpectedAccuracy(list(itertools.chain.from_iterable(predictions)), os.path.abspath(os.path.join(root, '../..', args.expectationDataPath))) print "Calculating random classifier results for comparison." print model.classifyRandomly(labels) print "Saving model to \'{0}\' directory.".format(modelPath) with open( os.path.join(modelPath, "model.pkl"), "wb") as f: pkl.dump(model, f) print "Experiment complete in {0:.2f} seconds.".format(time.time() - start)