def queryModel(self, query, preprocess=False): """ Preprocesses the query, encodes it into a pattern, then queries the classifier to infer distances to trained-on samples. @return (list) Two-tuples of sample ID and distance, sorted closest to farthest from the query. """ if preprocess: sample = TextPreprocess().tokenize( query, ignoreCommon=100, removeStrings=["[identifier deleted]"], correctSpell=True) else: sample = TextPreprocess().tokenize(query) encodedQuery = self.encodeSample(sample) allDistances = self.infer(encodedQuery) if len(allDistances) != len(self.sampleReference): raise IndexError( "Number of protoype distances must match number of " "samples trained on.") sampleDistances = defaultdict() for uniqueID in self.sampleReference: sampleDistances[uniqueID] = min([ allDistances[i] for i, x in enumerate(self.sampleReference) if x == uniqueID ]) return sorted(sampleDistances.items(), key=operator.itemgetter(1))
def testTokenizeExpandContraction(self): """Tests contractions are expanded.""" text = "I can't work at [identifier deleted] if you don't allw me to wfh" processor = TextPreprocess() expected_tokens = ["i", "can", "not", "work", "at", "identifier", "deleted", "if", "you", "do", "not", "allw", "me", "to", "wfh"] tokens = processor.tokenize(text, expandContr=True) self.assertSequenceEqual(tokens, expected_tokens)
def testTokenizeRemoveString(self): """Tests a provided string is ignored.""" text = "I can't work at [identifier deleted] if you don't allw me to wfh" processor = TextPreprocess() expected_tokens = ["i", "can", "t", "work", "at", "if", "you", "don", "t", "allw", "me", "to", "wfh"] tokens = processor.tokenize(text, removeStrings=["[identifier deleted]"]) self.assertSequenceEqual(tokens, expected_tokens)
def testTokenizeNoPreprocess(self): """Tests none of the preprocessing methods are used.""" text = "I can't work at [identifier deleted] if you don't allw me to wfh" processor = TextPreprocess() expected_tokens = ["i", "can", "t", "work", "at", "identifier", "deleted", "if", "you", "don", "t", "allw", "me", "to", "wfh"] tokens = processor.tokenize(text) self.assertSequenceEqual(tokens, expected_tokens)
def testReadExpansionFileWithSuffixes(self): """Tests TextPreprocess reads csv files correctly and adds suffixes.""" processor = TextPreprocess() suffixes = ["", "s", "'s"] abbreviations = processor.readExpansionFile("abbreviations.csv", suffixes) expectedAbbreviations = {"wfh": "work from home", "wfhs": "work from homes", "wfh's": "work from home's"} self.assertEqual(abbreviations, expectedAbbreviations)
def split(self, filePath=None, numLabels=3, textPreprocess=False, dataDict=None, abbrCSV="", contrCSV="", ignoreCommon=100, removeStrings="[identifier deleted]", correctSpell=True): """ Split all the comments in a file into tokens, w/ or w/o preprocessing. Specifying both filePath and dataDict will prefer filePath. @param filePath (str) Path to csv file @param dataDict (dict) Data as returned by readCSV() @param numLabels (int) Number of columns of category labels. @param textPreprocess (bool) True will preprocess text while tokenizing. @return dataDict (dict) Data as read in from filePath. Please see TextPreprocess tokenize() for the other parameters; they're only used when textPreprocess is True. """ if filePath: dataDict = readCSV(filePath, numLabels=numLabels) if dataDict is None: raise Exception("No data given, or could not read CSV.") preprocessor = TextPreprocess(abbrCSV=abbrCSV, contrCSV=contrCSV) expandAbbr = (abbrCSV != "") expandContr = (contrCSV != "") for recordNum, record in dataDict.iteritems(): comment, categories, uniqueID = record # Convert the categories to a string of their IDs categories = string.join( [str(self.categoryToId[c]) for c in categories]) if textPreprocess: tokens, _ = preprocessor.tokenizeAndFilter( comment, ignoreCommon, removeStrings, correctSpell, expandAbbr, expandContr) else: tokens = preprocessor.tokenize(comment) data = self._formatSequence(tokens, categories, recordNum, uniqueID) self.records.append(data) self.sequenceCount += 1 return dataDict
def split( self, filePath=None, numLabels=3, textPreprocess=False, dataDict=None, abbrCSV="", contrCSV="", ignoreCommon=100, removeStrings="[identifier deleted]", correctSpell=True, ): """ Split all the comments in a file into tokens, w/ or w/o preprocessing. Specifying both filePath and dataDict will prefer filePath. @param filePath (str) Path to csv file @param dataDict (dict) Data as returned by readCSV() @param numLabels (int) Number of columns of category labels. @param textPreprocess (bool) True will preprocess text while tokenizing. @return dataDict (dict) Data as read in from filePath. Please see TextPreprocess tokenize() for the other parameters; they're only used when textPreprocess is True. """ if filePath: dataDict = readCSV(filePath, numLabels=numLabels) if dataDict is None: raise Exception("No data given, or could not read CSV.") preprocessor = TextPreprocess(abbrCSV=abbrCSV, contrCSV=contrCSV) expandAbbr = abbrCSV != "" expandContr = contrCSV != "" for recordNum, record in dataDict.iteritems(): comment, categories, uniqueID = record # Convert the categories to a string of their IDs categories = string.join([str(self.categoryToId[c]) for c in categories]) if textPreprocess: tokens, _ = preprocessor.tokenizeAndFilter( comment, ignoreCommon, removeStrings, correctSpell, expandAbbr, expandContr ) else: tokens = preprocessor.tokenize(comment) data = self._formatSequence(tokens, categories, recordNum, uniqueID) self.records.append(data) self.sequenceCount += 1 return dataDict
def testFunctionsWithoutDataFiles(self): """ Ensures a TextPreprocess object can be created and tokenize when there are no text data files (corpus text, abbreviations, and contractions). """ text = "I can't work at [identifier deleted] if you don't allw me to wfh" processor = TextPreprocess(corpusTxt="fake.txt", abbrCSV="not_here.csv", contrCSV="not_real.csv") tokens = processor.tokenize(text) expected_tokens = ["i", "can", "t", "work", "at", "identifier", "deleted", "if", "you", "don", "t", "allw", "me", "to", "wfh"] self.assertSequenceEqual(tokens, expected_tokens)
def trainModelWithText(model, trainingData): """ Train the given model on trainingData. This is (essentially) the same training method as in the research repo's imbu_runner.py. """ textPreprocessor = TextPreprocess() for seqId, (text, _, _) in enumerate(trainingData.values()): textTokens = textPreprocessor.tokenize( text) # TODO: use model's tokenization method instead lastToken = len(textTokens) - 1 for i, token in enumerate(textTokens): # use the sequence's ID as the category label model.trainText(token, [seqId], sequenceId=seqId, reset=int(i == lastToken))
def trainModelWithText(model, trainingData): """ Train the given model on trainingData. This is (essentially) the same training method as in the research repo's imbu_runner.py. """ textPreprocessor = TextPreprocess() for seqId, (text, _, _) in enumerate(trainingData.values()): textTokens = textPreprocessor.tokenize(text) # TODO: use model's tokenization method instead lastToken = len(textTokens) - 1 for i, token in enumerate(textTokens): # use the sequence's ID as the category label model.trainText(token, [seqId], sequenceId=seqId, reset=int(i==lastToken))
def queryModel(self, query): """ Preprocesses the query, encodes it into a pattern, then queries the classifier to infer distances to trained-on samples. @return (list) Two-tuples of sample ID and distance, sorted closest to farthest from the query. """ sample = TextPreprocess().tokenize(query) encodedQuery = self.encodeSample(sample) # TODO: with new CioEncoder, switch to encode by token (below). # encodedQuery = [{"text": token, "bitmap": self.encodeToken(token)} # for token in sample] allDistances = self.infer(encodedQuery) if len(allDistances) != len(self.sampleReference): raise IndexError( "Number of protoype distances must match number of " "samples trained on.") sampleDistances = defaultdict() for uniqueID in self.sampleReference: sampleDistances[uniqueID] = min([ allDistances[i] for i, x in enumerate(self.sampleReference) if x == uniqueID ]) return sorted(sampleDistances.items(), key=operator.itemgetter(1))
def prepText(text, preprocess=False): """ Returns a list of the text tokens. @param preprocess (bool) Whether or not to preprocess the text data. """ if preprocess: sample = TextPreprocess().tokenize( text, ignoreCommon=100, removeStrings=["[identifier deleted]"], correctSpell=True) else: sample = TextPreprocess().tokenize(text) return sample
def getUnionEncoding(self, text): """ Encode each token of the input text, take the union, and then sparsify. @param text (str) A non-tokenized sample of text. @return (dict) The bitmap encoding is at encoding["fingerprint"]["positions"]. """ tokens = TextPreprocess().tokenize(text) positions = self.getUnionEncodingFromTokens(tokens) # Populate encoding encoding = { "text": text, "sparsity": len(positions) / float(self.n), "df": 0.0, "height": self.height, "width": self.width, "score": 0.0, "fingerprint": { "positions": sorted(positions) }, "pos_types": [] } return encoding
def testIndexMapping(self): originalWords = self.testDocuments[2].split(" ") tokenList, mapping = TextPreprocess().tokenizeAndFilter( self.testDocuments[2], ignoreCommon=50, removeStrings=["[identifier deleted]"], correctSpell=True, expandAbbr=True, expandContr=True) self.assertEqual(len(tokenList), len(mapping), "There should be one mapping entry for each token.") # Test filtering results self.assertEqual("therefore", tokenList[0], "Spelling not corrected.") self.assertEqual("discrete", tokenList[24], "Spelling not corrected.") self.assertSequenceEqual(["hierarchical", "temporal", "memory"], tokenList[1:4], "Abbreviation 'HTM' not expanded.") self.assertNotIn("but", tokenList, "Common word 'but' not removed.") self.assertNotIn("not", tokenList, "Common word 'not' not removed.") self.assertIn("does", tokenList, "Contraction not expanded to 'does not'.") # Test some token-to-word-mappings mappedWords = [originalWords[i] for i in mapping] self.assertNotEqual(len(originalWords), len(mappedWords)) for word in mappedWords[1:4]: self.assertEqual("HTM", word, "Tokens don't map to 'HTM' as expected.")
def getUnionEncoding(self, text): """ Encode each token of the input text, take the union, and then sparsify. @param text (str) A non-tokenized sample of text. @return (dict) The bitmap encoding is at encoding["fingerprint"]["positions"]. """ tokens = TextPreprocess().tokenize(text) # Count the ON bits represented in the encoded tokens. counts = Counter() for t in tokens: bitmap = self._getWordBitmap(t) counts.update(bitmap) positions = self.sparseUnion(counts) # Populate encoding encoding = { "text": text, "sparsity": len(positions) / float(self.n), "df": 0.0, "height": self.height, "width": self.width, "score": 0.0, "fingerprint": { "positions": sorted(positions) }, "pos_types": [] } return encoding
def testWindowEncodings(self): """Test the CioEncoder for the sliding window encodings.""" cio = CioEncoder(fingerprintType=EncoderTypes.word) text = """ I grok people. I am people, so now I can say it in people talk. I've found out why people laugh. They laugh because it hurts so much, because it's the only thing that'll make it stop hurting.""" tokens = TextPreprocess().tokenize(text) encodingDicts = cio.getWindowEncoding(tokens, minSparsity=0.19) # Test that only dense windows get encoded self.assertTrue( len(tokens) > len(encodingDicts), "Returned incorrect number of window encodings.") # Test window windowEncoding = getTestData("cio_encoding_window.json") self.assertEqual(windowEncoding["text"], encodingDicts[-1]["text"], "Window encoding represents the wrong text.") self.assertTrue(encodingDicts[-1]["sparsity"] <= cio.unionSparsity, "Sparsity for large window is larger than the max.") self.assertSequenceEqual( windowEncoding["bitmap"], encodingDicts[-1]["bitmap"].tolist(), "Window encoding's bitmap is not as expected.")
def queryModel(self, query, preprocess=False): """ Preprocesses the query, encodes it into a pattern, then queries the classifier to infer distances to trained-on samples. @return (list) Two-tuples of sample ID and distance, sorted closest to farthest from the query. """ if preprocess: sample = TextPreprocess().tokenize( query, ignoreCommon=100, removeStrings=["[identifier deleted]"], correctSpell=True) else: sample = TextPreprocess().tokenize(query) # Get window patterns for the query, but if the query is too small such that # the window encodings are too sparse, we default to a pure union. encodedQuery = self.encodeSample(sample) if len(encodedQuery) == 0: sample = " ".join(sample) fpInfo = self.encoder.getUnionEncoding(sample) encodedQuery = [{ "text": fpInfo["text"], "sparsity": fpInfo["sparsity"], "bitmap": numpy.array(fpInfo["fingerprint"]["positions"]) }] allDistances = self.infer(encodedQuery) if len(allDistances) != len(self.sampleReference): raise IndexError( "Number of protoype distances must match number of " "samples trained on.") sampleDistances = defaultdict() for uniqueID in self.sampleReference: sampleDistances[uniqueID] = min([ allDistances[i] for i, x in enumerate(self.sampleReference) if x == uniqueID ]) return sorted(sampleDistances.items(), key=operator.itemgetter(1))
def tokenize(self, inputText): """ Given a bunch of text (could be several sentences) return a single list containing individual tokens. It will filterText if the global option is set. @param inputText (str) A bunch of text. @return sample (list) A list of text tokens. @return mapping (list) Maps the original words to the sample tokens. See TextPreprocess method for details. """ if self.filterText: sample, mapping = TextPreprocess().tokenizeAndFilter( inputText, **self.filterOptions) else: sample, mapping = TextPreprocess().tokenizeAndFilter(inputText) return sample, mapping
def tokenize(self, inputText): """ Given a bunch of text (could be several sentences) return a single list containing individual tokens. It will filterText if the global option is set. @param inputText (str) A bunch of text. @return sample (list) A list of text tokens. @return mapping (list) Maps the original words to the sample tokens. See TextPreprocess method for details. """ if self.filterText: sample, mapping = TextPreprocess().tokenizeAndFilter( inputText, ignoreCommon=100, removeStrings=["[identifier deleted]"], correctSpell=True) else: sample, mapping = TextPreprocess().tokenizeAndFilter(inputText) return sample, mapping
def generateSequence(self, text, preprocess=False): """ Return a list of lists representing the text sequence in network data format. Does not preprocess the text. """ # TODO: enable text preprocessing; abstract out the logic in split() into a common method. tokens = TextPreprocess().tokenize(text) cat = [-1] self.sequenceCount += 1 uniqueID = "q" data = self._formatSequence(tokens, cat, self.sequenceCount-1, uniqueID) return data
def testReadExpansionFileNoSuffixes(self): """Tests TextPreprocess reads csv files correctly.""" processor = TextPreprocess() abbreviations = processor.readExpansionFile("abbreviations.csv") expectedAbbreviations = {"wfh": "work from home"} self.assertEqual(abbreviations, expectedAbbreviations)
def countBitFrequenciesForTerms(client, lines, acceptanceProbability=0.1, usePlaceholderEncoding=True, percentSparsity=0.0102): # Accumulate counts by inplace-adding sparse matrices skippedWords = {} counts = SparseMatrix() width = RETINA_SIZES[client.retina]["width"] height = RETINA_SIZES[client.retina]["height"] counts.resize(1, width * height) # Pre-allocate buffer sparse matrix sparseBitmap = SparseMatrix() sparseBitmap.resize(1, width * height) # Accumulate counts for each bit for each word numWords = 0 numLines = 0 for line in lines: tokens = TextPreprocess().tokenize(line) for term in tokens: p = random.uniform(0, 1) if p <= acceptanceProbability: if usePlaceholderEncoding: random.seed(term) bitmap = random.sample( xrange(width * height), int(width * height * percentSparsity)) bitmap.sort() random.seed(p) else: try: bitmap = client.getBitmap( term)["fingerprint"]["positions"] except Exception as err: print "Skipping '{}', reason: {}".format( term, str(err)) continue if not bitmap: skippedWords[term] = skippedWords.get(term, 0) + 1 # print "Skipping '{}', reason: empty".format(term) continue sparseBitmap.setRowFromSparse(0, bitmap, [1] * len(bitmap)) counts += sparseBitmap numWords += 1 numLines += 1 if numLines % 1000 == 0: print "...processed=", numLines, "lines and", numWords, "words" # Compute normalized version of counts as a separate matrix frequencies = SparseMatrix() frequencies.resize(1, width * height) frequencies.copy(counts) frequencies.divide(float(numWords)) # Wrap up by printing some statistics and then saving the normalized version print "Processed", numLines, "lines" printFrequencyStatistics(counts, frequencies, numWords, width * height) frequencyFilename = "bit_frequencies_" + client.retina + ".pkl" print "Saving frequency matrix in", frequencyFilename with open(frequencyFilename, "wb") as frequencyPickleFile: pickle.dump(frequencies, frequencyPickleFile) print "These words were skipped N times because of empty bitmap result" print skippedWords return counts