def queryModel(self, query, preprocess=False):
        """
    Preprocesses the query, encodes it into a pattern, then queries the
    classifier to infer distances to trained-on samples.
    @return       (list)          Two-tuples of sample ID and distance, sorted
                                  closest to farthest from the query.
    """
        if preprocess:
            sample = TextPreprocess().tokenize(
                query,
                ignoreCommon=100,
                removeStrings=["[identifier deleted]"],
                correctSpell=True)
        else:
            sample = TextPreprocess().tokenize(query)

        encodedQuery = self.encodeSample(sample)

        allDistances = self.infer(encodedQuery)

        if len(allDistances) != len(self.sampleReference):
            raise IndexError(
                "Number of protoype distances must match number of "
                "samples trained on.")

        sampleDistances = defaultdict()
        for uniqueID in self.sampleReference:
            sampleDistances[uniqueID] = min([
                allDistances[i] for i, x in enumerate(self.sampleReference)
                if x == uniqueID
            ])

        return sorted(sampleDistances.items(), key=operator.itemgetter(1))
  def testTokenizeExpandContraction(self):
    """Tests contractions are expanded."""
    text = "I can't work at [identifier deleted] if you don't allw me to wfh"
    processor = TextPreprocess()

    expected_tokens = ["i", "can", "not", "work", "at", "identifier", "deleted",
                       "if", "you", "do", "not", "allw", "me", "to", "wfh"]
    tokens = processor.tokenize(text, expandContr=True)
    self.assertSequenceEqual(tokens, expected_tokens)
  def testTokenizeRemoveString(self):
    """Tests a provided string is ignored."""
    text = "I can't work at [identifier deleted] if you don't allw me to wfh"
    processor = TextPreprocess()

    expected_tokens = ["i", "can", "t", "work", "at", "if", "you", "don",
                       "t", "allw", "me", "to", "wfh"]
    tokens = processor.tokenize(text, removeStrings=["[identifier deleted]"])
    self.assertSequenceEqual(tokens, expected_tokens)
  def testTokenizeNoPreprocess(self):
    """Tests none of the preprocessing methods are used."""
    text = "I can't work at [identifier deleted] if you don't allw me to wfh"
    processor = TextPreprocess()

    expected_tokens = ["i", "can", "t", "work", "at", "identifier", "deleted",
                       "if", "you", "don", "t", "allw", "me", "to", "wfh"]
    tokens = processor.tokenize(text)
    self.assertSequenceEqual(tokens, expected_tokens)
 def testReadExpansionFileWithSuffixes(self):
   """Tests TextPreprocess reads csv files correctly and adds suffixes."""
   processor = TextPreprocess()
   suffixes = ["", "s", "'s"]
   abbreviations = processor.readExpansionFile("abbreviations.csv", suffixes)
   expectedAbbreviations = {"wfh": "work from home",
                            "wfhs": "work from homes",
                            "wfh's": "work from home's"}
   self.assertEqual(abbreviations, expectedAbbreviations)
    def split(self,
              filePath=None,
              numLabels=3,
              textPreprocess=False,
              dataDict=None,
              abbrCSV="",
              contrCSV="",
              ignoreCommon=100,
              removeStrings="[identifier deleted]",
              correctSpell=True):
        """
    Split all the comments in a file into tokens, w/ or w/o preprocessing.
    Specifying both filePath and dataDict will prefer filePath.

    @param filePath        (str)    Path to csv file
    @param dataDict        (dict)   Data as returned by readCSV()
    @param numLabels       (int)    Number of columns of category labels.
    @param textPreprocess  (bool)   True will preprocess text while tokenizing.
    
    @return dataDict       (dict)   Data as read in from filePath.

    Please see TextPreprocess tokenize() for the other parameters; they're only
    used when textPreprocess is True.
    """
        if filePath:
            dataDict = readCSV(filePath, numLabels=numLabels)

        if dataDict is None:
            raise Exception("No data given, or could not read CSV.")

        preprocessor = TextPreprocess(abbrCSV=abbrCSV, contrCSV=contrCSV)
        expandAbbr = (abbrCSV != "")
        expandContr = (contrCSV != "")

        for recordNum, record in dataDict.iteritems():
            comment, categories, uniqueID = record

            # Convert the categories to a string of their IDs
            categories = string.join(
                [str(self.categoryToId[c]) for c in categories])

            if textPreprocess:
                tokens, _ = preprocessor.tokenizeAndFilter(
                    comment, ignoreCommon, removeStrings, correctSpell,
                    expandAbbr, expandContr)
            else:
                tokens = preprocessor.tokenize(comment)

            data = self._formatSequence(tokens, categories, recordNum,
                                        uniqueID)

            self.records.append(data)
            self.sequenceCount += 1

        return dataDict
    def split(
        self,
        filePath=None,
        numLabels=3,
        textPreprocess=False,
        dataDict=None,
        abbrCSV="",
        contrCSV="",
        ignoreCommon=100,
        removeStrings="[identifier deleted]",
        correctSpell=True,
    ):
        """
    Split all the comments in a file into tokens, w/ or w/o preprocessing.
    Specifying both filePath and dataDict will prefer filePath.

    @param filePath        (str)    Path to csv file
    @param dataDict        (dict)   Data as returned by readCSV()
    @param numLabels       (int)    Number of columns of category labels.
    @param textPreprocess  (bool)   True will preprocess text while tokenizing.
    
    @return dataDict       (dict)   Data as read in from filePath.

    Please see TextPreprocess tokenize() for the other parameters; they're only
    used when textPreprocess is True.
    """
        if filePath:
            dataDict = readCSV(filePath, numLabels=numLabels)

        if dataDict is None:
            raise Exception("No data given, or could not read CSV.")

        preprocessor = TextPreprocess(abbrCSV=abbrCSV, contrCSV=contrCSV)
        expandAbbr = abbrCSV != ""
        expandContr = contrCSV != ""

        for recordNum, record in dataDict.iteritems():
            comment, categories, uniqueID = record

            # Convert the categories to a string of their IDs
            categories = string.join([str(self.categoryToId[c]) for c in categories])

            if textPreprocess:
                tokens, _ = preprocessor.tokenizeAndFilter(
                    comment, ignoreCommon, removeStrings, correctSpell, expandAbbr, expandContr
                )
            else:
                tokens = preprocessor.tokenize(comment)

            data = self._formatSequence(tokens, categories, recordNum, uniqueID)

            self.records.append(data)
            self.sequenceCount += 1

        return dataDict
 def testFunctionsWithoutDataFiles(self):
   """
   Ensures a TextPreprocess object can be created and tokenize when there are
   no text data files (corpus text, abbreviations, and contractions).
   """
   text = "I can't work at [identifier deleted] if you don't allw me to wfh"
   processor = TextPreprocess(corpusTxt="fake.txt",
                              abbrCSV="not_here.csv",
                              contrCSV="not_real.csv")
   
   tokens = processor.tokenize(text)
   expected_tokens = ["i", "can", "t", "work", "at", "identifier", "deleted",
                      "if", "you", "don", "t", "allw", "me", "to", "wfh"]
   
   self.assertSequenceEqual(tokens, expected_tokens)
Exemple #9
0
def trainModelWithText(model, trainingData):
    """ Train the given model on trainingData.
  This is (essentially) the same training method as in the research repo's
  imbu_runner.py.
  """
    textPreprocessor = TextPreprocess()
    for seqId, (text, _, _) in enumerate(trainingData.values()):
        textTokens = textPreprocessor.tokenize(
            text)  # TODO: use model's tokenization method instead
        lastToken = len(textTokens) - 1
        for i, token in enumerate(textTokens):
            # use the sequence's ID as the category label
            model.trainText(token, [seqId],
                            sequenceId=seqId,
                            reset=int(i == lastToken))
Exemple #10
0
def trainModelWithText(model, trainingData):
  """ Train the given model on trainingData.
  This is (essentially) the same training method as in the research repo's
  imbu_runner.py.
  """
  textPreprocessor = TextPreprocess()
  for seqId, (text, _, _) in enumerate(trainingData.values()):
    textTokens = textPreprocessor.tokenize(text)  # TODO: use model's tokenization method instead
    lastToken = len(textTokens) - 1
    for i, token in enumerate(textTokens):
      # use the sequence's ID as the category label
      model.trainText(token,
                      [seqId],
                      sequenceId=seqId,
                      reset=int(i==lastToken))
Exemple #11
0
    def queryModel(self, query):
        """
    Preprocesses the query, encodes it into a pattern, then queries the
    classifier to infer distances to trained-on samples.
    @return       (list)          Two-tuples of sample ID and distance, sorted
                                  closest to farthest from the query.
    """
        sample = TextPreprocess().tokenize(query)
        encodedQuery = self.encodeSample(sample)
        # TODO: with new CioEncoder, switch to encode by token (below).
        # encodedQuery = [{"text": token, "bitmap": self.encodeToken(token)}
        #                 for token in sample]

        allDistances = self.infer(encodedQuery)

        if len(allDistances) != len(self.sampleReference):
            raise IndexError(
                "Number of protoype distances must match number of "
                "samples trained on.")

        sampleDistances = defaultdict()
        for uniqueID in self.sampleReference:
            sampleDistances[uniqueID] = min([
                allDistances[i] for i, x in enumerate(self.sampleReference)
                if x == uniqueID
            ])

        return sorted(sampleDistances.items(), key=operator.itemgetter(1))
Exemple #12
0
    def prepText(text, preprocess=False):
        """
    Returns a list of the text tokens.

    @param preprocess   (bool)    Whether or not to preprocess the text data.
    """
        if preprocess:
            sample = TextPreprocess().tokenize(
                text,
                ignoreCommon=100,
                removeStrings=["[identifier deleted]"],
                correctSpell=True)
        else:
            sample = TextPreprocess().tokenize(text)

        return sample
Exemple #13
0
    def getUnionEncoding(self, text):
        """
    Encode each token of the input text, take the union, and then sparsify.

    @param  text    (str)             A non-tokenized sample of text.
    @return         (dict)            The bitmap encoding is at
                                      encoding["fingerprint"]["positions"].
    """
        tokens = TextPreprocess().tokenize(text)

        positions = self.getUnionEncodingFromTokens(tokens)

        # Populate encoding
        encoding = {
            "text": text,
            "sparsity": len(positions) / float(self.n),
            "df": 0.0,
            "height": self.height,
            "width": self.width,
            "score": 0.0,
            "fingerprint": {
                "positions": sorted(positions)
            },
            "pos_types": []
        }

        return encoding
    def testIndexMapping(self):
        originalWords = self.testDocuments[2].split(" ")

        tokenList, mapping = TextPreprocess().tokenizeAndFilter(
            self.testDocuments[2],
            ignoreCommon=50,
            removeStrings=["[identifier deleted]"],
            correctSpell=True,
            expandAbbr=True,
            expandContr=True)

        self.assertEqual(len(tokenList), len(mapping),
                         "There should be one mapping entry for each token.")

        # Test filtering results
        self.assertEqual("therefore", tokenList[0], "Spelling not corrected.")
        self.assertEqual("discrete", tokenList[24], "Spelling not corrected.")
        self.assertSequenceEqual(["hierarchical", "temporal", "memory"],
                                 tokenList[1:4],
                                 "Abbreviation 'HTM' not expanded.")
        self.assertNotIn("but", tokenList, "Common word 'but' not removed.")
        self.assertNotIn("not", tokenList, "Common word 'not' not removed.")
        self.assertIn("does", tokenList,
                      "Contraction not expanded to 'does not'.")

        # Test some token-to-word-mappings
        mappedWords = [originalWords[i] for i in mapping]
        self.assertNotEqual(len(originalWords), len(mappedWords))
        for word in mappedWords[1:4]:
            self.assertEqual("HTM", word,
                             "Tokens don't map to 'HTM' as expected.")
    def getUnionEncoding(self, text):
        """
    Encode each token of the input text, take the union, and then sparsify.

    @param  text    (str)             A non-tokenized sample of text.
    @return         (dict)            The bitmap encoding is at
                                      encoding["fingerprint"]["positions"].
    """
        tokens = TextPreprocess().tokenize(text)

        # Count the ON bits represented in the encoded tokens.
        counts = Counter()
        for t in tokens:
            bitmap = self._getWordBitmap(t)
            counts.update(bitmap)

        positions = self.sparseUnion(counts)

        # Populate encoding
        encoding = {
            "text": text,
            "sparsity": len(positions) / float(self.n),
            "df": 0.0,
            "height": self.height,
            "width": self.width,
            "score": 0.0,
            "fingerprint": {
                "positions": sorted(positions)
            },
            "pos_types": []
        }

        return encoding
    def testWindowEncodings(self):
        """Test the CioEncoder for the sliding window encodings."""
        cio = CioEncoder(fingerprintType=EncoderTypes.word)

        text = """
      I grok people. I am people, so now I can say it in people talk. I've found
      out why people laugh. They laugh because it hurts so much, because it's
      the only thing that'll make it stop hurting."""

        tokens = TextPreprocess().tokenize(text)

        encodingDicts = cio.getWindowEncoding(tokens, minSparsity=0.19)

        # Test that only dense windows get encoded
        self.assertTrue(
            len(tokens) > len(encodingDicts),
            "Returned incorrect number of window encodings.")

        # Test window
        windowEncoding = getTestData("cio_encoding_window.json")
        self.assertEqual(windowEncoding["text"], encodingDicts[-1]["text"],
                         "Window encoding represents the wrong text.")
        self.assertTrue(encodingDicts[-1]["sparsity"] <= cio.unionSparsity,
                        "Sparsity for large window is larger than the max.")
        self.assertSequenceEqual(
            windowEncoding["bitmap"], encodingDicts[-1]["bitmap"].tolist(),
            "Window encoding's bitmap is not as expected.")
    def queryModel(self, query, preprocess=False):
        """
    Preprocesses the query, encodes it into a pattern, then queries the
    classifier to infer distances to trained-on samples.
    @return       (list)          Two-tuples of sample ID and distance, sorted
                                  closest to farthest from the query.
    """
        if preprocess:
            sample = TextPreprocess().tokenize(
                query,
                ignoreCommon=100,
                removeStrings=["[identifier deleted]"],
                correctSpell=True)
        else:
            sample = TextPreprocess().tokenize(query)

        # Get window patterns for the query, but if the query is too small such that
        # the window encodings are too sparse, we default to a pure union.
        encodedQuery = self.encodeSample(sample)
        if len(encodedQuery) == 0:
            sample = " ".join(sample)
            fpInfo = self.encoder.getUnionEncoding(sample)
            encodedQuery = [{
                "text":
                fpInfo["text"],
                "sparsity":
                fpInfo["sparsity"],
                "bitmap":
                numpy.array(fpInfo["fingerprint"]["positions"])
            }]
        allDistances = self.infer(encodedQuery)

        if len(allDistances) != len(self.sampleReference):
            raise IndexError(
                "Number of protoype distances must match number of "
                "samples trained on.")

        sampleDistances = defaultdict()
        for uniqueID in self.sampleReference:
            sampleDistances[uniqueID] = min([
                allDistances[i] for i, x in enumerate(self.sampleReference)
                if x == uniqueID
            ])

        return sorted(sampleDistances.items(), key=operator.itemgetter(1))
Exemple #18
0
    def tokenize(self, inputText):
        """
    Given a bunch of text (could be several sentences) return a single list
    containing individual tokens. It will filterText if the global option
    is set.

    @param inputText  (str)   A bunch of text.
    @return sample    (list)  A list of text tokens.
    @return mapping   (list)  Maps the original words to the sample tokens. See
                              TextPreprocess method for details.
    """
        if self.filterText:
            sample, mapping = TextPreprocess().tokenizeAndFilter(
                inputText, **self.filterOptions)
        else:
            sample, mapping = TextPreprocess().tokenizeAndFilter(inputText)

        return sample, mapping
    def tokenize(self, inputText):
        """
    Given a bunch of text (could be several sentences) return a single list
    containing individual tokens. It will filterText if the global option
    is set.

    @param inputText  (str)   A bunch of text.
    @return sample    (list)  A list of text tokens.
    @return mapping   (list)  Maps the original words to the sample tokens. See
                              TextPreprocess method for details.
    """
        if self.filterText:
            sample, mapping = TextPreprocess().tokenizeAndFilter(
                inputText,
                ignoreCommon=100,
                removeStrings=["[identifier deleted]"],
                correctSpell=True)
        else:
            sample, mapping = TextPreprocess().tokenizeAndFilter(inputText)

        return sample, mapping
Exemple #20
0
  def generateSequence(self, text, preprocess=False):
    """
    Return a list of lists representing the text sequence in network data 
    format. Does not preprocess the text.
    """
    # TODO: enable text preprocessing; abstract out the logic in split() into a common method.
    tokens = TextPreprocess().tokenize(text)
    cat = [-1]
    self.sequenceCount += 1
    uniqueID = "q"
    data = self._formatSequence(tokens, cat, self.sequenceCount-1, uniqueID)

    return data
 def testReadExpansionFileNoSuffixes(self):
   """Tests TextPreprocess reads csv files correctly."""
   processor = TextPreprocess()
   abbreviations = processor.readExpansionFile("abbreviations.csv")
   expectedAbbreviations = {"wfh": "work from home"}
   self.assertEqual(abbreviations, expectedAbbreviations)
Exemple #22
0
def countBitFrequenciesForTerms(client,
                                lines,
                                acceptanceProbability=0.1,
                                usePlaceholderEncoding=True,
                                percentSparsity=0.0102):
    # Accumulate counts by inplace-adding sparse matrices
    skippedWords = {}
    counts = SparseMatrix()
    width = RETINA_SIZES[client.retina]["width"]
    height = RETINA_SIZES[client.retina]["height"]
    counts.resize(1, width * height)

    # Pre-allocate buffer sparse matrix
    sparseBitmap = SparseMatrix()
    sparseBitmap.resize(1, width * height)

    # Accumulate counts for each bit for each word
    numWords = 0
    numLines = 0
    for line in lines:
        tokens = TextPreprocess().tokenize(line)
        for term in tokens:

            p = random.uniform(0, 1)
            if p <= acceptanceProbability:
                if usePlaceholderEncoding:
                    random.seed(term)
                    bitmap = random.sample(
                        xrange(width * height),
                        int(width * height * percentSparsity))
                    bitmap.sort()
                    random.seed(p)
                else:
                    try:
                        bitmap = client.getBitmap(
                            term)["fingerprint"]["positions"]
                    except Exception as err:
                        print "Skipping '{}', reason: {}".format(
                            term, str(err))
                        continue

                    if not bitmap:
                        skippedWords[term] = skippedWords.get(term, 0) + 1
                        # print "Skipping '{}', reason: empty".format(term)
                        continue

                sparseBitmap.setRowFromSparse(0, bitmap, [1] * len(bitmap))
                counts += sparseBitmap
                numWords += 1

        numLines += 1
        if numLines % 1000 == 0:
            print "...processed=", numLines, "lines and", numWords, "words"

    # Compute normalized version of counts as a separate matrix
    frequencies = SparseMatrix()
    frequencies.resize(1, width * height)
    frequencies.copy(counts)
    frequencies.divide(float(numWords))

    # Wrap up by printing some statistics and then saving the normalized version
    print "Processed", numLines, "lines"
    printFrequencyStatistics(counts, frequencies, numWords, width * height)

    frequencyFilename = "bit_frequencies_" + client.retina + ".pkl"
    print "Saving frequency matrix in", frequencyFilename
    with open(frequencyFilename, "wb") as frequencyPickleFile:
        pickle.dump(frequencies, frequencyPickleFile)

    print "These words were skipped N times because of empty bitmap result"
    print skippedWords

    return counts