Example #1
0
 def test_split_with_s(self):
     result = split_possessive_markers(["Fred's", 'is', "Frank's", 'bar', '.'])
     self.assertEqual(7, len(result), str(result))
     self.assertEqual(result[0], "Fred", str(result))
     self.assertEqual(result[1], "'s", str(result))
     self.assertEqual(result[3], "Frank", str(result))
     self.assertEqual(result[4], "'s", str(result))
Example #2
0
 def test_split_with_s(self):
     result = split_possessive_markers(
         ["Fred's", 'is', "Frank's", 'bar', '.'])
     self.assertEqual(7, len(result), str(result))
     self.assertEqual(result[0], "Fred", str(result))
     self.assertEqual(result[1], "'s", str(result))
     self.assertEqual(result[3], "Frank", str(result))
     self.assertEqual(result[4], "'s", str(result))
Example #3
0
    def word_tokenize(self, text):
        """Get list of string tokens from input string.

        Args:
            text: input string for tokenization
        Yields:
            token: str, non-whitespace tokens
        """
        for token in split_possessive_markers(split_contractions(_html_tokenize(text))):
            if self._max_characters_per_token is not None:
                for token_chunk in funcy.chunks(self._max_characters_per_token, token):
                    yield token_chunk
            else:
                yield token
Example #4
0
 def test_split_unicode(self):
     stem, marker = split_possessive_markers(["a\u2032s"])
     self.assertEqual(stem, 'a')
     self.assertEqual(marker, "\u2032s")
Example #5
0
 def test_split_without_s(self):
     stem, marker = split_possessive_markers(["CHARLES'"])
     self.assertEqual(stem, "CHARLES")
     self.assertEqual(marker, "'")
Example #6
0
 def test_split_unicode(self):
     stem, marker = split_possessive_markers(["a\u2032s"])
     self.assertEqual(stem, 'a')
     self.assertEqual(marker, "\u2032s")
Example #7
0
 def test_split_without_s(self):
     stem, marker = split_possessive_markers(["CHARLES'"])
     self.assertEqual(stem, "CHARLES")
     self.assertEqual(marker, "'")
Example #8
0
 def setUp(self):
     self.tokenizer = test_tokenizer_with_spans(
         self, lambda t: split_possessive_markers(space_tokenizer(t)))