Ejemplo n.º 1
0
 def test_split_with_s(self):
     result = split_possessive_markers(["Fred's", 'is', "Frank's", 'bar', '.'])
     self.assertEqual(7, len(result), str(result))
     self.assertEqual(result[0], "Fred", str(result))
     self.assertEqual(result[1], "'s", str(result))
     self.assertEqual(result[3], "Frank", str(result))
     self.assertEqual(result[4], "'s", str(result))
Ejemplo n.º 2
0
 def test_split_with_s(self):
     result = split_possessive_markers(
         ["Fred's", 'is', "Frank's", 'bar', '.'])
     self.assertEqual(7, len(result), str(result))
     self.assertEqual(result[0], "Fred", str(result))
     self.assertEqual(result[1], "'s", str(result))
     self.assertEqual(result[3], "Frank", str(result))
     self.assertEqual(result[4], "'s", str(result))
Ejemplo n.º 3
0
    def word_tokenize(self, text):
        """Get list of string tokens from input string.

        Args:
            text: input string for tokenization
        Yields:
            token: str, non-whitespace tokens
        """
        for token in split_possessive_markers(split_contractions(_html_tokenize(text))):
            if self._max_characters_per_token is not None:
                for token_chunk in funcy.chunks(self._max_characters_per_token, token):
                    yield token_chunk
            else:
                yield token
Ejemplo n.º 4
0
 def test_split_unicode(self):
     stem, marker = split_possessive_markers(["a\u2032s"])
     self.assertEqual(stem, 'a')
     self.assertEqual(marker, "\u2032s")
Ejemplo n.º 5
0
 def test_split_without_s(self):
     stem, marker = split_possessive_markers(["CHARLES'"])
     self.assertEqual(stem, "CHARLES")
     self.assertEqual(marker, "'")
Ejemplo n.º 6
0
 def test_split_unicode(self):
     stem, marker = split_possessive_markers(["a\u2032s"])
     self.assertEqual(stem, 'a')
     self.assertEqual(marker, "\u2032s")
Ejemplo n.º 7
0
 def test_split_without_s(self):
     stem, marker = split_possessive_markers(["CHARLES'"])
     self.assertEqual(stem, "CHARLES")
     self.assertEqual(marker, "'")
Ejemplo n.º 8
0
 def setUp(self):
     self.tokenizer = test_tokenizer_with_spans(
         self, lambda t: split_possessive_markers(space_tokenizer(t)))