def test_split_with_s(self): result = split_possessive_markers(["Fred's", 'is', "Frank's", 'bar', '.']) self.assertEqual(7, len(result), str(result)) self.assertEqual(result[0], "Fred", str(result)) self.assertEqual(result[1], "'s", str(result)) self.assertEqual(result[3], "Frank", str(result)) self.assertEqual(result[4], "'s", str(result))
def test_split_with_s(self): result = split_possessive_markers( ["Fred's", 'is', "Frank's", 'bar', '.']) self.assertEqual(7, len(result), str(result)) self.assertEqual(result[0], "Fred", str(result)) self.assertEqual(result[1], "'s", str(result)) self.assertEqual(result[3], "Frank", str(result)) self.assertEqual(result[4], "'s", str(result))
def word_tokenize(self, text): """Get list of string tokens from input string. Args: text: input string for tokenization Yields: token: str, non-whitespace tokens """ for token in split_possessive_markers(split_contractions(_html_tokenize(text))): if self._max_characters_per_token is not None: for token_chunk in funcy.chunks(self._max_characters_per_token, token): yield token_chunk else: yield token
def test_split_unicode(self): stem, marker = split_possessive_markers(["a\u2032s"]) self.assertEqual(stem, 'a') self.assertEqual(marker, "\u2032s")
def test_split_without_s(self): stem, marker = split_possessive_markers(["CHARLES'"]) self.assertEqual(stem, "CHARLES") self.assertEqual(marker, "'")
def setUp(self): self.tokenizer = test_tokenizer_with_spans( self, lambda t: split_possessive_markers(space_tokenizer(t)))