def __init__(self, lemmatizer=None, stemmer=None, url_parser=None, unicode_form='NFKC', nltk_stop_words="english", sentence_tokenizer=('nltk_data', 'tokenizers/punkt/english.pickle'), max_char_repeats=3, lru_cache_size=50000, translate_map_inv=None, replace_map=None, html_renderer='default', add_abbrev_types=None, del_sent_starters=None): self._unicode_normalize = partial(unicodedata.normalize, unicode_form) self._replace_inplace = InPlaceReplacer(replace_map).replace \ if replace_map else lambda x: x self._tokenize = RegexpFeatureTokenizer().tokenize self._stopwords = frozenset(stopwords.words(nltk_stop_words)) self._url_parser = url_parser self._sentence_tokenizer, self._sentence_tokenize = \ self.load_sent_tokenizer(sentence_tokenizer, add_abbrev_types, del_sent_starters) self.sentence_tokenizer = None self._lemmatize = lru_wrap(lemmatizer.lemmatize, lru_cache_size) if lemmatizer else None self._stem = stemmer.stem if stemmer else None self._pos_tag = pos_tag self._replace_char_repeats = \ RepeatReplacer(max_repeats=max_char_repeats).replace \ if max_char_repeats > 0 else self._identity # translation of Unicode characters translator = Translator(EXTRA_TRANSLATE_MAP, translated=True) translator.add_inverse_map(translate_map_inv, translated=False) self._replace_chars = translator.replace if html_renderer is None: self.strip_html = lambda x: x elif html_renderer == u'default': self.strip_html = HTMLCleaner().clean elif html_renderer == u'beautifulsoup': self.strip_html = strip_html_bs else: raise ValueError('Invalid parameter value given for `html_renderer`') # tokenize a dummy string b/c lemmatizer and/or other tools can take # a while to initialize screwing up our attempts to measure performance self.tokenize(u"dummy string")
def setUp(self): TOKENIZER = tokenizer_builder(features=FEATURES) self.tokenizer = TOKENIZER self.tokenize = partial(TOKENIZER.tokenize, remove_stopwords=False) self.sentence_tokenize = TOKENIZER.sentence_tokenize self.base_tokenizer = RegexpFeatureTokenizer(features=FEATURES, debug=True)
class TestTwitterTokens(unittest.TestCase, SetComparisonMixin): maxDiff = 2000 def setUp(self): TOKENIZER = tokenizer_builder(features=FEATURES) self.tokenizer = TOKENIZER self.tokenize = partial(TOKENIZER.tokenize, remove_stopwords=False) self.sentence_tokenize = TOKENIZER.sentence_tokenize self.base_tokenizer = RegexpFeatureTokenizer(features=FEATURES, debug=True) def test_preprocess(self): text = u"wow \u2014 such \u2013 doge" preprocessed = self.tokenizer.preprocess(text) self.assertEqual(u'wow --- such -- doge', preprocessed) def test_dashes(self): text = u"wow \u2014 such \u2013 doge -- and --- are dashes" counts = Counter(self.tokenize(text)) self.assertEqual(2, counts[u'--']) self.assertEqual(2, counts[u'---']) def test_censored(self): text = u"she's a b*tch in a f***d world" tokens = self.tokenize(text) self.assertSetContainsSubset([u'she', u"'s", u'b*tch', u'f***d'], tokens) def test_sentence_split_ellipsis(self): """ Make sure there is a sentence break after ellipsis Note: The sentence splitter we use does not treat ellipsis as a sentence terminator if the word after it is not capitalized. """ text = u"I had a feeling that after \"Submerged\", this one wouldn't " \ u"be any better... I was right." sentences = self.sentence_tokenize(text) self.assertEqual(2, len(sentences)) def test_sentence_split_br(self): """ Make sure there is a sentence break before "O.K." """ text = u'Memorable lines like: "You son-of-a-gun!", "You son-of-a-witch!",' \ u' "Shoot!", and "Well, Forget You!"<br /><br />O.K. Bye.' sentences = self.sentence_tokenize(text) joint = u' | '.join([u' '.join(sentence) for sentence in sentences]) self.assertIn(u' | o', joint) def test_western_emoticons_happy(self): """With custom features removed, this text should be idempotent on tokenization """ text = u":-) :) =) =)) :=) >:) :] :') :^) (: [: ((= (= (=: :-p :D :o" tokens = self.tokenize(text) reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC")) self.assertEqual(text.lower(), reconstructed) group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertEqual(len(tokens), count_prefix(u"EMOTIC", group_names)) def test_western_emoticons_sad(self): """With custom features removed, this text should be idempotent on tokenization """ text = u":-( :( =( =(( :=( >:( :[ :'( :^( ): ]: ))= )= )=: :-c :C :O :@ D:" tokens = self.tokenize(text) reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC")) self.assertEqual(text.lower(), reconstructed) group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertEqual(len(tokens), count_prefix(u"EMOTIC", group_names)) def test_western_emoticons_misc(self): """With custom features removed, this text should be idempotent on tokenization """ text = u":0 :l :s :x \o/ \m/" tokens = self.tokenize(text) self.assertSetContainsSubset([u':0', u':l', u':s', u':x', u'\o/', u'\m/'], tokens) def test_hearts(self): """With custom features removed, this text should be idempotent on tokenization """ text = u"<3 full heart </3 heartbreak" tokens = self.tokenize(text) reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC")) self.assertEqual(text.lower(), reconstructed) group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertSetContainsSubset([u'<3', u'<EMOTIC_HEART_HAPPY>', u'</3', u'<EMOTIC_HEART_SAD>'], tokens) self.assertEqual(len(tokens) - 3, count_prefix(u"EMOTIC", group_names)) def test_no_emoticon(self): """No emoticon should be detected in this text """ text = u"(8) such is the game): - (7 or 8) and also (8 inches)" \ u" and spaces next to parentheses ( space ) ." group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertEqual(0, count_prefix(u"EMOTIC", group_names)) def test_eastern_emoticons(self): text = u"*.* (^_^) *_* *-* +_+ ~_~ -.- -__- -___- t_t q_q ;_; t.t q.q ;.;" tokens = self.tokenize(text) reconstructed = u' '.join(token for token in tokens if not (token.startswith(u"<") and token.endswith(u">"))) self.assertEqual(text, reconstructed) group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertEqual(len(tokens), count_prefix(u"EMOTIC", group_names)) def test_russian_emoticons(self): text = u"haha! ))))) )) how sad ((" tokens = self.tokenize(text) reconstructed = u' '.join(tokens) self.assertEqual(u'haha ! ))) )) how sad ((', reconstructed) group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertEqual(len(tokens) - 4, count_prefix(u"EMOTIC", group_names)) def test_ascii_arrow(self): text = u"Look here -->> such doge <<<" tokens = self.tokenize(text) self.assertSetContainsSubset( {'<ASCIIARROW_R>', '<ASCIIARROW_L>'}, tokens) def test_abbrev(self): text = u"S&P index of X-men in the U.S." tokens = self.tokenize(text) self.assertListEqual( [u's&p', u'index', u'of', u'x-men', u'in', u'the', u'u.s.'], tokens) def test_url_email(self): text = u"a dummy comment with http://www.google.com/ and [email protected]" tokens = self.tokenize(text) self.assertListEqual( [u'a', u'dummy', u'comment', u'with', u'<URI>', u'and', u'<EMAIL>'], tokens) def test_contraction(self): text = u"Daniel's life isn't great" tokens = self.tokenize(text) self.assertSetContainsSubset([u'daniel', u"'s", u'be', u"n't"], tokens) def test_contraction_lookalike(self): text = u"abr'acad'a'bra" tokens = self.tokenize(text) self.assertEqual(text, u"'".join(tokens)) def test_special_3d(self): text = u"3-d (3D) effect" tokens = self.tokenize(text) self.assertListEqual([u"<3D>", u"<3D>", u"effect"], tokens) def test_grade_1(self): text = u"can save this boring, Grade B+ western." tokens = self.tokenize(text) self.assertSetContainsSubset([u'<GRADE_B+>'], tokens) def test_grade_2(self): text = u"can save this boring, Grade B western." tokens = self.tokenize(text) self.assertSetContainsSubset([u'<GRADE_B>'], tokens) def test_grade_3(self): text = u"My grade: F." tokens = self.tokenize(text) self.assertSetContainsSubset([u'<GRADE_F>'], tokens) def test_grade_4(self): text = u"mindless B-grade \"entertainment.\"" tokens = self.tokenize(text) self.assertSetContainsSubset([u'<GRADE_B>'], tokens) def test_decade_1(self): text = u"Nice 1950s & 60s \"Americana\"" tokens = self.tokenize(text) self.assertSetContainsSubset( [u'nice', u'1950s', u'60s', u'americana'], tokens) def test_decade_2(self): text = u"Nice 1950s & 60's \"Americana\"" tokens = self.tokenize(text) self.assertSetContainsSubset( [u'nice', u'1950s', u'60s', u'americana'], tokens) def test_mention(self): text = u"@RayFranco is answering to @AnPel, this is a real '@username83' " \ u"but this is [email protected], and this is a @probablyfaketwitterusername" token_counts = Counter(self.tokenize(text)) self.assertEqual(4, token_counts['<MENTION>']) self.assertEqual(1, token_counts['<EMAIL>']) def test_emphasis_star(self): text = u"@hypnotic I know *cries*" tokens = self.tokenize(text) self.assertSetContainsSubset([u'<EMPHASIS_B>', u'cry'], tokens) def test_emphasis_underscore(self): text = u"I _hate_ sunblock" tokens = self.tokenize(text) self.assertSetContainsSubset([u'<EMPHASIS_U>', u'hate'], tokens) def test_unescape(self): text = u"@artmeanslove I <3 that book" tokens = self.tokenize(text) self.assertSetContainsSubset([u'<3', u'<EMOTIC_HEART_HAPPY>'], tokens) def test_kisses(self): text = u"ohh lovely x hehe x count naked people hehe that's " \ u"what you always tell me to do hehe x x x night night xxxxx" token_counts = Counter(self.tokenize(text)) self.assertEqual(2, token_counts[u'<XX>']) def test_kisses_hugs_1(self): text = u"all right, xo xo vou mimi now xoxo" token_counts = Counter(self.tokenize(text)) self.assertEqual(2, token_counts[u'<XOXO>']) def test_kisses_hugs_2(self): text = u"Night world xoxx, kisses xox" token_counts = Counter(self.tokenize(text)) self.assertEqual(2, token_counts[u'<XOXO>']) def test_timeofday(self): text = u"its okay its only 10.00 for us perth kiddos" token_counts = Counter(self.tokenize(text)) self.assertEqual(1, token_counts[u'<TIMEOFDAY>']) def test_timeofday_neg(self): text = u"but i couldnt find my visa. Shipping Sweden $22.22" token_counts = Counter(self.tokenize(text)) self.assertEqual(0, token_counts[u'<TIMEOFDAY>']) self.assertEqual(1, token_counts[u'$'])
class TestFeatureTokens(unittest.TestCase, SetComparisonMixin): maxDiff = 2000 def setUp(self): self.tokenizer = TOKENIZER self.tokenize = partial(TOKENIZER.tokenize, remove_stopwords=False) self.sentence_tokenize = TOKENIZER.sentence_tokenize self.base_tokenizer = RegexpFeatureTokenizer(debug=True) def test_preprocess(self): text = u"wow \u2014 such \u2013 doge" preprocessed = self.tokenizer.preprocess(text) self.assertEqual(u'wow --- such -- doge', preprocessed) def test_dashes(self): text = u"wow \u2014 such \u2013 doge -- and --- are dashes" counts = Counter(self.tokenize(text)) self.assertEqual(2, counts[u'--']) self.assertEqual(2, counts[u'---']) def test_censored(self): text = u"she's a b*tch in a f***d world" tokens = self.tokenize(text) self.assertSetContainsSubset([u'she', u"'s", u'b*tch', u'f***d'], tokens) def test_sentence_split_ellipsis(self): """ The sentence splitter we use does not treat ellipsis as a sentence terminator if the word after it is not capitalized """ text = u"I had a feeling that after \"Submerged\", this one wouldn't be any better... I was right." self.assertEqual(2, len(self.sentence_tokenize(text))) def test_sentence_split_br(self): """ Make sure HTML <BR> tag is recognized as newline/break. In this case, make sure that there is a sentence break before "O.K." """ text = u'Memorable lines like: "You son-of-a-gun!", "You son-of-a-witch!",' \ u' "Shoot!", and "Well, Forget You!"<br /><br />O.K. Bye.' disjoint = self.sentence_tokenize(text) joint = u' | '.join([u' '.join(sent) for sent in disjoint]) self.assertIn(u' | o', joint) def test_western_emoticons_happy(self): """With custom features removed, this text should be idempotent on tokenization """ text = u":-) :) =) =)) :=) >:) :] :') :^) (: [: ((= (= (=: :-p :D :o" tokens = self.tokenize(text) reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC")) self.assertEqual(text.lower(), reconstructed) group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertEqual(34, count_prefix(u"EMOTIC", group_names)) def test_western_emoticons_sad(self): """With custom features removed, this text should be idempotent on tokenization """ text = u":-( :( =( =(( :=( >:( :[ :'( :^( ): ]: ))= )= )=: :-c :C :O" tokens = self.tokenize(text) reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC")) self.assertEqual(text.lower(), reconstructed) group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertEqual(34, count_prefix(u"EMOTIC", group_names)) def test_hearts(self): """With custom features removed, this text should be idempotent on tokenization """ text = u"<3 full heart </3 heartbreak" tokens = self.tokenize(text) reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC")) self.assertEqual(text.lower(), reconstructed) group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertSetContainsSubset([u'<3', u'<EMOTIC_HEART_HAPPY>', u'</3', u'<EMOTIC_HEART_SAD>'], tokens) self.assertEqual(4, count_prefix(u"EMOTIC", group_names)) def test_no_emoticon(self): """No emoticon should be detected in this text """ text = u"(8) such is the game): - (7 or 8) and also (8 inches)" \ u" and spaces next to parentheses ( space ) ." group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertEqual(0, count_prefix(u"EMOTIC", group_names)) def test_eastern_emoticons(self): text = u"*.* (^_^) *_* *-* +_+ ~_~" tokens = self.tokenize(text) reconstructed = u' '.join(token for token in tokens if not (token.startswith(u"<") and token.endswith(u">"))) self.assertEqual(text, reconstructed) group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertEqual(6, count_prefix(u"EMOTIC", group_names)) def test_russian_emoticons(self): text = u"haha! ))))) )) how sad ((" tokens = self.tokenize(text) reconstructed = u' '.join(tokens) self.assertEqual(u'haha ! ))) )) how sad ((', reconstructed) group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertEqual(3, count_prefix(u"EMOTIC", group_names)) def test_ascii_arrow(self): text = u"Look here -->> such doge <<<" tokens = self.tokenize(text) self.assertSetContainsSubset( {'<ASCIIARROW_RIGHT>', '<ASCIIARROW_LEFT>'}, tokens) def test_abbrev(self): text = u"S&P index of X-men in the U.S." tokens = self.tokenize(text) self.assertListEqual( [u's&p', u'index', u'of', u'x-men', u'in', u'the', u'u.s.'], tokens) def test_url_email(self): text = u"a dummy comment with http://www.google.com/ and [email protected]" tokens = self.tokenize(text) self.assertListEqual( [u'a', u'dummy', u'comment', u'with', u'<URI>', u'and', u'<EMAIL>'], tokens) def test_contraction(self): text = u"Daniel's life isn't great" tokens = self.tokenize(text) self.assertSetContainsSubset([u'daniel', u"'s", u'be', u"n't"], tokens) def test_contraction_lookalike(self): text = u"abr'acad'a'bra" tokens = self.tokenize(text) self.assertEqual(text, u"'".join(tokens)) def test_special_3d(self): text = u"3-d (3D) effect" tokens = self.tokenize(text) self.assertListEqual([u"<3D>", u"<3D>", u"effect"], tokens) def test_rating_false_0(self): text = u"I re-lived 1939/40 and my own evacuation from London" tokens = self.tokenize(text) self.assertSetContainsSubset([u'40'], tokens) def test_rating_false_1(self): text = u"Update: 9/4/07-I've now read Breaking Free" tokens = self.tokenize(text) self.assertSetContainsSubset([u'<DATE>'], tokens) def test_rating_false_2(self): text = u"the humility of a 10 year old in cooking class" tokens = self.tokenize(text) self.assertSetContainsSubset([u'10'], tokens) def test_rating_0(self): text = u"My rating: 8.75/10----While most of this show is good" tokens = self.tokenize(text) self.assertSetContainsSubset([u'<9/10>'], tokens) def test_rating_1(self): text = u"which deserves 11 out of 10," tokens = self.tokenize(text) self.assertSetContainsSubset([u'<11/10>'], tokens) def test_rating_2(self): text = u"I give this film 10 stars out of 10." tokens = self.tokenize(text) self.assertSetContainsSubset([u'<10/10>'], tokens) def test_rating_3(self): text = u"A must-see for fans of Japanese horror.10 out of 10." tokens = self.tokenize(text) self.assertSetContainsSubset([u'<10/10>'], tokens) def test_rating_4(self): text = u"a decent script.<br /><br />3/10" tokens = self.tokenize(text) self.assertSetContainsSubset([u'<3/10>'], tokens) def test_rating_5(self): text = u"give it five stars out of ten" tokens = self.tokenize(text) self.assertSetContainsSubset([u'<5/10>'], tokens) def test_rating_6(self): text = u"give it 3 1/2 stars out of five" tokens = self.tokenize(text) self.assertSetContainsSubset([u'<7/10>'], tokens) def test_rating_7(self): text = u"give it ** 1/2 stars out of four" tokens = self.tokenize(text) self.assertSetContainsSubset([u'<6/10>'], tokens) def test_rating_8(self): text = u"has been done so many times.. 7 of 10" tokens = self.tokenize(text) self.assertSetContainsSubset([u'<7/10>'], tokens) def test_rating_9(self): text = u"has been done so many times.. 8 / 10" tokens = self.tokenize(text) self.assertSetContainsSubset([u'<8/10>'], tokens) def test_rating_10(self): text = u"I give it a 7 star rating" tokens = self.tokenize(text) self.assertSetContainsSubset([u'<7/10>'], tokens) def test_rating_11(self): text = u"Grade: * out of *****" tokens = self.tokenize(text) self.assertSetContainsSubset([u'<2/10>'], tokens) def test_rating_12(self): text = u"Final Judgement: **/****" tokens = self.tokenize(text) self.assertSetContainsSubset([u'<5/10>'], tokens) def test_rating_13(self): text = u'on March 18th, 2007.<br /><br />84/100 (***)' tokens = self.tokenize(text) self.assertSetContainsSubset([u'<8/10>'], tokens) def test_rating_14(self): text = u'I give it a full 10.' tokens = self.tokenize(text) self.assertSetContainsSubset([u'<10/10>'], tokens) def test_rating_15(self): text = u'I give it a -50 out of 10. MY GOD!!!!' tokens = self.tokenize(text) self.assertSetContainsSubset([u'<0/10>'], tokens) def test_rating_16(self): text = u"* * 1/2 / * * * *" tokens = self.tokenize(text) self.assertSetContainsSubset([u'<6/10>'], tokens) def test_rating_17(self): text = u"i gave this movie a 2 for the actors" tokens = self.tokenize(text) self.assertSetContainsSubset([u'<2/10>'], tokens) def test_grade_1(self): text = u"can save this boring, Grade B+ western." tokens = self.tokenize(text) self.assertSetContainsSubset([u'<GRADE_B+>'], tokens) def test_grade_2(self): text = u"can save this boring, Grade B western." tokens = self.tokenize(text) self.assertSetContainsSubset([u'<GRADE_B>'], tokens) def test_grade_3(self): text = u"My grade: F." tokens = self.tokenize(text) self.assertSetContainsSubset([u'<GRADE_F>'], tokens) def test_grade_4(self): text = u"mindless B-grade \"entertainment.\"" tokens = self.tokenize(text) self.assertSetContainsSubset([u'<GRADE_B>'], tokens) def test_decade_1(self): text = u"Nice 1950s & 60s \"Americana\"" tokens = self.tokenize(text) self.assertSetContainsSubset( [u'nice', u'1950', u'60', u'americana'], tokens) def test_decade_2(self): text = u"Nice 1950s & 60's \"Americana\"" tokens = self.tokenize(text) self.assertSetContainsSubset( [u'nice', u'1950', u'60', u'americana'], tokens)
def setUp(self): self.tokenizer = TOKENIZER self.tokenize = partial(TOKENIZER.tokenize, remove_stopwords=False) self.sentence_tokenize = TOKENIZER.sentence_tokenize self.base_tokenizer = RegexpFeatureTokenizer(debug=True)