Example #1
0
    def __init__(self, lemmatizer=None, stemmer=None, url_parser=None,
                 unicode_form='NFKC', nltk_stop_words="english",
                 sentence_tokenizer=('nltk_data', 'tokenizers/punkt/english.pickle'),
                 max_char_repeats=3, lru_cache_size=50000, translate_map_inv=None,
                 replace_map=None, html_renderer='default', add_abbrev_types=None,
                 del_sent_starters=None):
        self._unicode_normalize = partial(unicodedata.normalize, unicode_form)
        self._replace_inplace = InPlaceReplacer(replace_map).replace \
            if replace_map else lambda x: x
        self._tokenize = RegexpFeatureTokenizer().tokenize
        self._stopwords = frozenset(stopwords.words(nltk_stop_words))
        self._url_parser = url_parser

        self._sentence_tokenizer, self._sentence_tokenize = \
            self.load_sent_tokenizer(sentence_tokenizer, add_abbrev_types, del_sent_starters)

        self.sentence_tokenizer = None
        self._lemmatize = lru_wrap(lemmatizer.lemmatize, lru_cache_size) if lemmatizer else None
        self._stem = stemmer.stem if stemmer else None
        self._pos_tag = pos_tag
        self._replace_char_repeats = \
            RepeatReplacer(max_repeats=max_char_repeats).replace \
            if max_char_repeats > 0 else self._identity

        # translation of Unicode characters
        translator = Translator(EXTRA_TRANSLATE_MAP, translated=True)
        translator.add_inverse_map(translate_map_inv, translated=False)
        self._replace_chars = translator.replace

        if html_renderer is None:
            self.strip_html = lambda x: x
        elif html_renderer == u'default':
            self.strip_html = HTMLCleaner().clean
        elif html_renderer == u'beautifulsoup':
            self.strip_html = strip_html_bs
        else:
            raise ValueError('Invalid parameter value given for `html_renderer`')

        # tokenize a dummy string b/c lemmatizer and/or other tools can take
        # a while to initialize screwing up our attempts to measure performance
        self.tokenize(u"dummy string")
Example #2
0
 def setUp(self):
     TOKENIZER = tokenizer_builder(features=FEATURES)
     self.tokenizer = TOKENIZER
     self.tokenize = partial(TOKENIZER.tokenize, remove_stopwords=False)
     self.sentence_tokenize = TOKENIZER.sentence_tokenize
     self.base_tokenizer = RegexpFeatureTokenizer(features=FEATURES, debug=True)
Example #3
0
class TestTwitterTokens(unittest.TestCase, SetComparisonMixin):

    maxDiff = 2000

    def setUp(self):
        TOKENIZER = tokenizer_builder(features=FEATURES)
        self.tokenizer = TOKENIZER
        self.tokenize = partial(TOKENIZER.tokenize, remove_stopwords=False)
        self.sentence_tokenize = TOKENIZER.sentence_tokenize
        self.base_tokenizer = RegexpFeatureTokenizer(features=FEATURES, debug=True)

    def test_preprocess(self):
        text = u"wow \u2014 such \u2013 doge"
        preprocessed = self.tokenizer.preprocess(text)
        self.assertEqual(u'wow --- such -- doge', preprocessed)

    def test_dashes(self):
        text = u"wow \u2014 such \u2013 doge -- and --- are dashes"
        counts = Counter(self.tokenize(text))
        self.assertEqual(2, counts[u'--'])
        self.assertEqual(2, counts[u'---'])

    def test_censored(self):
        text = u"she's a b*tch in a f***d world"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'she', u"'s", u'b*tch', u'f***d'], tokens)

    def test_sentence_split_ellipsis(self):
        """
        Make sure there is a sentence break after ellipsis

        Note: The sentence splitter we use does not treat ellipsis as a
        sentence terminator if the word after it is not capitalized.
        """
        text = u"I had a feeling that after \"Submerged\", this one wouldn't " \
               u"be any better... I was right."
        sentences = self.sentence_tokenize(text)
        self.assertEqual(2, len(sentences))

    def test_sentence_split_br(self):
        """
        Make sure there is a sentence break before "O.K."
        """
        text = u'Memorable lines like: "You son-of-a-gun!", "You son-of-a-witch!",' \
               u' "Shoot!", and "Well, Forget You!"<br /><br />O.K. Bye.'
        sentences = self.sentence_tokenize(text)
        joint = u' | '.join([u' '.join(sentence) for sentence in sentences])
        self.assertIn(u' | o', joint)

    def test_western_emoticons_happy(self):
        """With custom features removed, this text should be idempotent on tokenization
        """
        text = u":-) :) =) =)) :=) >:) :] :') :^) (: [: ((= (= (=: :-p :D :o"
        tokens = self.tokenize(text)
        reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC"))
        self.assertEqual(text.lower(), reconstructed)
        group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
        self.assertEqual(len(tokens), count_prefix(u"EMOTIC", group_names))

    def test_western_emoticons_sad(self):
        """With custom features removed, this text should be idempotent on tokenization
        """
        text = u":-( :( =( =(( :=( >:( :[ :'( :^( ): ]: ))= )= )=: :-c :C :O :@ D:"
        tokens = self.tokenize(text)
        reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC"))
        self.assertEqual(text.lower(), reconstructed)

        group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
        self.assertEqual(len(tokens), count_prefix(u"EMOTIC", group_names))

    def test_western_emoticons_misc(self):
        """With custom features removed, this text should be idempotent on tokenization
        """
        text = u":0 :l :s :x \o/ \m/"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u':0', u':l', u':s', u':x', u'\o/', u'\m/'], tokens)

    def test_hearts(self):
        """With custom features removed, this text should be idempotent on tokenization
        """
        text = u"<3 full heart </3 heartbreak"
        tokens = self.tokenize(text)
        reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC"))
        self.assertEqual(text.lower(), reconstructed)

        group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
        self.assertSetContainsSubset([u'<3', u'<EMOTIC_HEART_HAPPY>', u'</3', u'<EMOTIC_HEART_SAD>'],
                                     tokens)
        self.assertEqual(len(tokens) - 3, count_prefix(u"EMOTIC", group_names))

    def test_no_emoticon(self):
        """No emoticon should be detected in this text
        """
        text = u"(8) such is the game): -  (7 or 8) and also (8 inches)" \
            u" and spaces next to parentheses ( space ) ."
        group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
        self.assertEqual(0, count_prefix(u"EMOTIC", group_names))

    def test_eastern_emoticons(self):
        text = u"*.* (^_^) *_* *-* +_+ ~_~ -.- -__- -___- t_t q_q ;_; t.t q.q ;.;"
        tokens = self.tokenize(text)
        reconstructed = u' '.join(token for token in tokens if not (token.startswith(u"<") and token.endswith(u">")))
        self.assertEqual(text, reconstructed)
        group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
        self.assertEqual(len(tokens), count_prefix(u"EMOTIC", group_names))

    def test_russian_emoticons(self):
        text = u"haha! ))))) )) how sad (("
        tokens = self.tokenize(text)
        reconstructed = u' '.join(tokens)
        self.assertEqual(u'haha ! ))) )) how sad ((', reconstructed)
        group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
        self.assertEqual(len(tokens) - 4, count_prefix(u"EMOTIC", group_names))

    def test_ascii_arrow(self):
        text = u"Look here -->> such doge <<<"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset(
            {'<ASCIIARROW_R>', '<ASCIIARROW_L>'}, tokens)

    def test_abbrev(self):
        text = u"S&P index of X-men in the U.S."
        tokens = self.tokenize(text)
        self.assertListEqual(
            [u's&p', u'index', u'of', u'x-men', u'in', u'the', u'u.s.'],
            tokens)

    def test_url_email(self):
        text = u"a dummy comment with http://www.google.com/ and [email protected]"
        tokens = self.tokenize(text)
        self.assertListEqual(
            [u'a', u'dummy', u'comment', u'with', u'<URI>', u'and', u'<EMAIL>'],
            tokens)

    def test_contraction(self):
        text = u"Daniel's life isn't great"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'daniel', u"'s", u'be', u"n't"], tokens)

    def test_contraction_lookalike(self):
        text = u"abr'acad'a'bra"
        tokens = self.tokenize(text)
        self.assertEqual(text, u"'".join(tokens))

    def test_special_3d(self):
        text = u"3-d (3D) effect"
        tokens = self.tokenize(text)
        self.assertListEqual([u"<3D>", u"<3D>", u"effect"], tokens)

    def test_grade_1(self):
        text = u"can save this boring, Grade B+ western."
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<GRADE_B+>'], tokens)

    def test_grade_2(self):
        text = u"can save this boring, Grade B western."
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<GRADE_B>'], tokens)

    def test_grade_3(self):
        text = u"My grade: F."
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<GRADE_F>'], tokens)

    def test_grade_4(self):
        text = u"mindless B-grade \"entertainment.\""
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<GRADE_B>'], tokens)

    def test_decade_1(self):
        text = u"Nice 1950s & 60s \"Americana\""
        tokens = self.tokenize(text)
        self.assertSetContainsSubset(
            [u'nice', u'1950s', u'60s', u'americana'], tokens)

    def test_decade_2(self):
        text = u"Nice 1950s & 60's \"Americana\""
        tokens = self.tokenize(text)
        self.assertSetContainsSubset(
            [u'nice', u'1950s', u'60s', u'americana'], tokens)

    def test_mention(self):
        text = u"@RayFranco is answering to @AnPel, this is a real '@username83' " \
               u"but this is [email protected], and this is a @probablyfaketwitterusername"
        token_counts = Counter(self.tokenize(text))
        self.assertEqual(4, token_counts['<MENTION>'])
        self.assertEqual(1, token_counts['<EMAIL>'])

    def test_emphasis_star(self):
        text = u"@hypnotic I know  *cries*"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<EMPHASIS_B>', u'cry'], tokens)

    def test_emphasis_underscore(self):
        text = u"I _hate_ sunblock"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<EMPHASIS_U>', u'hate'], tokens)

    def test_unescape(self):
        text = u"@artmeanslove I &lt;3 that book"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<3', u'<EMOTIC_HEART_HAPPY>'], tokens)

    def test_kisses(self):
        text = u"ohh lovely  x hehe x count naked people hehe that's " \
            u"what you always tell me to do hehe x x x night night xxxxx"
        token_counts = Counter(self.tokenize(text))
        self.assertEqual(2, token_counts[u'<XX>'])

    def test_kisses_hugs_1(self):
        text = u"all right, xo xo vou mimi now xoxo"
        token_counts = Counter(self.tokenize(text))
        self.assertEqual(2, token_counts[u'<XOXO>'])

    def test_kisses_hugs_2(self):
        text = u"Night world xoxx, kisses xox"
        token_counts = Counter(self.tokenize(text))
        self.assertEqual(2, token_counts[u'<XOXO>'])

    def test_timeofday(self):
        text = u"its okay its only 10.00 for us perth kiddos"
        token_counts = Counter(self.tokenize(text))
        self.assertEqual(1, token_counts[u'<TIMEOFDAY>'])

    def test_timeofday_neg(self):
        text = u"but i couldnt find my visa. Shipping Sweden $22.22"
        token_counts = Counter(self.tokenize(text))
        self.assertEqual(0, token_counts[u'<TIMEOFDAY>'])
        self.assertEqual(1, token_counts[u'$'])
Example #4
0
class TestFeatureTokens(unittest.TestCase, SetComparisonMixin):

    maxDiff = 2000

    def setUp(self):
        self.tokenizer = TOKENIZER
        self.tokenize = partial(TOKENIZER.tokenize, remove_stopwords=False)
        self.sentence_tokenize = TOKENIZER.sentence_tokenize
        self.base_tokenizer = RegexpFeatureTokenizer(debug=True)

    def test_preprocess(self):
        text = u"wow \u2014 such \u2013 doge"
        preprocessed = self.tokenizer.preprocess(text)
        self.assertEqual(u'wow --- such -- doge', preprocessed)

    def test_dashes(self):
        text = u"wow \u2014 such \u2013 doge -- and --- are dashes"
        counts = Counter(self.tokenize(text))
        self.assertEqual(2, counts[u'--'])
        self.assertEqual(2, counts[u'---'])

    def test_censored(self):
        text = u"she's a b*tch in a f***d world"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'she', u"'s", u'b*tch', u'f***d'], tokens)

    def test_sentence_split_ellipsis(self):
        """
        The sentence splitter we use does not treat ellipsis as a sentence terminator
        if the word after it is not capitalized
        """
        text = u"I had a feeling that after \"Submerged\", this one wouldn't be any better... I was right."
        self.assertEqual(2, len(self.sentence_tokenize(text)))

    def test_sentence_split_br(self):
        """
        Make sure HTML <BR> tag is recognized as newline/break. In this case,
        make sure that there is a sentence break before "O.K."
        """
        text = u'Memorable lines like: "You son-of-a-gun!", "You son-of-a-witch!",' \
            u' "Shoot!", and "Well, Forget You!"<br /><br />O.K. Bye.'
        disjoint = self.sentence_tokenize(text)
        joint = u' | '.join([u' '.join(sent) for sent in disjoint])
        self.assertIn(u' | o', joint)

    def test_western_emoticons_happy(self):
        """With custom features removed, this text should be idempotent on tokenization
        """
        text = u":-) :) =) =)) :=) >:) :] :') :^) (: [: ((= (= (=: :-p :D :o"
        tokens = self.tokenize(text)
        reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC"))
        self.assertEqual(text.lower(), reconstructed)
        group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
        self.assertEqual(34, count_prefix(u"EMOTIC", group_names))

    def test_western_emoticons_sad(self):
        """With custom features removed, this text should be idempotent on tokenization
        """
        text = u":-( :( =( =(( :=( >:( :[ :'( :^( ): ]: ))= )= )=: :-c :C :O"
        tokens = self.tokenize(text)
        reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC"))
        self.assertEqual(text.lower(), reconstructed)

        group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
        self.assertEqual(34, count_prefix(u"EMOTIC", group_names))

    def test_hearts(self):
        """With custom features removed, this text should be idempotent on tokenization
        """
        text = u"<3 full heart </3 heartbreak"
        tokens = self.tokenize(text)
        reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC"))
        self.assertEqual(text.lower(), reconstructed)

        group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
        self.assertSetContainsSubset([u'<3', u'<EMOTIC_HEART_HAPPY>', u'</3', u'<EMOTIC_HEART_SAD>'],
                                     tokens)
        self.assertEqual(4, count_prefix(u"EMOTIC", group_names))

    def test_no_emoticon(self):
        """No emoticon should be detected in this text
        """
        text = u"(8) such is the game): -  (7 or 8) and also (8 inches)" \
            u" and spaces next to parentheses ( space ) ."
        group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
        self.assertEqual(0, count_prefix(u"EMOTIC", group_names))

    def test_eastern_emoticons(self):
        text = u"*.* (^_^) *_* *-* +_+ ~_~"
        tokens = self.tokenize(text)
        reconstructed = u' '.join(token for token in tokens if not (token.startswith(u"<") and token.endswith(u">")))
        self.assertEqual(text, reconstructed)
        group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
        self.assertEqual(6, count_prefix(u"EMOTIC", group_names))

    def test_russian_emoticons(self):
        text = u"haha! ))))) )) how sad (("
        tokens = self.tokenize(text)
        reconstructed = u' '.join(tokens)
        self.assertEqual(u'haha ! ))) )) how sad ((', reconstructed)
        group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
        self.assertEqual(3, count_prefix(u"EMOTIC", group_names))

    def test_ascii_arrow(self):
        text = u"Look here -->> such doge <<<"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset(
            {'<ASCIIARROW_RIGHT>', '<ASCIIARROW_LEFT>'}, tokens)

    def test_abbrev(self):
        text = u"S&P index of X-men in the U.S."
        tokens = self.tokenize(text)
        self.assertListEqual(
            [u's&p', u'index', u'of', u'x-men', u'in', u'the', u'u.s.'],
            tokens)

    def test_url_email(self):
        text = u"a dummy comment with http://www.google.com/ and [email protected]"
        tokens = self.tokenize(text)
        self.assertListEqual(
            [u'a', u'dummy', u'comment', u'with', u'<URI>', u'and', u'<EMAIL>'],
            tokens)

    def test_contraction(self):
        text = u"Daniel's life isn't great"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'daniel', u"'s", u'be', u"n't"], tokens)

    def test_contraction_lookalike(self):
        text = u"abr'acad'a'bra"
        tokens = self.tokenize(text)
        self.assertEqual(text, u"'".join(tokens))

    def test_special_3d(self):
        text = u"3-d (3D) effect"
        tokens = self.tokenize(text)
        self.assertListEqual([u"<3D>", u"<3D>", u"effect"], tokens)

    def test_rating_false_0(self):
        text = u"I re-lived 1939/40 and my own evacuation from London"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'40'], tokens)

    def test_rating_false_1(self):
        text = u"Update: 9/4/07-I've now read Breaking Free"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<DATE>'], tokens)

    def test_rating_false_2(self):
        text = u"the humility of a 10 year old in cooking class"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'10'], tokens)

    def test_rating_0(self):
        text = u"My rating: 8.75/10----While most of this show is good"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<9/10>'], tokens)

    def test_rating_1(self):
        text = u"which deserves 11 out of 10,"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<11/10>'], tokens)

    def test_rating_2(self):
        text = u"I give this film 10 stars out of 10."
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<10/10>'], tokens)

    def test_rating_3(self):
        text = u"A must-see for fans of Japanese horror.10 out of 10."
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<10/10>'], tokens)

    def test_rating_4(self):
        text = u"a decent script.<br /><br />3/10"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<3/10>'], tokens)

    def test_rating_5(self):
        text = u"give it five stars out of ten"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<5/10>'], tokens)

    def test_rating_6(self):
        text = u"give it 3 1/2 stars out of five"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<7/10>'], tokens)

    def test_rating_7(self):
        text = u"give it ** 1/2 stars out of four"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<6/10>'], tokens)

    def test_rating_8(self):
        text = u"has been done so many times.. 7 of 10"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<7/10>'], tokens)

    def test_rating_9(self):
        text = u"has been done so many times.. 8 / 10"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<8/10>'], tokens)

    def test_rating_10(self):
        text = u"I give it a 7 star rating"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<7/10>'], tokens)

    def test_rating_11(self):
        text = u"Grade: * out of *****"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<2/10>'], tokens)

    def test_rating_12(self):
        text = u"Final Judgement: **/****"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<5/10>'], tokens)

    def test_rating_13(self):
        text = u'on March 18th, 2007.<br /><br />84/100 (***)'
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<8/10>'], tokens)

    def test_rating_14(self):
        text = u'I give it a full 10.'
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<10/10>'], tokens)

    def test_rating_15(self):
        text = u'I give it a -50 out of 10. MY GOD!!!!'
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<0/10>'], tokens)

    def test_rating_16(self):
        text = u"* * 1/2 / * * * *"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<6/10>'], tokens)

    def test_rating_17(self):
        text = u"i gave this movie a 2 for the actors"
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<2/10>'], tokens)

    def test_grade_1(self):
        text = u"can save this boring, Grade B+ western."
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<GRADE_B+>'], tokens)

    def test_grade_2(self):
        text = u"can save this boring, Grade B western."
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<GRADE_B>'], tokens)

    def test_grade_3(self):
        text = u"My grade: F."
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<GRADE_F>'], tokens)

    def test_grade_4(self):
        text = u"mindless B-grade \"entertainment.\""
        tokens = self.tokenize(text)
        self.assertSetContainsSubset([u'<GRADE_B>'], tokens)

    def test_decade_1(self):
        text = u"Nice 1950s & 60s \"Americana\""
        tokens = self.tokenize(text)
        self.assertSetContainsSubset(
            [u'nice', u'1950', u'60', u'americana'], tokens)

    def test_decade_2(self):
        text = u"Nice 1950s & 60's \"Americana\""
        tokens = self.tokenize(text)
        self.assertSetContainsSubset(
            [u'nice', u'1950', u'60', u'americana'], tokens)
Example #5
0
 def setUp(self):
     self.tokenizer = TOKENIZER
     self.tokenize = partial(TOKENIZER.tokenize, remove_stopwords=False)
     self.sentence_tokenize = TOKENIZER.sentence_tokenize
     self.base_tokenizer = RegexpFeatureTokenizer(debug=True)