Ejemplo n.º 1
0
 def test_eastern_emoticons(self):
     text = u"*.* (^_^) *_* *-* +_+ ~_~ -.- -__- -___- t_t q_q ;_; t.t q.q ;.;"
     tokens = self.tokenize(text)
     reconstructed = u' '.join(token for token in tokens if not (token.startswith(u"<") and token.endswith(u">")))
     self.assertEqual(text, reconstructed)
     group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
     self.assertEqual(len(tokens), count_prefix(u"EMOTIC", group_names))
Ejemplo n.º 2
0
 def test_russian_emoticons(self):
     text = u"haha! ))))) )) how sad (("
     tokens = self.tokenize(text)
     reconstructed = u' '.join(tokens)
     self.assertEqual(u'haha ! ))) )) how sad ((', reconstructed)
     group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
     self.assertEqual(len(tokens) - 4, count_prefix(u"EMOTIC", group_names))
Ejemplo n.º 3
0
 def test_eastern_emoticons(self):
     text = u"*.* (^_^) *_* *-* +_+ ~_~"
     tokens = self.tokenize(text)
     reconstructed = u' '.join(token for token in tokens if not (token.startswith(u"<") and token.endswith(u">")))
     self.assertEqual(text, reconstructed)
     group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
     self.assertEqual(6, count_prefix(u"EMOTIC", group_names))
Ejemplo n.º 4
0
 def test_no_emoticon(self):
     """No emoticon should be detected in this text
     """
     text = u"(8) such is the game): -  (7 or 8) and also (8 inches)" \
         u" and spaces next to parentheses ( space ) ."
     group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
     self.assertEqual(0, count_prefix(u"EMOTIC", group_names))
Ejemplo n.º 5
0
 def test_no_emoticon(self):
     """No emoticon should be detected in this text
     """
     text = u"(8) such is the game): -  (7 or 8) and also (8 inches)" \
         u" and spaces next to parentheses ( space ) ."
     group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
     self.assertEqual(0, count_prefix(u"EMOTIC", group_names))
Ejemplo n.º 6
0
 def test_russian_emoticons(self):
     text = u"haha! ))))) )) how sad (("
     tokens = self.tokenize(text)
     reconstructed = u' '.join(tokens)
     self.assertEqual(u'haha ! ))) )) how sad ((', reconstructed)
     group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
     self.assertEqual(3, count_prefix(u"EMOTIC", group_names))
Ejemplo n.º 7
0
 def test_western_emoticons_happy(self):
     """With custom features removed, this text should be idempotent on tokenization
     """
     text = u":-) :) =) =)) :=) >:) :] :') :^) (: [: ((= (= (=: :-p :D :o"
     tokens = self.tokenize(text)
     reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC"))
     self.assertEqual(text.lower(), reconstructed)
     group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
     self.assertEqual(len(tokens), count_prefix(u"EMOTIC", group_names))
Ejemplo n.º 8
0
 def test_western_emoticons_happy(self):
     """With custom features removed, this text should be idempotent on tokenization
     """
     text = u":-) :) =) =)) :=) >:) :] :') :^) (: [: ((= (= (=: :-p :D :o"
     tokens = self.tokenize(text)
     reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC"))
     self.assertEqual(text.lower(), reconstructed)
     group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
     self.assertEqual(34, count_prefix(u"EMOTIC", group_names))
Ejemplo n.º 9
0
    def test_western_emoticons_sad(self):
        """With custom features removed, this text should be idempotent on tokenization
        """
        text = u":-( :( =( =(( :=( >:( :[ :'( :^( ): ]: ))= )= )=: :-c :C :O :@ D:"
        tokens = self.tokenize(text)
        reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC"))
        self.assertEqual(text.lower(), reconstructed)

        group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
        self.assertEqual(len(tokens), count_prefix(u"EMOTIC", group_names))
Ejemplo n.º 10
0
    def test_western_emoticons_sad(self):
        """With custom features removed, this text should be idempotent on tokenization
        """
        text = u":-( :( =( =(( :=( >:( :[ :'( :^( ): ]: ))= )= )=: :-c :C :O"
        tokens = self.tokenize(text)
        reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC"))
        self.assertEqual(text.lower(), reconstructed)

        group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
        self.assertEqual(34, count_prefix(u"EMOTIC", group_names))
Ejemplo n.º 11
0
    def test_hearts(self):
        """With custom features removed, this text should be idempotent on tokenization
        """
        text = u"<3 full heart </3 heartbreak"
        tokens = self.tokenize(text)
        reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC"))
        self.assertEqual(text.lower(), reconstructed)

        group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
        self.assertSetContainsSubset([u'<3', u'<EMOTIC_HEART_HAPPY>', u'</3', u'<EMOTIC_HEART_SAD>'],
                                     tokens)
        self.assertEqual(len(tokens) - 3, count_prefix(u"EMOTIC", group_names))
Ejemplo n.º 12
0
    def test_hearts(self):
        """With custom features removed, this text should be idempotent on tokenization
        """
        text = u"<3 full heart </3 heartbreak"
        tokens = self.tokenize(text)
        reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC"))
        self.assertEqual(text.lower(), reconstructed)

        group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]]
        self.assertSetContainsSubset([u'<3', u'<EMOTIC_HEART_HAPPY>', u'</3', u'<EMOTIC_HEART_SAD>'],
                                     tokens)
        self.assertEqual(4, count_prefix(u"EMOTIC", group_names))