def test_eastern_emoticons(self): text = u"*.* (^_^) *_* *-* +_+ ~_~ -.- -__- -___- t_t q_q ;_; t.t q.q ;.;" tokens = self.tokenize(text) reconstructed = u' '.join(token for token in tokens if not (token.startswith(u"<") and token.endswith(u">"))) self.assertEqual(text, reconstructed) group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertEqual(len(tokens), count_prefix(u"EMOTIC", group_names))
def test_russian_emoticons(self): text = u"haha! ))))) )) how sad ((" tokens = self.tokenize(text) reconstructed = u' '.join(tokens) self.assertEqual(u'haha ! ))) )) how sad ((', reconstructed) group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertEqual(len(tokens) - 4, count_prefix(u"EMOTIC", group_names))
def test_eastern_emoticons(self): text = u"*.* (^_^) *_* *-* +_+ ~_~" tokens = self.tokenize(text) reconstructed = u' '.join(token for token in tokens if not (token.startswith(u"<") and token.endswith(u">"))) self.assertEqual(text, reconstructed) group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertEqual(6, count_prefix(u"EMOTIC", group_names))
def test_no_emoticon(self): """No emoticon should be detected in this text """ text = u"(8) such is the game): - (7 or 8) and also (8 inches)" \ u" and spaces next to parentheses ( space ) ." group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertEqual(0, count_prefix(u"EMOTIC", group_names))
def test_russian_emoticons(self): text = u"haha! ))))) )) how sad ((" tokens = self.tokenize(text) reconstructed = u' '.join(tokens) self.assertEqual(u'haha ! ))) )) how sad ((', reconstructed) group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertEqual(3, count_prefix(u"EMOTIC", group_names))
def test_western_emoticons_happy(self): """With custom features removed, this text should be idempotent on tokenization """ text = u":-) :) =) =)) :=) >:) :] :') :^) (: [: ((= (= (=: :-p :D :o" tokens = self.tokenize(text) reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC")) self.assertEqual(text.lower(), reconstructed) group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertEqual(len(tokens), count_prefix(u"EMOTIC", group_names))
def test_western_emoticons_happy(self): """With custom features removed, this text should be idempotent on tokenization """ text = u":-) :) =) =)) :=) >:) :] :') :^) (: [: ((= (= (=: :-p :D :o" tokens = self.tokenize(text) reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC")) self.assertEqual(text.lower(), reconstructed) group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertEqual(34, count_prefix(u"EMOTIC", group_names))
def test_western_emoticons_sad(self): """With custom features removed, this text should be idempotent on tokenization """ text = u":-( :( =( =(( :=( >:( :[ :'( :^( ): ]: ))= )= )=: :-c :C :O :@ D:" tokens = self.tokenize(text) reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC")) self.assertEqual(text.lower(), reconstructed) group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertEqual(len(tokens), count_prefix(u"EMOTIC", group_names))
def test_western_emoticons_sad(self): """With custom features removed, this text should be idempotent on tokenization """ text = u":-( :( =( =(( :=( >:( :[ :'( :^( ): ]: ))= )= )=: :-c :C :O" tokens = self.tokenize(text) reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC")) self.assertEqual(text.lower(), reconstructed) group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertEqual(34, count_prefix(u"EMOTIC", group_names))
def test_hearts(self): """With custom features removed, this text should be idempotent on tokenization """ text = u"<3 full heart </3 heartbreak" tokens = self.tokenize(text) reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC")) self.assertEqual(text.lower(), reconstructed) group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertSetContainsSubset([u'<3', u'<EMOTIC_HEART_HAPPY>', u'</3', u'<EMOTIC_HEART_SAD>'], tokens) self.assertEqual(len(tokens) - 3, count_prefix(u"EMOTIC", group_names))
def test_hearts(self): """With custom features removed, this text should be idempotent on tokenization """ text = u"<3 full heart </3 heartbreak" tokens = self.tokenize(text) reconstructed = u' '.join(token for token in tokens if not token.startswith(u"<EMOTIC")) self.assertEqual(text.lower(), reconstructed) group_names = [m.lastgroup for m in zip(*self.base_tokenizer.tokenize(text))[1]] self.assertSetContainsSubset([u'<3', u'<EMOTIC_HEART_HAPPY>', u'</3', u'<EMOTIC_HEART_SAD>'], tokens) self.assertEqual(4, count_prefix(u"EMOTIC", group_names))