Esempio n. 1
0
 def test_emails(self):
     data = [[u"Has a [email protected]",u"Has a 1"],
             [u"[email protected] is a nice site",u"1 is a nice site"],
             [u"super cool [email protected] site",u"super cool 1 site"],
             ]
     for i,(text, truth) in enumerate(data):
         with self.subTest(i=i):
             new_text = replace_email("1",text)
             self.assertEqual(new_text,truth)
Esempio n. 2
0
def word_count(text, lang=None):

    # Remove all tags from text
    text_no_tags,_ = remove_tags(text)

    # Remove urls, emails and other breakable elements.
    text_no_tags = replace_url(u'url', text_no_tags)
    text_no_tags = replace_email(u'email', text_no_tags)
    text_no_tags = replace_date(u'date', text_no_tags)
    text_no_tags = replace_phone(u'phone', text_no_tags)
    text_no_tags = replace_money(u'money', text_no_tags)

    if lang is not None and lang not in asian_languages:
          words = word_count_aux(text_no_tags)
          return words
    else:
        asian_chars = sum([is_asian(x) for x in text_no_tags])
        non_asian_words = "".join([filter_jchars(c) for c in text_no_tags])
        words = word_count_aux(non_asian_words)
    return words + asian_chars