def extract_names(sender): """Tries to extract sender's names from `From:` header. It could extract not only the actual names but e.g. the name of the company, parts of email, etc. >>> extract_names('Sergey N. Obukhov <*****@*****.**>') ['Sergey', 'Obukhov', 'serobnic'] >>> extract_names('') [] """ # Make sure sender is not None sender = sender or "" # Convert to unicode sender = to_unicode(sender, precise=True) # Remove non-alphabetical characters sender = "".join([char if char.isalpha() else " " for char in sender]) # Remove too short words and words from "black" list i.e. # words like `ru`, `gmail`, `com`, `org`, etc. sender = [ word for word in sender.split() if len(word) > 1 and not word in BAD_SENDER_NAMES ] # Remove duplicates names = list(set(sender)) return names
def test_unicode(): eq_(u"hi", u.to_unicode("hi")) eq_(type(u.to_unicode("hi")), six.text_type) eq_(type(u.to_unicode(u"hi")), six.text_type) eq_(type(u.to_unicode("привет")), six.text_type) eq_(type(u.to_unicode(u"привет")), six.text_type) eq_(u"привет", u.to_unicode("привет")) eq_(u"привет", u.to_unicode(u"привет")) # some latin1 stuff eq_(u"Versión", u.to_unicode(u"Versi\xf3n".encode("iso-8859-2"), precise=True))
def test_unicode(): eq_('hi', u.to_unicode('hi')) eq_(type(u.to_unicode('hi')), str) eq_(type(u.to_unicode('hi')), str) eq_(type(u.to_unicode('привет')), str) eq_(type(u.to_unicode('привет')), str) eq_("привет", u.to_unicode('привет')) eq_("привет", u.to_unicode('привет')) # some latin1 stuff eq_("Versión", u.to_unicode('Versi\xf3n', precise=True))
def test_unicode(): eq_ (u'hi', u.to_unicode('hi')) eq_ (type(u.to_unicode('hi')), six.text_type ) eq_ (type(u.to_unicode(u'hi')), six.text_type ) eq_ (type(u.to_unicode('привет')), six.text_type ) eq_ (type(u.to_unicode(u'привет')), six.text_type ) eq_ (u"привет", u.to_unicode('привет')) eq_ (u"привет", u.to_unicode(u'привет')) # some latin1 stuff eq_ (u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True))
def test_unicode(): eq_(u'hi', u.to_unicode('hi')) eq_(type(u.to_unicode('hi')), six.text_type) eq_(type(u.to_unicode(u'hi')), six.text_type) eq_(type(u.to_unicode('привет')), six.text_type) eq_(type(u.to_unicode(u'привет')), six.text_type) eq_(u"привет", u.to_unicode('привет')) eq_(u"привет", u.to_unicode(u'привет')) # some latin1 stuff eq_(u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True))
def test_unicode(): eq_ (u'hi', u.to_unicode('hi')) eq_ (type(u.to_unicode('hi')), unicode ) eq_ (type(u.to_unicode(u'hi')), unicode ) eq_ (type(u.to_unicode('привет')), unicode ) eq_ (type(u.to_unicode(u'привет')), unicode ) eq_ (u"привет", u.to_unicode('привет')) eq_ (u"привет", u.to_unicode(u'привет')) # some latin1 stuff eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True))
def capitalized_words_percent(s): '''Returns capitalized words percent.''' s = to_unicode(s, precise=True) words = re.split('\s', s) words = [w for w in words if w.strip()] capitalized_words_counter = 0 valid_words_counter = 0 for word in words: if not INVALID_WORD_START.match(word): valid_words_counter += 1 if word[0].isupper(): capitalized_words_counter += 1 if valid_words_counter > 0 and len(words) > 1: return 100 * float(capitalized_words_counter) / valid_words_counter return 0
def capitalized_words_percent(s): '''Returns capitalized words percent.''' s = to_unicode(s) words = re.split('\s', s) words = [w for w in words if w.strip()] capitalized_words_counter = 0 valid_words_counter = 0 for word in words: if not INVALID_WORD_START.match(word): valid_words_counter += 1 if word[0].isupper(): capitalized_words_counter += 1 if valid_words_counter > 0 and len(words) > 1: return 100 * float(capitalized_words_counter) / valid_words_counter return 0
def categories_percent(s, categories): """Returns category characters percent. >>> categories_percent("qqq ggg hhh", ["Po"]) 0.0 >>> categories_percent("q,w.", ["Po"]) 50.0 >>> categories_percent("qqq ggg hhh", ["Nd"]) 0.0 >>> categories_percent("q5", ["Nd"]) 50.0 >>> categories_percent("s.s,5s", ["Po", "Nd"]) 50.0 """ count = 0 s = to_unicode(s, precise=True) for c in s: if unicodedata.category(c) in categories: count += 1 return 100 * float(count) / len(s) if len(s) else 0
def categories_percent(s, categories): '''Returns category characters persent. >>> categories_percent("qqq ggg hhh", ["Po"]) 0.0 >>> categories_percent("q,w.", ["Po"]) 50.0 >>> categories_percent("qqq ggg hhh", ["Nd"]) 0.0 >>> categories_percent("q5", ["Nd"]) 50.0 >>> categories_percent("s.s,5s", ["Po", "Nd"]) 50.0 ''' count = 0 s = to_unicode(s) for c in s: if unicodedata.category(c) in categories: count += 1 return 100 * float(count) / len(s) if len(s) else 0
def extract_names(sender): """Tries to extract sender's names from `From:` header. It could extract not only the actual names but e.g. the name of the company, parts of email, etc. >>> extract_names('Sergey N. Obukhov <*****@*****.**>') ['Sergey', 'Obukhov', 'serobnic'] >>> extract_names('') [] """ sender = to_unicode(sender) # Remove non-alphabetical characters sender = "".join([char if char.isalpha() else ' ' for char in sender]) # Remove too short words and words from "black" list i.e. # words like `ru`, `gmail`, `com`, `org`, etc. sender = [word for word in sender.split() if len(word) > 1 and not word in BAD_SENDER_NAMES] # Remove duplicates names = list(set(sender)) return names