Ejemplo n.º 1
0
def extract_names(sender):
    """Tries to extract sender's names from `From:` header.

    It could extract not only the actual names but e.g.
    the name of the company, parts of email, etc.

    >>> extract_names('Sergey N.  Obukhov <*****@*****.**>')
    ['Sergey', 'Obukhov', 'serobnic']
    >>> extract_names('')
    []
    """

    # Make sure sender is not None
    sender = sender or ""

    # Convert to unicode
    sender = to_unicode(sender, precise=True)
    # Remove non-alphabetical characters
    sender = "".join([char if char.isalpha() else " " for char in sender])
    # Remove too short words and words from "black" list i.e.
    # words like `ru`, `gmail`, `com`, `org`, etc.
    sender = [
        word for word in sender.split()
        if len(word) > 1 and not word in BAD_SENDER_NAMES
    ]
    # Remove duplicates
    names = list(set(sender))
    return names
Ejemplo n.º 2
0
def test_unicode():
    eq_(u"hi", u.to_unicode("hi"))
    eq_(type(u.to_unicode("hi")), six.text_type)
    eq_(type(u.to_unicode(u"hi")), six.text_type)
    eq_(type(u.to_unicode("привет")), six.text_type)
    eq_(type(u.to_unicode(u"привет")), six.text_type)
    eq_(u"привет", u.to_unicode("привет"))
    eq_(u"привет", u.to_unicode(u"привет"))
    # some latin1 stuff
    eq_(u"Versión", u.to_unicode(u"Versi\xf3n".encode("iso-8859-2"), precise=True))
Ejemplo n.º 3
0
def test_unicode():
    eq_('hi', u.to_unicode('hi'))
    eq_(type(u.to_unicode('hi')), str)
    eq_(type(u.to_unicode('hi')), str)
    eq_(type(u.to_unicode('привет')), str)
    eq_(type(u.to_unicode('привет')), str)
    eq_("привет", u.to_unicode('привет'))
    eq_("привет", u.to_unicode('привет'))
    # some latin1 stuff
    eq_("Versión", u.to_unicode('Versi\xf3n', precise=True))
Ejemplo n.º 4
0
def test_unicode():
    eq_ (u'hi', u.to_unicode('hi'))
    eq_ (type(u.to_unicode('hi')), six.text_type )
    eq_ (type(u.to_unicode(u'hi')), six.text_type )
    eq_ (type(u.to_unicode('привет')), six.text_type )
    eq_ (type(u.to_unicode(u'привет')), six.text_type )
    eq_ (u"привет", u.to_unicode('привет'))
    eq_ (u"привет", u.to_unicode(u'привет'))
    # some latin1 stuff
    eq_ (u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True))
Ejemplo n.º 5
0
def test_unicode():
    eq_(u'hi', u.to_unicode('hi'))
    eq_(type(u.to_unicode('hi')), six.text_type)
    eq_(type(u.to_unicode(u'hi')), six.text_type)
    eq_(type(u.to_unicode('привет')), six.text_type)
    eq_(type(u.to_unicode(u'привет')), six.text_type)
    eq_(u"привет", u.to_unicode('привет'))
    eq_(u"привет", u.to_unicode(u'привет'))
    # some latin1 stuff
    eq_(u"Versión", u.to_unicode(u'Versi\xf3n'.encode('iso-8859-2'), precise=True))
Ejemplo n.º 6
0
def test_unicode():
    eq_ (u'hi', u.to_unicode('hi'))
    eq_ (type(u.to_unicode('hi')), unicode )
    eq_ (type(u.to_unicode(u'hi')), unicode )
    eq_ (type(u.to_unicode('привет')), unicode )
    eq_ (type(u.to_unicode(u'привет')), unicode )
    eq_ (u"привет", u.to_unicode('привет'))
    eq_ (u"привет", u.to_unicode(u'привет'))
    # some latin1 stuff
    eq_ (u"Versión", u.to_unicode('Versi\xf3n', precise=True))
Ejemplo n.º 7
0
def capitalized_words_percent(s):
    '''Returns capitalized words percent.'''
    s = to_unicode(s, precise=True)
    words = re.split('\s', s)
    words = [w for w in words if w.strip()]
    capitalized_words_counter = 0
    valid_words_counter = 0
    for word in words:
        if not INVALID_WORD_START.match(word):
            valid_words_counter += 1
            if word[0].isupper():
                capitalized_words_counter += 1
    if valid_words_counter > 0 and len(words) > 1:
        return 100 * float(capitalized_words_counter) / valid_words_counter

    return 0
Ejemplo n.º 8
0
def capitalized_words_percent(s):
    '''Returns capitalized words percent.'''
    s = to_unicode(s)
    words = re.split('\s', s)
    words = [w for w in words if w.strip()]
    capitalized_words_counter = 0
    valid_words_counter = 0
    for word in words:
        if not INVALID_WORD_START.match(word):
            valid_words_counter += 1
            if word[0].isupper():
                capitalized_words_counter += 1
    if valid_words_counter > 0 and len(words) > 1:
        return 100 * float(capitalized_words_counter) / valid_words_counter

    return 0
Ejemplo n.º 9
0
def categories_percent(s, categories):
    """Returns category characters percent.

    >>> categories_percent("qqq ggg hhh", ["Po"])
    0.0
    >>> categories_percent("q,w.", ["Po"])
    50.0
    >>> categories_percent("qqq ggg hhh", ["Nd"])
    0.0
    >>> categories_percent("q5", ["Nd"])
    50.0
    >>> categories_percent("s.s,5s", ["Po", "Nd"])
    50.0
    """
    count = 0
    s = to_unicode(s, precise=True)
    for c in s:
        if unicodedata.category(c) in categories:
            count += 1
    return 100 * float(count) / len(s) if len(s) else 0
Ejemplo n.º 10
0
def categories_percent(s, categories):
    '''Returns category characters persent.

    >>> categories_percent("qqq ggg hhh", ["Po"])
    0.0
    >>> categories_percent("q,w.", ["Po"])
    50.0
    >>> categories_percent("qqq ggg hhh", ["Nd"])
    0.0
    >>> categories_percent("q5", ["Nd"])
    50.0
    >>> categories_percent("s.s,5s", ["Po", "Nd"])
    50.0
    '''
    count = 0
    s = to_unicode(s)
    for c in s:
        if unicodedata.category(c) in categories:
            count += 1
    return 100 * float(count) / len(s) if len(s) else 0
Ejemplo n.º 11
0
def extract_names(sender):
    """Tries to extract sender's names from `From:` header.

    It could extract not only the actual names but e.g.
    the name of the company, parts of email, etc.

    >>> extract_names('Sergey N.  Obukhov <*****@*****.**>')
    ['Sergey', 'Obukhov', 'serobnic']
    >>> extract_names('')
    []
    """
    sender = to_unicode(sender)
    # Remove non-alphabetical characters
    sender = "".join([char if char.isalpha() else ' ' for char in sender])
    # Remove too short words and words from "black" list i.e.
    # words like `ru`, `gmail`, `com`, `org`, etc.
    sender = [word for word in sender.split() if len(word) > 1 and
              not word in BAD_SENDER_NAMES]
    # Remove duplicates
    names = list(set(sender))
    return names