Beispiel #1
0
def test_extract_names():
    senders_names = {
        # from example dataset
        ('Jay Rickerts <*****@*****.**>@EXAMPLE <XXX-Jay+20Rickerts'
         '*****@*****.**>'):
        ['Jay', 'Rickerts'],
        # if `,` is used in sender's name
        'Williams III, Bill </O=EXAMPLE/OU=NA/CN=RECIPIENTS/CN=BWILLIA5>':
        ['Williams', 'III', 'Bill'],
        # if somehow `'` or `"` are used in sender's name
        'Laura" "Goldberg <*****@*****.**>':
        ['Laura', 'Goldberg'],
        # extract from senders email address
        '<*****@*****.**>': ['sergey'],
        # extract from sender's email address
        # if dots are used in the email address
        '<*****@*****.**>': ['sergey', 'obukhov'],
        # extract from sender's email address
        # if dashes are used in the email address
        '<*****@*****.**>': ['sergey', 'obukhov'],
        # extract from sender's email address
        # if `_` are used in the email address
        '<*****@*****.**>': ['sergey', 'obukhov'],
        # old style From field, found in jangada dataset
        '[email protected] (Wayne Long)': ['Wayne', 'Long'],
        # if only sender's name provided
        'Wayne Long': ['Wayne', 'Long'],
        # if middle name is shortened with dot
        'Sergey N.  Obukhov <*****@*****.**>': ['Sergey', 'Obukhov'],
        # not only spaces could be used as name splitters
        '  Sergey  Obukhov  <*****@*****.**>': ['Sergey', 'Obukhov'],
        # finally normal example
        'Sergey <*****@*****.**>': ['Sergey'],
        # if middle name is shortened with `,`
        'Sergey N, Obukhov': ['Sergey', 'Obukhov'],
        # if mailto used with email address and sender's name is specified
        'Sergey N, Obukhov [mailto: [email protected]]': ['Sergey', 'Obukhov'],
        # when only email address is given
        '*****@*****.**': ['serobnic'],
        # when nothing is given
        '': [],
        # if phone is specified in the `From:` header
        '[email protected] (Wayne Long +7 920 -256 - 35-09)': ['Wayne', 'Long'],
        # from crash reports `nothing to repeat`
        '* * * * <*****@*****.**>': ['the', 'pod'],
        '"**Bobby B**" <*****@*****.**>':
        ['Bobby', 'copymycashsystem'],
        # from crash reports `bad escape`
        '"M Ali B Azlan \(GHSE/PETH\)" <*****@*****.**>':
        ['Ali', 'Azlan'],
        ('"Ridthauddin B A Rahim \(DD/PCSB\)"'
         ' <*****@*****.**>'): ['Ridthauddin', 'Rahim'],
        ('"Boland, Patrick \(Global Xxx Group, Ireland \)"'
         ' <*****@*****.**>'): ['Boland', 'Patrick'],
        '"Mates Rate \(Wine\)" <*****@*****.**>':
        ['Mates', 'Rate', 'Wine'],
        ('"Morgan, Paul \(Business Xxx RI, Xxx Xxx Group\)"'
         ' <*****@*****.**>'): ['Morgan', 'Paul'],
        '"David DECOSTER \(Domicile\)" <*****@*****.**>':
        ['David', 'DECOSTER', 'Domicile']
        }

    for sender, expected_names in senders_names.items():
        extracted_names = h.extract_names(sender)
        # check that extracted names could be compiled
        try:
            re.compile("|".join(extracted_names))
        except Exception as e:
            ok_(False, ("Failed to compile extracted names {}"
                        "\n\nReason: {}").format(extracted_names, e))
        if expected_names:
            for name in expected_names:
                assert_in(name, extracted_names)
        else:
            eq_(expected_names, extracted_names)

    # words like `ru`, `gmail`, `com`, `org`, etc. are not considered
    # sender's names
    for word in h.BAD_SENDER_NAMES:
        eq_(h.extract_names(word), [])

    # duplicates are not allowed
    eq_(h.extract_names("sergey <*****@*****.**"), ["sergey"])
Beispiel #2
0
def test_extract_names():
    senders_names = {
        # from example dataset
        ("Jay Rickerts <*****@*****.**>@EXAMPLE <XXX-Jay+20Rickerts"
         "*****@*****.**>"):
        ["Jay", "Rickerts"],
        # if `,` is used in sender's name
        "Williams III, Bill </O=EXAMPLE/OU=NA/CN=RECIPIENTS/CN=BWILLIA5>": [
            "Williams",
            "III",
            "Bill",
        ],
        # if somehow `'` or `"` are used in sender's name
        'Laura" "Goldberg <*****@*****.**>': ["Laura", "Goldberg"],
        # extract from senders email address
        "<*****@*****.**>": ["sergey"],
        # extract from sender's email address
        # if dots are used in the email address
        "<*****@*****.**>": ["sergey", "obukhov"],
        # extract from sender's email address
        # if dashes are used in the email address
        "<*****@*****.**>": ["sergey", "obukhov"],
        # extract from sender's email address
        # if `_` are used in the email address
        "<*****@*****.**>": ["sergey", "obukhov"],
        # old style From field, found in jangada dataset
        "[email protected] (Wayne Long)": ["Wayne", "Long"],
        # if only sender's name provided
        "Wayne Long": ["Wayne", "Long"],
        # if middle name is shortened with dot
        "Sergey N.  Obukhov <*****@*****.**>": ["Sergey", "Obukhov"],
        # not only spaces could be used as name splitters
        "  Sergey  Obukhov  <*****@*****.**>": ["Sergey", "Obukhov"],
        # finally normal example
        "Sergey <*****@*****.**>": ["Sergey"],
        # if middle name is shortened with `,`
        "Sergey N, Obukhov": ["Sergey", "Obukhov"],
        # if mailto used with email address and sender's name is specified
        "Sergey N, Obukhov [mailto: [email protected]]": ["Sergey", "Obukhov"],
        # when only email address is given
        "*****@*****.**": ["serobnic"],
        # when nothing is given
        "": [],
        # if phone is specified in the `From:` header
        "[email protected] (Wayne Long +7 920 -256 - 35-09)": ["Wayne", "Long"],
        # from crash reports `nothing to repeat`
        "* * * * <*****@*****.**>": ["the", "pod"],
        '"**Bobby B**" <*****@*****.**>':
        ["Bobby", "copymycashsystem"],
        # from crash reports `bad escape`
        '"M Ali B Azlan \(GHSE/PETH\)" <*****@*****.**>':
        ["Ali", "Azlan"],
        ('"Ridthauddin B A Rahim \(DD/PCSB\)"'
         " <*****@*****.**>"): [
            "Ridthauddin",
            "Rahim",
        ],
        ('"Boland, Patrick \(Global Xxx Group, Ireland \)"'
         " <*****@*****.**>"): ["Boland", "Patrick"],
        '"Mates Rate \(Wine\)" <*****@*****.**>':
        ["Mates", "Rate", "Wine"],
        ('"Morgan, Paul \(Business Xxx RI, Xxx Xxx Group\)"'
         " <*****@*****.**>"): ["Morgan", "Paul"],
        '"David DECOSTER \(Domicile\)" <*****@*****.**>': [
            "David",
            "DECOSTER",
            "Domicile",
        ],
    }

    for sender, expected_names in senders_names.items():
        extracted_names = h.extract_names(sender)
        # check that extracted names could be compiled
        try:
            re.compile("|".join(extracted_names))
        except Exception as e:
            ok_(
                False,
                ("Failed to compile extracted names {}"
                 "\n\nReason: {}").format(extracted_names, e),
            )
        if expected_names:
            for name in expected_names:
                assert_in(name, extracted_names)
        else:
            eq_(expected_names, extracted_names)

    # words like `ru`, `gmail`, `com`, `org`, etc. are not considered
    # sender's names
    for word in h.BAD_SENDER_NAMES:
        eq_(h.extract_names(word), [])

    # duplicates are not allowed
    eq_(h.extract_names("sergey <*****@*****.**"), ["sergey"])
Beispiel #3
0
        # check that extracted names could be compiled
        try:
            re.compile("|".join(extracted_names))
        except Exception, e:
            ok_(False, ("Failed to compile extracted names {}"
                        "\n\nReason: {}").format(extracted_names, e))
        if expected_names:
            for name in expected_names:
                assert_in(name, extracted_names)
        else:
            eq_(expected_names, extracted_names)

    # words like `ru`, `gmail`, `com`, `org`, etc. are not considered
    # sender's names
    for word in h.BAD_SENDER_NAMES:
        eq_(h.extract_names(word), [])

    # duplicates are not allowed
    eq_(h.extract_names("sergey <*****@*****.**"), ["sergey"])


def test_categories_percent():
    eq_(0.0, h.categories_percent("qqq ggg hhh", ["Po"]))
    eq_(50.0, h.categories_percent("q,w.", ["Po"]))
    eq_(0.0, h.categories_percent("qqq ggg hhh", ["Nd"]))
    eq_(50.0, h.categories_percent("q5", ["Nd"]))
    eq_(50.0, h.categories_percent("s.s,5s", ["Po", "Nd"]))
    eq_(0.0, h.categories_percent("", ["Po", "Nd"]))


@patch.object(h, 'categories_percent')
Beispiel #4
0
def test_extract_names():
    senders_names = {
        # from example dataset
        ('Jay Rickerts <*****@*****.**>@EXAMPLE <XXX-Jay+20Rickerts'
         '*****@*****.**>'):
        ['Jay', 'Rickerts'],
        # if `,` is used in sender's name
        'Williams III, Bill </O=EXAMPLE/OU=NA/CN=RECIPIENTS/CN=BWILLIA5>':
        ['Williams', 'III', 'Bill'],
        # if somehow `'` or `"` are used in sender's name
        'Laura" "Goldberg <*****@*****.**>':
        ['Laura', 'Goldberg'],
        # extract from senders email address
        '<*****@*****.**>': ['sergey'],
        # extract from sender's email address
        # if dots are used in the email address
        '<*****@*****.**>': ['sergey', 'obukhov'],
        # extract from sender's email address
        # if dashes are used in the email address
        '<*****@*****.**>': ['sergey', 'obukhov'],
        # extract from sender's email address
        # if `_` are used in the email address
        '<*****@*****.**>': ['sergey', 'obukhov'],
        # old style From field, found in jangada dataset
        '[email protected] (Wayne Long)': ['Wayne', 'Long'],
        # if only sender's name provided
        'Wayne Long': ['Wayne', 'Long'],
        # if middle name is shortened with dot
        'Sergey N.  Obukhov <*****@*****.**>': ['Sergey', 'Obukhov'],
        # not only spaces could be used as name splitters
        '  Sergey  Obukhov  <*****@*****.**>': ['Sergey', 'Obukhov'],
        # finally normal example
        'Sergey <*****@*****.**>': ['Sergey'],
        # if middle name is shortened with `,`
        'Sergey N, Obukhov': ['Sergey', 'Obukhov'],
        # if mailto used with email address and sender's name is specified
        'Sergey N, Obukhov [mailto: [email protected]]': ['Sergey', 'Obukhov'],
        # when only email address is given
        '*****@*****.**': ['serobnic'],
        # when nothing is given
        '': [],
        # if phone is specified in the `From:` header
        '[email protected] (Wayne Long +7 920 -256 - 35-09)': ['Wayne', 'Long'],
        # from crash reports `nothing to repeat`
        '* * * * <*****@*****.**>': ['the', 'pod'],
        '"**Bobby B**" <*****@*****.**>':
        ['Bobby', 'copymycashsystem'],
        # from crash reports `bad escape`
        '"M Ali B Azlan \(GHSE/PETH\)" <*****@*****.**>':
        ['Ali', 'Azlan'],
        ('"Ridthauddin B A Rahim \(DD/PCSB\)"'
         ' <*****@*****.**>'): ['Ridthauddin', 'Rahim'],
        ('"Boland, Patrick \(Global Xxx Group, Ireland \)"'
         ' <*****@*****.**>'): ['Boland', 'Patrick'],
        '"Mates Rate \(Wine\)" <*****@*****.**>':
        ['Mates', 'Rate', 'Wine'],
        ('"Morgan, Paul \(Business Xxx RI, Xxx Xxx Group\)"'
         ' <*****@*****.**>'): ['Morgan', 'Paul'],
        '"David DECOSTER \(Domicile\)" <*****@*****.**>':
        ['David', 'DECOSTER', 'Domicile']
        }

    for sender, expected_names in senders_names.items():
        extracted_names = h.extract_names(sender)
        # check that extracted names could be compiled
        try:
            re.compile("|".join(extracted_names))
        except Exception, e:
            ok_(False, ("Failed to compile extracted names {}"
                        "\n\nReason: {}").format(extracted_names, e))
        if expected_names:
            for name in expected_names:
                assert_in(name, extracted_names)
        else:
            eq_(expected_names, extracted_names)
Beispiel #5
0
        # check that extracted names could be compiled
        try:
            re.compile("|".join(extracted_names))
        except Exception, e:
            ok_(False, ("Failed to compile extracted names {}"
                        "\n\nReason: {}").format(extracted_names, e))
        if expected_names:
            for name in expected_names:
                assert_in(name, extracted_names)
        else:
            eq_(expected_names, extracted_names)

    # words like `ru`, `gmail`, `com`, `org`, etc. are not considered
    # sender's names
    for word in h.BAD_SENDER_NAMES:
        eq_(h.extract_names(word), [])

    # duplicates are not allowed
    eq_(h.extract_names("sergey <*****@*****.**"), ["sergey"])


def test_categories_percent():
    eq_(0.0, h.categories_percent("qqq ggg hhh", ["Po"]))
    eq_(50.0, h.categories_percent("q,w.", ["Po"]))
    eq_(0.0, h.categories_percent("qqq ggg hhh", ["Nd"]))
    eq_(50.0, h.categories_percent("q5", ["Nd"]))
    eq_(50.0, h.categories_percent("s.s,5s", ["Po", "Nd"]))
    eq_(0.0, h.categories_percent("", ["Po", "Nd"]))


@patch.object(h, 'categories_percent')