def test_extract_names(): senders_names = { # from example dataset ('Jay Rickerts <*****@*****.**>@EXAMPLE <XXX-Jay+20Rickerts' '*****@*****.**>'): ['Jay', 'Rickerts'], # if `,` is used in sender's name 'Williams III, Bill </O=EXAMPLE/OU=NA/CN=RECIPIENTS/CN=BWILLIA5>': ['Williams', 'III', 'Bill'], # if somehow `'` or `"` are used in sender's name 'Laura" "Goldberg <*****@*****.**>': ['Laura', 'Goldberg'], # extract from senders email address '<*****@*****.**>': ['sergey'], # extract from sender's email address # if dots are used in the email address '<*****@*****.**>': ['sergey', 'obukhov'], # extract from sender's email address # if dashes are used in the email address '<*****@*****.**>': ['sergey', 'obukhov'], # extract from sender's email address # if `_` are used in the email address '<*****@*****.**>': ['sergey', 'obukhov'], # old style From field, found in jangada dataset '[email protected] (Wayne Long)': ['Wayne', 'Long'], # if only sender's name provided 'Wayne Long': ['Wayne', 'Long'], # if middle name is shortened with dot 'Sergey N. Obukhov <*****@*****.**>': ['Sergey', 'Obukhov'], # not only spaces could be used as name splitters ' Sergey Obukhov <*****@*****.**>': ['Sergey', 'Obukhov'], # finally normal example 'Sergey <*****@*****.**>': ['Sergey'], # if middle name is shortened with `,` 'Sergey N, Obukhov': ['Sergey', 'Obukhov'], # if mailto used with email address and sender's name is specified 'Sergey N, Obukhov [mailto: [email protected]]': ['Sergey', 'Obukhov'], # when only email address is given '*****@*****.**': ['serobnic'], # when nothing is given '': [], # if phone is specified in the `From:` header '[email protected] (Wayne Long +7 920 -256 - 35-09)': ['Wayne', 'Long'], # from crash reports `nothing to repeat` '* * * * <*****@*****.**>': ['the', 'pod'], '"**Bobby B**" <*****@*****.**>': ['Bobby', 'copymycashsystem'], # from crash reports `bad escape` '"M Ali B Azlan \(GHSE/PETH\)" <*****@*****.**>': ['Ali', 'Azlan'], ('"Ridthauddin B A Rahim \(DD/PCSB\)"' ' <*****@*****.**>'): ['Ridthauddin', 'Rahim'], ('"Boland, Patrick \(Global Xxx Group, Ireland \)"' ' <*****@*****.**>'): ['Boland', 'Patrick'], '"Mates Rate \(Wine\)" <*****@*****.**>': ['Mates', 'Rate', 'Wine'], ('"Morgan, Paul \(Business Xxx RI, Xxx Xxx Group\)"' ' <*****@*****.**>'): ['Morgan', 'Paul'], '"David DECOSTER \(Domicile\)" <*****@*****.**>': ['David', 'DECOSTER', 'Domicile'] } for sender, expected_names in senders_names.items(): extracted_names = h.extract_names(sender) # check that extracted names could be compiled try: re.compile("|".join(extracted_names)) except Exception as e: ok_(False, ("Failed to compile extracted names {}" "\n\nReason: {}").format(extracted_names, e)) if expected_names: for name in expected_names: assert_in(name, extracted_names) else: eq_(expected_names, extracted_names) # words like `ru`, `gmail`, `com`, `org`, etc. are not considered # sender's names for word in h.BAD_SENDER_NAMES: eq_(h.extract_names(word), []) # duplicates are not allowed eq_(h.extract_names("sergey <*****@*****.**"), ["sergey"])
def test_extract_names(): senders_names = { # from example dataset ("Jay Rickerts <*****@*****.**>@EXAMPLE <XXX-Jay+20Rickerts" "*****@*****.**>"): ["Jay", "Rickerts"], # if `,` is used in sender's name "Williams III, Bill </O=EXAMPLE/OU=NA/CN=RECIPIENTS/CN=BWILLIA5>": [ "Williams", "III", "Bill", ], # if somehow `'` or `"` are used in sender's name 'Laura" "Goldberg <*****@*****.**>': ["Laura", "Goldberg"], # extract from senders email address "<*****@*****.**>": ["sergey"], # extract from sender's email address # if dots are used in the email address "<*****@*****.**>": ["sergey", "obukhov"], # extract from sender's email address # if dashes are used in the email address "<*****@*****.**>": ["sergey", "obukhov"], # extract from sender's email address # if `_` are used in the email address "<*****@*****.**>": ["sergey", "obukhov"], # old style From field, found in jangada dataset "[email protected] (Wayne Long)": ["Wayne", "Long"], # if only sender's name provided "Wayne Long": ["Wayne", "Long"], # if middle name is shortened with dot "Sergey N. Obukhov <*****@*****.**>": ["Sergey", "Obukhov"], # not only spaces could be used as name splitters " Sergey Obukhov <*****@*****.**>": ["Sergey", "Obukhov"], # finally normal example "Sergey <*****@*****.**>": ["Sergey"], # if middle name is shortened with `,` "Sergey N, Obukhov": ["Sergey", "Obukhov"], # if mailto used with email address and sender's name is specified "Sergey N, Obukhov [mailto: [email protected]]": ["Sergey", "Obukhov"], # when only email address is given "*****@*****.**": ["serobnic"], # when nothing is given "": [], # if phone is specified in the `From:` header "[email protected] (Wayne Long +7 920 -256 - 35-09)": ["Wayne", "Long"], # from crash reports `nothing to repeat` "* * * * <*****@*****.**>": ["the", "pod"], '"**Bobby B**" <*****@*****.**>': ["Bobby", "copymycashsystem"], # from crash reports `bad escape` '"M Ali B Azlan \(GHSE/PETH\)" <*****@*****.**>': ["Ali", "Azlan"], ('"Ridthauddin B A Rahim \(DD/PCSB\)"' " <*****@*****.**>"): [ "Ridthauddin", "Rahim", ], ('"Boland, Patrick \(Global Xxx Group, Ireland \)"' " <*****@*****.**>"): ["Boland", "Patrick"], '"Mates Rate \(Wine\)" <*****@*****.**>': ["Mates", "Rate", "Wine"], ('"Morgan, Paul \(Business Xxx RI, Xxx Xxx Group\)"' " <*****@*****.**>"): ["Morgan", "Paul"], '"David DECOSTER \(Domicile\)" <*****@*****.**>': [ "David", "DECOSTER", "Domicile", ], } for sender, expected_names in senders_names.items(): extracted_names = h.extract_names(sender) # check that extracted names could be compiled try: re.compile("|".join(extracted_names)) except Exception as e: ok_( False, ("Failed to compile extracted names {}" "\n\nReason: {}").format(extracted_names, e), ) if expected_names: for name in expected_names: assert_in(name, extracted_names) else: eq_(expected_names, extracted_names) # words like `ru`, `gmail`, `com`, `org`, etc. are not considered # sender's names for word in h.BAD_SENDER_NAMES: eq_(h.extract_names(word), []) # duplicates are not allowed eq_(h.extract_names("sergey <*****@*****.**"), ["sergey"])
# check that extracted names could be compiled try: re.compile("|".join(extracted_names)) except Exception, e: ok_(False, ("Failed to compile extracted names {}" "\n\nReason: {}").format(extracted_names, e)) if expected_names: for name in expected_names: assert_in(name, extracted_names) else: eq_(expected_names, extracted_names) # words like `ru`, `gmail`, `com`, `org`, etc. are not considered # sender's names for word in h.BAD_SENDER_NAMES: eq_(h.extract_names(word), []) # duplicates are not allowed eq_(h.extract_names("sergey <*****@*****.**"), ["sergey"]) def test_categories_percent(): eq_(0.0, h.categories_percent("qqq ggg hhh", ["Po"])) eq_(50.0, h.categories_percent("q,w.", ["Po"])) eq_(0.0, h.categories_percent("qqq ggg hhh", ["Nd"])) eq_(50.0, h.categories_percent("q5", ["Nd"])) eq_(50.0, h.categories_percent("s.s,5s", ["Po", "Nd"])) eq_(0.0, h.categories_percent("", ["Po", "Nd"])) @patch.object(h, 'categories_percent')
def test_extract_names(): senders_names = { # from example dataset ('Jay Rickerts <*****@*****.**>@EXAMPLE <XXX-Jay+20Rickerts' '*****@*****.**>'): ['Jay', 'Rickerts'], # if `,` is used in sender's name 'Williams III, Bill </O=EXAMPLE/OU=NA/CN=RECIPIENTS/CN=BWILLIA5>': ['Williams', 'III', 'Bill'], # if somehow `'` or `"` are used in sender's name 'Laura" "Goldberg <*****@*****.**>': ['Laura', 'Goldberg'], # extract from senders email address '<*****@*****.**>': ['sergey'], # extract from sender's email address # if dots are used in the email address '<*****@*****.**>': ['sergey', 'obukhov'], # extract from sender's email address # if dashes are used in the email address '<*****@*****.**>': ['sergey', 'obukhov'], # extract from sender's email address # if `_` are used in the email address '<*****@*****.**>': ['sergey', 'obukhov'], # old style From field, found in jangada dataset '[email protected] (Wayne Long)': ['Wayne', 'Long'], # if only sender's name provided 'Wayne Long': ['Wayne', 'Long'], # if middle name is shortened with dot 'Sergey N. Obukhov <*****@*****.**>': ['Sergey', 'Obukhov'], # not only spaces could be used as name splitters ' Sergey Obukhov <*****@*****.**>': ['Sergey', 'Obukhov'], # finally normal example 'Sergey <*****@*****.**>': ['Sergey'], # if middle name is shortened with `,` 'Sergey N, Obukhov': ['Sergey', 'Obukhov'], # if mailto used with email address and sender's name is specified 'Sergey N, Obukhov [mailto: [email protected]]': ['Sergey', 'Obukhov'], # when only email address is given '*****@*****.**': ['serobnic'], # when nothing is given '': [], # if phone is specified in the `From:` header '[email protected] (Wayne Long +7 920 -256 - 35-09)': ['Wayne', 'Long'], # from crash reports `nothing to repeat` '* * * * <*****@*****.**>': ['the', 'pod'], '"**Bobby B**" <*****@*****.**>': ['Bobby', 'copymycashsystem'], # from crash reports `bad escape` '"M Ali B Azlan \(GHSE/PETH\)" <*****@*****.**>': ['Ali', 'Azlan'], ('"Ridthauddin B A Rahim \(DD/PCSB\)"' ' <*****@*****.**>'): ['Ridthauddin', 'Rahim'], ('"Boland, Patrick \(Global Xxx Group, Ireland \)"' ' <*****@*****.**>'): ['Boland', 'Patrick'], '"Mates Rate \(Wine\)" <*****@*****.**>': ['Mates', 'Rate', 'Wine'], ('"Morgan, Paul \(Business Xxx RI, Xxx Xxx Group\)"' ' <*****@*****.**>'): ['Morgan', 'Paul'], '"David DECOSTER \(Domicile\)" <*****@*****.**>': ['David', 'DECOSTER', 'Domicile'] } for sender, expected_names in senders_names.items(): extracted_names = h.extract_names(sender) # check that extracted names could be compiled try: re.compile("|".join(extracted_names)) except Exception, e: ok_(False, ("Failed to compile extracted names {}" "\n\nReason: {}").format(extracted_names, e)) if expected_names: for name in expected_names: assert_in(name, extracted_names) else: eq_(expected_names, extracted_names)