def test_dm_tokenize_name_with_soft_sign(): """Test correct handling of the cyrillic soft sign.""" assert dm_tokenize_name("Aref'ev, M.") == ((dm(u"Arefev")[0],), (dm(u"M")[0],)) # If the following letter is uppercase, split assert dm_tokenize_name("An'Sun, J.") == ((dm(u"An")[0], dm(u"Sun")[0]), (dm(u"J")[0],))
def test_dm_tokenize_name_with_soft_sign(): """Test correct handling of the cyrillic soft sign.""" assert dm_tokenize_name("Aref'ev, M.") == ((dm(u"Arefev")[0], ), (dm(u"M")[0], )) # If the following letter is uppercase, split assert dm_tokenize_name("An'Sun, J.") == ((dm(u"An")[0], dm(u"Sun")[0]), (dm(u"J")[0], ))
def test_dm_tokenize_name_remove_common_affixes(): """Test correct removal of the common affixes.""" assert dm_tokenize_name("von und zu Hohenstein, F.") == \ dm_tokenize_name("Hohenstein, F.") # If the name consists of only the common prefixes, don't drop it, as # it might actually be the correct surname. assert dm_tokenize_name("Ben, Robert") == ((dm(u"Ben")[0],), (dm(u"Robert")[0],)) # Don't drop affixes among the first names. assert dm_tokenize_name("Robert, L. W.") == ((dm(u"Robert")[0],), (dm(u"L")[0], dm(u"W")[0]))
def test_dm_tokenize_name_remove_common_affixes(): """Test correct removal of the common affixes.""" assert dm_tokenize_name("von und zu Hohenstein, F.") == \ dm_tokenize_name("Hohenstein, F.") # If the name consists of only the common prefixes, don't drop it, as # it might actually be the correct surname. assert dm_tokenize_name("Ben, Robert") == ((dm(u"Ben")[0], ), (dm(u"Robert")[0], )) # Don't drop affixes among the first names. assert dm_tokenize_name("Robert, L. W.") == ((dm(u"Robert")[0], ), (dm(u"L")[0], dm(u"W")[0]))
def test_dm_tokenize_name_simple(): """Test of tokenize_name.""" assert dm_tokenize_name("Doe, John") == ((dm(u"Doe")[0],), (dm(u"John")[0],)) assert dm_tokenize_name("Doe, J.") == dm_tokenize_name(u"Doe, J") assert dm_tokenize_name("Doe-Foe, Willem") == ((dm(u"Doe")[0], dm(u"Foe")[0]), (dm(u"Willem")[0],)) assert dm_tokenize_name("Dupont, René") == \ dm_tokenize_name("Dupont., René") assert dm_tokenize_name("Dupont, Jean-René") == \ ((dm(u"Dupont")[0],), (dm(u"Jean")[0], dm(u"Rene")[0])) assert dm_tokenize_name("Dupont, René, III") == \ ((dm(u"Dupont")[0],), (dm(u"Rene")[0], dm(u"III")[0])) assert dm_tokenize_name("Dupont, René, Jr.") == \ ((dm(u"Dupont")[0],), (dm(u"Rene")[0], dm(u"Jr")[0])) assert dm_tokenize_name("Dupont, J.R.") == \ dm_tokenize_name("Dupont, J.-R.") assert dm_tokenize_name("Dupont") == ((dm(u"Dupont")[0],), ('',)) assert dm_tokenize_name("Jean Dupont") == dm_tokenize_name("Dupont, Jean")
def test_phonetic_tokenize_name_simple(): """Test of tokenize_name.""" assert phonetic_tokenize_name("Doe, John") == ((dm(u"Doe")[0], ), (dm(u"John")[0], )) assert phonetic_tokenize_name("Doe, J.") == \ phonetic_tokenize_name(u"Doe, J") assert phonetic_tokenize_name("Doe-Foe, Willem") == ((dm(u"Doe")[0], dm(u"Foe")[0]), (dm(u"Willem")[0], )) assert phonetic_tokenize_name("Dupont, René") == \ phonetic_tokenize_name("Dupont., René") assert phonetic_tokenize_name("Dupont, Jean-René") == \ ((dm(u"Dupont")[0],), (dm(u"Jean")[0], dm(u"René")[0])) assert phonetic_tokenize_name("Dupont, René, III") == \ ((dm(u"Dupont")[0],), (dm(u"Rene")[0], dm(u"III")[0])) assert phonetic_tokenize_name("Dupont, René, Jr.") == \ ((dm(u"Dupont")[0],), (dm(u"Rene")[0], dm(u"Jr")[0])) assert phonetic_tokenize_name("Dupont, J.R.") == \ phonetic_tokenize_name("Dupont, J.-R.") assert phonetic_tokenize_name("Dupont") == ((dm(u"Dupont")[0], ), ('', )) assert phonetic_tokenize_name("Jean Dupont") == \ phonetic_tokenize_name("Dupont, Jean")