Example #1
0
def test_dm_tokenize_name_with_soft_sign():
    """Test correct handling of the cyrillic soft sign."""
    assert dm_tokenize_name("Aref'ev, M.") == ((dm(u"Arefev")[0],),
                                               (dm(u"M")[0],))
    # If the following letter is uppercase, split
    assert dm_tokenize_name("An'Sun, J.") == ((dm(u"An")[0], dm(u"Sun")[0]),
                                              (dm(u"J")[0],))
Example #2
0
def test_dm_tokenize_name_with_soft_sign():
    """Test correct handling of the cyrillic soft sign."""
    assert dm_tokenize_name("Aref'ev, M.") == ((dm(u"Arefev")[0], ),
                                               (dm(u"M")[0], ))
    # If the following letter is uppercase, split
    assert dm_tokenize_name("An'Sun, J.") == ((dm(u"An")[0], dm(u"Sun")[0]),
                                              (dm(u"J")[0], ))
Example #3
0
def test_dm_tokenize_name_remove_common_affixes():
    """Test correct removal of the common affixes."""
    assert dm_tokenize_name("von und zu Hohenstein, F.") == \
        dm_tokenize_name("Hohenstein, F.")
    # If the name consists of only the common prefixes, don't drop it, as
    # it might actually be the correct surname.
    assert dm_tokenize_name("Ben, Robert") == ((dm(u"Ben")[0],),
                                               (dm(u"Robert")[0],))
    # Don't drop affixes among the first names.
    assert dm_tokenize_name("Robert, L. W.") == ((dm(u"Robert")[0],),
                                                 (dm(u"L")[0], dm(u"W")[0]))
Example #4
0
def test_dm_tokenize_name_remove_common_affixes():
    """Test correct removal of the common affixes."""
    assert dm_tokenize_name("von und zu Hohenstein, F.") == \
        dm_tokenize_name("Hohenstein, F.")
    # If the name consists of only the common prefixes, don't drop it, as
    # it might actually be the correct surname.
    assert dm_tokenize_name("Ben, Robert") == ((dm(u"Ben")[0], ),
                                               (dm(u"Robert")[0], ))
    # Don't drop affixes among the first names.
    assert dm_tokenize_name("Robert, L. W.") == ((dm(u"Robert")[0], ),
                                                 (dm(u"L")[0], dm(u"W")[0]))
Example #5
0
def test_dm_tokenize_name_simple():
    """Test of tokenize_name."""
    assert dm_tokenize_name("Doe, John") == ((dm(u"Doe")[0],),
                                             (dm(u"John")[0],))
    assert dm_tokenize_name("Doe, J.") == dm_tokenize_name(u"Doe, J")
    assert dm_tokenize_name("Doe-Foe, Willem") == ((dm(u"Doe")[0],
                                                    dm(u"Foe")[0]),
                                                   (dm(u"Willem")[0],))
    assert dm_tokenize_name("Dupont, René") == \
        dm_tokenize_name("Dupont., René")
    assert dm_tokenize_name("Dupont, Jean-René") == \
        ((dm(u"Dupont")[0],), (dm(u"Jean")[0], dm(u"Rene")[0]))
    assert dm_tokenize_name("Dupont, René, III") == \
        ((dm(u"Dupont")[0],), (dm(u"Rene")[0], dm(u"III")[0]))
    assert dm_tokenize_name("Dupont, René, Jr.") == \
        ((dm(u"Dupont")[0],), (dm(u"Rene")[0], dm(u"Jr")[0]))
    assert dm_tokenize_name("Dupont, J.R.") == \
        dm_tokenize_name("Dupont, J.-R.")
    assert dm_tokenize_name("Dupont") == ((dm(u"Dupont")[0],), ('',))
    assert dm_tokenize_name("Jean Dupont") == dm_tokenize_name("Dupont, Jean")
Example #6
0
def test_phonetic_tokenize_name_simple():
    """Test of tokenize_name."""
    assert phonetic_tokenize_name("Doe, John") == ((dm(u"Doe")[0], ),
                                                   (dm(u"John")[0], ))
    assert phonetic_tokenize_name("Doe, J.") == \
        phonetic_tokenize_name(u"Doe, J")
    assert phonetic_tokenize_name("Doe-Foe, Willem") == ((dm(u"Doe")[0],
                                                          dm(u"Foe")[0]),
                                                         (dm(u"Willem")[0], ))
    assert phonetic_tokenize_name("Dupont, René") == \
        phonetic_tokenize_name("Dupont., René")
    assert phonetic_tokenize_name("Dupont, Jean-René") == \
        ((dm(u"Dupont")[0],), (dm(u"Jean")[0], dm(u"René")[0]))
    assert phonetic_tokenize_name("Dupont, René, III") == \
        ((dm(u"Dupont")[0],), (dm(u"Rene")[0], dm(u"III")[0]))
    assert phonetic_tokenize_name("Dupont, René, Jr.") == \
        ((dm(u"Dupont")[0],), (dm(u"Rene")[0], dm(u"Jr")[0]))
    assert phonetic_tokenize_name("Dupont, J.R.") == \
        phonetic_tokenize_name("Dupont, J.-R.")
    assert phonetic_tokenize_name("Dupont") == ((dm(u"Dupont")[0], ), ('', ))
    assert phonetic_tokenize_name("Jean Dupont") == \
        phonetic_tokenize_name("Dupont, Jean")