Example #1
0
def test_phonetic_normalize_name_tokenize_sign():
    """Test correct handling of the cyrillic soft sign."""
    assert phonetic_tokenize_name("Aref'ev, M.") == ((dm(u"Arefev")[0], ),
                                                     (dm(u"M")[0], ))
    # If the following letter is uppercase, split
    assert phonetic_tokenize_name("An'Sun, J.") == ((dm(u"An")[0],
                                                     dm(u"Sun")[0]),
                                                    (dm(u"J")[0], ))
Example #2
0
def test_phonetic_normalize_name_tokenize_sign():
    """Test correct handling of the cyrillic soft sign."""
    assert phonetic_tokenize_name("Aref'ev, M.") == ((dm(u"Arefev")[0],),
                                                     (dm(u"M")[0],))
    # If the following letter is uppercase, split
    assert phonetic_tokenize_name("An'Sun, J.") == ((dm(u"An")[0],
                                                    dm(u"Sun")[0]),
                                                    (dm(u"J")[0],))
Example #3
0
def test_phonetic_tokenize_name_python2():
    """Test checking if custom phonetic algorithms from fuzzy packages work."""
    import fuzzy
    soundex = fuzzy.Soundex(5)
    assert phonetic_tokenize_name("Dupont, René", "nysiis") == (
        ((fuzzy.nysiis(u"Dupont"),), (fuzzy.nysiis(u"René"),)))
    assert phonetic_tokenize_name("Dupont, René", "soundex") == (
        # no direct support for unicode in soundex, thus "Rene"
        ((soundex(u"Dupont"),), (soundex(u"Rene"),)))
Example #4
0
def test_phonetic_normalize_name_remove_tokenizefixes():
    """Test correct removal of the common affixes."""
    assert phonetic_tokenize_name("von und zu Hohenstein, F.") == \
        phonetic_tokenize_name("Hohenstein, F.")
    # If the name consists of only the common prefixes, don't drop it, as
    # it might actually be the correct surname.
    assert phonetic_tokenize_name("Ben, Robert") == ((dm(u"Ben")[0], ),
                                                     (dm(u"Robert")[0], ))
    # Don't drop affixes among the first names.
    assert phonetic_tokenize_name("Robert, L. W.") == ((dm(u"Robert")[0], ),
                                                       (dm(u"L")[0],
                                                        dm(u"W")[0]))
Example #5
0
def test_phonetic_normalize_name_remove_tokenizefixes():
    """Test correct removal of the common affixes."""
    assert phonetic_tokenize_name("von und zu Hohenstein, F.") == \
        phonetic_tokenize_name("Hohenstein, F.")
    # If the name consists of only the common prefixes, don't drop it, as
    # it might actually be the correct surname.
    assert phonetic_tokenize_name("Ben, Robert") == ((dm(u"Ben")[0],),
                                                     (dm(u"Robert")[0],))
    # Don't drop affixes among the first names.
    assert phonetic_tokenize_name("Robert, L. W.") == ((dm(u"Robert")[0],),
                                                       (dm(u"L")[0],
                                                       dm(u"W")[0]))
Example #6
0
def block_phonetic(X, threshold=1000, phonetic_algorithm="double_metaphone"):
    """Block the signatures.

    This blocking algorithm takes into consideration the cases, where
    author has more than one surname. Such a signature can be assigned
    to a block for the first author surname or the last one.

    The names are preprocessed by ``phonetic_tokenize_name`` function. As a
    result, here the algorithm operates on ``Double Metaphone`` tokens which
    are previously normalized.

    The algorithm has two phases. In the first phase, all the signatures with
    one surname are clustered together. Every different surname token creates
    a new block. In the second phase, the signatures
    with multiple surnames are compared with the blocks for the first and
    last surname.

    If the first surnames of author were already used as the last given names
    on some of the signatures, the new signature will be assigned to the block
    of the last surname.

    Otherwise, the signature will be assigned to the block of
    the first surname.

    To prevent creation of too big clusters, the ``threshold`` parameter can
    be set. The algorithm will split every block which size is bigger than
    ``threshold`` into smaller ones using given names initials as the
    condition.

    Parameters
    ----------
    :param X: numpy array
        Array of one element arrays of dictionaries. Each dictionary
        represents a signature. The algorithm needs ``author_name`` field in
        the dictionaries in order to work.
    :param threshold: integer
        Size above which the blocks will be split into smaller ones.
    :param phonetic algorithm: string
        Which phonetic algorithm will be used. Options:
        -  "double_metaphone"
        -  "nysiis" (only for Python 2)
        -  "soundex" (only for Python 2)

    Returns
    -------
    :returns: numpy array
        Array with ids of the blocks. The ids are strings. The order of the
        array is the same as in the ``X`` input parameter.
    """
    # Stores all clusters. It is the only way to access them.
    # Every cluster can be accessed by the token that was used to create it.
    # It is the last token from the surnames tokens passed to the constructor.
    id_to_block = {}

    # List of tuples. Used as the in-between state of the algorithm between
    # the first and the second states. The tuple contain the block name
    # if the signature has been already blocked or None otherwise, and the
    # tokens.
    ordered_tokens = []

    # First phase.
    # Create blocks for signatures with single surname

    for signature_array in X[:, 0]:
        tokens = phonetic_tokenize_name(signature_array['author_name'],
                                        phonetic_algorithm=phonetic_algorithm)
        surname_tokens = tokens[0]
        if len(surname_tokens) == 1:
            # Single surname case
            surname = surname_tokens[0]
            if surname not in id_to_block:
                id_to_block[surname] = _Block(*tokens)
            else:
                id_to_block[surname].add_signature(*tokens)
            ordered_tokens.append((surname, tokens))
        else:
            # Multiple surnames
            ordered_tokens.append((None, tokens))

    # Second phase.
    # Assign every signature with multiple surnames to the block of the
    # first surname or the block of the last surname.

    blocks = []

    for token_tuple in ordered_tokens:

        if token_tuple[0] is not None:

            # There is already a block
            blocks.append(id_to_block[token_tuple[0]])

        else:

            # Case of multiple surnames
            tokens = token_tuple[1]
            surnames, given_names = tokens

            # Check if this combination of surnames was already included
            try:
                # First surname

                cluster = id_to_block[surnames[0]]
                if cluster.contains(surnames):
                    cluster.add_signature(*tokens)
                    blocks.append(cluster)
                    continue
            except KeyError:
                # No such block
                pass

            try:
                # Last surname

                cluster = id_to_block[surnames[-1]]
                if cluster.contains(surnames):
                    cluster.add_signature(*tokens)
                    blocks.append(cluster)
                    continue

                # # No match, compute heuristically the match over initials

                # Firstly, check if some of the surnames were used as the
                # last given names on some of the signatures.
                index = len(surnames) - 1
                match_found = False

                while index > 0:
                    token_prefix = surnames[:index]
                    if cluster.compare_tokens_from_last(token_prefix,
                                                        (surnames[-1],)):
                        cluster.add_signature(*tokens)
                        match_found = True
                        break
                    index -= 1

                if match_found:
                    # There was a full name match, so it must be the same
                    # author.
                    blocks.append(cluster)
                    continue

            except KeyError:
                # No such block
                pass

            try:
                # No match with last surname. Match with the first one.
                cluster = id_to_block[surnames[0]]
                cluster.add_signature(*tokens)
                blocks.append(cluster)

                continue

            except KeyError:
                # No such block
                pass

            # No block for the first surname and no good match for the
            # last surname.
            if surnames[-1] not in id_to_block:
                # Create new block.
                id_to_block[surnames[-1]] = _Block(*tokens)
            blocks.append(id_to_block[surnames[-1]])

    return np.array(_split_blocks(blocks, X, threshold))
Example #7
0
def block_phonetic(X, threshold=1000, phonetic_algorithm="double_metaphone"):
    """Block the signatures.

    This blocking algorithm takes into consideration the cases, where
    author has more than one surname. Such a signature can be assigned
    to a block for the first author surname or the last one.

    The names are preprocessed by ``phonetic_tokenize_name`` function. As a
    result, here the algorithm operates on ``Double Metaphone`` tokens which
    are previously normalized.

    The algorithm has two phases. In the first phase, all the signatures with
    one surname are clustered together. Every different surname token creates
    a new block. In the second phase, the signatures
    with multiple surnames are compared with the blocks for the first and
    last surname.

    If the first surnames of author were already used as the last given names
    on some of the signatures, the new signature will be assigned to the block
    of the last surname.

    Otherwise, the signature will be assigned to the block of
    the first surname.

    To prevent creation of too big clusters, the ``threshold`` parameter can
    be set. The algorithm will split every block which size is bigger than
    ``threshold`` into smaller ones using given names initials as the
    condition.

    Parameters
    ----------
    :param X: numpy array
        Array of one element arrays of dictionaries. Each dictionary
        represents a signature. The algorithm needs ``author_name`` field in
        the dictionaries in order to work.
    :param threshold: integer
        Size above which the blocks will be split into smaller ones.
    :param phonetic algorithm: string
        Which phonetic algorithm will be used. Options:
        -  "double_metaphone"
        -  "nysiis" (only for Python 2)
        -  "soundex" (only for Python 2)


    Returns
    -------
    :returns: numpy array
        Array with ids of the blocks. The ids are strings. The order of the
        array is the same as in the ``X`` input parameter.
    """
    # Stores all clusters. It is the only way to access them.
    # Every cluster can be accessed by the token that was used to create it.
    # It is the last token from the surnames tokens passed to the constructor.
    id_to_block = {}

    # List of tuples. Used as the in-between state of the algorithm between
    # the first and the second states. The tuple contain the block name
    # if the signature has been already blocked or None otherwise, and the
    # tokens.
    ordered_tokens = []

    # First phase.
    # Create blocks for signatures with single surname

    for signature_array in X[:, 0]:
        tokens = phonetic_tokenize_name(signature_array['author_name'],
                                        phonetic_algorithm=phonetic_algorithm)
        surname_tokens = tokens[0]
        if len(surname_tokens) == 1:
            # Single surname case
            surname = surname_tokens[0]
            if surname not in id_to_block:
                id_to_block[surname] = _Block(*tokens)
            else:
                id_to_block[surname].add_signature(*tokens)
            ordered_tokens.append((surname, tokens))
        else:
            # Multiple surnames
            ordered_tokens.append((None, tokens))

    # Second phase.
    # Assign every signature with multiple surnames to the block of the
    # first surname or the block of the last surname.

    blocks = []

    for token_tuple in ordered_tokens:

        if token_tuple[0] is not None:

            # There is already a block
            blocks.append(id_to_block[token_tuple[0]])

        else:

            # Case of multiple surnames
            tokens = token_tuple[1]
            surnames, given_names = tokens

            # Check if this combination of surnames was already included
            try:
                # First surname

                cluster = id_to_block[surnames[0]]
                if cluster.contains(surnames):
                    cluster.add_signature(*tokens)
                    blocks.append(cluster)
                    continue
            except KeyError:
                # No such block
                pass

            try:
                # Last surname

                cluster = id_to_block[surnames[-1]]
                if cluster.contains(surnames):
                    cluster.add_signature(*tokens)
                    blocks.append(cluster)
                    continue

                # # No match, compute heuristically the match over initials

                # Firstly, check if some of the surnames were used as the
                # last given names on some of the signatures.
                index = len(surnames) - 1
                match_found = False

                while index > 0:
                    token_prefix = surnames[:index]
                    if cluster.compare_tokens_from_last(
                            token_prefix, (surnames[-1], )):
                        cluster.add_signature(*tokens)
                        match_found = True
                        break
                    index -= 1

                if match_found:
                    # There was a full name match, so it must be the same
                    # author.
                    blocks.append(cluster)
                    continue

            except KeyError:
                # No such block
                pass

            try:
                # No match with last surname. Match with the first one.
                cluster = id_to_block[surnames[0]]
                cluster.add_signature(*tokens)
                blocks.append(cluster)

                continue

            except KeyError:
                # No such block
                pass

            # No block for the first surname and no good match for the
            # last surname.
            if surnames[-1] not in id_to_block:
                # Create new block.
                id_to_block[surnames[-1]] = _Block(*tokens)
            blocks.append(id_to_block[surnames[-1]])

    return np.array(_split_blocks(blocks, X, threshold))
Example #8
0
def test_phonetic_tokenize_name_nysiis():
    assert phonetic_tokenize_name("Dupont, René",
                                  "nysiis") == (((fuzzy.nysiis(u"Dupont"), ),
                                                 (fuzzy.nysiis(u"René"), )))
Example #9
0
def test_phonetic_tokenize_name_simple():
    """Test of tokenize_name."""
    assert phonetic_tokenize_name("Doe, John") == ((dm(u"Doe")[0], ),
                                                   (dm(u"John")[0], ))
    assert phonetic_tokenize_name("Doe, J.") == \
        phonetic_tokenize_name(u"Doe, J")
    assert phonetic_tokenize_name("Doe-Foe, Willem") == ((dm(u"Doe")[0],
                                                          dm(u"Foe")[0]),
                                                         (dm(u"Willem")[0], ))
    assert phonetic_tokenize_name("Dupont, René") == \
        phonetic_tokenize_name("Dupont., René")
    assert phonetic_tokenize_name("Dupont, Jean-René") == \
        ((dm(u"Dupont")[0],), (dm(u"Jean")[0], dm(u"René")[0]))
    assert phonetic_tokenize_name("Dupont, René, III") == \
        ((dm(u"Dupont")[0],), (dm(u"Rene")[0], dm(u"III")[0]))
    assert phonetic_tokenize_name("Dupont, René, Jr.") == \
        ((dm(u"Dupont")[0],), (dm(u"Rene")[0], dm(u"Jr")[0]))
    assert phonetic_tokenize_name("Dupont, J.R.") == \
        phonetic_tokenize_name("Dupont, J.-R.")
    assert phonetic_tokenize_name("Dupont") == ((dm(u"Dupont")[0], ), ('', ))
    assert phonetic_tokenize_name("Jean Dupont") == \
        phonetic_tokenize_name("Dupont, Jean")
Example #10
0
def test_phonetic_tokenize_name_simple():
    """Test of tokenize_name."""
    assert phonetic_tokenize_name("Doe, John") == ((dm(u"Doe")[0],),
                                                   (dm(u"John")[0],))
    assert phonetic_tokenize_name("Doe, J.") == \
        phonetic_tokenize_name(u"Doe, J")
    assert phonetic_tokenize_name("Doe-Foe, Willem") == ((dm(u"Doe")[0],
                                                         dm(u"Foe")[0]),
                                                         (dm(u"Willem")[0],))
    assert phonetic_tokenize_name("Dupont, René") == \
        phonetic_tokenize_name("Dupont., René")
    assert phonetic_tokenize_name("Dupont, Jean-René") == \
        ((dm(u"Dupont")[0],), (dm(u"Jean")[0], dm(u"René")[0]))
    assert phonetic_tokenize_name("Dupont, René, III") == \
        ((dm(u"Dupont")[0],), (dm(u"Rene")[0], dm(u"III")[0]))
    assert phonetic_tokenize_name("Dupont, René, Jr.") == \
        ((dm(u"Dupont")[0],), (dm(u"Rene")[0], dm(u"Jr")[0]))
    assert phonetic_tokenize_name("Dupont, J.R.") == \
        phonetic_tokenize_name("Dupont, J.-R.")
    assert phonetic_tokenize_name("Dupont") == ((dm(u"Dupont")[0],), ('',))
    assert phonetic_tokenize_name("Jean Dupont") == \
        phonetic_tokenize_name("Dupont, Jean")