def test_phonetic_normalize_name_tokenize_sign(): """Test correct handling of the cyrillic soft sign.""" assert phonetic_tokenize_name("Aref'ev, M.") == ((dm(u"Arefev")[0], ), (dm(u"M")[0], )) # If the following letter is uppercase, split assert phonetic_tokenize_name("An'Sun, J.") == ((dm(u"An")[0], dm(u"Sun")[0]), (dm(u"J")[0], ))
def test_phonetic_normalize_name_tokenize_sign(): """Test correct handling of the cyrillic soft sign.""" assert phonetic_tokenize_name("Aref'ev, M.") == ((dm(u"Arefev")[0],), (dm(u"M")[0],)) # If the following letter is uppercase, split assert phonetic_tokenize_name("An'Sun, J.") == ((dm(u"An")[0], dm(u"Sun")[0]), (dm(u"J")[0],))
def test_phonetic_tokenize_name_python2(): """Test checking if custom phonetic algorithms from fuzzy packages work.""" import fuzzy soundex = fuzzy.Soundex(5) assert phonetic_tokenize_name("Dupont, René", "nysiis") == ( ((fuzzy.nysiis(u"Dupont"),), (fuzzy.nysiis(u"René"),))) assert phonetic_tokenize_name("Dupont, René", "soundex") == ( # no direct support for unicode in soundex, thus "Rene" ((soundex(u"Dupont"),), (soundex(u"Rene"),)))
def test_phonetic_normalize_name_remove_tokenizefixes(): """Test correct removal of the common affixes.""" assert phonetic_tokenize_name("von und zu Hohenstein, F.") == \ phonetic_tokenize_name("Hohenstein, F.") # If the name consists of only the common prefixes, don't drop it, as # it might actually be the correct surname. assert phonetic_tokenize_name("Ben, Robert") == ((dm(u"Ben")[0], ), (dm(u"Robert")[0], )) # Don't drop affixes among the first names. assert phonetic_tokenize_name("Robert, L. W.") == ((dm(u"Robert")[0], ), (dm(u"L")[0], dm(u"W")[0]))
def test_phonetic_normalize_name_remove_tokenizefixes(): """Test correct removal of the common affixes.""" assert phonetic_tokenize_name("von und zu Hohenstein, F.") == \ phonetic_tokenize_name("Hohenstein, F.") # If the name consists of only the common prefixes, don't drop it, as # it might actually be the correct surname. assert phonetic_tokenize_name("Ben, Robert") == ((dm(u"Ben")[0],), (dm(u"Robert")[0],)) # Don't drop affixes among the first names. assert phonetic_tokenize_name("Robert, L. W.") == ((dm(u"Robert")[0],), (dm(u"L")[0], dm(u"W")[0]))
def block_phonetic(X, threshold=1000, phonetic_algorithm="double_metaphone"): """Block the signatures. This blocking algorithm takes into consideration the cases, where author has more than one surname. Such a signature can be assigned to a block for the first author surname or the last one. The names are preprocessed by ``phonetic_tokenize_name`` function. As a result, here the algorithm operates on ``Double Metaphone`` tokens which are previously normalized. The algorithm has two phases. In the first phase, all the signatures with one surname are clustered together. Every different surname token creates a new block. In the second phase, the signatures with multiple surnames are compared with the blocks for the first and last surname. If the first surnames of author were already used as the last given names on some of the signatures, the new signature will be assigned to the block of the last surname. Otherwise, the signature will be assigned to the block of the first surname. To prevent creation of too big clusters, the ``threshold`` parameter can be set. The algorithm will split every block which size is bigger than ``threshold`` into smaller ones using given names initials as the condition. Parameters ---------- :param X: numpy array Array of one element arrays of dictionaries. Each dictionary represents a signature. The algorithm needs ``author_name`` field in the dictionaries in order to work. :param threshold: integer Size above which the blocks will be split into smaller ones. :param phonetic algorithm: string Which phonetic algorithm will be used. Options: - "double_metaphone" - "nysiis" (only for Python 2) - "soundex" (only for Python 2) Returns ------- :returns: numpy array Array with ids of the blocks. The ids are strings. The order of the array is the same as in the ``X`` input parameter. """ # Stores all clusters. It is the only way to access them. # Every cluster can be accessed by the token that was used to create it. # It is the last token from the surnames tokens passed to the constructor. id_to_block = {} # List of tuples. Used as the in-between state of the algorithm between # the first and the second states. The tuple contain the block name # if the signature has been already blocked or None otherwise, and the # tokens. ordered_tokens = [] # First phase. # Create blocks for signatures with single surname for signature_array in X[:, 0]: tokens = phonetic_tokenize_name(signature_array['author_name'], phonetic_algorithm=phonetic_algorithm) surname_tokens = tokens[0] if len(surname_tokens) == 1: # Single surname case surname = surname_tokens[0] if surname not in id_to_block: id_to_block[surname] = _Block(*tokens) else: id_to_block[surname].add_signature(*tokens) ordered_tokens.append((surname, tokens)) else: # Multiple surnames ordered_tokens.append((None, tokens)) # Second phase. # Assign every signature with multiple surnames to the block of the # first surname or the block of the last surname. blocks = [] for token_tuple in ordered_tokens: if token_tuple[0] is not None: # There is already a block blocks.append(id_to_block[token_tuple[0]]) else: # Case of multiple surnames tokens = token_tuple[1] surnames, given_names = tokens # Check if this combination of surnames was already included try: # First surname cluster = id_to_block[surnames[0]] if cluster.contains(surnames): cluster.add_signature(*tokens) blocks.append(cluster) continue except KeyError: # No such block pass try: # Last surname cluster = id_to_block[surnames[-1]] if cluster.contains(surnames): cluster.add_signature(*tokens) blocks.append(cluster) continue # # No match, compute heuristically the match over initials # Firstly, check if some of the surnames were used as the # last given names on some of the signatures. index = len(surnames) - 1 match_found = False while index > 0: token_prefix = surnames[:index] if cluster.compare_tokens_from_last(token_prefix, (surnames[-1],)): cluster.add_signature(*tokens) match_found = True break index -= 1 if match_found: # There was a full name match, so it must be the same # author. blocks.append(cluster) continue except KeyError: # No such block pass try: # No match with last surname. Match with the first one. cluster = id_to_block[surnames[0]] cluster.add_signature(*tokens) blocks.append(cluster) continue except KeyError: # No such block pass # No block for the first surname and no good match for the # last surname. if surnames[-1] not in id_to_block: # Create new block. id_to_block[surnames[-1]] = _Block(*tokens) blocks.append(id_to_block[surnames[-1]]) return np.array(_split_blocks(blocks, X, threshold))
def block_phonetic(X, threshold=1000, phonetic_algorithm="double_metaphone"): """Block the signatures. This blocking algorithm takes into consideration the cases, where author has more than one surname. Such a signature can be assigned to a block for the first author surname or the last one. The names are preprocessed by ``phonetic_tokenize_name`` function. As a result, here the algorithm operates on ``Double Metaphone`` tokens which are previously normalized. The algorithm has two phases. In the first phase, all the signatures with one surname are clustered together. Every different surname token creates a new block. In the second phase, the signatures with multiple surnames are compared with the blocks for the first and last surname. If the first surnames of author were already used as the last given names on some of the signatures, the new signature will be assigned to the block of the last surname. Otherwise, the signature will be assigned to the block of the first surname. To prevent creation of too big clusters, the ``threshold`` parameter can be set. The algorithm will split every block which size is bigger than ``threshold`` into smaller ones using given names initials as the condition. Parameters ---------- :param X: numpy array Array of one element arrays of dictionaries. Each dictionary represents a signature. The algorithm needs ``author_name`` field in the dictionaries in order to work. :param threshold: integer Size above which the blocks will be split into smaller ones. :param phonetic algorithm: string Which phonetic algorithm will be used. Options: - "double_metaphone" - "nysiis" (only for Python 2) - "soundex" (only for Python 2) Returns ------- :returns: numpy array Array with ids of the blocks. The ids are strings. The order of the array is the same as in the ``X`` input parameter. """ # Stores all clusters. It is the only way to access them. # Every cluster can be accessed by the token that was used to create it. # It is the last token from the surnames tokens passed to the constructor. id_to_block = {} # List of tuples. Used as the in-between state of the algorithm between # the first and the second states. The tuple contain the block name # if the signature has been already blocked or None otherwise, and the # tokens. ordered_tokens = [] # First phase. # Create blocks for signatures with single surname for signature_array in X[:, 0]: tokens = phonetic_tokenize_name(signature_array['author_name'], phonetic_algorithm=phonetic_algorithm) surname_tokens = tokens[0] if len(surname_tokens) == 1: # Single surname case surname = surname_tokens[0] if surname not in id_to_block: id_to_block[surname] = _Block(*tokens) else: id_to_block[surname].add_signature(*tokens) ordered_tokens.append((surname, tokens)) else: # Multiple surnames ordered_tokens.append((None, tokens)) # Second phase. # Assign every signature with multiple surnames to the block of the # first surname or the block of the last surname. blocks = [] for token_tuple in ordered_tokens: if token_tuple[0] is not None: # There is already a block blocks.append(id_to_block[token_tuple[0]]) else: # Case of multiple surnames tokens = token_tuple[1] surnames, given_names = tokens # Check if this combination of surnames was already included try: # First surname cluster = id_to_block[surnames[0]] if cluster.contains(surnames): cluster.add_signature(*tokens) blocks.append(cluster) continue except KeyError: # No such block pass try: # Last surname cluster = id_to_block[surnames[-1]] if cluster.contains(surnames): cluster.add_signature(*tokens) blocks.append(cluster) continue # # No match, compute heuristically the match over initials # Firstly, check if some of the surnames were used as the # last given names on some of the signatures. index = len(surnames) - 1 match_found = False while index > 0: token_prefix = surnames[:index] if cluster.compare_tokens_from_last( token_prefix, (surnames[-1], )): cluster.add_signature(*tokens) match_found = True break index -= 1 if match_found: # There was a full name match, so it must be the same # author. blocks.append(cluster) continue except KeyError: # No such block pass try: # No match with last surname. Match with the first one. cluster = id_to_block[surnames[0]] cluster.add_signature(*tokens) blocks.append(cluster) continue except KeyError: # No such block pass # No block for the first surname and no good match for the # last surname. if surnames[-1] not in id_to_block: # Create new block. id_to_block[surnames[-1]] = _Block(*tokens) blocks.append(id_to_block[surnames[-1]]) return np.array(_split_blocks(blocks, X, threshold))
def test_phonetic_tokenize_name_nysiis(): assert phonetic_tokenize_name("Dupont, René", "nysiis") == (((fuzzy.nysiis(u"Dupont"), ), (fuzzy.nysiis(u"René"), )))
def test_phonetic_tokenize_name_simple(): """Test of tokenize_name.""" assert phonetic_tokenize_name("Doe, John") == ((dm(u"Doe")[0], ), (dm(u"John")[0], )) assert phonetic_tokenize_name("Doe, J.") == \ phonetic_tokenize_name(u"Doe, J") assert phonetic_tokenize_name("Doe-Foe, Willem") == ((dm(u"Doe")[0], dm(u"Foe")[0]), (dm(u"Willem")[0], )) assert phonetic_tokenize_name("Dupont, René") == \ phonetic_tokenize_name("Dupont., René") assert phonetic_tokenize_name("Dupont, Jean-René") == \ ((dm(u"Dupont")[0],), (dm(u"Jean")[0], dm(u"René")[0])) assert phonetic_tokenize_name("Dupont, René, III") == \ ((dm(u"Dupont")[0],), (dm(u"Rene")[0], dm(u"III")[0])) assert phonetic_tokenize_name("Dupont, René, Jr.") == \ ((dm(u"Dupont")[0],), (dm(u"Rene")[0], dm(u"Jr")[0])) assert phonetic_tokenize_name("Dupont, J.R.") == \ phonetic_tokenize_name("Dupont, J.-R.") assert phonetic_tokenize_name("Dupont") == ((dm(u"Dupont")[0], ), ('', )) assert phonetic_tokenize_name("Jean Dupont") == \ phonetic_tokenize_name("Dupont, Jean")
def test_phonetic_tokenize_name_simple(): """Test of tokenize_name.""" assert phonetic_tokenize_name("Doe, John") == ((dm(u"Doe")[0],), (dm(u"John")[0],)) assert phonetic_tokenize_name("Doe, J.") == \ phonetic_tokenize_name(u"Doe, J") assert phonetic_tokenize_name("Doe-Foe, Willem") == ((dm(u"Doe")[0], dm(u"Foe")[0]), (dm(u"Willem")[0],)) assert phonetic_tokenize_name("Dupont, René") == \ phonetic_tokenize_name("Dupont., René") assert phonetic_tokenize_name("Dupont, Jean-René") == \ ((dm(u"Dupont")[0],), (dm(u"Jean")[0], dm(u"René")[0])) assert phonetic_tokenize_name("Dupont, René, III") == \ ((dm(u"Dupont")[0],), (dm(u"Rene")[0], dm(u"III")[0])) assert phonetic_tokenize_name("Dupont, René, Jr.") == \ ((dm(u"Dupont")[0],), (dm(u"Rene")[0], dm(u"Jr")[0])) assert phonetic_tokenize_name("Dupont, J.R.") == \ phonetic_tokenize_name("Dupont, J.-R.") assert phonetic_tokenize_name("Dupont") == ((dm(u"Dupont")[0],), ('',)) assert phonetic_tokenize_name("Jean Dupont") == \ phonetic_tokenize_name("Dupont, Jean")