def names_are_substrings(name1, name2): ''' Checks if two names are substrings of each other; e.g. "Christoph" vs. "Ch" Only checks for the beginning of the names. @param name1: Name string of the first name (w/ last name) @type name1: string @param name2: Name string of the second name (w/ last name) @type name2: string @return: are names synonymous @rtype: boolean ''' if not isinstance(name1, list): name1 = split_name_parts(name1) if not isinstance(name2, list): name2 = split_name_parts(name2) onames = name1[2] tnames = name2[2] # oname = "".join(onames).lower() # tname = "".join(tnames).lower() oname = clean_name_string("".join(onames).lower(), "", False, True) tname = clean_name_string("".join(tnames).lower(), "", False, True) names_are_substrings_b = False if (oname.startswith(tname) or tname.startswith(oname)): names_are_substrings_b = True return names_are_substrings_b
def names_are_equal_composites(name1, name2): ''' Checks if names are equal composites; e.g. "guangsheng" vs. "guang sheng" @param name1: Name string of the first name (w/ last name) @type name1: string @param name2: Name string of the second name (w/ last name) @type name2: string @return: Are the names equal composites? @rtype: boolean ''' if not isinstance(name1, list): name1 = split_name_parts(name1) if not isinstance(name2, list): name2 = split_name_parts(name2) is_equal_composite = False oname_variations = create_name_tuples(name1[2]) tname_variations = create_name_tuples(name2[2]) for oname_variation in oname_variations: for tname_variation in tname_variations: oname = clean_name_string(oname_variation.lower(), "", False, True) tname = clean_name_string(tname_variation.lower(), "", False, True) if oname == tname: is_equal_composite = True break return is_equal_composite
def soft_compare_names(origin_name, target_name): ''' Soft comparison of names, to use in search engine an similar Base results: If surname is equal in [0.6,1.0] If surname similar in [0.4,0.8] If surname differs in [0.0,0.4] all depending on average compatibility of names and initials. ''' jaro_fctn = None try: from Levenshtein import jaro_winkler jaro_fctn = jaro_winkler except ImportError: jaro_fctn = jaro_winkler_str_similarity score = 0.0 oname = deepcopy(origin_name) tname = deepcopy(target_name) orig_name = split_name_parts(oname.lower()) targ_name = split_name_parts(tname.lower()) orig_name[0] = clean_name_string(orig_name[0], replacement="", keep_whitespace=False) targ_name[0] = clean_name_string(targ_name[0], replacement="", keep_whitespace=False) if orig_name[0] == targ_name[0]: score += 0.6 else: if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95) or min(len(orig_name[0]), len(targ_name[0])) <= 4): score += 0.0 else: score += 0.4 if orig_name[1] and targ_name[1]: max_initials = max(len(orig_name[1]), len(targ_name[1])) matching_i = 0 if len(orig_name[1]) >= 1 and len(targ_name[1]) >= 1: for i in orig_name[1]: if i in targ_name[1]: matching_i += 1 max_names = max(len(orig_name[2]), len(targ_name[2])) matching_n = 0 if len(orig_name[2]) >= 1 and len(targ_name[2]) >= 1: cleaned_targ_name = [ clean_name_string(i, replacement="", keep_whitespace=False) for i in targ_name[2] ] for i in orig_name[2]: if clean_name_string( i, replacement="", keep_whitespace=False) in cleaned_targ_name: matching_n += 1 name_score = (matching_i + matching_n) * 0.4 / (max_names + max_initials) score += name_score return score
def test_clean_name_string(self): """bibauthorid - test cleaning of name strings""" self.assertEqual('this is a full name', baidu.clean_name_string('this is a full name')) self.assertEqual('this is a full ,. pz', baidu.clean_name_string('this is a full ;,.$&[{{}}(=*)+]pz')) self.assertEqual('', baidu.clean_name_string(''))
def soft_compare_names(origin_name, target_name): ''' Soft comparison of names, to use in search engine an similar Base results: If surname is equal in [0.6,1.0] If surname similar in [0.4,0.8] If surname differs in [0.0,0.4] all depending on average compatibility of names and initials. ''' jaro_fctn = None try: from Levenshtein import jaro_winkler jaro_fctn = jaro_winkler except ImportError: jaro_fctn = jaro_winkler_str_similarity score = 0.0 oname = deepcopy(origin_name) tname = deepcopy(target_name) orig_name = split_name_parts(oname.lower()) targ_name = split_name_parts(tname.lower()) orig_name[0] = clean_name_string(orig_name[0], replacement="", keep_whitespace=False) targ_name[0] = clean_name_string(targ_name[0], replacement="", keep_whitespace=False) if orig_name[0] == targ_name[0]: score += 0.6 else: if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95) or min(len(orig_name[0]), len(targ_name[0])) <= 4): score += 0.0 else: score += 0.4 if orig_name[1] and targ_name[1]: max_initials = max(len(orig_name[1]), len(targ_name[1])) matching_i = 0 if len(orig_name[1]) >= 1 and len(targ_name[1]) >= 1: for i in orig_name[1]: if i in targ_name[1]: matching_i += 1 max_names = max(len(orig_name[2]), len(targ_name[2])) matching_n = 0 if len(orig_name[2]) >= 1 and len(targ_name[2]) >= 1: cleaned_targ_name = [clean_name_string(i, replacement="", keep_whitespace=False) for i in targ_name[2]] for i in orig_name[2]: if clean_name_string(i, replacement="", keep_whitespace=False) in cleaned_targ_name: matching_n += 1 name_score = (matching_i + matching_n) * 0.4 / (max_names + max_initials) score += name_score return score
def names_are_equal_gender(name1, name2, gendernames): ''' Checks on gender equality of two names baes on a word list @param name1: Name string of the first name (w/ last name) @type name1: string @param name2: Name string of the second name (w/ last name) @type name2: string @param gendernames: dictionary of male/female names @type gendernames: dict @return: Are names gender-equal? @rtype: boolean ''' if not isinstance(name1, list): name1 = split_name_parts(name1) if not isinstance(name2, list): name2 = split_name_parts(name2) print_debug = False names_are_equal_gender_b = True ogender = None tgender = None oname = name1[2][0].lower() tname = name2[2][0].lower() oname = clean_name_string(oname, "", False, True) tname = clean_name_string(tname, "", False, True) if oname in gendernames['boys']: ogender = 'Male' elif oname in gendernames['girls']: ogender = 'Female' if tname in gendernames['boys']: tgender = 'Male' elif tname in gendernames['girls']: tgender = 'Female' if print_debug: print ' Gender check: ', oname, ' is a ', ogender print ' Gender check: ', tname, ' is a ', tgender if ogender and tgender: if ogender != tgender: if print_debug: print ' Gender differs, force split!' names_are_equal_gender_b = False return names_are_equal_gender_b
def update_virtualauthor_cluster(va_id): """ Computes the clusterID for the virtualauthor. The clustering at this level is done accumulating all the compatible surnames, but there is still space for a smarter choice. @note: The clustering number is used as a speed up tool, and it is meant to accumulate all the possible compatible virtualauthors so we do not have to search anywhere else. This means that a worst case cluster is no clustering at all (only one cluster for the whole virtualauthors set) and a best case cluster is a clusterID for the minimum set of virtualauthors representing a real author. It is important _NOT to loose_ virtualauthors! It is better to have a bigger cluster than loosing possibly precious information later. @param va_id: ID of the virtual author the cluster shall be updated for @type va_id: int """ ori_name = get_virtualauthor_records(va_id, 'orig_name_string')[0]['value'] ori_name = clean_name_string(ori_name) current_cluster_ids = get_clusterids_from_name(ori_name, True) for va_item in [row for row in dat.VIRTUALAUTHORS if row['virtualauthorid'] == va_id]: va_item['clusterid'] = current_cluster_ids[1] bconfig.LOGGER.debug("| Found %s cluster for %s. Now set to %s." % (len(current_cluster_ids[0]), ori_name, current_cluster_ids[1])) update_virtualauthor_record(va_id, 'updated', 'True')
def update_virtualauthor_cluster(va_id): """ Computes the clusterID for the virtualauthor. The clustering at this level is done accumulating all the compatible surnames, but there is still space for a smarter choice. @note: The clustering number is used as a speed up tool, and it is meant to accumulate all the possible compatible virtualauthors so we do not have to search anywhere else. This means that a worst case cluster is no clustering at all (only one cluster for the whole virtualauthors set) and a best case cluster is a clusterID for the minimum set of virtualauthors representing a real author. It is important _NOT to loose_ virtualauthors! It is better to have a bigger cluster than loosing possibly precious information later. @param va_id: ID of the virtual author the cluster shall be updated for @type va_id: int """ ori_name = get_virtualauthor_records(va_id, 'orig_name_string')[0]['value'] ori_name = clean_name_string(ori_name) current_cluster_ids = get_clusterids_from_name(ori_name, True) for va_item in [ row for row in dat.VIRTUALAUTHORS if row['virtualauthorid'] == va_id ]: va_item['clusterid'] = current_cluster_ids[1] bconfig.LOGGER.debug( "| Found %s cluster for %s. Now set to %s." % (len(current_cluster_ids[0]), ori_name, current_cluster_ids[1])) update_virtualauthor_record(va_id, 'updated', 'True')
def names_are_synonymous(name1, name2, name_variations): ''' Checks if two names are synonymous; e.g. "Robert" vs. "Bob" @param name1: Name string of the first name (w/ last name) @type name1: string @param name2: Name string of the second name (w/ last name) @type name2: string @param name_variations: name variations list @type name_variations: list of lists @return: are names synonymous @rtype: boolean ''' if not isinstance(name1, list): name1 = split_name_parts(name1) if not isinstance(name2, list): name2 = split_name_parts(name2) print_debug = False names_are_synonymous_b = False max_matches = min(len(name1[2]), len(name2[2])) matches = [] for i in xrange(max_matches): matches.append(False) for nvar in name_variations: for i in xrange(max_matches): oname = name1[2][i].lower() tname = name2[2][i].lower() oname = clean_name_string(oname, "", False, True) tname = clean_name_string(tname, "", False, True) if oname in nvar and tname in nvar: if print_debug: print ' ', oname, ' and ', tname, ' are synonyms! Not splitting!' matches[i] = True if sum(matches) == max_matches: names_are_synonymous_b = True break return names_are_synonymous_b
def names_minimum_levenshtein_distance(name1, name2): ''' Determines the minimum distance D between two names. Comparison is base on the minimum number of first names. Examples: D("guang", "guang sheng") = 0 D("guang", "guangsheng") = 5 D("guang sheng", "guangsheng") = 5 D("guang sheng", "guang shing") = 1 D("guang ming", "guang fin") = 2 @precondition: Names have been checked for composition equality. @param name1: Name string of the first name (w/ last name) @type name1: string @param name2: Name string of the second name (w/ last name) @type name2: string @return: the minimum Levenshtein distance between two names @rtype: int ''' try: from Levenshtein import distance except ImportError: bconfig.LOGGER.exception("Levenshtein Module not available!") return - 1 if not isinstance(name1, list): name1 = split_name_parts(name1) if not isinstance(name2, list): name2 = split_name_parts(name2) onames = name1[2] tnames = name2[2] # min_names_count = min(len(onames), len(tnames)) # # if min_names_count <= 0: # return -1 # # oname = "".join(onames[:min_names_count]).lower() # tname = "".join(tnames[:min_names_count]).lower() oname = clean_name_string("".join(onames).lower(), "", False, True) tname = clean_name_string("".join(tnames).lower(), "", False, True) return distance(oname, tname)
def names_minimum_levenshtein_distance(name1, name2): ''' Determines the minimum distance D between two names. Comparison is base on the minimum number of first names. Examples: D("guang", "guang sheng") = 0 D("guang", "guangsheng") = 5 D("guang sheng", "guangsheng") = 5 D("guang sheng", "guang shing") = 1 D("guang ming", "guang fin") = 2 @precondition: Names have been checked for composition equality. @param name1: Name string of the first name (w/ last name) @type name1: string @param name2: Name string of the second name (w/ last name) @type name2: string @return: the minimum Levenshtein distance between two names @rtype: int ''' try: from Levenshtein import distance except ImportError: bconfig.LOGGER.exception("Levenshtein Module not available!") return -1 if not isinstance(name1, list): name1 = split_name_parts(name1) if not isinstance(name2, list): name2 = split_name_parts(name2) onames = name1[2] tnames = name2[2] # min_names_count = min(len(onames), len(tnames)) # # if min_names_count <= 0: # return -1 # # oname = "".join(onames[:min_names_count]).lower() # tname = "".join(tnames[:min_names_count]).lower() oname = clean_name_string("".join(onames).lower(), "", False, True) tname = clean_name_string("".join(tnames).lower(), "", False, True) return distance(oname, tname)
def get_clusterids_from_name(name, return_matching=False): ''' Returns a list of cluster IDs, which are fitting for the parameter 'name'. First checks if, in general, a cluster for this name exists. If not, create one. If there is a cluster, try to find all other fitting clusters and add the found cluster IDs to the list to be returned @param name: The name to be on the lookout for. @type name: string @param return_matching: also return the reference name's matching cluster @type return_matching: boolean @return: if return_matching: list of 1) list of cluster IDs 2) the cluster ID matching the name if not return_matching: list of cluster IDs @rtype: if return_matching: list of (list of int, int) if not return_matching: list of int ''' search_string = create_unified_name(name) search_string = clean_name_string(search_string) if len(search_string) > 150: search_string = search_string[:150] clusterids = set() matching_cluster = -1 initials = "" split_string = "" if search_string[:-1].count(",") > 0: split_string = search_string[:-1].replace(' ', '').split(',') if split_string[1]: initials = split_string[1].split('.') if len(initials) > 2 and len(initials) <= 5: permutation_list = initials permutation_base = ("%s, %s." % (search_string.split(',')[0], permutation_list[0])) for permutation in permutations(permutation_list[1:]): name_string = "%s %s." % (permutation_base, ". ".join(permutation)) clusters = _get_clusterids_from_name(name_string, return_matching) if return_matching: matching_cluster = clusters[1] for clusterid in clusters[0]: clusterids.add(clusterid) else: for clusterid in clusters: clusterids.add(clusterid) else: clusters = _get_clusterids_from_name(search_string, return_matching) if return_matching: matching_cluster = clusters[1] clusterids = clusters[0] else: clusterids = clusters if return_matching: return [clusterids, matching_cluster] else: return clusterids
def compare_names(origin_name, target_name): """ Compute an index of confidence that would like to indicate whether two names might represent the same person.The computation is based on similarities of name structure, in particular: Initials: We assign an high score if all the initials matches are in the right order, much lower if they are in the wrong order Names: We assign a lower score for mismatching names and higher score for fully matching names If there is nothing to compare we are forced to assume a high score. Example for splitting names: In : bibauthorid.split_name_parts("Ellis, John R") Out: ['Ellis', ['J', 'R'], ['John']] Ellis, R. Keith => [ [Ellis], [R, K], [Keith] ] Ellis, Richard Keith => [ [Ellis], [R, K], [Richard, Keith] ] Since the initials are computed whether on the real initials present in the name string and using the full name, if there is no initials match we are 1 00% confident that: 1. we have no names/initials at all, or 2. we have completely different names; hence if there is no initial match we skip this step. @param orig_name: The first author's last name, first name(s) and initial @type orig_name: list of strings and lists of strings @param targ_name: The second author's last name, first name(s) and initial @type targ_name: list of strings and lists of strings @return: a value that describes the likelihood of the names being the same @rtype: float """ jaro_fctn = None try: from Levenshtein import jaro_winkler jaro_fctn = jaro_winkler except ImportError: jaro_fctn = jaro_winkler_str_similarity oname = deepcopy(origin_name) tname = deepcopy(target_name) orig_name = split_name_parts(oname.lower()) targ_name = split_name_parts(tname.lower()) bconfig.LOGGER.info("|--> Comparing Names: \"%s\" and \"%s\"" % (origin_name, target_name)) lastname_modifier = 0.0 if not (orig_name[0] == targ_name[0]): # last names are not equal before cleaning them. Assign entry penalty. lastname_modifier = 0.15 orig_name[0] = clean_name_string(orig_name[0], replacement="", keep_whitespace=False) targ_name[0] = clean_name_string(targ_name[0], replacement="", keep_whitespace=False) if not (orig_name[0] == targ_name[0]): if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95) or min(len(orig_name[0]), len(targ_name[0])) <= 4): bconfig.LOGGER.warn(("Unequal lastnames(%s vs. %s)." + "Skipping Comparison") % (orig_name[0], targ_name[0])) return 0.0 else: bconfig.LOGGER.log(25, "Last names are not equal; " + "but similar enough to continue the comparison") # Let it go through...however, reduce the final result a little. lastname_modifier = 0.24 else: # last names are equal after cleaning them. Reduce penalty. if lastname_modifier == 0.15: lastname_modifier = 0.02 if orig_name[2] and targ_name[2]: if len(orig_name[2]) > 1 or len(targ_name[2]) > 1: variation_ps = [] oname_variations = create_name_tuples(orig_name[2]) tname_variations = create_name_tuples(targ_name[2]) for oname_variation in oname_variations: for tname_variation in tname_variations: oname_var = split_name_parts("%s, %s" % (orig_name[0], oname_variation)) tname_var = split_name_parts("%s, %s" % (targ_name[0], tname_variation)) variation_ps.append(_perform_matching(oname_var, tname_var)) return max(variation_ps) - lastname_modifier return _perform_matching(orig_name, targ_name) - lastname_modifier
def compare_names(origin_name, target_name): """ Compute an index of confidence that would like to indicate whether two names might represent the same person.The computation is based on similarities of name structure, in particular: Initials: We assign an high score if all the initials matches are in the right order, much lower if they are in the wrong order Names: We assign a lower score for mismatching names and higher score for fully matching names If there is nothing to compare we are forced to assume a high score. Example for splitting names: In : bibauthorid.split_name_parts("Ellis, John R") Out: ['Ellis', ['J', 'R'], ['John']] Ellis, R. Keith => [ [Ellis], [R, K], [Keith] ] Ellis, Richard Keith => [ [Ellis], [R, K], [Richard, Keith] ] Since the initials are computed whether on the real initials present in the name string and using the full name, if there is no initials match we are 1 00% confident that: 1. we have no names/initials at all, or 2. we have completely different names; hence if there is no initial match we skip this step. @param orig_name: The first author's last name, first name(s) and initial @type orig_name: list of strings and lists of strings @param targ_name: The second author's last name, first name(s) and initial @type targ_name: list of strings and lists of strings @return: a value that describes the likelihood of the names being the same @rtype: float """ jaro_fctn = None try: from Levenshtein import jaro_winkler jaro_fctn = jaro_winkler except ImportError: jaro_fctn = jaro_winkler_str_similarity oname = deepcopy(origin_name) tname = deepcopy(target_name) orig_name = split_name_parts(oname.lower()) targ_name = split_name_parts(tname.lower()) bconfig.LOGGER.info("|--> Comparing Names: \"%s\" and \"%s\"" % (origin_name, target_name)) lastname_modifier = 0.0 if not (orig_name[0] == targ_name[0]): # last names are not equal before cleaning them. Assign entry penalty. lastname_modifier = 0.15 orig_name[0] = clean_name_string(orig_name[0], replacement="", keep_whitespace=False) targ_name[0] = clean_name_string(targ_name[0], replacement="", keep_whitespace=False) if not (orig_name[0] == targ_name[0]): if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95) or min(len(orig_name[0]), len(targ_name[0])) <= 4): bconfig.LOGGER.warn( ("Unequal lastnames(%s vs. %s)." + "Skipping Comparison") % (orig_name[0], targ_name[0])) return 0.0 else: bconfig.LOGGER.log( 25, "Last names are not equal; " + "but similar enough to continue the comparison") # Let it go through...however, reduce the final result a little. lastname_modifier = 0.24 else: # last names are equal after cleaning them. Reduce penalty. if lastname_modifier == 0.15: lastname_modifier = 0.02 if orig_name[2] and targ_name[2]: if len(orig_name[2]) > 1 or len(targ_name[2]) > 1: variation_ps = [] oname_variations = create_name_tuples(orig_name[2]) tname_variations = create_name_tuples(targ_name[2]) for oname_variation in oname_variations: for tname_variation in tname_variations: oname_var = split_name_parts( "%s, %s" % (orig_name[0], oname_variation)) tname_var = split_name_parts( "%s, %s" % (targ_name[0], tname_variation)) variation_ps.append(_perform_matching( oname_var, tname_var)) return max(variation_ps) - lastname_modifier return _perform_matching(orig_name, targ_name) - lastname_modifier