def soft_compare_names(origin_name, target_name): ''' Soft comparison of names, to use in search engine an similar Base results: If surname is equal in [0.6,1.0] If surname similar in [0.4,0.8] If surname differs in [0.0,0.4] all depending on average compatibility of names and initials. ''' jaro_fctn = distance # try: # from Levenshtein import jaro_winkler # jaro_fctn = jaro_winkler # except ImportError: # jaro_fctn = jaro_winkler_str_similarity score = 0.0 oname = deepcopy(origin_name) tname = deepcopy(target_name) oname = translate_to_ascii(oname)[0] tname = translate_to_ascii(tname)[0] orig_name = split_name_parts(oname.lower()) targ_name = split_name_parts(tname.lower()) orig_name[0] = clean_name_string(orig_name[0], replacement="", keep_whitespace=False) targ_name[0] = clean_name_string(targ_name[0], replacement="", keep_whitespace=False) if orig_name[0] == targ_name[0]: score += 0.6 else: if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95) or min(len(orig_name[0]), len(targ_name[0])) <= 4): score += 0.0 else: score += 0.4 if orig_name[1] and targ_name[1]: max_initials = max(len(orig_name[1]), len(targ_name[1])) matching_i = 0 if len(orig_name[1]) >= 1 and len(targ_name[1]) >= 1: for i in orig_name[1]: if i in targ_name[1]: matching_i += 1 max_names = max(len(orig_name[2]), len(targ_name[2])) matching_n = 0 if len(orig_name[2]) >= 1 and len(targ_name[2]) >= 1: cleaned_targ_name = [clean_name_string(i, replacement="", keep_whitespace=False) for i in targ_name[2]] for i in orig_name[2]: if clean_name_string(i, replacement="", keep_whitespace=False) in cleaned_targ_name: matching_n += 1 name_score = (matching_i + matching_n) * 0.4 / (max_names + max_initials) score += name_score return score
def soft_compare_names(origin_name, target_name): ''' Soft comparison of names, to use in search engine an similar Base results: If surname is equal in [0.6,1.0] If surname similar in [0.4,0.8] If surname differs in [0.0,0.4] all depending on average compatibility of names and initials. ''' jaro_fctn = distance score = 0.0 oname = deepcopy(origin_name) tname = deepcopy(target_name) oname = translate_to_ascii(oname)[0] tname = translate_to_ascii(tname)[0] orig_name = split_name_parts(oname.lower()) targ_name = split_name_parts(tname.lower()) orig_name[0] = clean_string(orig_name[0]) targ_name[0] = clean_string(targ_name[0]) if orig_name[0].lower() == targ_name[0].lower(): score += 0.6 else: if ((jaro_fctn(unicode(orig_name[0].lower()), unicode(targ_name[0].lower())) < .95) or min(len(orig_name[0]), len(targ_name[0])) <= 4): score += 0.0 else: score += 0.4 if orig_name[1] and targ_name[1]: max_initials = max(len(orig_name[1]), len(targ_name[1])) matching_i = 0 if len(orig_name[1]) >= 1 and len(targ_name[1]) >= 1: for i in orig_name[1]: if i in targ_name[1]: matching_i += 1 max_names = max(len(orig_name[2]), len(targ_name[2])) matching_n = 0 if len(orig_name[2]) >= 1 and len(targ_name[2]) >= 1: cleaned_targ_name = [clean_string(i) for i in targ_name[2]] for i in orig_name[2]: if clean_string(i) in cleaned_targ_name: matching_n += 1 name_score = (matching_i + matching_n) * 0.4 / (max_names + max_initials) score += name_score return score
def clean_string(string, title_strings=False): string = _replace_content_in_parentheses(string, '') string = _apply_character_mapping_to_name(string, M_NAME_LOCALE_CHARACTER_MAPPING) string = _apply_character_mapping_to_name(string, M_NAME_SPECIAL_CHARACTER_MAPPING) string = translate_to_ascii(string)[0] string = _remove_special_characters_and_numbers(string) if title_strings: return string.title() return string
def _sort_alphanumerically_remove_leading_articles_strip_accents(self, val): """ Convert: 'The title' => 'title' 'A title' => 'title' 'Title' => 'title' """ if not val: return '' val = translate_to_ascii(val).pop().lower() val_tokens = val.split(" ", 1) #split in leading_word, phrase_without_leading_word if len(val_tokens) == 2 and val_tokens[0].strip() in LEADING_ARTICLES: return val_tokens[1].strip() return val.strip()
def compare_names(origin_name, target_name, initials_penalty=False): ''' Compare two names. ''' MAX_ALLOWED_SURNAME_DISTANCE = 2 name_comparison_print("\nComparing: " , origin_name, ' ', target_name) gendernames = GLOBAL_gendernames name_variations = GLOBAL_name_variations origin_name = translate_to_ascii(origin_name)[0] target_name = translate_to_ascii(target_name)[0] no = split_name_parts(origin_name, True, "", True) nt = split_name_parts(target_name, True, "", True) name_comparison_print("|- splitted no: ", no) name_comparison_print("|- splitted nt: ", nt) score = 0.0 surname_dist = distance(no[0], nt[0]) name_comparison_print("|- surname distance: ", surname_dist) if surname_dist > 0: l_artifact_removal = re.compile("[^a-zA-Z0-9]") fn1 = l_artifact_removal.sub("", no[0]) fn2 = l_artifact_removal.sub("", nt[0]) if fn1 == fn2: score = 1.0 else: score = max(0.0, 0.5 - (float(surname_dist) / float(MAX_ALLOWED_SURNAME_DISTANCE))) else: score = 1.0 name_comparison_print('||- surname score: ', score) initials_only = ((min(len(no[2]), len(nt[2]))) == 0) only_initials_available = False if len(no[2]) == len(nt[2]) and initials_only: only_initials_available = True name_comparison_print('|- initials only: ', initials_only) name_comparison_print('|- only initials available: ', only_initials_available) names_are_equal_composites = False if not initials_only: names_are_equal_composites = full_names_are_equal_composites(origin_name, target_name) name_comparison_print("|- equal composites: ", names_are_equal_composites) max_n_initials = max(len(no[1]), len(nt[1])) initials_intersection = set(no[1]).intersection(set(nt[1])) n_initials_intersection = len(initials_intersection) initials_union = set(no[1]).union(set(nt[1])) n_initials_union = len(initials_union) initials_distance = distance("".join(no[1]), "".join(nt[1])) if n_initials_union > 0: initials_c = float(n_initials_intersection) / float(n_initials_union) else: initials_c = 1 if len(no[1]) > len(nt[1]): alo = no[1] alt = nt[1] else: alo = nt[1] alt = no[1] lo = len(alo) lt = len(alt) if max_n_initials > 0: initials_screwup = sum([i + 1 for i, k in enumerate(reversed(alo)) if lo - 1 - i < lt and k != alt[lo - 1 - i] ]) / \ float(float(max_n_initials * (max_n_initials + 1)) / 2) initials_distance = initials_distance / max_n_initials else: initials_screwup = 0 initials_distance = 0 score = max((score - ((0.75 * initials_screwup + 0.10 * (1. - initials_c)\ + 0.15 * initials_distance) * score)), 0.0) name_comparison_print("|- initials sets: ", no[1], " ", nt[1]) name_comparison_print("|- initials distance: ", initials_distance) name_comparison_print("|- initials c: ", initials_c) name_comparison_print("|- initials screwup: ", initials_screwup) name_comparison_print("||- initials score: ", score) composits_eq = full_names_are_equal_composites(no, nt) if len(no[2]) > 0 and len(nt[2]) > 0: gender_eq = full_names_are_equal_gender(no, nt, gendernames) else: gender_eq = True vars_eq = full_names_are_synonymous(no, nt, name_variations) substr_eq = full_names_are_substrings(no, nt) if not initials_only: if len(no[2]) > len(nt[2]): nalo = no[2] nalt = nt[2] else: nalo = nt[2] nalt = no[2] nlo = len(nalo) nlt = len(nalt) names_screwup_list = [(distance(k, nalt[nlo - 1 - i]), max(len(k), len(nalt[nlo - 1 - i]))) for i, k in enumerate(reversed(nalo)) \ if nlo - 1 - i < nlt] max_names_screwup = max([float(i[0]) / i[1] for i in names_screwup_list]) avg_names_screwup = sum([float(i[0]) / i[1] for i in names_screwup_list])\ / len(names_screwup_list) else: max_names_screwup = 0 avg_names_screwup = 0 score = max(score - score * ( 0.75 * max_names_screwup + 0.25 * avg_names_screwup), 0.0) name_comparison_print("|- max names screwup: ", max_names_screwup) name_comparison_print("|- avg screwup: ", avg_names_screwup) name_comparison_print("||- names score: ", score) name_comparison_print("|- names composites: ", composits_eq) name_comparison_print("|- same gender: ", gender_eq) name_comparison_print("|- synonims: ", vars_eq) name_comparison_print("|- substrings: ", substr_eq) if vars_eq: synmap = [[i, j, names_are_synonymous(i, j, name_variations)] for i in no[2] for j in nt[2]] synmap = [i for i in synmap if i[2] == True] name_comparison_print("|-- synmap: ", synmap) for i in synmap: if no[2].index(i[0]) == nt[2].index(i[1]): score = score + (1 - score) * 0.5 else: score = score + (1 - score) * 0.15 else: name_comparison_print("|-- synmap: empty") name_comparison_print("|-- synmap score: ", score) if substr_eq and not initials_only: ssmap = [[i, j, names_are_substrings(i, j)] for i in no[2] for j in nt[2]] ssmap = [i for i in ssmap if i[2] == True] name_comparison_print("|-- substr map: ", ssmap) for i in ssmap: if no[2].index(i[0]) == nt[2].index(i[1]): score = score + (1 - score) * 0.2 else: score = score + (1 - score) * 0.05 else: name_comparison_print("|-- substr map: empty") name_comparison_print("|-- substring score: ", score) if composits_eq and not initials_only: name_comparison_print("|-- composite names") score = score + (1 - score) * 0.2 else: name_comparison_print("|-- not composite names") name_comparison_print("|-- composite score: ", score) if not gender_eq: score = score / 3. name_comparison_print("|-- apply gender penalty") else: name_comparison_print("|-- no gender penalty") name_comparison_print("|-- gender score: ", score) if surname_dist > MAX_ALLOWED_SURNAME_DISTANCE: score = 0.0 name_comparison_print("|- surname trim: ", score) else: name_comparison_print("|- no surname trim: ", score) if initials_only and (not only_initials_available or initials_penalty): score = score * .9 name_comparison_print("|- initials only penalty: ", score, initials_only, only_initials_available) else: name_comparison_print("|- no initials only penalty", initials_only, only_initials_available) name_comparison_print("||- final score: ", score) return score
def validate_match(org_record, matched_record, ruleset, verbose=0, ascii_mode=False): """ This function will try to match the original record with matched record. This comparison uses various methods defined in configuration and/or determined from the source record. These methods can be derived from each rule-set defined, which contains a mapping of a certain pattern to a list of rules defining the "match-strategy". For example: ('260__', [{ 'tags' : '260__c', 'threshold' : 0.8, 'compare_mode' : 'lazy', 'match_mode' : 'date', 'result_mode' : 'normal' }]) Quick run-down of possible values: Compare mode: 'strict' : all (sub-)fields are compared, and all must match. Order is significant. 'normal' : all (sub-)fields are compared, and all must match. Order is ignored. 'lazy' : all (sub-)fields are compared with each other and at least one must match 'ignored' : the tag is ignored in the match. Used to disable previously defined rules. Match mode: 'title' : uses a method specialized for comparing titles, e.g. looking for subtitles 'author' : uses a special authorname comparison. Will take initials into account. 'identifier': special matching for identifiers, stripping away punctuation 'date' : matches dates by extracting and comparing the year 'normal' : normal string comparison. Result mode: 'normal' : a failed match will cause the validation to continue on other rules (if any) a successful match will cause the validation to continue on other rules (if any) 'final' : a failed match will cause the validation to immediately exit as a failure. a successful match will cause validation to immediately exit as a success. 'joker' : a failed match will cause the validation to continue on other rules (if any). a successful match will cause validation to immediately exit as a success. Fields are considered matching when all its subfields or values match. ALL matching strategy must return successfully for a match to be validated (except for 'joker' mode). @param org_record: bibrec structure of original record @type org_record: dict @param matched_record: bibrec structure of matched record @type matched_record: dict @param ruleset: the default rule-set {tag: strategy,..} used when validating @type ruleset: dict @param verbose: be loud @type verbose: int @param ascii_mode: True to transform values to its ascii representation @type ascii_mode: bool @return: Number of matches succeeded divided by number of comparisons done. At least two successful matches must be done unless a joker or final match is found @rtype: float """ total_number_of_matches = 0 total_number_of_comparisons = 0 for field_tags, threshold, compare_mode, match_mode, result_mode in ruleset: field_tag_list = field_tags.split(',') if verbose > 8: sys.stderr.write("\nValidating tags: %s in parsing mode '%s' and comparison\ mode '%s' as '%s' result with threshold %0.2f\n" \ % (field_tag_list, compare_mode, match_mode, \ result_mode, threshold)) current_matching_status = False ## 1. COMPARE MODE # Fetch defined fields from both records original_record_values = [] matched_record_values = [] for field_tag in field_tag_list: tag_structure = validate_tag(field_tag) if tag_structure != None: tag, ind1, ind2, code = tag_structure # Fetch all field instances to match original_values = record_get_field_values(org_record, tag, ind1, ind2, code) original_record_values.extend([value for value in original_values if value]) matched_values = record_get_field_values(matched_record, tag, ind1, ind2, code) matched_record_values.extend([value for value in matched_values if value]) if (len(original_record_values) == 0 or len(matched_record_values) == 0): # Both records do not have values, ignore. if verbose > 8: sys.stderr.write("\nBoth records do not have this field. Continue.\n") continue if result_mode != 'joker': # Since joker is a special beast (should have no impact on failure), # We first check if it is the current mode before incrementing number # of matching comparisons / attempts total_number_of_comparisons += 1 if ascii_mode: original_record_values = translate_to_ascii(original_record_values) matched_record_values = translate_to_ascii(matched_record_values) ignore_order = True matches_needed = 0 # How many field-value matches are needed for successful validation of this record if compare_mode == 'lazy': # 'lazy' : all fields are matched with each other, if any match = success matches_needed = 1 elif compare_mode == 'normal': # 'normal' : all fields are compared, and all must match. # Order is ignored. The number of matches needed is equal # to the value count of original record matches_needed = len(original_record_values) elif compare_mode == 'strict': # 'strict' : all fields are compared, and all must match. Order matters. if len(original_record_values) != len(matched_record_values): # Not the same number of fields, not a valid match # Unless this is a joker, we return indicating failure if result_mode != 'joker': return 0.0 continue matches_needed = len(original_record_values) ignore_order = False if verbose > 8: sys.stderr.write("Total matches needed: %d -> " % (matches_needed,)) ## 2. MATCH MODE comparison_function = None if match_mode == 'title': # Special title mode comparison_function = compare_fieldvalues_title elif match_mode == 'author': # Special author mode comparison_function = compare_fieldvalues_authorname elif match_mode == 'identifier': # Special identifier mode comparison_function = compare_fieldvalues_identifier elif match_mode == 'date': # Special identifier mode comparison_function = compare_fieldvalues_date else: # Normal mode comparison_function = compare_fieldvalues_normal # Get list of comparisons to perform containing extracted values field_comparisons = get_paired_comparisons(original_record_values, \ matched_record_values, \ ignore_order) if verbose > 8: sys.stderr.write("Field comparison values:\n%s\n" % (field_comparisons,)) # Run comparisons according to match_mode current_matching_status, matches = comparison_function(field_comparisons, \ threshold, \ matches_needed) CFG_BIBMATCH_LOGGER.info("-- Comparing fields %s with %s = %d matches of %d" % \ (str(original_record_values), \ str(matched_record_values), \ matches, matches_needed)) ## 3. RESULT MODE if current_matching_status: if verbose > 8: sys.stderr.write("Fields matched successfully.\n") if result_mode in ['final', 'joker']: # Matching success. Return 5,5 indicating exact-match when final or joker. return 1.0 total_number_of_matches += 1 else: # Matching failed. Not a valid match if result_mode == 'final': # Final does not allow failure return 0.0 elif result_mode == 'joker': if verbose > 8: sys.stderr.write("Fields not matching. (Joker)\n") else: if verbose > 8: sys.stderr.write("Fields not matching. \n") if total_number_of_matches < CFG_BIBMATCH_MIN_VALIDATION_COMPARISONS \ or total_number_of_comparisons == 0: return 0.0 return total_number_of_matches / float(total_number_of_comparisons)
def _sort_nosymbols_case_insensitive_strip_accents(self, val): """Remove accents, remove symbols, and convert to lower case""" if not val: return '' return ''.join(_RE_NOSYMBOLS.findall(translate_to_ascii(val).pop().lower()))
def _sort_case_insensitive_strip_accents(self, val): """Remove accents and convert to lower case""" if not val: return '' return translate_to_ascii(val).pop().lower()