Ejemplo n.º 1
0
def soft_compare_names(origin_name, target_name):
    '''
    Soft comparison of names, to use in search engine an similar
    Base results:
    If surname is equal in [0.6,1.0]
    If surname similar in [0.4,0.8]
    If surname differs in [0.0,0.4]
    all depending on average compatibility of names and initials.
    '''
    jaro_fctn = distance

#    try:
#        from Levenshtein import jaro_winkler
#        jaro_fctn = jaro_winkler
#    except ImportError:
#        jaro_fctn = jaro_winkler_str_similarity

    score = 0.0
    oname = deepcopy(origin_name)
    tname = deepcopy(target_name)

    oname = translate_to_ascii(oname)[0]
    tname = translate_to_ascii(tname)[0]

    orig_name = split_name_parts(oname.lower())
    targ_name = split_name_parts(tname.lower())
    orig_name[0] = clean_name_string(orig_name[0],
                                     replacement="",
                                     keep_whitespace=False)
    targ_name[0] = clean_name_string(targ_name[0],
                                     replacement="",
                                     keep_whitespace=False)
    if orig_name[0] == targ_name[0]:
        score += 0.6
    else:
        if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95)
            or min(len(orig_name[0]), len(targ_name[0])) <= 4):
            score += 0.0
        else:
            score += 0.4

    if orig_name[1] and targ_name[1]:
        max_initials = max(len(orig_name[1]), len(targ_name[1]))
        matching_i = 0
        if len(orig_name[1]) >= 1 and len(targ_name[1]) >= 1:
            for i in orig_name[1]:
                if i in targ_name[1]:
                    matching_i += 1
        max_names = max(len(orig_name[2]), len(targ_name[2]))
        matching_n = 0
        if len(orig_name[2]) >= 1 and len(targ_name[2]) >= 1:
            cleaned_targ_name = [clean_name_string(i, replacement="", keep_whitespace=False) for i in targ_name[2]]
            for i in orig_name[2]:
                if clean_name_string(i, replacement="", keep_whitespace=False) in cleaned_targ_name:
                    matching_n += 1

        name_score = (matching_i + matching_n) * 0.4 / (max_names + max_initials)
        score += name_score
    return score
Ejemplo n.º 2
0
def soft_compare_names(origin_name, target_name):
    '''
    Soft comparison of names, to use in search engine an similar
    Base results:
    If surname is equal in [0.6,1.0]
    If surname similar in [0.4,0.8]
    If surname differs in [0.0,0.4]
    all depending on average compatibility of names and initials.
    '''
    jaro_fctn = distance

    score = 0.0
    oname = deepcopy(origin_name)
    tname = deepcopy(target_name)

    oname = translate_to_ascii(oname)[0]
    tname = translate_to_ascii(tname)[0]

    orig_name = split_name_parts(oname.lower())
    targ_name = split_name_parts(tname.lower())
    orig_name[0] = clean_string(orig_name[0])
    targ_name[0] = clean_string(targ_name[0])
    if orig_name[0].lower() == targ_name[0].lower():
        score += 0.6
    else:
        if ((jaro_fctn(unicode(orig_name[0].lower()),
                       unicode(targ_name[0].lower())) < .95)
                or min(len(orig_name[0]), len(targ_name[0])) <= 4):
            score += 0.0
        else:
            score += 0.4

    if orig_name[1] and targ_name[1]:
        max_initials = max(len(orig_name[1]), len(targ_name[1]))
        matching_i = 0
        if len(orig_name[1]) >= 1 and len(targ_name[1]) >= 1:
            for i in orig_name[1]:
                if i in targ_name[1]:
                    matching_i += 1
        max_names = max(len(orig_name[2]), len(targ_name[2]))
        matching_n = 0
        if len(orig_name[2]) >= 1 and len(targ_name[2]) >= 1:
            cleaned_targ_name = [clean_string(i) for i in targ_name[2]]
            for i in orig_name[2]:
                if clean_string(i) in cleaned_targ_name:
                    matching_n += 1

        name_score = (matching_i + matching_n) * 0.4 / (max_names +
                                                        max_initials)
        score += name_score
    return score
Ejemplo n.º 3
0
def soft_compare_names(origin_name, target_name):
    '''
    Soft comparison of names, to use in search engine an similar
    Base results:
    If surname is equal in [0.6,1.0]
    If surname similar in [0.4,0.8]
    If surname differs in [0.0,0.4]
    all depending on average compatibility of names and initials.
    '''
    jaro_fctn = distance

    score = 0.0
    oname = deepcopy(origin_name)
    tname = deepcopy(target_name)

    oname = translate_to_ascii(oname)[0]
    tname = translate_to_ascii(tname)[0]

    orig_name = split_name_parts(oname.lower())
    targ_name = split_name_parts(tname.lower())
    orig_name[0] = clean_string(orig_name[0])
    targ_name[0] = clean_string(targ_name[0])
    if orig_name[0].lower() == targ_name[0].lower():
        score += 0.6
    else:
        if ((jaro_fctn(unicode(orig_name[0].lower()),
                       unicode(targ_name[0].lower())) < .95)
                or min(len(orig_name[0]), len(targ_name[0])) <= 4):
            score += 0.0
        else:
            score += 0.4

    if orig_name[1] and targ_name[1]:
        max_initials = max(len(orig_name[1]), len(targ_name[1]))
        matching_i = 0
        if len(orig_name[1]) >= 1 and len(targ_name[1]) >= 1:
            for i in orig_name[1]:
                if i in targ_name[1]:
                    matching_i += 1
        max_names = max(len(orig_name[2]), len(targ_name[2]))
        matching_n = 0
        if len(orig_name[2]) >= 1 and len(targ_name[2]) >= 1:
            cleaned_targ_name = [clean_string(i) for i in targ_name[2]]
            for i in orig_name[2]:
                if clean_string(i) in cleaned_targ_name:
                    matching_n += 1

        name_score = (matching_i + matching_n) * 0.4 / (max_names + max_initials)
        score += name_score
    return score
Ejemplo n.º 4
0
def clean_string(string, title_strings=False):

    string = _replace_content_in_parentheses(string, '')
    string = _apply_character_mapping_to_name(string,
                                              M_NAME_LOCALE_CHARACTER_MAPPING)
    string = _apply_character_mapping_to_name(string,
                                              M_NAME_SPECIAL_CHARACTER_MAPPING)
    string = translate_to_ascii(string)[0]
    string = _remove_special_characters_and_numbers(string)

    if title_strings:
        return string.title()
    return string
Ejemplo n.º 5
0
 def _sort_alphanumerically_remove_leading_articles_strip_accents(self, val):
     """
     Convert:
     'The title' => 'title'
     'A title' => 'title'
     'Title' => 'title'
     """
     if not val:
         return ''
     val = translate_to_ascii(val).pop().lower()
     val_tokens = val.split(" ", 1) #split in leading_word, phrase_without_leading_word
     if len(val_tokens) == 2 and val_tokens[0].strip() in LEADING_ARTICLES:
         return val_tokens[1].strip()
     return val.strip()
Ejemplo n.º 6
0
def compare_names(origin_name, target_name, initials_penalty=False):
    '''
    Compare two names.
    '''
    MAX_ALLOWED_SURNAME_DISTANCE = 2
    name_comparison_print("\nComparing: " , origin_name, ' ', target_name)
    gendernames = GLOBAL_gendernames
    name_variations = GLOBAL_name_variations

    origin_name = translate_to_ascii(origin_name)[0]
    target_name = translate_to_ascii(target_name)[0]

    no = split_name_parts(origin_name, True, "", True)
    nt = split_name_parts(target_name, True, "", True)

    name_comparison_print("|- splitted no: ", no)
    name_comparison_print("|- splitted nt: ", nt)

    score = 0.0

    surname_dist = distance(no[0], nt[0])
    name_comparison_print("|- surname distance: ", surname_dist)

    if surname_dist > 0:
        l_artifact_removal = re.compile("[^a-zA-Z0-9]")
        fn1 = l_artifact_removal.sub("", no[0])
        fn2 = l_artifact_removal.sub("", nt[0])

        if fn1 == fn2:
            score = 1.0
        else:
            score = max(0.0, 0.5 - (float(surname_dist) / float(MAX_ALLOWED_SURNAME_DISTANCE)))
    else:
        score = 1.0
    name_comparison_print('||- surname score: ', score)

    initials_only = ((min(len(no[2]), len(nt[2]))) == 0)
    only_initials_available = False
    if len(no[2]) == len(nt[2]) and initials_only:
        only_initials_available = True

    name_comparison_print('|- initials only: ', initials_only)
    name_comparison_print('|- only initials available: ', only_initials_available)

    names_are_equal_composites = False
    if not initials_only:
        names_are_equal_composites = full_names_are_equal_composites(origin_name, target_name)
    name_comparison_print("|- equal composites: ", names_are_equal_composites)

    max_n_initials = max(len(no[1]), len(nt[1]))
    initials_intersection = set(no[1]).intersection(set(nt[1]))
    n_initials_intersection = len(initials_intersection)
    initials_union = set(no[1]).union(set(nt[1]))
    n_initials_union = len(initials_union)


    initials_distance = distance("".join(no[1]), "".join(nt[1]))
    if n_initials_union > 0:
        initials_c = float(n_initials_intersection) / float(n_initials_union)
    else:
        initials_c = 1

    if len(no[1]) > len(nt[1]):
        alo = no[1]
        alt = nt[1]
    else:
        alo = nt[1]
        alt = no[1]
    lo = len(alo)
    lt = len(alt)
    if max_n_initials > 0:
        initials_screwup = sum([i + 1 for i, k in enumerate(reversed(alo))
                            if lo - 1 - i < lt and k != alt[lo - 1 - i] ]) / \
                            float(float(max_n_initials * (max_n_initials + 1)) / 2)
        initials_distance = initials_distance / max_n_initials
    else:
        initials_screwup = 0
        initials_distance = 0

    score = max((score - ((0.75 * initials_screwup + 0.10 * (1. - initials_c)\
            + 0.15 * initials_distance) * score)), 0.0)
    name_comparison_print("|- initials sets: ", no[1], " ", nt[1])
    name_comparison_print("|- initials distance: ", initials_distance)
    name_comparison_print("|- initials c: ", initials_c)
    name_comparison_print("|- initials screwup: ", initials_screwup)
    name_comparison_print("||- initials score: ", score)

    composits_eq = full_names_are_equal_composites(no, nt)
    if len(no[2]) > 0 and len(nt[2]) > 0:
        gender_eq = full_names_are_equal_gender(no, nt, gendernames)
    else:
        gender_eq = True
    vars_eq = full_names_are_synonymous(no, nt, name_variations)
    substr_eq = full_names_are_substrings(no, nt)

    if not initials_only:
        if len(no[2]) > len(nt[2]):
            nalo = no[2]
            nalt = nt[2]
        else:
            nalo = nt[2]
            nalt = no[2]
        nlo = len(nalo)
        nlt = len(nalt)
        names_screwup_list = [(distance(k, nalt[nlo - 1 - i]), max(len(k), len(nalt[nlo - 1 - i])))
                             for i, k in enumerate(reversed(nalo)) \
                             if nlo - 1 - i < nlt]
        max_names_screwup = max([float(i[0]) / i[1] for i in names_screwup_list])
        avg_names_screwup = sum([float(i[0]) / i[1] for i in names_screwup_list])\
                            / len(names_screwup_list)

    else:
        max_names_screwup = 0
        avg_names_screwup = 0

    score = max(score - score * ( 0.75 * max_names_screwup + 0.25 * avg_names_screwup), 0.0)
    name_comparison_print("|- max names screwup: ", max_names_screwup)
    name_comparison_print("|- avg screwup: ", avg_names_screwup)
    name_comparison_print("||- names score: ", score)
    name_comparison_print("|- names composites: ", composits_eq)
    name_comparison_print("|- same gender: ", gender_eq)
    name_comparison_print("|- synonims: ", vars_eq)
    name_comparison_print("|- substrings: ", substr_eq)

    if vars_eq:
        synmap = [[i, j, names_are_synonymous(i, j, name_variations)] for i in no[2] for j in nt[2]]
        synmap = [i for i in synmap if i[2] == True]
        name_comparison_print("|-- synmap: ", synmap)
        for i in synmap:
            if no[2].index(i[0]) == nt[2].index(i[1]):
                score = score + (1 - score) * 0.5
            else:
                score = score + (1 - score) * 0.15
    else:
        name_comparison_print("|-- synmap: empty")
    name_comparison_print("|-- synmap score: ", score)

    if substr_eq and not initials_only:
        ssmap = [[i, j, names_are_substrings(i, j)] for i in no[2] for j in nt[2]]
        ssmap = [i for i in ssmap if i[2] == True]
        name_comparison_print("|-- substr map: ", ssmap)
        for i in ssmap:
            if no[2].index(i[0]) == nt[2].index(i[1]):
                score = score + (1 - score) * 0.2
            else:
                score = score + (1 - score) * 0.05
    else:
        name_comparison_print("|-- substr map: empty")

    name_comparison_print("|-- substring score: ", score)

    if composits_eq and not initials_only:
        name_comparison_print("|-- composite names")
        score = score + (1 - score) * 0.2
    else:
        name_comparison_print("|-- not composite names")
    name_comparison_print("|-- composite score: ", score)

    if not gender_eq:
        score = score / 3.
        name_comparison_print("|-- apply gender penalty")
    else:
        name_comparison_print("|--   no  gender penalty")

    name_comparison_print("|-- gender score: ", score)

    if surname_dist > MAX_ALLOWED_SURNAME_DISTANCE:
        score = 0.0
        name_comparison_print("|- surname trim: ", score)
    else:
        name_comparison_print("|- no surname trim: ", score)

    if initials_only and (not only_initials_available or initials_penalty):
        score = score * .9
        name_comparison_print("|- initials only penalty: ", score, initials_only, only_initials_available)
    else:
        name_comparison_print("|- no initials only penalty", initials_only, only_initials_available)

    name_comparison_print("||- final score:  ", score)

    return score
Ejemplo n.º 7
0
def validate_match(org_record, matched_record, ruleset, verbose=0, ascii_mode=False):
    """
    This function will try to match the original record with matched record.
    This comparison uses various methods defined in configuration and/or
    determined from the source record.

    These methods can be derived from each rule-set defined, which contains a
    mapping of a certain pattern to a list of rules defining the "match-strategy".

    For example:

    ('260__', [{ 'tags' : '260__c',
                 'threshold' : 0.8,
                 'compare_mode' : 'lazy',
                 'match_mode' : 'date',
                 'result_mode' : 'normal' }])

    Quick run-down of possible values:
      Compare mode:
        'strict'    : all (sub-)fields are compared, and all must match. Order is significant.
        'normal'    : all (sub-)fields are compared, and all must match. Order is ignored.
        'lazy'      : all (sub-)fields are compared with each other and at least one must match
        'ignored'   : the tag is ignored in the match. Used to disable previously defined rules.

      Match mode:
        'title'     : uses a method specialized for comparing titles, e.g. looking for subtitles
        'author'    : uses a special authorname comparison. Will take initials into account.
        'identifier': special matching for identifiers, stripping away punctuation
        'date'      : matches dates by extracting and comparing the year
        'normal'    : normal string comparison.

      Result mode:
        'normal'    : a failed match will cause the validation to continue on other rules (if any)
                      a successful match will cause the validation to continue on other rules (if any)
        'final'     : a failed match will cause the validation to immediately exit as a failure.
                      a successful match will cause validation to immediately exit as a success.
        'joker'     : a failed match will cause the validation to continue on other rules (if any).
                      a successful match will cause validation to immediately exit as a success.

    Fields are considered matching when all its subfields or values match. ALL matching strategy
    must return successfully for a match to be validated (except for 'joker' mode).

    @param org_record: bibrec structure of original record
    @type org_record: dict

    @param matched_record: bibrec structure of matched record
    @type matched_record: dict

    @param ruleset: the default rule-set {tag: strategy,..} used when validating
    @type ruleset: dict

    @param verbose: be loud
    @type verbose: int

    @param ascii_mode: True to transform values to its ascii representation
    @type ascii_mode: bool

    @return: Number of matches succeeded divided by number of comparisons done. At least two
        successful matches must be done unless a joker or final match is found
    @rtype: float
    """
    total_number_of_matches = 0
    total_number_of_comparisons = 0
    for field_tags, threshold, compare_mode, match_mode, result_mode in ruleset:
        field_tag_list = field_tags.split(',')
        if verbose > 8:
            sys.stderr.write("\nValidating tags: %s in parsing mode '%s' and comparison\
 mode '%s' as '%s' result with threshold %0.2f\n" \
                             % (field_tag_list, compare_mode, match_mode, \
                                result_mode, threshold))
        current_matching_status = False

        ## 1. COMPARE MODE
        # Fetch defined fields from both records
        original_record_values = []
        matched_record_values = []
        for field_tag in field_tag_list:
            tag_structure = validate_tag(field_tag)
            if tag_structure != None:
                tag, ind1, ind2, code = tag_structure
                # Fetch all field instances to match
                original_values = record_get_field_values(org_record, tag, ind1, ind2, code)
                original_record_values.extend([value for value in original_values if value])
                matched_values = record_get_field_values(matched_record, tag, ind1, ind2, code)
                matched_record_values.extend([value for value in matched_values if value])

        if (len(original_record_values) == 0 or len(matched_record_values) == 0):
            # Both records do not have values, ignore.
            if verbose > 8:
                sys.stderr.write("\nBoth records do not have this field. Continue.\n")
            continue

        if result_mode != 'joker':
            # Since joker is a special beast (should have no impact on failure),
            # We first check if it is the current mode before incrementing number
            # of matching comparisons / attempts
            total_number_of_comparisons += 1

        if ascii_mode:
            original_record_values = translate_to_ascii(original_record_values)
            matched_record_values = translate_to_ascii(matched_record_values)

        ignore_order = True
        matches_needed = 0
        # How many field-value matches are needed for successful validation of this record
        if compare_mode == 'lazy':
            # 'lazy' : all fields are matched with each other, if any match = success
            matches_needed = 1
        elif compare_mode == 'normal':
            # 'normal' : all fields are compared, and all must match.
            # Order is ignored. The number of matches needed is equal
            # to the value count of original record
            matches_needed = len(original_record_values)
        elif compare_mode == 'strict':
            # 'strict' : all fields are compared, and all must match. Order matters.
            if len(original_record_values) != len(matched_record_values):
                # Not the same number of fields, not a valid match
                # Unless this is a joker, we return indicating failure
                if result_mode != 'joker':
                    return 0.0
                continue
            matches_needed = len(original_record_values)
            ignore_order = False
        if verbose > 8:
            sys.stderr.write("Total matches needed: %d -> " % (matches_needed,))

        ## 2. MATCH MODE
        comparison_function = None
        if match_mode == 'title':
            # Special title mode
            comparison_function = compare_fieldvalues_title
        elif match_mode == 'author':
            # Special author mode
            comparison_function = compare_fieldvalues_authorname
        elif match_mode == 'identifier':
            # Special identifier mode
            comparison_function = compare_fieldvalues_identifier
        elif match_mode == 'date':
            # Special identifier mode
            comparison_function = compare_fieldvalues_date
        else:
            # Normal mode
            comparison_function = compare_fieldvalues_normal

        # Get list of comparisons to perform containing extracted values
        field_comparisons = get_paired_comparisons(original_record_values, \
                                                   matched_record_values, \
                                                   ignore_order)

        if verbose > 8:
            sys.stderr.write("Field comparison values:\n%s\n" % (field_comparisons,))

        # Run comparisons according to match_mode
        current_matching_status, matches = comparison_function(field_comparisons, \
                                                               threshold, \
                                                               matches_needed)
        CFG_BIBMATCH_LOGGER.info("-- Comparing fields %s with %s = %d matches of %d" % \
                                 (str(original_record_values), \
                                  str(matched_record_values), \
                                  matches, matches_needed))

        ## 3. RESULT MODE
        if current_matching_status:
            if verbose > 8:
                sys.stderr.write("Fields matched successfully.\n")
            if result_mode in ['final', 'joker']:
                # Matching success. Return 5,5 indicating exact-match when final or joker.
                return 1.0
            total_number_of_matches += 1
        else:
            # Matching failed. Not a valid match
            if result_mode == 'final':
                # Final does not allow failure
                return 0.0
            elif result_mode == 'joker':
                if verbose > 8:
                    sys.stderr.write("Fields not matching. (Joker)\n")
            else:
                if verbose > 8:
                    sys.stderr.write("Fields not matching. \n")
    if total_number_of_matches < CFG_BIBMATCH_MIN_VALIDATION_COMPARISONS \
        or total_number_of_comparisons == 0:
        return 0.0
    return total_number_of_matches / float(total_number_of_comparisons)
Ejemplo n.º 8
0
 def _sort_nosymbols_case_insensitive_strip_accents(self, val):
     """Remove accents, remove symbols, and convert to lower case"""
     if not val:
         return ''
     return ''.join(_RE_NOSYMBOLS.findall(translate_to_ascii(val).pop().lower()))
Ejemplo n.º 9
0
 def _sort_case_insensitive_strip_accents(self, val):
     """Remove accents and convert to lower case"""
     if not val:
         return ''
     return translate_to_ascii(val).pop().lower()