Exemple #1
0
def validate_matches(bibmatch_recid, record, server, result_recids, \
                     collections="", verbose=0, ascii_mode=False):
    """
    Perform record validation on a set of matches. This function will
    try to find any search-result that "really" is a correct match, based on
    various methods defined in a given rule-set. See more about rule-sets in
    validate_match() function documentation.

    This function will return a tuple containing a list of all record IDs
    satisfying the count of field matching needed for exact matches and a
    similar list for fuzzy matches that has less fields matching then the
    threshold. Records that are not matching at all are simply left out of
    the lists.

    @param bibmatch_recid: Current record number. Used for logging.
    @type bibmatch_recid: int

    @param record: bibrec structure of original record
    @type record: dict

    @param server: InvenioConnector object to matched record source repository
    @type server: InvenioConnector object

    @param result_recids: the list of record ids from search result.
    @type result_recids: list

    @param collections: list of collections to search, if specified
    @type collections: list

    @param verbose: be loud
    @type verbose: int

    @param ascii_mode: True to transform values to its ascii representation
    @type ascii_mode: bool

    @return: list of record IDs matched
    @rtype: list
    """
    matches_found = []
    fuzzy_matches_found = []

    # Generate final rule-set by analyzing the record
    final_ruleset = get_validation_ruleset(record)
    if not final_ruleset:
        raise BibMatchValidationError("Bad configuration rule-set." \
                                      "Please check that CFG_BIBMATCH_MATCH_VALIDATION_RULESETS" \
                                      " is formed correctly.")

    if verbose > 8:
        sys.stderr.write("\nStart record validation:\n\nFinal validation ruleset used:\n")
        pp = pprint.PrettyPrinter(stream=sys.stderr, indent=2)
        pp.pprint(final_ruleset)
    CFG_BIBMATCH_LOGGER.info("Final validation ruleset used: %s" % (final_ruleset,))

    # Fetch all records in MARCXML and convert to BibRec
    found_record_list = []
    query = " OR ".join(["001:%d" % (recid,) for recid in result_recids])

    if collections:
        search_params = dict(p=query, of="xm", c=collections)
    else:
        search_params = dict(p=query, of="xm")
    CFG_BIBMATCH_LOGGER.info("Fetching records to match: %s" % (str(search_params),))
    result_marcxml = server.search_with_retry(**search_params)
    # Check if record was found
    if result_marcxml:
        found_record_list = [r[0] for r in create_records(result_marcxml)]
        # Check if BibRecord generation was successful
        if not found_record_list:
            # Error fetching records. Unable to validate. Abort.
            raise BibMatchValidationError("Error retrieving MARCXML for possible matches from %s. Aborting." \
                                          % (server.server_url,))
        if len(found_record_list) < len(result_recids):
            # Error fetching all records. Will still continue.
            sys.stderr.write("\nError retrieving all MARCXML for possible matched records from %s.\n" \
                              % (server.server_url,))

    # Validate records one-by-one, adding any matches to the list of matching record IDs
    current_index = 1
    for matched_record in found_record_list:
        recid = record_get_field_values(matched_record, tag="001")[0]
        if verbose > 8:
            sys.stderr.write("\n Validating matched record #%d (%s):\n" % \
                             (current_index, recid))
        CFG_BIBMATCH_LOGGER.info("Matching of record %d: Comparing to matched record %s" % \
                                 (bibmatch_recid, recid))
        match_ratio = validate_match(record, matched_record, final_ruleset, \
                                     verbose, ascii_mode)
        if match_ratio == 1.0:
            # All matches were a success, this is an exact match
            CFG_BIBMATCH_LOGGER.info("Matching of record %d: Exact match found -> %s" % (bibmatch_recid, recid))
            matches_found.append(recid)
        elif match_ratio >= CFG_BIBMATCH_FUZZY_MATCH_VALIDATION_LIMIT:
            # This means that some matches failed, but some succeeded as well. That's fuzzy...
            CFG_BIBMATCH_LOGGER.info("Matching of record %d: Fuzzy match found -> %s" % \
                                     (bibmatch_recid, recid))
            fuzzy_matches_found.append(recid)
        else:
            CFG_BIBMATCH_LOGGER.info("Matching of record %d: Not a match" % (bibmatch_recid,))
        current_index += 1

    # Return list of matching record IDs
    return matches_found, fuzzy_matches_found
Exemple #2
0
def validate_match(org_record, matched_record, ruleset, verbose=0, ascii_mode=False):
    """
    This function will try to match the original record with matched record.
    This comparison uses various methods defined in configuration and/or
    determined from the source record.

    These methods can be derived from each rule-set defined, which contains a
    mapping of a certain pattern to a list of rules defining the "match-strategy".

    For example:

    ('260__', [{ 'tags' : '260__c',
                 'threshold' : 0.8,
                 'compare_mode' : 'lazy',
                 'match_mode' : 'date',
                 'result_mode' : 'normal' }])

    Quick run-down of possible values:
      Compare mode:
        'strict'    : all (sub-)fields are compared, and all must match. Order is significant.
        'normal'    : all (sub-)fields are compared, and all must match. Order is ignored.
        'lazy'      : all (sub-)fields are compared with each other and at least one must match
        'ignored'   : the tag is ignored in the match. Used to disable previously defined rules.

      Match mode:
        'title'     : uses a method specialized for comparing titles, e.g. looking for subtitles
        'author'    : uses a special authorname comparison. Will take initials into account.
        'identifier': special matching for identifiers, stripping away punctuation
        'date'      : matches dates by extracting and comparing the year
        'normal'    : normal string comparison.

      Result mode:
        'normal'    : a failed match will cause the validation to continue on other rules (if any)
                      a successful match will cause the validation to continue on other rules (if any)
        'final'     : a failed match will cause the validation to immediately exit as a failure.
                      a successful match will cause validation to immediately exit as a success.
        'joker'     : a failed match will cause the validation to continue on other rules (if any).
                      a successful match will cause validation to immediately exit as a success.

    Fields are considered matching when all its subfields or values match. ALL matching strategy
    must return successfully for a match to be validated (except for 'joker' mode).

    @param org_record: bibrec structure of original record
    @type org_record: dict

    @param matched_record: bibrec structure of matched record
    @type matched_record: dict

    @param ruleset: the default rule-set {tag: strategy,..} used when validating
    @type ruleset: dict

    @param verbose: be loud
    @type verbose: int

    @param ascii_mode: True to transform values to its ascii representation
    @type ascii_mode: bool

    @return: Number of matches succeeded divided by number of comparisons done. At least two
        successful matches must be done unless a joker or final match is found
    @rtype: float
    """
    total_number_of_matches = 0
    total_number_of_comparisons = 0
    for field_tags, threshold, compare_mode, match_mode, result_mode in ruleset:
        field_tag_list = field_tags.split(',')
        if verbose > 8:
            sys.stderr.write("\nValidating tags: %s in parsing mode '%s' and comparison\
 mode '%s' as '%s' result with threshold %0.2f\n" \
                             % (field_tag_list, compare_mode, match_mode, \
                                result_mode, threshold))
        current_matching_status = False

        ## 1. COMPARE MODE
        # Fetch defined fields from both records
        original_record_values = []
        matched_record_values = []
        for field_tag in field_tag_list:
            tag_structure = validate_tag(field_tag)
            if tag_structure != None:
                tag, ind1, ind2, code = tag_structure
                # Fetch all field instances to match
                original_values = record_get_field_values(org_record, tag, ind1, ind2, code)
                original_record_values.extend([value for value in original_values if value])
                matched_values = record_get_field_values(matched_record, tag, ind1, ind2, code)
                matched_record_values.extend([value for value in matched_values if value])

        if (len(original_record_values) == 0 or len(matched_record_values) == 0):
            # Both records do not have values, ignore.
            if verbose > 8:
                sys.stderr.write("\nBoth records do not have this field. Continue.\n")
            continue

        if result_mode != 'joker':
            # Since joker is a special beast (should have no impact on failure),
            # We first check if it is the current mode before incrementing number
            # of matching comparisons / attempts
            total_number_of_comparisons += 1

        if ascii_mode:
            original_record_values = translate_to_ascii(original_record_values)
            matched_record_values = translate_to_ascii(matched_record_values)

        ignore_order = True
        matches_needed = 0
        # How many field-value matches are needed for successful validation of this record
        if compare_mode == 'lazy':
            # 'lazy' : all fields are matched with each other, if any match = success
            matches_needed = 1
        elif compare_mode == 'normal':
            # 'normal' : all fields are compared, and all must match.
            # Order is ignored. The number of matches needed is equal
            # to the value count of original record
            matches_needed = len(original_record_values)
        elif compare_mode == 'strict':
            # 'strict' : all fields are compared, and all must match. Order matters.
            if len(original_record_values) != len(matched_record_values):
                # Not the same number of fields, not a valid match
                # Unless this is a joker, we return indicating failure
                if result_mode != 'joker':
                    return 0.0
                continue
            matches_needed = len(original_record_values)
            ignore_order = False
        if verbose > 8:
            sys.stderr.write("Total matches needed: %d -> " % (matches_needed,))

        ## 2. MATCH MODE
        comparison_function = None
        if match_mode == 'title':
            # Special title mode
            comparison_function = compare_fieldvalues_title
        elif match_mode == 'author':
            # Special author mode
            comparison_function = compare_fieldvalues_authorname
        elif match_mode == 'identifier':
            # Special identifier mode
            comparison_function = compare_fieldvalues_identifier
        elif match_mode == 'date':
            # Special identifier mode
            comparison_function = compare_fieldvalues_date
        else:
            # Normal mode
            comparison_function = compare_fieldvalues_normal

        # Get list of comparisons to perform containing extracted values
        field_comparisons = get_paired_comparisons(original_record_values, \
                                                   matched_record_values, \
                                                   ignore_order)

        if verbose > 8:
            sys.stderr.write("Field comparison values:\n%s\n" % (field_comparisons,))

        # Run comparisons according to match_mode
        current_matching_status, matches = comparison_function(field_comparisons, \
                                                               threshold, \
                                                               matches_needed)
        CFG_BIBMATCH_LOGGER.info("-- Comparing fields %s with %s = %d matches of %d" % \
                                 (str(original_record_values), \
                                  str(matched_record_values), \
                                  matches, matches_needed))

        ## 3. RESULT MODE
        if current_matching_status:
            if verbose > 8:
                sys.stderr.write("Fields matched successfully.\n")
            if result_mode in ['final', 'joker']:
                # Matching success. Return 5,5 indicating exact-match when final or joker.
                return 1.0
            total_number_of_matches += 1
        else:
            # Matching failed. Not a valid match
            if result_mode == 'final':
                # Final does not allow failure
                return 0.0
            elif result_mode == 'joker':
                if verbose > 8:
                    sys.stderr.write("Fields not matching. (Joker)\n")
            else:
                if verbose > 8:
                    sys.stderr.write("Fields not matching. \n")
    if total_number_of_matches < CFG_BIBMATCH_MIN_VALIDATION_COMPARISONS \
        or total_number_of_comparisons == 0:
        return 0.0
    return total_number_of_matches / float(total_number_of_comparisons)