def validate_matches(bibmatch_recid, record, server, result_recids, \ collections="", verbose=0, ascii_mode=False): """ Perform record validation on a set of matches. This function will try to find any search-result that "really" is a correct match, based on various methods defined in a given rule-set. See more about rule-sets in validate_match() function documentation. This function will return a tuple containing a list of all record IDs satisfying the count of field matching needed for exact matches and a similar list for fuzzy matches that has less fields matching then the threshold. Records that are not matching at all are simply left out of the lists. @param bibmatch_recid: Current record number. Used for logging. @type bibmatch_recid: int @param record: bibrec structure of original record @type record: dict @param server: InvenioConnector object to matched record source repository @type server: InvenioConnector object @param result_recids: the list of record ids from search result. @type result_recids: list @param collections: list of collections to search, if specified @type collections: list @param verbose: be loud @type verbose: int @param ascii_mode: True to transform values to its ascii representation @type ascii_mode: bool @return: list of record IDs matched @rtype: list """ matches_found = [] fuzzy_matches_found = [] # Generate final rule-set by analyzing the record final_ruleset = get_validation_ruleset(record) if not final_ruleset: raise BibMatchValidationError("Bad configuration rule-set." \ "Please check that CFG_BIBMATCH_MATCH_VALIDATION_RULESETS" \ " is formed correctly.") if verbose > 8: sys.stderr.write("\nStart record validation:\n\nFinal validation ruleset used:\n") pp = pprint.PrettyPrinter(stream=sys.stderr, indent=2) pp.pprint(final_ruleset) CFG_BIBMATCH_LOGGER.info("Final validation ruleset used: %s" % (final_ruleset,)) # Fetch all records in MARCXML and convert to BibRec found_record_list = [] query = " OR ".join(["001:%d" % (recid,) for recid in result_recids]) if collections: search_params = dict(p=query, of="xm", c=collections) else: search_params = dict(p=query, of="xm") CFG_BIBMATCH_LOGGER.info("Fetching records to match: %s" % (str(search_params),)) result_marcxml = server.search_with_retry(**search_params) # Check if record was found if result_marcxml: found_record_list = [r[0] for r in create_records(result_marcxml)] # Check if BibRecord generation was successful if not found_record_list: # Error fetching records. Unable to validate. Abort. raise BibMatchValidationError("Error retrieving MARCXML for possible matches from %s. Aborting." \ % (server.server_url,)) if len(found_record_list) < len(result_recids): # Error fetching all records. Will still continue. sys.stderr.write("\nError retrieving all MARCXML for possible matched records from %s.\n" \ % (server.server_url,)) # Validate records one-by-one, adding any matches to the list of matching record IDs current_index = 1 for matched_record in found_record_list: recid = record_get_field_values(matched_record, tag="001")[0] if verbose > 8: sys.stderr.write("\n Validating matched record #%d (%s):\n" % \ (current_index, recid)) CFG_BIBMATCH_LOGGER.info("Matching of record %d: Comparing to matched record %s" % \ (bibmatch_recid, recid)) match_ratio = validate_match(record, matched_record, final_ruleset, \ verbose, ascii_mode) if match_ratio == 1.0: # All matches were a success, this is an exact match CFG_BIBMATCH_LOGGER.info("Matching of record %d: Exact match found -> %s" % (bibmatch_recid, recid)) matches_found.append(recid) elif match_ratio >= CFG_BIBMATCH_FUZZY_MATCH_VALIDATION_LIMIT: # This means that some matches failed, but some succeeded as well. That's fuzzy... CFG_BIBMATCH_LOGGER.info("Matching of record %d: Fuzzy match found -> %s" % \ (bibmatch_recid, recid)) fuzzy_matches_found.append(recid) else: CFG_BIBMATCH_LOGGER.info("Matching of record %d: Not a match" % (bibmatch_recid,)) current_index += 1 # Return list of matching record IDs return matches_found, fuzzy_matches_found
def validate_matches(bibmatch_recid, record, server, result_recids, \ collections="", verbose=0, ascii_mode=False): """ Perform record validation on a set of matches. This function will try to find any search-result that "really" is a correct match, based on various methods defined in a given rule-set. See more about rule-sets in validate_match() function documentation. This function will return a tuple containing a list of all record IDs satisfying the count of field matching needed for exact matches and a similar list for fuzzy matches that has less fields matching then the threshold. Records that are not matching at all are simply left out of the lists. @param bibmatch_recid: Current record number. Used for logging. @type bibmatch_recid: int @param record: bibrec structure of original record @type record: dict @param server: InvenioConnector object to matched record source repository @type server: InvenioConnector object @param result_recids: the list of record ids from search result. @type result_recids: list @param collections: list of collections to search, if specified @type collections: list @param verbose: be loud @type verbose: int @param ascii_mode: True to transform values to its ascii representation @type ascii_mode: bool @return: list of record IDs matched @rtype: list """ matches_found = [] fuzzy_matches_found = [] # Generate final rule-set by analyzing the record final_ruleset = get_validation_ruleset(record) if not final_ruleset: raise BibMatchValidationError("Bad configuration rule-set." \ "Please check that CFG_BIBMATCH_MATCH_VALIDATION_RULESETS" \ " is formed correctly.") if verbose > 8: sys.stderr.write( "\nStart record validation:\n\nFinal validation ruleset used:\n") pp = pprint.PrettyPrinter(stream=sys.stderr, indent=2) pp.pprint(final_ruleset) CFG_BIBMATCH_LOGGER.info("Final validation ruleset used: %s" % (final_ruleset, )) # Fetch all records in MARCXML and convert to BibRec found_record_list = [] query = " OR ".join(["001:%d" % (recid, ) for recid in result_recids]) if collections: search_params = dict(p=query, of="xm", c=collections) else: search_params = dict(p=query, of="xm") CFG_BIBMATCH_LOGGER.info("Fetching records to match: %s" % (str(search_params), )) result_marcxml = server.search_with_retry(**search_params) # Check if record was found if result_marcxml: found_record_list = [r[0] for r in create_records(result_marcxml)] # Check if BibRecord generation was successful if not found_record_list: # Error fetching records. Unable to validate. Abort. raise BibMatchValidationError("Error retrieving MARCXML for possible matches from %s. Aborting." \ % (server.server_url,)) if len(found_record_list) < len(result_recids): # Error fetching all records. Will still continue. sys.stderr.write("\nError retrieving all MARCXML for possible matched records from %s.\n" \ % (server.server_url,)) # Validate records one-by-one, adding any matches to the list of matching record IDs current_index = 1 for matched_record in found_record_list: recid = record_get_field_values(matched_record, tag="001")[0] if verbose > 8: sys.stderr.write("\n Validating matched record #%d (%s):\n" % \ (current_index, recid)) CFG_BIBMATCH_LOGGER.info("Matching of record %d: Comparing to matched record %s" % \ (bibmatch_recid, recid)) match_ratio = validate_match(record, matched_record, final_ruleset, \ verbose, ascii_mode) if match_ratio == 1.0: # All matches were a success, this is an exact match CFG_BIBMATCH_LOGGER.info( "Matching of record %d: Exact match found -> %s" % (bibmatch_recid, recid)) matches_found.append(recid) elif match_ratio >= CFG_BIBMATCH_FUZZY_MATCH_VALIDATION_LIMIT: # This means that some matches failed, but some succeeded as well. That's fuzzy... CFG_BIBMATCH_LOGGER.info("Matching of record %d: Fuzzy match found -> %s" % \ (bibmatch_recid, recid)) fuzzy_matches_found.append(recid) else: CFG_BIBMATCH_LOGGER.info("Matching of record %d: Not a match" % (bibmatch_recid, )) current_index += 1 # Return list of matching record IDs return matches_found, fuzzy_matches_found
def validate_match(org_record, matched_record, ruleset, verbose=0, ascii_mode=False): """ This function will try to match the original record with matched record. This comparison uses various methods defined in configuration and/or determined from the source record. These methods can be derived from each rule-set defined, which contains a mapping of a certain pattern to a list of rules defining the "match-strategy". For example: ('260__', [{ 'tags' : '260__c', 'threshold' : 0.8, 'compare_mode' : 'lazy', 'match_mode' : 'date', 'result_mode' : 'normal' }]) Quick run-down of possible values: Compare mode: 'strict' : all (sub-)fields are compared, and all must match. Order is significant. 'normal' : all (sub-)fields are compared, and all must match. Order is ignored. 'lazy' : all (sub-)fields are compared with each other and at least one must match 'ignored' : the tag is ignored in the match. Used to disable previously defined rules. Match mode: 'title' : uses a method specialized for comparing titles, e.g. looking for subtitles 'author' : uses a special authorname comparison. Will take initials into account. 'identifier': special matching for identifiers, stripping away punctuation 'date' : matches dates by extracting and comparing the year 'normal' : normal string comparison. Result mode: 'normal' : a failed match will cause the validation to continue on other rules (if any) a successful match will cause the validation to continue on other rules (if any) 'final' : a failed match will cause the validation to immediately exit as a failure. a successful match will cause validation to immediately exit as a success. 'joker' : a failed match will cause the validation to continue on other rules (if any). a successful match will cause validation to immediately exit as a success. Fields are considered matching when all its subfields or values match. ALL matching strategy must return successfully for a match to be validated (except for 'joker' mode). @param org_record: bibrec structure of original record @type org_record: dict @param matched_record: bibrec structure of matched record @type matched_record: dict @param ruleset: the default rule-set {tag: strategy,..} used when validating @type ruleset: dict @param verbose: be loud @type verbose: int @param ascii_mode: True to transform values to its ascii representation @type ascii_mode: bool @return: Number of matches succeeded divided by number of comparisons done. At least two successful matches must be done unless a joker or final match is found @rtype: float """ total_number_of_matches = 0 total_number_of_comparisons = 0 for field_tags, threshold, compare_mode, match_mode, result_mode in ruleset: field_tag_list = field_tags.split(',') if verbose > 8: sys.stderr.write("\nValidating tags: %s in parsing mode '%s' and comparison\ mode '%s' as '%s' result with threshold %0.2f\n" \ % (field_tag_list, compare_mode, match_mode, \ result_mode, threshold)) current_matching_status = False ## 1. COMPARE MODE # Fetch defined fields from both records original_record_values = [] matched_record_values = [] for field_tag in field_tag_list: tag_structure = validate_tag(field_tag) if tag_structure != None: tag, ind1, ind2, code = tag_structure # Fetch all field instances to match original_values = record_get_field_values(org_record, tag, ind1, ind2, code) original_record_values.extend([value for value in original_values if value]) matched_values = record_get_field_values(matched_record, tag, ind1, ind2, code) matched_record_values.extend([value for value in matched_values if value]) if (len(original_record_values) == 0 or len(matched_record_values) == 0): # Both records do not have values, ignore. if verbose > 8: sys.stderr.write("\nBoth records do not have this field. Continue.\n") continue if result_mode != 'joker': # Since joker is a special beast (should have no impact on failure), # We first check if it is the current mode before incrementing number # of matching comparisons / attempts total_number_of_comparisons += 1 if ascii_mode: original_record_values = translate_to_ascii(original_record_values) matched_record_values = translate_to_ascii(matched_record_values) ignore_order = True matches_needed = 0 # How many field-value matches are needed for successful validation of this record if compare_mode == 'lazy': # 'lazy' : all fields are matched with each other, if any match = success matches_needed = 1 elif compare_mode == 'normal': # 'normal' : all fields are compared, and all must match. # Order is ignored. The number of matches needed is equal # to the value count of original record matches_needed = len(original_record_values) elif compare_mode == 'strict': # 'strict' : all fields are compared, and all must match. Order matters. if len(original_record_values) != len(matched_record_values): # Not the same number of fields, not a valid match # Unless this is a joker, we return indicating failure if result_mode != 'joker': return 0.0 continue matches_needed = len(original_record_values) ignore_order = False if verbose > 8: sys.stderr.write("Total matches needed: %d -> " % (matches_needed,)) ## 2. MATCH MODE comparison_function = None if match_mode == 'title': # Special title mode comparison_function = compare_fieldvalues_title elif match_mode == 'author': # Special author mode comparison_function = compare_fieldvalues_authorname elif match_mode == 'identifier': # Special identifier mode comparison_function = compare_fieldvalues_identifier elif match_mode == 'date': # Special identifier mode comparison_function = compare_fieldvalues_date else: # Normal mode comparison_function = compare_fieldvalues_normal # Get list of comparisons to perform containing extracted values field_comparisons = get_paired_comparisons(original_record_values, \ matched_record_values, \ ignore_order) if verbose > 8: sys.stderr.write("Field comparison values:\n%s\n" % (field_comparisons,)) # Run comparisons according to match_mode current_matching_status, matches = comparison_function(field_comparisons, \ threshold, \ matches_needed) CFG_BIBMATCH_LOGGER.info("-- Comparing fields %s with %s = %d matches of %d" % \ (str(original_record_values), \ str(matched_record_values), \ matches, matches_needed)) ## 3. RESULT MODE if current_matching_status: if verbose > 8: sys.stderr.write("Fields matched successfully.\n") if result_mode in ['final', 'joker']: # Matching success. Return 5,5 indicating exact-match when final or joker. return 1.0 total_number_of_matches += 1 else: # Matching failed. Not a valid match if result_mode == 'final': # Final does not allow failure return 0.0 elif result_mode == 'joker': if verbose > 8: sys.stderr.write("Fields not matching. (Joker)\n") else: if verbose > 8: sys.stderr.write("Fields not matching. \n") if total_number_of_matches < CFG_BIBMATCH_MIN_VALIDATION_COMPARISONS \ or total_number_of_comparisons == 0: return 0.0 return total_number_of_matches / float(total_number_of_comparisons)
def validate_match(org_record, matched_record, ruleset, verbose=0, ascii_mode=False): """ This function will try to match the original record with matched record. This comparison uses various methods defined in configuration and/or determined from the source record. These methods can be derived from each rule-set defined, which contains a mapping of a certain pattern to a list of rules defining the "match-strategy". For example: ('260__', [{ 'tags' : '260__c', 'threshold' : 0.8, 'compare_mode' : 'lazy', 'match_mode' : 'date', 'result_mode' : 'normal' }]) Quick run-down of possible values: Compare mode: 'strict' : all (sub-)fields are compared, and all must match. Order is significant. 'normal' : all (sub-)fields are compared, and all must match. Order is ignored. 'lazy' : all (sub-)fields are compared with each other and at least one must match 'ignored' : the tag is ignored in the match. Used to disable previously defined rules. Match mode: 'title' : uses a method specialized for comparing titles, e.g. looking for subtitles 'author' : uses a special authorname comparison. Will take initials into account. 'identifier': special matching for identifiers, stripping away punctuation 'date' : matches dates by extracting and comparing the year 'normal' : normal string comparison. Result mode: 'normal' : a failed match will cause the validation to continue on other rules (if any) a successful match will cause the validation to continue on other rules (if any) 'final' : a failed match will cause the validation to immediately exit as a failure. a successful match will cause validation to immediately exit as a success. 'joker' : a failed match will cause the validation to continue on other rules (if any). a successful match will cause validation to immediately exit as a success. Fields are considered matching when all its subfields or values match. ALL matching strategy must return successfully for a match to be validated (except for 'joker' mode). @param org_record: bibrec structure of original record @type org_record: dict @param matched_record: bibrec structure of matched record @type matched_record: dict @param ruleset: the default rule-set {tag: strategy,..} used when validating @type ruleset: dict @param verbose: be loud @type verbose: int @param ascii_mode: True to transform values to its ascii representation @type ascii_mode: bool @return: Number of matches succeeded divided by number of comparisons done. At least two successful matches must be done unless a joker or final match is found @rtype: float """ total_number_of_matches = 0 total_number_of_comparisons = 0 for field_tags, threshold, compare_mode, match_mode, result_mode in ruleset: field_tag_list = field_tags.split(',') if verbose > 8: sys.stderr.write("\nValidating tags: %s in parsing mode '%s' and comparison\ mode '%s' as '%s' result with threshold %0.2f\n" \ % (field_tag_list, compare_mode, match_mode, \ result_mode, threshold)) current_matching_status = False ## 1. COMPARE MODE # Fetch defined fields from both records original_record_values = [] matched_record_values = [] for field_tag in field_tag_list: tag_structure = validate_tag(field_tag) if tag_structure != None: tag, ind1, ind2, code = tag_structure # Fetch all field instances to match original_values = record_get_field_values( org_record, tag, ind1, ind2, code) original_record_values.extend( [value for value in original_values if value]) matched_values = record_get_field_values( matched_record, tag, ind1, ind2, code) matched_record_values.extend( [value for value in matched_values if value]) if (len(original_record_values) == 0 or len(matched_record_values) == 0): # Both records do not have values, ignore. if verbose > 8: sys.stderr.write( "\nBoth records do not have this field. Continue.\n") continue if result_mode != 'joker': # Since joker is a special beast (should have no impact on failure), # We first check if it is the current mode before incrementing number # of matching comparisons / attempts total_number_of_comparisons += 1 if ascii_mode: original_record_values = translate_to_ascii(original_record_values) matched_record_values = translate_to_ascii(matched_record_values) ignore_order = True matches_needed = 0 # How many field-value matches are needed for successful validation of this record if compare_mode == 'lazy': # 'lazy' : all fields are matched with each other, if any match = success matches_needed = 1 elif compare_mode == 'normal': # 'normal' : all fields are compared, and all must match. # Order is ignored. The number of matches needed is equal # to the value count of original record matches_needed = len(original_record_values) elif compare_mode == 'strict': # 'strict' : all fields are compared, and all must match. Order matters. if len(original_record_values) != len(matched_record_values): # Not the same number of fields, not a valid match # Unless this is a joker, we return indicating failure if result_mode != 'joker': return 0.0 continue matches_needed = len(original_record_values) ignore_order = False if verbose > 8: sys.stderr.write("Total matches needed: %d -> " % (matches_needed, )) ## 2. MATCH MODE comparison_function = None if match_mode == 'title': # Special title mode comparison_function = compare_fieldvalues_title elif match_mode == 'author': # Special author mode comparison_function = compare_fieldvalues_authorname elif match_mode == 'identifier': # Special identifier mode comparison_function = compare_fieldvalues_identifier elif match_mode == 'date': # Special identifier mode comparison_function = compare_fieldvalues_date else: # Normal mode comparison_function = compare_fieldvalues_normal # Get list of comparisons to perform containing extracted values field_comparisons = get_paired_comparisons(original_record_values, \ matched_record_values, \ ignore_order) if verbose > 8: sys.stderr.write("Field comparison values:\n%s\n" % (field_comparisons, )) # Run comparisons according to match_mode current_matching_status, matches = comparison_function(field_comparisons, \ threshold, \ matches_needed) CFG_BIBMATCH_LOGGER.info("-- Comparing fields %s with %s = %d matches of %d" % \ (str(original_record_values), \ str(matched_record_values), \ matches, matches_needed)) ## 3. RESULT MODE if current_matching_status: if verbose > 8: sys.stderr.write("Fields matched successfully.\n") if result_mode in ['final', 'joker']: # Matching success. Return 5,5 indicating exact-match when final or joker. return 1.0 total_number_of_matches += 1 else: # Matching failed. Not a valid match if result_mode == 'final': # Final does not allow failure return 0.0 elif result_mode == 'joker': if verbose > 8: sys.stderr.write("Fields not matching. (Joker)\n") else: if verbose > 8: sys.stderr.write("Fields not matching. \n") if total_number_of_matches < CFG_BIBMATCH_MIN_VALIDATION_COMPARISONS \ or total_number_of_comparisons == 0: return 0.0 return total_number_of_matches / float(total_number_of_comparisons)