def get_trusted_and_untrusted_fields(fields1, fields2, tag): """ Selects the most trusted fields. """ try: origin1 = get_origin(fields1) origin_val1 = get_origin_importance(tag, origin1) except OriginValueNotFound, error: logger.critical(error) raise
def merge_creation_modification_dates(merged_record): """Function that grabs all the origins in the merged record and creates a merged version of the creation and modification date based only on the found origins""" #I create a local copy to avoid problems record = deepcopy(merged_record) #I extract all the creation and modification dates try: creat_mod = record[FIELD_TO_MARC['creation and modification date']] except KeyError: logger.warning(' No Creation-Modification field available!') return record #then I extract all the origins from all the fields but the creation and modification date origins = [] for field_code in record: if field_code != FIELD_TO_MARC['creation and modification date']: for field in record[field_code]: try: origin = bibrecord.field_get_subfield_values(field, ORIGIN_SUBFIELD)[0] if origin !='': origins.append(origin) #if there is origin this is a problem, but I don't have to manage it here except IndexError: pass #I unique the list origins = list(set(origins)) #then for each field in creation e modification date I check if it has an origin used in other fields #and if so I update creation and modification dates new_creation_modification_date = {} for field in creat_mod: try: origin = bibrecord.field_get_subfield_values(field, ORIGIN_SUBFIELD)[0] except IndexError: origin = '' if origin in origins: #I have to put or update the creation and modification date if len(new_creation_modification_date) == 0: #if there is no creation or modification date I simply insert the field new_creation_modification_date[CREATION_DATE_SUBFIELD] = bibrecord.field_get_subfield_values(field, CREATION_DATE_SUBFIELD)[0] new_creation_modification_date[MODIFICATION_DATE_SUBFIELD] = bibrecord.field_get_subfield_values(field, MODIFICATION_DATE_SUBFIELD)[0] new_creation_modification_date[ORIGIN_SUBFIELD] = origin new_creation_modification_date['origin_importance'] = get_origin_importance(FIELD_TO_MARC['creation and modification date'], origin) else: #otherwise I have to check which one is the oldest for creation and newest for modification old_creation = new_creation_modification_date[CREATION_DATE_SUBFIELD] old_modification = new_creation_modification_date[CREATION_DATE_SUBFIELD] new_creation = bibrecord.field_get_subfield_values(field, CREATION_DATE_SUBFIELD)[0] new_modification = bibrecord.field_get_subfield_values(field, MODIFICATION_DATE_SUBFIELD)[0] new_creation_modification_date[CREATION_DATE_SUBFIELD] = old_creation if old_creation <= new_creation else new_creation new_creation_modification_date[CREATION_DATE_SUBFIELD] = old_modification if old_modification >= new_modification else new_modification #then at the end I put as origin the most trusted origin old_origin = new_creation_modification_date[ORIGIN_SUBFIELD] old_origin_import = new_creation_modification_date['origin_importance'] new_origin_import = get_origin_importance(FIELD_TO_MARC['creation and modification date'], origin) new_creation_modification_date[ORIGIN_SUBFIELD] = old_origin if old_origin_import >= new_origin_import else origin new_creation_modification_date['origin_importance'] = old_origin_import if old_origin_import >= new_origin_import else new_origin_import #then I upgrade the field record[FIELD_TO_MARC['creation and modification date']] = [([(MODIFICATION_DATE_SUBFIELD, new_creation_modification_date[MODIFICATION_DATE_SUBFIELD]), (CREATION_DATE_SUBFIELD, new_creation_modification_date[CREATION_DATE_SUBFIELD]), (ORIGIN_SUBFIELD, new_creation_modification_date[ORIGIN_SUBFIELD])], ) + creat_mod[0][1:]] return record
def references_merger(fields1, fields2, tag): """Merging function for references""" #if one of the two lists is empty, I don't have to do anything if len(fields1) == 0 or len(fields2) == 0: logger.info(' Only one field for "%s".' % tag) return fields1+fields2 #first I split the references in two groups: the ones that should be merged and the one that have to taken over the others ref_by_merging_type_fields1 = {'take_all':[], 'priority':[]} ref_by_merging_type_fields2 = {'take_all':[], 'priority':[]} #I split the fields1 for field in fields1: if bibrecord.field_get_subfield_values(field, ORIGIN_SUBFIELD)[0] in REFERENCES_MERGING_TAKE_ALL_ORIGINS: ref_by_merging_type_fields1['take_all'].append(field) else: ref_by_merging_type_fields1['priority'].append(field) #and the fields2 (this in theory should be always of the same origin type) for field in fields2: if bibrecord.field_get_subfield_values(field, ORIGIN_SUBFIELD)[0] in REFERENCES_MERGING_TAKE_ALL_ORIGINS: ref_by_merging_type_fields2['take_all'].append(field) else: ref_by_merging_type_fields2['priority'].append(field) global_list = take_all(take_all(ref_by_merging_type_fields1['take_all'], ref_by_merging_type_fields2['take_all'], tag), priority_based_merger(ref_by_merging_type_fields1['priority'], ref_by_merging_type_fields2['priority'], tag), tag) #finally I unique the resolved references #taking the reference string (and the related extension handler) from the most trusted origin or #from the other if the most trusted origin has an empty reference string #or one with only the bibcode unique_references_dict = {} unresolved_references = [] for field in global_list: fieldcp = deepcopy(field) try: bibcode_res = bibrecord.field_get_subfield_values(fieldcp, REFERENCE_RESOLVED_KEY)[0] except IndexError: bibcode_res = None if bibcode_res: #first record found if bibcode_res not in unique_references_dict: unique_references_dict[bibcode_res] = fieldcp #merging of subfields else: #I put in local variable the two list of subfields inlist = unique_references_dict[bibcode_res][0] outlist = fieldcp[0] #I create a new dictionary where to merge the results with the subfields of the first list new_subfields = {} for subfield in inlist: new_subfields[subfield[0]] = subfield[1] origin_imp_inlist = get_origin_importance(tag, new_subfields[ORIGIN_SUBFIELD]) #then I compare these entries with the values from the second list #first I retrieve the origin of the second list and its importance for subfield in outlist: if subfield[0] == ORIGIN_SUBFIELD: origin_outlist = subfield[1] origin_imp_outlist = get_origin_importance(tag, subfield[1]) break #and I retrieve the reference extension if it exists extension_outlist = None for subfield in outlist: if subfield[0] == REFERENCE_EXTENSION: extension_outlist = subfield[1] break #then I merge for subfield in outlist: #if I don't have a subfield at all I insert it unless it is a Extension field if subfield[0] not in new_subfields and subfield[0] != REFERENCE_EXTENSION: logger.info(' Subfield "%s" added to reference "%s".' % (subfield[0], bibcode_res)) new_subfields[subfield[0]] = subfield[1] #otherwise if it is a reference string elif subfield[0] in new_subfields and subfield[0] == REFERENCE_STRING: #I extract both reference strings refstring_out = subfield[1] refstring_in = new_subfields[REFERENCE_STRING] #if the one already in the list is the bibcode and the other one not I take the other one and I set the origin to the most trusted one if (refstring_in == bibcode_res or len(refstring_in) == 0) and len(refstring_out) != 0: new_subfields[REFERENCE_STRING] = refstring_out logger.info(' Reference string (bibcode only or empty) replaced by the one with origin "%s" for reference %s".' % (origin_outlist, bibcode_res)) #if there was an extension for this string I copy also that one if extension_outlist != None: new_subfields[REFERENCE_EXTENSION] = extension_outlist logger.info(' Reference extension replaced by the one with value "%s" for reference %s".' % (extension_outlist, bibcode_res)) #I update the origin if the new one is better if origin_imp_outlist > origin_imp_inlist: #first I print the message because I need the old origin logger.info(' Reference origin "%s" replaced by the more trusted "%s".' % (new_subfields[ORIGIN_SUBFIELD], origin_outlist)) #then I replace it new_subfields[ORIGIN_SUBFIELD] = origin_outlist #otherwise if the string already in is not a bibcode or empty I have to check the importance else: if origin_imp_outlist > origin_imp_inlist: new_subfields[REFERENCE_STRING] = refstring_out logger.info(' Reference string replaced by the one with origin "%s" for reference %s".' % (origin_outlist, bibcode_res)) if extension_outlist != None: new_subfields[REFERENCE_EXTENSION] = extension_outlist logger.info(' Reference extension replaced by the one with value "%s" for reference %s".' % (extension_outlist, bibcode_res)) #first I print the message because I need the old origin logger.info(' Reference origin "%s" replaced by the more trusted "%s".' % (new_subfields[ORIGIN_SUBFIELD], origin_outlist)) new_subfields[ORIGIN_SUBFIELD] = origin_outlist #finally I replace the global field newrecord = (new_subfields.items(), ) + unique_references_dict[bibcode_res][1:] unique_references_dict[bibcode_res] = newrecord else: unresolved_references.append(fieldcp) #and I return the union of the two lists of resolved and unresolved references return unique_references_dict.values() + unresolved_references
return unique_references_dict.values() + unresolved_references def get_trusted_and_untrusted_fields(fields1, fields2, tag): """ Selects the most trusted fields. """ try: origin1 = get_origin(fields1) origin_val1 = get_origin_importance(tag, origin1) except OriginValueNotFound, error: logger.critical(error) raise try: origin2 = get_origin(fields2) origin_val2 = get_origin_importance(tag, origin2) except OriginValueNotFound, error: logger.critical(error) raise if origin_val1 > origin_val2: logger.info(' Selected fields from record 1 (%s over %s).' % (origin1, origin2)) return fields1, fields2 elif origin_val1 < origin_val2: logger.info(' Selected fields from record 2 (%s over %s).' % (origin2, origin1)) return fields2, fields1 else: raise EqualOrigins(str(origin1) + ' - ' + str(origin2)) def _get_best_fields(fields1, fields2, tag):