def soft_compare_names(origin_name, target_name): ''' Soft comparison of names, to use in search engine an similar Base results: If surname is equal in [0.6,1.0] If surname similar in [0.4,0.8] If surname differs in [0.0,0.4] all depending on average compatibility of names and initials. ''' jaro_fctn = None try: from Levenshtein import jaro_winkler jaro_fctn = jaro_winkler except ImportError: jaro_fctn = jaro_winkler_str_similarity score = 0.0 oname = deepcopy(origin_name) tname = deepcopy(target_name) orig_name = split_name_parts(oname.lower()) targ_name = split_name_parts(tname.lower()) orig_name[0] = clean_name_string(orig_name[0], replacement="", keep_whitespace=False) targ_name[0] = clean_name_string(targ_name[0], replacement="", keep_whitespace=False) if orig_name[0] == targ_name[0]: score += 0.6 else: if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95) or min(len(orig_name[0]), len(targ_name[0])) <= 4): score += 0.0 else: score += 0.4 if orig_name[1] and targ_name[1]: max_initials = max(len(orig_name[1]), len(targ_name[1])) matching_i = 0 if len(orig_name[1]) >= 1 and len(targ_name[1]) >= 1: for i in orig_name[1]: if i in targ_name[1]: matching_i += 1 max_names = max(len(orig_name[2]), len(targ_name[2])) matching_n = 0 if len(orig_name[2]) >= 1 and len(targ_name[2]) >= 1: cleaned_targ_name = [ clean_name_string(i, replacement="", keep_whitespace=False) for i in targ_name[2] ] for i in orig_name[2]: if clean_name_string( i, replacement="", keep_whitespace=False) in cleaned_targ_name: matching_n += 1 name_score = (matching_i + matching_n) * 0.4 / (max_names + max_initials) score += name_score return score
def names_are_substrings(name1, name2): ''' Checks if two names are substrings of each other; e.g. "Christoph" vs. "Ch" Only checks for the beginning of the names. @param name1: Name string of the first name (w/ last name) @type name1: string @param name2: Name string of the second name (w/ last name) @type name2: string @return: are names synonymous @rtype: boolean ''' if not isinstance(name1, list): name1 = split_name_parts(name1) if not isinstance(name2, list): name2 = split_name_parts(name2) onames = name1[2] tnames = name2[2] # oname = "".join(onames).lower() # tname = "".join(tnames).lower() oname = clean_name_string("".join(onames).lower(), "", False, True) tname = clean_name_string("".join(tnames).lower(), "", False, True) names_are_substrings_b = False if (oname.startswith(tname) or tname.startswith(oname)): names_are_substrings_b = True return names_are_substrings_b
def names_are_equal_composites(name1, name2): ''' Checks if names are equal composites; e.g. "guangsheng" vs. "guang sheng" @param name1: Name string of the first name (w/ last name) @type name1: string @param name2: Name string of the second name (w/ last name) @type name2: string @return: Are the names equal composites? @rtype: boolean ''' if not isinstance(name1, list): name1 = split_name_parts(name1) if not isinstance(name2, list): name2 = split_name_parts(name2) is_equal_composite = False oname_variations = create_name_tuples(name1[2]) tname_variations = create_name_tuples(name2[2]) for oname_variation in oname_variations: for tname_variation in tname_variations: oname = clean_name_string(oname_variation.lower(), "", False, True) tname = clean_name_string(tname_variation.lower(), "", False, True) if oname == tname: is_equal_composite = True break return is_equal_composite
def soft_compare_names(origin_name, target_name): ''' Soft comparison of names, to use in search engine an similar Base results: If surname is equal in [0.6,1.0] If surname similar in [0.4,0.8] If surname differs in [0.0,0.4] all depending on average compatibility of names and initials. ''' jaro_fctn = None try: from Levenshtein import jaro_winkler jaro_fctn = jaro_winkler except ImportError: jaro_fctn = jaro_winkler_str_similarity score = 0.0 oname = deepcopy(origin_name) tname = deepcopy(target_name) orig_name = split_name_parts(oname.lower()) targ_name = split_name_parts(tname.lower()) orig_name[0] = clean_name_string(orig_name[0], replacement="", keep_whitespace=False) targ_name[0] = clean_name_string(targ_name[0], replacement="", keep_whitespace=False) if orig_name[0] == targ_name[0]: score += 0.6 else: if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95) or min(len(orig_name[0]), len(targ_name[0])) <= 4): score += 0.0 else: score += 0.4 if orig_name[1] and targ_name[1]: max_initials = max(len(orig_name[1]), len(targ_name[1])) matching_i = 0 if len(orig_name[1]) >= 1 and len(targ_name[1]) >= 1: for i in orig_name[1]: if i in targ_name[1]: matching_i += 1 max_names = max(len(orig_name[2]), len(targ_name[2])) matching_n = 0 if len(orig_name[2]) >= 1 and len(targ_name[2]) >= 1: cleaned_targ_name = [clean_name_string(i, replacement="", keep_whitespace=False) for i in targ_name[2]] for i in orig_name[2]: if clean_name_string(i, replacement="", keep_whitespace=False) in cleaned_targ_name: matching_n += 1 name_score = (matching_i + matching_n) * 0.4 / (max_names + max_initials) score += name_score return score
def search_matching_names(authorname_string, match_function=name_matching, consider_surname_only=True): """ search for matching names give a matching function. @warning: searching for matching name with consider_surname_only=false will be painfully slow! You've been warned. @warning: for mental sanity purposes the surnames not ending with a comma are being ignored; if you're searching for a surname without comma or names, the comma is being added automatically to the end of the string. @param authorname_string: The author name string @type authorname_string: string @param match_function: The function to use for the name matching @type match_function: function descriptor @param consider_surname_only: Decides if only names with the same surname shall be considered or _all_ other names. @type consider_surname_only: boolean @return: an array containing a tuple @rtype: list of tuples @note: example: search_matching_names('einstein, albert') Out[7]: [[(962L, 'Einstein, Albert'), ['Einstein', ['A'], ['Albert']]], [(1128L, 'Einstein, A.'), ['Einstein', ['A'], []]]] """ possible_names = [] names = [] if authorname_string.count(',') == 0: authorname_string += ',' authorname = bibauthorid_utils.split_name_parts(authorname_string) if consider_surname_only: names = [ row for row in dat.AUTHOR_NAMES if row['name'].startswith(authorname[0]) ] else: names = [row for row in dat.AUTHOR_NAMES] for name in names: if match_function(authorname_string, name['name']): possible_names.append([ (name['id'], name['name']), bibauthorid_utils.split_name_parts(name['name']) ]) return possible_names
def names_are_equal_gender(name1, name2, gendernames): ''' Checks on gender equality of two names baes on a word list @param name1: Name string of the first name (w/ last name) @type name1: string @param name2: Name string of the second name (w/ last name) @type name2: string @param gendernames: dictionary of male/female names @type gendernames: dict @return: Are names gender-equal? @rtype: boolean ''' if not isinstance(name1, list): name1 = split_name_parts(name1) if not isinstance(name2, list): name2 = split_name_parts(name2) print_debug = False names_are_equal_gender_b = True ogender = None tgender = None oname = name1[2][0].lower() tname = name2[2][0].lower() oname = clean_name_string(oname, "", False, True) tname = clean_name_string(tname, "", False, True) if oname in gendernames['boys']: ogender = 'Male' elif oname in gendernames['girls']: ogender = 'Female' if tname in gendernames['boys']: tgender = 'Male' elif tname in gendernames['girls']: tgender = 'Female' if print_debug: print ' Gender check: ', oname, ' is a ', ogender print ' Gender check: ', tname, ' is a ', tgender if ogender and tgender: if ogender != tgender: if print_debug: print ' Gender differs, force split!' names_are_equal_gender_b = False return names_are_equal_gender_b
def get_va_ids_by_recid_lname(bibrec, lastname): ''' Finds all the virtual author ids that belong to a certain record and hold a certain last name @param bibrec: bibrec id of a record @type bibrec: int @param lastname: The last name of a person @type lastname: string @return: list of virtual author ids @rtype: list of int ''' va_ids = set() pot_va_ids = [ row['virtualauthorid'] for row in dat.VIRTUALAUTHOR_DATA if ((row['tag'] == 'bibrec_id') and (row['value'] == str(bibrec))) ] for va_id in [ row['virtualauthorid'] for row in dat.VIRTUALAUTHOR_DATA if ((row['virtualauthorid'] in pot_va_ids) and ( row['tag'] == 'orig_name_string') and ( split_name_parts(row['value'])[0] == lastname)) ]: va_ids.add(va_id) return list(va_ids)
def find_and_process_updates(process_initials): """ Finds and processes not updated virtualauthors (which are identified by the 'updated' tag) and delivers the ID of this virtualauthor to the function responsible for assigning the virtualauthor to a realauthor. @param process_initials: If names with initials only shall be processed or not @type process_initials: boolean """ if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty(): init_va_process_queue() while True: va_id = -1 if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty(): bconfig.LOGGER.debug("Empty Queue. Job finished. Nothing to do.") break else: va_id = dat.VIRTUALAUTHOR_PROCESS_QUEUE.get() va_name = bibauthorid_virtualauthor_utils.get_virtualauthor_records(va_id, tag="orig_name_string")[0]["value"] if not process_initials: if bibauthorid_utils.split_name_parts(va_name)[2]: (bibauthorid_virtualauthor_utils.delete_virtualauthor_record(va_id, "updated")) bconfig.LOGGER.log(25, "|> Inserting VA:" + " %s Orig. name: %s" % (va_id, va_name)) add_virtualauthor(va_id) else: (bibauthorid_virtualauthor_utils.delete_virtualauthor_record(va_id, "updated")) bconfig.LOGGER.log(25, "|> Inserting VA: %s Orig. name: %s" % (va_id, va_name)) add_virtualauthor(va_id)
def test_split_name_parts(self): """bibauthorid - test split name parts""" self.assertEqual(['This', ['I', 'F'], ['Isacorrect', 'Fullname'], [0, 1]], baidu.split_name_parts('This, Isacorrect Fullname')) self.assertEqual(['', [], []], baidu.split_name_parts('')) self.assertEqual(['name', ['F', 'I'], ['Full', 'Inverted'], [0, 1]], baidu.split_name_parts('full inverted name')) self.assertEqual(['Two Words', ['S', 'N'], ['Surname', 'Name'], [0, 1]], baidu.split_name_parts('Two Words, Surname Name')) self.assertEqual(['Strange+)*{ (=]&-$Char', ['N'], ['Name'], [0]], baidu.split_name_parts('Strange+)*{ (=]&-$Char, Name'))
def search_matching_names(authorname_string, match_function=name_matching, consider_surname_only=True): """ search for matching names give a matching function. @warning: searching for matching name with consider_surname_only=false will be painfully slow! You've been warned. @warning: for mental sanity purposes the surnames not ending with a comma are being ignored; if you're searching for a surname without comma or names, the comma is being added automatically to the end of the string. @param authorname_string: The author name string @type authorname_string: string @param match_function: The function to use for the name matching @type match_function: function descriptor @param consider_surname_only: Decides if only names with the same surname shall be considered or _all_ other names. @type consider_surname_only: boolean @return: an array containing a tuple @rtype: list of tuples @note: example: search_matching_names('einstein, albert') Out[7]: [[(962L, 'Einstein, Albert'), ['Einstein', ['A'], ['Albert']]], [(1128L, 'Einstein, A.'), ['Einstein', ['A'], []]]] """ possible_names = [] names = [] if authorname_string.count(',') == 0: authorname_string += ',' authorname = bibauthorid_utils.split_name_parts(authorname_string) if consider_surname_only: names = [row for row in dat.AUTHOR_NAMES if row['name'].startswith(authorname[0])] else: names = [row for row in dat.AUTHOR_NAMES] for name in names: if match_function(authorname_string, name['name']): possible_names.append([(name['id'], name['name']), bibauthorid_utils.split_name_parts(name['name'])]) return possible_names
def names_are_synonymous(name1, name2, name_variations): ''' Checks if two names are synonymous; e.g. "Robert" vs. "Bob" @param name1: Name string of the first name (w/ last name) @type name1: string @param name2: Name string of the second name (w/ last name) @type name2: string @param name_variations: name variations list @type name_variations: list of lists @return: are names synonymous @rtype: boolean ''' if not isinstance(name1, list): name1 = split_name_parts(name1) if not isinstance(name2, list): name2 = split_name_parts(name2) print_debug = False names_are_synonymous_b = False max_matches = min(len(name1[2]), len(name2[2])) matches = [] for i in xrange(max_matches): matches.append(False) for nvar in name_variations: for i in xrange(max_matches): oname = name1[2][i].lower() tname = name2[2][i].lower() oname = clean_name_string(oname, "", False, True) tname = clean_name_string(tname, "", False, True) if oname in nvar and tname in nvar: if print_debug: print ' ', oname, ' and ', tname, ' are synonyms! Not splitting!' matches[i] = True if sum(matches) == max_matches: names_are_synonymous_b = True break return names_are_synonymous_b
def names_minimum_levenshtein_distance(name1, name2): ''' Determines the minimum distance D between two names. Comparison is base on the minimum number of first names. Examples: D("guang", "guang sheng") = 0 D("guang", "guangsheng") = 5 D("guang sheng", "guangsheng") = 5 D("guang sheng", "guang shing") = 1 D("guang ming", "guang fin") = 2 @precondition: Names have been checked for composition equality. @param name1: Name string of the first name (w/ last name) @type name1: string @param name2: Name string of the second name (w/ last name) @type name2: string @return: the minimum Levenshtein distance between two names @rtype: int ''' try: from Levenshtein import distance except ImportError: bconfig.LOGGER.exception("Levenshtein Module not available!") return - 1 if not isinstance(name1, list): name1 = split_name_parts(name1) if not isinstance(name2, list): name2 = split_name_parts(name2) onames = name1[2] tnames = name2[2] # min_names_count = min(len(onames), len(tnames)) # # if min_names_count <= 0: # return -1 # # oname = "".join(onames[:min_names_count]).lower() # tname = "".join(tnames[:min_names_count]).lower() oname = clean_name_string("".join(onames).lower(), "", False, True) tname = clean_name_string("".join(tnames).lower(), "", False, True) return distance(oname, tname)
def names_minimum_levenshtein_distance(name1, name2): ''' Determines the minimum distance D between two names. Comparison is base on the minimum number of first names. Examples: D("guang", "guang sheng") = 0 D("guang", "guangsheng") = 5 D("guang sheng", "guangsheng") = 5 D("guang sheng", "guang shing") = 1 D("guang ming", "guang fin") = 2 @precondition: Names have been checked for composition equality. @param name1: Name string of the first name (w/ last name) @type name1: string @param name2: Name string of the second name (w/ last name) @type name2: string @return: the minimum Levenshtein distance between two names @rtype: int ''' try: from Levenshtein import distance except ImportError: bconfig.LOGGER.exception("Levenshtein Module not available!") return -1 if not isinstance(name1, list): name1 = split_name_parts(name1) if not isinstance(name2, list): name2 = split_name_parts(name2) onames = name1[2] tnames = name2[2] # min_names_count = min(len(onames), len(tnames)) # # if min_names_count <= 0: # return -1 # # oname = "".join(onames[:min_names_count]).lower() # tname = "".join(tnames[:min_names_count]).lower() oname = clean_name_string("".join(onames).lower(), "", False, True) tname = clean_name_string("".join(tnames).lower(), "", False, True) return distance(oname, tname)
def test_create_normalized_name(self): """bibauthorid - test creation of normalized name strings""" self.assertEqual('this, Isa Fullname', baidu.create_normalized_name( baidu.split_name_parts('this, isa fullname'))) self.assertEqual('fullname, This Isa', baidu.create_normalized_name( baidu.split_name_parts('this isa fullname'))) self.assertEqual('Strange&][{}) ==}{$*]!, Name', baidu.create_normalized_name( baidu.split_name_parts('Strange&][{}) ==}{$*]!, Name'))) self.assertEqual(',', baidu.create_normalized_name( baidu.split_name_parts('')))
def name_matching(orig_name, target_name): """ Checks the compatibility of the given names. @param orig_name: The original name String @type orig_name: string @param target_name: The target name string @type target_name: string @return: true or false in respect to the compatibility of the given names @rtype: boolean """ orig = bibauthorid_utils.split_name_parts(orig_name) targ = bibauthorid_utils.split_name_parts(target_name) if (len(orig[1]) == 0) or (len(targ[1]) == 0): return True else: initials_set = set(orig[1]) names_set = set(orig[2]) comp_initials_set = set(targ[1]) comp_names_set = set(targ[2]) names_intersection = names_set.intersection(comp_names_set) initials_intersection = initials_set.intersection(comp_initials_set) if len(initials_intersection) == 0: if len(names_intersection) != 0: bconfig.LOGGER.error("length of names intersection != 0..." "This should never happen!") if ((len(names_intersection) == 0) and (len(comp_names_set) > 0) and (len(names_set) > 0)): return False if orig[1][0] == targ[1][0]: return True return False
def find_and_process_updates(process_initials): ''' Finds and processes not updated virtualauthors (which are identified by the 'updated' tag) and delivers the ID of this virtualauthor to the function responsible for assigning the virtualauthor to a realauthor. @param process_initials: If names with initials only shall be processed or not @type process_initials: boolean ''' if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty(): init_va_process_queue() while True: va_id = -1 if dat.VIRTUALAUTHOR_PROCESS_QUEUE.empty(): bconfig.LOGGER.debug("Empty Queue. Job finished. Nothing to do.") break else: va_id = dat.VIRTUALAUTHOR_PROCESS_QUEUE.get() va_name = (bibauthorid_virtualauthor_utils. get_virtualauthor_records(va_id, tag='orig_name_string')[0]['value']) if not process_initials: if bibauthorid_utils.split_name_parts(va_name)[2]: (bibauthorid_virtualauthor_utils. delete_virtualauthor_record(va_id, 'updated')) bconfig.LOGGER.log(25, "|> Inserting VA:" + " %s Orig. name: %s" % (va_id, va_name)) add_virtualauthor(va_id) else: (bibauthorid_virtualauthor_utils. delete_virtualauthor_record(va_id, 'updated')) bconfig.LOGGER.log(25, "|> Inserting VA: %s Orig. name: %s" % (va_id, va_name)) add_virtualauthor(va_id)
def get_va_ids_by_recid_lname(bibrec, lastname): ''' Finds all the virtual author ids that belong to a certain record and hold a certain last name @param bibrec: bibrec id of a record @type bibrec: int @param lastname: The last name of a person @type lastname: string @return: list of virtual author ids @rtype: list of int ''' va_ids = set() pot_va_ids = [row['virtualauthorid'] for row in dat.VIRTUALAUTHOR_DATA if ((row['tag'] == 'bibrec_id') and (row['value'] == str(bibrec)))] for va_id in [row['virtualauthorid'] for row in dat.VIRTUALAUTHOR_DATA if ((row['virtualauthorid'] in pot_va_ids) and (row['tag'] == 'orig_name_string') and (split_name_parts(row['value'])[0] == lastname))]: va_ids.add(va_id) return list(va_ids)
def _update_authorid_universe(): ''' Updates all data related to the authorid algorithm. Sequence of operations: - Get all recently updated papers and remember time in the log - Get all authors on all papers - Extract collection of last names - For each last name: - Populate mem cache with cluster data - Delete updated records and their virtual authors from mem cache - Create virtual authors for new and updated records - Start matching algorithm - Update tables with results of the computation - Start personid update procedure ''' def create_vas_from_specific_doclist(bibrec_ids): ''' Processes the document list and creates a new minimal virtual author for each author in each record specified in the given list. @param bibrec_ids: Record IDs to concern in this update @type bibrec_ids: list of int ''' num_docs = len([row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids]) bconfig.LOGGER.log(25, "Creating minimal virtual authors for " "all loaded docs (%s)" % (num_docs)) for docs in [row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids]: for author_id in docs['authornameids']: author_name = [an['name'] for an in dat.AUTHOR_NAMES if an['id'] == author_id] refrecs = [ref[1] for ref in docs['authornameid_bibrefrec'] if ref[0] == author_id] refrec = -1 if len(refrecs) > 1: print "SCREEEEEEWWWWWWED!!! Several bibrefs on one paper?!" refrec = refrecs[0] elif refrecs: refrec = refrecs[0] if refrec and author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, [], refrec) elif author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, []) dat.reset_mem_cache(True) last_log = get_user_log(userinfo='daemon', action='update_aid', only_most_recent=True) updated_records = [] if last_log: #select only the most recent papers recently_modified, last_update_time = get_papers_recently_modified( date=last_log[0][2]) insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status', comment='bibauthorid_daemon, update_authorid_universe', timestamp=last_update_time[0][0]) bibtask.write_message("Update authorid will operate on %s records." % (len(recently_modified)), stream=sys.stdout, verbose=0) if not recently_modified: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return for rec in recently_modified: updated_records.append(rec[0]) dat.update_log("rec_updates", rec[0]) else: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return authors = [] author_last_names = set() bibtask.task_update_progress('Reading authors from updated records') bibtask.write_message("Reading authors from updated records", stream=sys.stdout, verbose=0) updated_ras = set() # get all authors from all updated records for rec in updated_records: rec_authors = get_field_values_on_condition(rec, ['100', '700'], "a", source="API") for rec_author in rec_authors: if not rec_author: bconfig.LOGGER.error("Invalid empty author string, which " "will be skipped on record %s" % (rec)) continue author_in_list = [row for row in authors if row['db_name'] == rec_author] if author_in_list: for upd in [row for row in authors if row['db_name'] == rec_author]: upd['records'].append(rec) else: last_name = split_name_parts(rec_author)[0] author_last_names.add(last_name) authors.append({'db_name': rec_author, 'records': [rec], 'last_name': last_name}) for status, author_last_name in enumerate(author_last_names): current_authors = [row for row in authors if row['last_name'] == author_last_name] total_lnames = len(author_last_names) total_authors = len(current_authors) bibtask.task_update_progress('Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors)) bibtask.write_message('Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors), stream=sys.stdout, verbose=0) dat.reset_mem_cache(True) init_authornames(author_last_name) load_mem_cache_from_tables() bconfig.LOGGER.log(25, "-- Relevant data successfully read into memory" " to start processing") for current_author in current_authors: load_records_to_mem_cache(current_author['records']) authornamesid = [row['id'] for row in dat.AUTHOR_NAMES if row['db_name'] == current_author['db_name']] if not authornamesid: bconfig.LOGGER.error("The author '%s' rec '%s' is not in authornames " "and will be skipped. You might want " "to run authornames update before?" % (current_author['db_name'], rec)) continue else: try: authornamesid = int(authornamesid[0]) except (IndexError, TypeError, ValueError): bconfig.LOGGER.error("Invalid authornames ID!") continue if not current_author['records']: bconfig.LOGGER.error("The author '%s' is not associated to any" " document and will be skipped." % (current_author['db_name'])) continue for rec in current_author['records']: # remove VAs already existing for the record va_ids = get_va_ids_by_recid_lname(rec, current_author["last_name"]) if va_ids: for va_id in va_ids: ra_list = get_realauthors_by_virtuala_id(va_id) for ra_id in ra_list: remove_va_from_ra(ra_id, va_id) del_ra_data_by_vaid(ra_id, va_id) va_anames_id = get_virtualauthor_records(va_id, "orig_authorname_id") for an_list in [row['authornameids'] for row in dat.DOC_LIST if row['bibrecid'] == rec]: try: an_list.remove(va_anames_id) except (ValueError): # This names id is not in the list...don't care pass delete_virtual_author(va_id) # create new VAs for the record. update_doclist(rec, authornamesid) dat.update_log("rec_updates", rec) create_vas_from_specific_doclist(current_author['records']) bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.") start_computation(process_doclist=False, process_orphans=True, print_stats=True) bconfig.LOGGER.log(25, "-- Computation finished. Will write back to " "the database now.") update_db_result = update_tables_from_mem_cache(return_ra_updates=True) if not update_db_result[0]: bconfig.LOGGER.log(25, "Writing to persistence layer failed.") else: if update_db_result[1]: for updated_ra in update_db_result[1]: if updated_ra: updated_ras.add(updated_ra[0]) bconfig.LOGGER.log(25, "Done updating authorid universe.") personid_ra_format = [] for ra_id in updated_ras: personid_ra_format.append((ra_id,)) bconfig.LOGGER.log(25, "Will now run personid update to make the " "changes visible also on the front end and to " "create person IDs for %s newly created and changed " "authors." % len(updated_ras)) bibtask.task_update_progress('Updating persistent Person IDs') update_personID_from_algorithm(personid_ra_format) bconfig.LOGGER.log(25, "Done updating everything. Thanks for flying " "with bibauthorid!")
def _update_authorid_universe(): ''' Updates all data related to the authorid algorithm. Sequence of operations: - Get all recently updated papers and remember time in the log - Get all authors on all papers - Extract collection of last names - For each last name: - Populate mem cache with cluster data - Delete updated records and their virtual authors from mem cache - Create virtual authors for new and updated records - Start matching algorithm - Update tables with results of the computation - Start personid update procedure ''' def create_vas_from_specific_doclist(bibrec_ids): ''' Processes the document list and creates a new minimal virtual author for each author in each record specified in the given list. @param bibrec_ids: Record IDs to concern in this update @type bibrec_ids: list of int ''' num_docs = len( [row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids]) bconfig.LOGGER.log( 25, "Creating minimal virtual authors for " "all loaded docs (%s)" % (num_docs)) for docs in [ row for row in dat.DOC_LIST if row['bibrecid'] in bibrec_ids ]: for author_id in docs['authornameids']: author_name = [ an['name'] for an in dat.AUTHOR_NAMES if an['id'] == author_id ] refrecs = [ ref[1] for ref in docs['authornameid_bibrefrec'] if ref[0] == author_id ] refrec = -1 if len(refrecs) > 1: print "SCREEEEEEWWWWWWED!!! Several bibrefs on one paper?!" refrec = refrecs[0] elif refrecs: refrec = refrecs[0] if refrec and author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, [], refrec) elif author_name: add_minimum_virtualauthor(author_id, author_name[0], docs['bibrecid'], 0, []) dat.reset_mem_cache(True) last_log = get_user_log(userinfo='daemon', action='update_aid', only_most_recent=True) updated_records = [] if last_log: #select only the most recent papers recently_modified, last_update_time = get_papers_recently_modified( date=last_log[0][2]) insert_user_log('daemon', '-1', 'update_aid', 'bibsched', 'status', comment='bibauthorid_daemon, update_authorid_universe', timestamp=last_update_time[0][0]) bibtask.write_message("Update authorid will operate on %s records." % (len(recently_modified)), stream=sys.stdout, verbose=0) if not recently_modified: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return for rec in recently_modified: updated_records.append(rec[0]) dat.update_log("rec_updates", rec[0]) else: bibtask.write_message("Update authorid: Nothing to do", stream=sys.stdout, verbose=0) return authors = [] author_last_names = set() bibtask.task_update_progress('Reading authors from updated records') bibtask.write_message("Reading authors from updated records", stream=sys.stdout, verbose=0) updated_ras = set() # get all authors from all updated records for rec in updated_records: rec_authors = get_field_values_on_condition(rec, ['100', '700'], "a", source="API") for rec_author in rec_authors: if not rec_author: bconfig.LOGGER.error("Invalid empty author string, which " "will be skipped on record %s" % (rec)) continue author_in_list = [ row for row in authors if row['db_name'] == rec_author ] if author_in_list: for upd in [ row for row in authors if row['db_name'] == rec_author ]: upd['records'].append(rec) else: last_name = split_name_parts(rec_author)[0] author_last_names.add(last_name) authors.append({ 'db_name': rec_author, 'records': [rec], 'last_name': last_name }) for status, author_last_name in enumerate(author_last_names): current_authors = [ row for row in authors if row['last_name'] == author_last_name ] total_lnames = len(author_last_names) total_authors = len(current_authors) bibtask.task_update_progress( 'Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors)) bibtask.write_message( 'Processing %s of %s cluster: "%s" ' '(%s authors)' % (status + 1, total_lnames, author_last_name, total_authors), stream=sys.stdout, verbose=0) dat.reset_mem_cache(True) init_authornames(author_last_name) load_mem_cache_from_tables() bconfig.LOGGER.log( 25, "-- Relevant data successfully read into memory" " to start processing") for current_author in current_authors: load_records_to_mem_cache(current_author['records']) authornamesid = [ row['id'] for row in dat.AUTHOR_NAMES if row['db_name'] == current_author['db_name'] ] if not authornamesid: bconfig.LOGGER.error( "The author '%s' rec '%s' is not in authornames " "and will be skipped. You might want " "to run authornames update before?" % (current_author['db_name'], rec)) continue else: try: authornamesid = int(authornamesid[0]) except (IndexError, TypeError, ValueError): bconfig.LOGGER.error("Invalid authornames ID!") continue if not current_author['records']: bconfig.LOGGER.error("The author '%s' is not associated to any" " document and will be skipped." % (current_author['db_name'])) continue for rec in current_author['records']: # remove VAs already existing for the record va_ids = get_va_ids_by_recid_lname(rec, current_author["last_name"]) if va_ids: for va_id in va_ids: ra_list = get_realauthors_by_virtuala_id(va_id) for ra_id in ra_list: remove_va_from_ra(ra_id, va_id) del_ra_data_by_vaid(ra_id, va_id) va_anames_id = get_virtualauthor_records( va_id, "orig_authorname_id") for an_list in [ row['authornameids'] for row in dat.DOC_LIST if row['bibrecid'] == rec ]: try: an_list.remove(va_anames_id) except (ValueError): # This names id is not in the list...don't care pass delete_virtual_author(va_id) # create new VAs for the record. update_doclist(rec, authornamesid) dat.update_log("rec_updates", rec) create_vas_from_specific_doclist(current_author['records']) bconfig.LOGGER.log(25, "-- Relevant data pre-processed successfully.") start_computation(process_doclist=False, process_orphans=True, print_stats=True) bconfig.LOGGER.log( 25, "-- Computation finished. Will write back to " "the database now.") update_db_result = update_tables_from_mem_cache(return_ra_updates=True) if not update_db_result[0]: bconfig.LOGGER.log(25, "Writing to persistence layer failed.") else: if update_db_result[1]: for updated_ra in update_db_result[1]: if updated_ra: updated_ras.add(updated_ra[0]) bconfig.LOGGER.log(25, "Done updating authorid universe.") personid_ra_format = [] for ra_id in updated_ras: personid_ra_format.append((ra_id, )) bconfig.LOGGER.log( 25, "Will now run personid update to make the " "changes visible also on the front end and to " "create person IDs for %s newly created and changed " "authors." % len(updated_ras)) bibtask.task_update_progress('Updating persistent Person IDs') update_personID_from_algorithm(personid_ra_format) bconfig.LOGGER.log( 25, "Done updating everything. Thanks for flying " "with bibauthorid!")
def compare_names(origin_name, target_name): """ Compute an index of confidence that would like to indicate whether two names might represent the same person.The computation is based on similarities of name structure, in particular: Initials: We assign an high score if all the initials matches are in the right order, much lower if they are in the wrong order Names: We assign a lower score for mismatching names and higher score for fully matching names If there is nothing to compare we are forced to assume a high score. Example for splitting names: In : bibauthorid.split_name_parts("Ellis, John R") Out: ['Ellis', ['J', 'R'], ['John']] Ellis, R. Keith => [ [Ellis], [R, K], [Keith] ] Ellis, Richard Keith => [ [Ellis], [R, K], [Richard, Keith] ] Since the initials are computed whether on the real initials present in the name string and using the full name, if there is no initials match we are 1 00% confident that: 1. we have no names/initials at all, or 2. we have completely different names; hence if there is no initial match we skip this step. @param orig_name: The first author's last name, first name(s) and initial @type orig_name: list of strings and lists of strings @param targ_name: The second author's last name, first name(s) and initial @type targ_name: list of strings and lists of strings @return: a value that describes the likelihood of the names being the same @rtype: float """ jaro_fctn = None try: from Levenshtein import jaro_winkler jaro_fctn = jaro_winkler except ImportError: jaro_fctn = jaro_winkler_str_similarity oname = deepcopy(origin_name) tname = deepcopy(target_name) orig_name = split_name_parts(oname.lower()) targ_name = split_name_parts(tname.lower()) bconfig.LOGGER.info("|--> Comparing Names: \"%s\" and \"%s\"" % (origin_name, target_name)) lastname_modifier = 0.0 if not (orig_name[0] == targ_name[0]): # last names are not equal before cleaning them. Assign entry penalty. lastname_modifier = 0.15 orig_name[0] = clean_name_string(orig_name[0], replacement="", keep_whitespace=False) targ_name[0] = clean_name_string(targ_name[0], replacement="", keep_whitespace=False) if not (orig_name[0] == targ_name[0]): if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95) or min(len(orig_name[0]), len(targ_name[0])) <= 4): bconfig.LOGGER.warn(("Unequal lastnames(%s vs. %s)." + "Skipping Comparison") % (orig_name[0], targ_name[0])) return 0.0 else: bconfig.LOGGER.log(25, "Last names are not equal; " + "but similar enough to continue the comparison") # Let it go through...however, reduce the final result a little. lastname_modifier = 0.24 else: # last names are equal after cleaning them. Reduce penalty. if lastname_modifier == 0.15: lastname_modifier = 0.02 if orig_name[2] and targ_name[2]: if len(orig_name[2]) > 1 or len(targ_name[2]) > 1: variation_ps = [] oname_variations = create_name_tuples(orig_name[2]) tname_variations = create_name_tuples(targ_name[2]) for oname_variation in oname_variations: for tname_variation in tname_variations: oname_var = split_name_parts("%s, %s" % (orig_name[0], oname_variation)) tname_var = split_name_parts("%s, %s" % (targ_name[0], tname_variation)) variation_ps.append(_perform_matching(oname_var, tname_var)) return max(variation_ps) - lastname_modifier return _perform_matching(orig_name, targ_name) - lastname_modifier
def compare_names(origin_name, target_name): """ Compute an index of confidence that would like to indicate whether two names might represent the same person.The computation is based on similarities of name structure, in particular: Initials: We assign an high score if all the initials matches are in the right order, much lower if they are in the wrong order Names: We assign a lower score for mismatching names and higher score for fully matching names If there is nothing to compare we are forced to assume a high score. Example for splitting names: In : bibauthorid.split_name_parts("Ellis, John R") Out: ['Ellis', ['J', 'R'], ['John']] Ellis, R. Keith => [ [Ellis], [R, K], [Keith] ] Ellis, Richard Keith => [ [Ellis], [R, K], [Richard, Keith] ] Since the initials are computed whether on the real initials present in the name string and using the full name, if there is no initials match we are 1 00% confident that: 1. we have no names/initials at all, or 2. we have completely different names; hence if there is no initial match we skip this step. @param orig_name: The first author's last name, first name(s) and initial @type orig_name: list of strings and lists of strings @param targ_name: The second author's last name, first name(s) and initial @type targ_name: list of strings and lists of strings @return: a value that describes the likelihood of the names being the same @rtype: float """ jaro_fctn = None try: from Levenshtein import jaro_winkler jaro_fctn = jaro_winkler except ImportError: jaro_fctn = jaro_winkler_str_similarity oname = deepcopy(origin_name) tname = deepcopy(target_name) orig_name = split_name_parts(oname.lower()) targ_name = split_name_parts(tname.lower()) bconfig.LOGGER.info("|--> Comparing Names: \"%s\" and \"%s\"" % (origin_name, target_name)) lastname_modifier = 0.0 if not (orig_name[0] == targ_name[0]): # last names are not equal before cleaning them. Assign entry penalty. lastname_modifier = 0.15 orig_name[0] = clean_name_string(orig_name[0], replacement="", keep_whitespace=False) targ_name[0] = clean_name_string(targ_name[0], replacement="", keep_whitespace=False) if not (orig_name[0] == targ_name[0]): if ((jaro_fctn(orig_name[0].lower(), targ_name[0].lower()) < .95) or min(len(orig_name[0]), len(targ_name[0])) <= 4): bconfig.LOGGER.warn( ("Unequal lastnames(%s vs. %s)." + "Skipping Comparison") % (orig_name[0], targ_name[0])) return 0.0 else: bconfig.LOGGER.log( 25, "Last names are not equal; " + "but similar enough to continue the comparison") # Let it go through...however, reduce the final result a little. lastname_modifier = 0.24 else: # last names are equal after cleaning them. Reduce penalty. if lastname_modifier == 0.15: lastname_modifier = 0.02 if orig_name[2] and targ_name[2]: if len(orig_name[2]) > 1 or len(targ_name[2]) > 1: variation_ps = [] oname_variations = create_name_tuples(orig_name[2]) tname_variations = create_name_tuples(targ_name[2]) for oname_variation in oname_variations: for tname_variation in tname_variations: oname_var = split_name_parts( "%s, %s" % (orig_name[0], oname_variation)) tname_var = split_name_parts( "%s, %s" % (targ_name[0], tname_variation)) variation_ps.append(_perform_matching( oname_var, tname_var)) return max(variation_ps) - lastname_modifier return _perform_matching(orig_name, targ_name) - lastname_modifier