def rabbit(bibrecs=None, check_invalid_papers=False, personids_to_update_extids=None, verbose=False): logger = Logger("Rabbit") if verbose: logger.verbose = True if not bibrecs: logger.log("Running on all records") else: logger.log("Running on %s " % (str(bibrecs))) populate_mnames_pids_cache() global M_NAME_PIDS_CACHE memoized_compare_names = memoized(comp_names) compare_names = lambda x, y: memoized_compare_names(*sorted((x, y))) def find_pids_by_matchable_name_with_cache(matchable_name): try: matched_pids = [M_NAME_PIDS_CACHE[matchable_name]] except KeyError: matched_pids = get_authors_by_name(matchable_name, use_matchable_name=True) if matched_pids: M_NAME_PIDS_CACHE[matchable_name] = matched_pids[0] return matched_pids if USE_EXT_IDS: def get_matched_pids_by_external_ids(sig, rec, pids_having_rec): ''' This function returns all the matched pids after iterating through all available external IDs of the system. ''' for get_external_id_of_signature in external_id_getters: external_id = get_external_id_of_signature(sig + (rec, )) if external_id: matched_pids = list( get_author_by_external_id(external_id[0])) if matched_pids and int( matched_pids[0][0]) in pids_having_rec: matched_pids = list() return matched_pids threshold = 0.8 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_bibrecs() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) bibrecs = list(bibrecs) for idx, rec in enumerate(bibrecs): logger.log("Considering %s" % str(rec)) if idx % 100 == 0: task_update_progress("%d/%d current: %d" % (idx, len(bibrecs), rec)) if idx % 1000 == 0: destroy_partial_marc_caches() populate_partial_marc_caches(bibrecs[idx:idx + 1000]) logger.log( float(idx) / len(bibrecs), "%d/%d" % (idx, len(bibrecs))) if rec in deleted: remove_papers([rec]) continue author_refs = get_author_refs_of_paper(rec) coauthor_refs = get_coauthor_refs_of_paper(rec) markrefs = frozenset( chain(izip(cycle([100]), imap(itemgetter(0), author_refs)), izip(cycle([700]), imap(itemgetter(0), coauthor_refs)))) personid_rows = [ map(int, row[:3]) + [row[4]] for row in get_signatures_of_paper(rec) ] personidrefs_names = dict( ((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict( (new, get_name_by_bibref(new)) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[ compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures ] for new in new_signatures] logger.log(" - Deleted signatures: %s" % str(old_signatures)) logger.log(" - Added signatures: %s" % str(new_signatures)) logger.log(" - Matrix: %s" % str(matrix)) #[new_signatures, old_signatures] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] logger.log(" - Best match: %s " % str(best_match)) for new, old in best_match: logger.log(" - - Moving signature: %s on %s to %s as %s" % (old, rec, new, new_signatures_names[new])) modify_signature(old, rec, new, new_signatures_names[new]) remove_signatures(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset( map(itemgetter(0), best_match)) remaining_personid_rows = ([ x for x in personid_rows if x[1:3] in old_signatures ]) pids_having_rec = set([int(row[0]) for row in remaining_personid_rows]) logger.log(" - Not matched: %s" % str(not_matched)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matchable_name = create_matchable_name(name) matched_pids = list() if USE_EXT_IDS: matched_pids = get_matched_pids_by_external_ids( sig, rec, pids_having_rec) if matched_pids: add_signature(list(sig) + [rec], name, matched_pids[0][0], m_name=matchable_name) M_NAME_PIDS_CACHE[matchable_name] = matched_pids[0][0] updated_pids.add(matched_pids[0][0]) pids_having_rec.add(matched_pids[0][0]) continue matched_pids = find_pids_by_matchable_name_with_cache( matchable_name) if not matched_pids: for matching_function in M_NAME_FUNCTIONS[1:]: matchable_name = matching_function(name) matched_pids = find_pids_by_matchable_name_with_cache( matchable_name) if matched_pids: break matched_pids = [p for p in matched_pids if int(p) not in used_pids] best_matched_pid = None for matched_pid in matched_pids: # Because of the wrongly labeled data in the db, all # of the possible choices have to be checked. If one of the # coauthors, who had his signature already considered, claimed # in the past one of the signatures of currently considered # author, the algorithm will think that two signatures belong # to the same person, and, will create an unnecessary new # profile. if not int(matched_pid) in pids_having_rec: best_matched_pid = matched_pid break if not best_matched_pid: new_pid = new_person_from_signature( list(sig) + [rec], name, matchable_name) M_NAME_PIDS_CACHE[matchable_name] = new_pid used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, best_matched_pid, m_name=matchable_name) M_NAME_PIDS_CACHE[matchable_name] = best_matched_pid used_pids.add(best_matched_pid) updated_pids.add(best_matched_pid) pids_having_rec.add(best_matched_pid) logger.log('Finished with %s' % str(rec)) logger.update_status_final() destroy_partial_marc_caches() if personids_to_update_extids: updated_pids |= set(personids_to_update_extids) if updated_pids: # an empty set will update all canonical_names update_canonical_names_of_authors(updated_pids) update_external_ids_of_authors( updated_pids, limit_to_claimed_papers=bconfig. LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS, force_cache_tables=True) destroy_partial_marc_caches() destroy_mnames_pids_cache() remove_empty_authors() task_update_progress("Done!")
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None, verbose=False): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' logfile = open('/tmp/RABBITLOG-%s' % str(now()).replace(" ", "_"), 'w') logfile.write("RABBIT %s running on %s \n" % (str(now()), str(bibrecs))) def logwrite(msg, is_error): verb = 9 if is_error or verbose: verb = 1 write_message(msg, verbose=verb) if bconfig.RABBIT_USE_CACHED_PID: PID_NAMES_CACHE = get_name_to_authors_mapping() def find_pids_by_exact_names_cache(name): try: return zip(PID_NAMES_CACHE[name]) except KeyError: return [] def add_signature_using_names_cache(sig, name, pid): try: PID_NAMES_CACHE[name].add(pid) except KeyError: PID_NAMES_CACHE[name] = set([pid]) _add_signature(sig, name, pid) def new_person_from_signature_using_names_cache(sig, name): pid = get_free_author_id() add_signature_using_names_cache(sig, name, pid) return pid add_signature = add_signature_using_names_cache new_person_from_signature = new_person_from_signature_using_names_cache find_pids_by_exact_name = find_pids_by_exact_names_cache else: add_signature = _add_signature new_person_from_signature = _new_person_from_signature find_pids_by_exact_name = _find_pids_by_exact_name compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_papers() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD): populate_partial_marc_caches() SWAPPED_GET_GROUPED_RECORDS = True else: SWAPPED_GET_GROUPED_RECORDS = False updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): logwrite("\nConsidering %s" % str(rec), False) if idx % 200 == 0: task_sleep_now_if_required(True) update_status( float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) task_update_progress("%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: logwrite( " - Record was deleted, removing from pid and continuing with next record", True) remove_papers([rec]) continue markrefs = frozenset( chain( izip(cycle([100]), imap(itemgetter(0), get_author_refs_of_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthor_refs_of_paper(rec))))) personid_rows = [ map(int, row[:3]) + [row[4]] for row in get_signatures_of_paper(rec) ] personidrefs_names = dict( ((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict( (new, create_normalized_name(split_name_parts(get_name_by_bibref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[ compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures ] for new in new_signatures] logwrite(" - Old signatures: %s" % str(old_signatures), bool(old_signatures)) logwrite(" - New signatures: %s" % str(new_signatures), bool(new_signatures)) logwrite(" - Matrix: %s" % str(matrix), bool(matrix)) # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] logwrite(" - Best match: %s " % str(best_match), bool(best_match)) for new, old in best_match: logwrite( " - - Moving signature: %s on %s to %s as %s" % (old, rec, new, new_signatures_names[new]), True) modify_signature(old, rec, new, new_signatures_names[new]) remove_signatures(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset( map(itemgetter(0), best_match)) pids_having_rec = set( [int(row[0]) for row in get_signatures_of_paper(rec)]) logwrite(" - Not matched: %s" % str(not_matched), bool(not_matched)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = list() if USE_EXT_IDS: if USE_INSPIREID: inspire_id = get_inspire_id_of_signature(sig + (rec, )) if inspire_id: matched_pids = list( get_author_by_external_id(inspire_id[0])) if matched_pids and int( matched_pids[0][0]) in pids_having_rec: matched_pids = list() if matched_pids: add_signature(list(sig) + [rec], name, matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) pids_having_rec.add(matched_pids[0][0]) continue matched_pids = find_pids_by_exact_name(name) matched_pids = [ p for p in matched_pids if int(p[0]) not in used_pids ] if not matched_pids or int(matched_pids[0][0]) in pids_having_rec: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) pids_having_rec.add(matched_pids[0][0]) logwrite('Finished with %s' % str(rec), False) update_status_final() if personids_to_update_extids: updated_pids |= personids_to_update_extids if updated_pids: # an empty set will update all canonical_names update_canonical_names_of_authors(updated_pids) update_external_ids_of_authors( updated_pids, limit_to_claimed_papers=bconfig. LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS) if SWAPPED_GET_GROUPED_RECORDS: destroy_partial_marc_caches() remove_empty_authors()
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None, verbose=False): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' logfile = open('/tmp/RABBITLOG-%s' % str(now()).replace(" ", "_"), 'w') logfile.write("RABBIT %s running on %s \n" % (str(now()), str(bibrecs))) def logwrite(msg, is_error): verb = 9 if is_error or verbose: verb = 1 write_message(msg, verbose=verb) if bconfig.RABBIT_USE_CACHED_PID: PID_NAMES_CACHE = get_name_to_authors_mapping() def find_pids_by_exact_names_cache(name): try: return zip(PID_NAMES_CACHE[name]) except KeyError: return [] def add_signature_using_names_cache(sig, name, pid): try: PID_NAMES_CACHE[name].add(pid) except KeyError: PID_NAMES_CACHE[name] = set([pid]) _add_signature(sig, name, pid) def new_person_from_signature_using_names_cache(sig, name): pid = get_free_author_id() add_signature_using_names_cache(sig, name, pid) return pid add_signature = add_signature_using_names_cache new_person_from_signature = new_person_from_signature_using_names_cache find_pids_by_exact_name = find_pids_by_exact_names_cache else: add_signature = _add_signature new_person_from_signature = _new_person_from_signature find_pids_by_exact_name = _find_pids_by_exact_name compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_papers() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD): populate_partial_marc_caches() SWAPPED_GET_GROUPED_RECORDS = True else: SWAPPED_GET_GROUPED_RECORDS = False updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): logwrite("\nConsidering %s" % str(rec), False) if idx%200 == 0: task_sleep_now_if_required(True) update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) task_update_progress("%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: logwrite(" - Record was deleted, removing from pid and continuing with next record", True) remove_papers([rec]) continue markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_author_refs_of_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthor_refs_of_paper(rec))))) personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_of_paper(rec)] personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures] for new in new_signatures] logwrite(" - Old signatures: %s" % str(old_signatures), bool(old_signatures)) logwrite(" - New signatures: %s" % str(new_signatures), bool(new_signatures)) logwrite(" - Matrix: %s" % str(matrix), bool(matrix)) # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] logwrite(" - Best match: %s " % str(best_match), bool(best_match)) for new, old in best_match: logwrite(" - - Moving signature: %s on %s to %s as %s" % (old, rec, new, new_signatures_names[new]), True) modify_signature(old, rec, new, new_signatures_names[new]) remove_signatures(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match)) pids_having_rec = set([int(row[0]) for row in get_signatures_of_paper(rec)]) logwrite(" - Not matched: %s" % str(not_matched), bool(not_matched)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = list() if USE_EXT_IDS: if USE_INSPIREID: inspire_id = get_inspire_id_of_signature(sig + (rec,)) if inspire_id: matched_pids = list(get_author_by_external_id(inspire_id[0])) if matched_pids and int(matched_pids[0][0]) in pids_having_rec: matched_pids = list() if matched_pids: add_signature(list(sig) + [rec], name, matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) pids_having_rec.add(matched_pids[0][0]) continue matched_pids = find_pids_by_exact_name(name) matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids] if not matched_pids or int(matched_pids[0][0]) in pids_having_rec: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) pids_having_rec.add(matched_pids[0][0]) logwrite('Finished with %s' % str(rec), False) update_status_final() if personids_to_update_extids: updated_pids |= personids_to_update_extids if updated_pids: # an empty set will update all canonical_names update_canonical_names_of_authors(updated_pids) update_external_ids_of_authors(updated_pids, limit_to_claimed_papers=bconfig.LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS) if SWAPPED_GET_GROUPED_RECORDS: destroy_partial_marc_caches() remove_empty_authors()