def convert_personid(): from dbquery import run_sql # oh come on, the whole function will be removed soon from itertools import repeat chunk = 1000 old_personid = run_sql("SELECT `personid`, `tag`, `data`, `flag`, `lcul` FROM `aidPERSONID`") def flush_papers(args): run_sql("INSERT INTO `aidPERSONIDPAPERS` " "(`personid`, " " `bibref_table`, " " `bibref_value`, " " `bibrec`, " " `name`, " " `flag`, " " `lcul`) " "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s, %s, %s)", len(args) / 7)) , tuple(args)) def flush_data(args): run_sql("INSERT INTO `aidPERSONIDDATA` " "(`personid`, " " `tag`, " " `data`, " " `opt1`, " " `opt2`) " "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s)", len(args) / 5)) , tuple(args)) paper_args = [] data_args = [] for row in old_personid: if row[1] == 'paper': bibref, rec = row[2].split(',') tab, ref = bibref.split(':') try: name = get_name_by_bibrecref((int(tab), int(ref), int(rec))) except: continue name = split_name_parts(name) name = create_normalized_name(name) paper_args += [row[0], tab, ref, rec, name, row[3], row[4]] if len(paper_args) > chunk: flush_papers(paper_args) paper_args = [] elif row[1] == 'gathered_name': continue else: data_args += list(row) if len(data_args) > chunk: flush_data(data_args) data_args = [] if paper_args: flush_papers(paper_args) if data_args: flush_data(data_args)
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' if bconfig.RABBIT_USE_CACHED_PID: PID_NAMES_CACHE = get_name_string_to_pid_dictionary() def find_pids_by_exact_names_cache(name): try: return zip(PID_NAMES_CACHE[name]) except KeyError: return [] def add_signature_using_names_cache(sig, name, pid): try: PID_NAMES_CACHE[name].add(pid) except KeyError: PID_NAMES_CACHE[name] = set([pid]) _add_signature(sig, name, pid) def new_person_from_signature_using_names_cache(sig, name): pid = get_new_personid() add_signature_using_names_cache(sig, name, pid) return pid add_signature = add_signature_using_names_cache new_person_from_signature = new_person_from_signature_using_names_cache find_pids_by_exact_name = find_pids_by_exact_names_cache else: add_signature = _add_signature new_person_from_signature = _new_person_from_signature find_pids_by_exact_name = _find_pids_by_exact_name compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_bibrecs() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD): populate_partial_marc_caches() SWAPPED_GET_GROUPED_RECORDS = True else: SWAPPED_GET_GROUPED_RECORDS = False updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): task_sleep_now_if_required(True) update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: delete_paper_from_personid(rec) continue markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec))))) personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec)] personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibrecref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures] for new in new_signatures] # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] for new, old in best_match: modify_signature(old, rec, new, new_signatures_names[new]) remove_sigs(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = [] if USE_EXT_IDS: if USE_INSPIREID: inspire_id = get_inspire_id(sig + (rec,)) if inspire_id: matched_pids = list(get_person_with_extid(inspire_id[0])) if matched_pids: add_signature(list(sig) + [rec], name, matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) continue matched_pids = find_pids_by_exact_name(name) matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids] if not matched_pids: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) update_status_final() if personids_to_update_extids: updated_pids |= personids_to_update_extids if updated_pids: # an empty set will update all canonical_names update_personID_canonical_names(updated_pids) update_personID_external_ids(updated_pids, limit_to_claimed_papers=bconfig.LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS) if SWAPPED_GET_GROUPED_RECORDS: destroy_partial_marc_caches()
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' if bconfig.RABBIT_USE_CACHED_PID: PID_NAMES_CACHE = get_name_string_to_pid_dictionary() def find_pids_by_exact_names_cache(name): try: return zip(PID_NAMES_CACHE[name]) except KeyError: return [] def add_signature_using_names_cache(sig, name, pid): try: PID_NAMES_CACHE[name].add(pid) except KeyError: PID_NAMES_CACHE[name] = set([pid]) _add_signature(sig, name, pid) def new_person_from_signature_using_names_cache(sig, name): pid = get_new_personid() add_signature_using_names_cache(sig, name, pid) return pid add_signature = add_signature_using_names_cache new_person_from_signature = new_person_from_signature_using_names_cache find_pids_by_exact_name = find_pids_by_exact_names_cache else: add_signature = _add_signature new_person_from_signature = _new_person_from_signature find_pids_by_exact_name = _find_pids_by_exact_name compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_bibrecs() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD): populate_partial_marc_caches() SWAPPED_GET_GROUPED_RECORDS = True else: SWAPPED_GET_GROUPED_RECORDS = False updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): task_sleep_now_if_required(True) update_status( float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: delete_paper_from_personid(rec) continue markrefs = frozenset( chain( izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec))))) personid_rows = [ map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec) ] personidrefs_names = dict( ((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict( (new, create_normalized_name( split_name_parts(get_name_by_bibrecref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[ compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures ] for new in new_signatures] # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] for new, old in best_match: modify_signature(old, rec, new, new_signatures_names[new]) remove_sigs(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset( map(itemgetter(0), best_match)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = [] if USE_EXT_IDS: if USE_INSPIREID: inspire_id = get_inspire_id(sig + (rec, )) if inspire_id: matched_pids = list( get_person_with_extid(inspire_id[0])) if matched_pids: add_signature(list(sig) + [rec], name, matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) continue matched_pids = find_pids_by_exact_name(name) matched_pids = [ p for p in matched_pids if int(p[0]) not in used_pids ] if not matched_pids: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) update_status_final() if personids_to_update_extids: updated_pids |= personids_to_update_extids if updated_pids: # an empty set will update all canonical_names update_personID_canonical_names(updated_pids) update_personID_external_ids( updated_pids, limit_to_claimed_papers=bconfig. LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS) if SWAPPED_GET_GROUPED_RECORDS: destroy_partial_marc_caches()