def calculate_name_score(query_string, nameids): ''' docstring @param query_string: @type query_string: @param nameids: @type nameids: @return: @rtype: ''' name_personids_list = get_authors_data_from_indexable_name_ids(nameids) query_last_name = split_name_parts(query_string)[0] query_last_name_len = len(query_last_name) name_score_list = list() for name, personids in name_personids_list: current_last_name = split_name_parts(name)[0] current_last_name_len = len(current_last_name) if abs(query_last_name_len - current_last_name_len) == 0: dist = distance(query_last_name, current_last_name) limit = min([query_last_name_len, current_last_name_len]) name_score = sum([1/float(2**(i+1)) for i in range(limit) if query_last_name[i] == current_last_name[i]])/(dist + 1) if name_score > 0.5: name_score_list.append((name, name_score, deserialize(personids))) return name_score_list
def crossref_normalize_name(record): """ Changes the format of author's name (often with initials) to the proper, unified one, using bibauthor_name_utils tools @return: changed record """ # pattern for removing the spaces between two initials pattern_initials = '([A-Z]\\.)\\s([A-Z]\\.)' # first, change the main author for field in record_get_field_instances(record, '100'): main_author = field[0][0][1] new_author = create_normalized_name(split_name_parts(main_author)) # remove spaces between initials # two iterations are required for _ in range(2): new_author = re.sub(pattern_initials, '\g<1>\g<2>', new_author) position = field[4] record_modify_subfield(rec=record, tag='100', subfield_code='a', \ value=new_author, subfield_position=0, field_position_global=position) # then, change additional authors for field in record_get_field_instances(record, '700'): author = field[0][0][1] new_author = create_normalized_name(split_name_parts(author)) for _ in range(2): new_author = re.sub(pattern_initials, '\g<1>\g<2>', new_author) position = field[4] record_modify_subfield(rec=record, tag='700', subfield_code='a', \ value=new_author, subfield_position=0, field_position_global=position)
def calculate_name_score(query_string, nameids): ''' docstring @param query_string: @type query_string: @param nameids: @type nameids: @return: @rtype: ''' name_personids_list = get_authors_data_from_indexable_name_ids(nameids) query_last_name = split_name_parts(query_string)[0] query_last_name_len = len(query_last_name) name_score_list = list() for name, personids in name_personids_list: current_last_name = split_name_parts(name)[0] current_last_name_len = len(current_last_name) if abs(query_last_name_len - current_last_name_len) == 0: dist = distance(query_last_name, current_last_name) limit = min([query_last_name_len, current_last_name_len]) name_score = sum([ 1 / float(2**(i + 1)) for i in range(limit) if query_last_name[i] == current_last_name[i] ]) / (dist + 1) if name_score > 0.5: name_score_list.append( (name, name_score, deserialize(personids))) return name_score_list
def crossref_normalize_name(record): """ Changes the format of author's name (often with initials) to the proper, unified one, using bibauthor_name_utils tools @return: changed record """ # pattern for removing the spaces between two initials pattern_initials = '([A-Z]\\.)\\s([A-Z]\\.)' # first, change the main author for field in record_get_field_instances(record, '100'): main_author = field[0][0][1] new_author = create_normalized_name(split_name_parts(main_author)) # remove spaces between initials # two iterations are required for _ in range(2): new_author = re.sub(pattern_initials, r'\g<1>\g<2>', new_author) position = field[4] record_modify_subfield(rec=record, tag='100', subfield_code='a', value=new_author, subfield_position=0, field_position_global=position) # then, change additional authors for field in record_get_field_instances(record, '700'): author = field[0][0][1] new_author = create_normalized_name(split_name_parts(author)) for _ in range(2): new_author = re.sub(pattern_initials, r'\g<1>\g<2>', new_author) position = field[4] record_modify_subfield(rec=record, tag='700', subfield_code='a', value=new_author, subfield_position=0, field_position_global=position)
def convert_personid(): from dbquery import run_sql # oh come on, the whole function will be removed soon from itertools import repeat chunk = 1000 old_personid = run_sql("SELECT `personid`, `tag`, `data`, `flag`, `lcul` FROM `aidPERSONID`") def flush_papers(args): run_sql("INSERT INTO `aidPERSONIDPAPERS` " "(`personid`, " " `bibref_table`, " " `bibref_value`, " " `bibrec`, " " `name`, " " `flag`, " " `lcul`) " "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s, %s, %s)", len(args) / 7)) , tuple(args)) def flush_data(args): run_sql("INSERT INTO `aidPERSONIDDATA` " "(`personid`, " " `tag`, " " `data`, " " `opt1`, " " `opt2`) " "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s)", len(args) / 5)) , tuple(args)) paper_args = [] data_args = [] for row in old_personid: if row[1] == 'paper': bibref, rec = row[2].split(',') tab, ref = bibref.split(':') try: name = get_name_by_bibrecref((int(tab), int(ref), int(rec))) except: continue name = split_name_parts(name) name = create_normalized_name(name) paper_args += [row[0], tab, ref, rec, name, row[3], row[4]] if len(paper_args) > chunk: flush_papers(paper_args) paper_args = [] elif row[1] == 'gathered_name': continue else: data_args += list(row) if len(data_args) > chunk: flush_data(data_args) data_args = [] if paper_args: flush_papers(paper_args) if data_args: flush_data(data_args)
def convert_personid(): from invenio.dbquery import run_sql # oh come on, the whole function will be removed soon from itertools import repeat chunk = 1000 old_personid = run_sql("SELECT `personid`, `tag`, `data`, `flag`, `lcul` FROM `aidPERSONID`") def flush_papers(args): run_sql("INSERT INTO `aidPERSONIDPAPERS` " "(`personid`, " " `bibref_table`, " " `bibref_value`, " " `bibrec`, " " `name`, " " `flag`, " " `lcul`) " "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s, %s, %s)", len(args) / 7)) , tuple(args)) def flush_data(args): run_sql("INSERT INTO `aidPERSONIDDATA` " "(`personid`, " " `tag`, " " `data`, " " `opt1`, " " `opt2`) " "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s)", len(args) / 5)) , tuple(args)) paper_args = [] data_args = [] for row in old_personid: if row[1] == 'paper': bibref, rec = row[2].split(',') tab, ref = bibref.split(':') try: name = get_name_by_bibref((int(tab), int(ref), int(rec))) except: continue name = split_name_parts(name) name = create_normalized_name(name) paper_args += [row[0], tab, ref, rec, name, row[3], row[4]] if len(paper_args) > chunk: flush_papers(paper_args) paper_args = [] elif row[1] == 'gathered_name': continue else: data_args += list(row) if len(data_args) > chunk: flush_data(data_args) data_args = [] if paper_args: flush_papers(paper_args) if data_args: flush_data(data_args)
def _split_and_index(el): name, pids = el asciified_name = translate_to_ascii(name)[0] split_name = split_name_parts(indexable_name_re.sub(' ', asciified_name)) indexable_name = create_indexable_name(split_name) surname = split_name[0] + ',' indexable_surname = create_indexable_name([surname, [], [], []]) return (name, pids, indexable_name, indexable_surname)
def create_bibauthorid_indexer(): ''' It constructs the disk-based indexer. It consists of the dense index (which maps a name to the set of personids who withhold that name) and the inverted lists (which map a qgram to the set of name ids that share that qgram). ''' name_pids_dict = get_confirmed_name_to_authors_mapping() if not name_pids_dict: return indexable_name_pids_dict = dict() for name in name_pids_dict.keys(): asciified_name = translate_to_ascii(name)[0] indexable_name = create_indexable_name(asciified_name) if indexable_name: try: asciified_name, pids = indexable_name_pids_dict[indexable_name] updated_pids = pids | name_pids_dict[name] indexable_name_pids_dict[indexable_name] = (asciified_name, updated_pids) except KeyError: indexable_name_pids_dict[indexable_name] = (asciified_name, name_pids_dict[name]) surname = split_name_parts(name)[0] asciified_surname = translate_to_ascii(surname)[0] indexable_surname = create_indexable_name(asciified_surname) if indexable_surname: try: asciified_surname, pids = indexable_name_pids_dict[indexable_surname] updated_pids = pids | name_pids_dict[name] indexable_name_pids_dict[indexable_surname] = (asciified_surname, updated_pids) except KeyError: indexable_name_pids_dict[indexable_surname] = (asciified_surname, name_pids_dict[name]) indexable_names_list = indexable_name_pids_dict.keys() # If an exception/error occurs in any of the threads it is not detectable # so inter-thread communication is necessary to make it visible. q = Queue() threads = list() threads.append(Thread(target=create_dense_index, args=(indexable_name_pids_dict, indexable_names_list, q))) threads.append(Thread(target=create_inverted_lists, args=(indexable_names_list, q))) for t in threads: t.start() for t in threads: all_ok, error = q.get(block=True) if not all_ok: raise error q.task_done() for t in threads: t.join()
def find_personids_by_name(query_string): query_string_surname = split_name_parts(query_string)[0] name_score_list = set(find_personids_by_name1(query_string) + find_personids_by_name1(query_string_surname)) name_ranking_list = sorted(name_score_list, key=itemgetter(1), reverse=True) pid_score_list = calculate_pid_score(name_ranking_list) pids_ranking_list = sorted(pid_score_list, key=itemgetter(2), reverse=True) ranked_pid_name_list = [pid for pid, name, final_score in pids_ranking_list] return ranked_pid_name_list
def fallback_find_personids_by_name_string(target): ''' Search engine to find persons matching the given string The matching is done on the surname first, and names if present. An ordered list (per compatibility) of pids and found names is returned. @param namestring: string name, 'surname, names I.' @type: string @param strict: Define if this shall perform an exact or a fuzzy match @type strict: boolean @return: pid list of lists [pid,[[name string, occur count, compatibility]]] ''' splitted_name = split_name_parts(target) family = splitted_name[0] levels = ( # target + '%', #this introduces a weird problem: different results for mele, salvatore and salvatore mele family + ',%', family[:-2] + '%', '%' + family + ',%', '%' + family[1:-1] + '%') if len(family) <= 4: levels = [levels[0], levels[2]] for lev in levels: names = dbinter.get_authors_by_name_regexp(lev) if names: print "%s" % lev break is_canonical = False if not names: names = dbinter.get_authors_by_canonical_name_regexp(target) is_canonical = True names = groupby(sorted(names)) names = [(key[0], key[1], len(list(data)), soft_compare_names(target, key[1])) for key, data in names] names = groupby(names, itemgetter(0)) names = [(key, sorted([(d[1], d[2], d[3]) for d in data if (d[3] > 0.5 or is_canonical)], key=itemgetter(2), reverse=True)) for key, data in names] names = [name for name in names if name[1]] names = sorted(names, key=lambda x: (x[1][0][2], x[1][0][0], x[1][0][1]), reverse=True) return names
def fallback_find_personids_by_name_string(target): ''' Search engine to find persons matching the given string The matching is done on the surname first, and names if present. An ordered list (per compatibility) of pids and found names is returned. @param namestring: string name, 'surname, names I.' @type: string @param strict: Define if this shall perform an exact or a fuzzy match @type strict: boolean @return: pid list of lists [pid,[[name string, occur count, compatibility]]] ''' splitted_name = split_name_parts(target) family = splitted_name[0] levels = (# target + '%', #this introduces a weird problem: different results for mele, salvatore and salvatore mele family + ',%', family[:-2] + '%', '%' + family + ',%', '%' + family[1:-1] + '%') if len(family) <= 4: levels = [levels[0], levels[2]] for lev in levels: names = dbinter.get_authors_by_name_regexp(lev) if names: print "%s" % lev break is_canonical = False if not names: names = dbinter.get_authors_by_canonical_name_regexp(target) is_canonical = True names = groupby(sorted(names)) names = [(key[0], key[1], len(list(data)), soft_compare_names(target, key[1])) for key, data in names] names = groupby(names, itemgetter(0)) names = [(key, sorted([(d[1], d[2], d[3]) for d in data if (d[3] > 0.5 or is_canonical)], key=itemgetter(2), reverse=True)) for key, data in names] names = [name for name in names if name[1]] names = sorted(names, key=lambda x: (x[1][0][2], x[1][0][0], x[1][0][1]), reverse=True) return names
def find_personids_by_name(query_string): query_string_surname = split_name_parts(query_string)[0] name_score_list = set( find_personids_by_name1(query_string) + find_personids_by_name1(query_string_surname)) name_ranking_list = sorted(name_score_list, key=itemgetter(1), reverse=True) pid_score_list = calculate_pid_score(name_ranking_list) pids_ranking_list = sorted(pid_score_list, key=itemgetter(2), reverse=True) ranked_pid_name_list = [ pid for pid, name, final_score in pids_ranking_list ] return ranked_pid_name_list
def cache_name_variants_of_authors(author_to_name_and_occurrence_mapping): args = list() for author, names_and_occurrence in author_to_name_and_occurrence_mapping.iteritems( ): indexable_names_and_occurrence = dict() for name, occurrences in names_and_occurrence.iteritems(): asciified_name = translate_to_ascii(name)[0] indexable_name = create_indexable_name( split_name_parts(indexable_name_re.sub(' ', asciified_name))) try: indexable_names_and_occurrence[indexable_name] += occurrences except KeyError: indexable_names_and_occurrence[indexable_name] = occurrences args += [author, serialize(indexable_names_and_occurrence), 1] populate_table('aidDENSEINDEX', ['id', 'personids', 'flag'], args, empty_table_first=False)
pdata = {} ndata = {} artifact_removal = re.compile("[^a-zA-Z0-9]") for i in claimnames: pid = i[0] n = i[1] if not pid in pdata: pdata[pid] = {} pdata[pid]['name'] = [] pdata[pid]['lname'] = set() pdata[pid]['olnames'] = set() pdata[pid]['vclaims'] = run_sql("select count(id) from aidPERSONID where flag = 2 and tag='paper' and personid = %s", (pid,))[0][0] pdata[pid]['name'].append(n) clname = artifact_removal.sub("", split_name_parts(n)[0].lower()) pdata[pid]['lname'].add(clname) pdata[pid]['olnames'].add(split_name_parts(n)[0]) for p in pdata: if len(pdata[p]['lname']) > 1: print "multiple names in", pdata[p]['lname'] l = list(pdata[p]['lname'])[0] if l in ndata: ndata[l]['pc'] += 1 ndata[l]['vp'] += pdata[p]['vclaims'] else: ndata[l] = {} ndata[l]['pc'] = 1 ndata[l]['vp'] = pdata[p]['vclaims']
def test_create_normalized_name(self): for tn in self.tc.keys(): self.assertEqual(create_normalized_name(split_name_parts(tn)), self.tc[tn])
from invenio.dbquery import run_sql import invenio.bibauthorid_name_utils as nu names = run_sql("select name from aidAUTHORNAMES") fnames = set([i[0].split(',')[0] for i in names]) splitnames = [] for i in names: splitnames.append(nu.split_name_parts(i[0])) multinames = 0 cntnames = 0 cntinitonly = 0 multiinitials = 0 fnlt5 = 0 fnonly = 0 for i in splitnames: if len(i[0]) < 5: fnlt5 += 1 if i[2] and len(i[2]) > 1: multinames += 1 if i[2]: cntnames += 1 else: if i[1] and len(i[1]) > 1: multiinitials += 1 if i[1]: cntinitonly += 1 else: fnonly += 1
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' if bconfig.RABBIT_USE_CACHED_PID: PID_NAMES_CACHE = get_name_string_to_pid_dictionary() def find_pids_by_exact_names_cache(name): try: return zip(PID_NAMES_CACHE[name]) except KeyError: return [] def add_signature_using_names_cache(sig, name, pid): try: PID_NAMES_CACHE[name].add(pid) except KeyError: PID_NAMES_CACHE[name] = set([pid]) _add_signature(sig, name, pid) def new_person_from_signature_using_names_cache(sig, name): pid = get_new_personid() add_signature_using_names_cache(sig, name, pid) return pid add_signature = add_signature_using_names_cache new_person_from_signature = new_person_from_signature_using_names_cache find_pids_by_exact_name = find_pids_by_exact_names_cache else: add_signature = _add_signature new_person_from_signature = _new_person_from_signature find_pids_by_exact_name = _find_pids_by_exact_name compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_bibrecs() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD): populate_partial_marc_caches() SWAPPED_GET_GROUPED_RECORDS = True else: SWAPPED_GET_GROUPED_RECORDS = False updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): task_sleep_now_if_required(True) update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: delete_paper_from_personid(rec) continue markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec))))) personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec)] personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibrecref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures] for new in new_signatures] # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] for new, old in best_match: modify_signature(old, rec, new, new_signatures_names[new]) remove_sigs(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = [] if USE_EXT_IDS: if USE_INSPIREID: inspire_id = get_inspire_id(sig + (rec,)) if inspire_id: matched_pids = list(get_person_with_extid(inspire_id[0])) if matched_pids: add_signature(list(sig) + [rec], name, matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) continue matched_pids = find_pids_by_exact_name(name) matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids] if not matched_pids: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) update_status_final() if personids_to_update_extids: updated_pids |= personids_to_update_extids if updated_pids: # an empty set will update all canonical_names update_personID_canonical_names(updated_pids) update_personID_external_ids(updated_pids, limit_to_claimed_papers=bconfig.LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS) if SWAPPED_GET_GROUPED_RECORDS: destroy_partial_marc_caches()
from invenio.dbquery import run_sql from invenio.bibauthorid_name_utils import split_name_parts from invenio.bibauthorid_dbinterface import get_all_authors from invenio.bibauthorid_dbinterface import get_all_bibrecs from bibauthorid_general_utils import update_status print("Getting records...") #records = [p[0] for p in run_sql("select id from bibrec")] records = get_all_bibrecs() lastnames = [] for index, bibrec in enumerate(records): if index % 1000 == 0: percent = float(index) / len(records) update_status(percent, "%s of all %s records done." % (index, len(records))) for author in get_all_authors(bibrec): lastnames.append(split_name_parts(author)[0]) fp = open("/tmp/lastnames.txt", "w") fp.write("\n".join(lastnames)) fp.close()
def worker(i, gpid_queue, qdata_queue): if gpid_queue.empty(): return gpid = gpid_queue.get() gpdata = {} rpdata = {} if not PRINT_STATS: sys.stdout.write(".") sys.stdout.flush() artifact_removal = re.compile("[^a-zA-Z0-9]") gpdata[gpid] = {} pdata = gpdata[gpid] gpids_scores = {} gperson_papers = [] gperson_papers_set = set() total_papers = set() gpapers = run_sql("select data from aidGOLD where tag='paper' and personid=%s", (gpid,)) msg('|-- %s: Starting w/ %s gold papers' % (i, len(gpapers))) for gp in gpapers: total_papers.add(gp[0]) gperson_papers.append(gp[0]) gperson_papers_set.add(gp[0]) rpids = run_sql("select distinct o.personid from aidRESULTS o, (select i.data as idata from aidGOLD i " "where tag='paper' and i.personid=%s) as dummy where dummy.idata = o.data", (gpid,)) if not rpids: msg("%s: Nothing to do. No person entities in result set" % i) return msg(' |-- %s: Collecting data for rpids' % i) for rpid in rpids: rpp = run_sql("select data from aidRESULTS where tag='paper' and personid=%s", (rpid[0],)) rpdata[rpid[0]] = {} rpdata[rpid[0]]['papers'] = [] for rp in rpp: total_papers.add(rp[0]) rpdata[rpid[0]]['papers'].append(rp[0]) # construct reference vectors for total papers and gold person papers for the C measure total_papers_vector = list(total_papers) gp_papers_vector = zeros(len(total_papers_vector)) for p in gperson_papers: gp_papers_vector[total_papers_vector.index(p)] = 1 msg(' |-- %s: Performing QA data for %s rpids' % (i,len(rpids))) # assess quality for each result person for this gold person for rpid in rpdata: rpd = rpdata[rpid] rpaperset = set(rpd['papers']) # F1 measure... true_positives = rpaperset.intersection(gperson_papers_set) false_positives = set() true_negatives = set() false_negatives = set() for rpp in rpaperset: if not rpp in gperson_papers_set: false_positives.add(rpp) for gpp in gperson_papers_set: if not gpp in rpaperset: false_negatives.add(gpp) tp = float(len(true_positives)) fp = float(len(false_positives)) fn = float(len(false_negatives)) precision = tp / max(tp + fp, 1.0) recall = tp / max(tp + fn, 1.0) f1 = 2.0 * (float(precision * recall) / max(float(precision + recall), 1.0)) # C measure... c = 0.0 if len(total_papers) > 0 and len(rpaperset) > 0: k = zeros(len(total_papers_vector)) for p in rpd['papers']: k[total_papers_vector.index(p)] = 1 c = linalg.norm(gp_papers_vector - k) #* (tp / float(len(total_papers_vector))) #/ float(len(total_papers_vector)) # store results for this person: pdata[rpid] = {} pdata[rpid]['true_positives'] = tp pdata[rpid]['false_positives'] = fp pdata[rpid]['false_negatives'] = fn pdata[rpid]['true_positives_set'] = true_positives pdata[rpid]['false_positives_set'] = false_positives pdata[rpid]['false_negatives_set'] = false_negatives pdata[rpid]['f1'] = f1 pdata[rpid]['c'] = c # print the f1 score parts msg(' |-- GOLDPID %s || True Positives for ResultPID %s: %s' % (gpid, rpid, str(true_positives))) msg(' |-- GOLDPID %s || False Positives for ResultPID %s: %s' % (gpid, rpid, str(false_positives))) msg(' |-- GOLDPID %s || False Negatives for ResultPID %s: %s' % (gpid, rpid, str(false_negatives))) pdata['gpid_f1'] = 0.0 pdata['gpid_c'] = 0.0 gpid_f1 = [] gpid_c = [] for rpid in rpdata: gpid_f1.append(pdata[rpid]['f1']) gpid_c.append((pdata[rpid]['c'] * pdata[rpid]['true_positives'])/ float(len(gp_papers_vector))) pdata['gpid_f1'] = average(gpid_f1) pdata['gpid_c'] = average(gpid_c) pdata['lastnames'] = set() all_refs = set([br.split(',')[0] for br in gperson_papers]) all_last_names = set() all_names = set() msg(' |-- %s: Finding last name for gPid' % i) for ref in all_refs: try: all_names.add(run_sql("select o.name from aidAUTHORNAMES o, " "(select i.name_id as nid from aidAUTHORNAMESBIBREFS i " "where bibref=%s) as dummy where o.id = dummy.nid", (ref,))[0][0]) except IndexError: msg("Ignoring bibref (no name found): %s" % (ref)) for name in all_names: cln = artifact_removal.sub("", split_name_parts(name)[0].lower()) pdata['lastnames'].add(cln) pdata['lastname'] = list(pdata['lastnames'])[0] qdata_queue.put(gpdata) msg(' |-- %s: Done with golden pid %s.' % (i,gpid)) return
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None, verbose=False): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' logfile = open('/tmp/RABBITLOG-%s' % str(now()).replace(" ", "_"), 'w') logfile.write("RABBIT %s running on %s \n" % (str(now()), str(bibrecs))) def logwrite(msg, is_error): verb = 9 if is_error or verbose: verb = 1 write_message(msg, verbose=verb) if bconfig.RABBIT_USE_CACHED_PID: PID_NAMES_CACHE = get_name_to_authors_mapping() def find_pids_by_exact_names_cache(name): try: return zip(PID_NAMES_CACHE[name]) except KeyError: return [] def add_signature_using_names_cache(sig, name, pid): try: PID_NAMES_CACHE[name].add(pid) except KeyError: PID_NAMES_CACHE[name] = set([pid]) _add_signature(sig, name, pid) def new_person_from_signature_using_names_cache(sig, name): pid = get_free_author_id() add_signature_using_names_cache(sig, name, pid) return pid add_signature = add_signature_using_names_cache new_person_from_signature = new_person_from_signature_using_names_cache find_pids_by_exact_name = find_pids_by_exact_names_cache else: add_signature = _add_signature new_person_from_signature = _new_person_from_signature find_pids_by_exact_name = _find_pids_by_exact_name compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_papers() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD): populate_partial_marc_caches() SWAPPED_GET_GROUPED_RECORDS = True else: SWAPPED_GET_GROUPED_RECORDS = False updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): logwrite("\nConsidering %s" % str(rec), False) if idx%200 == 0: task_sleep_now_if_required(True) update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) task_update_progress("%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: logwrite(" - Record was deleted, removing from pid and continuing with next record", True) remove_papers([rec]) continue markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_author_refs_of_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthor_refs_of_paper(rec))))) personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_of_paper(rec)] personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures] for new in new_signatures] logwrite(" - Old signatures: %s" % str(old_signatures), bool(old_signatures)) logwrite(" - New signatures: %s" % str(new_signatures), bool(new_signatures)) logwrite(" - Matrix: %s" % str(matrix), bool(matrix)) # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] logwrite(" - Best match: %s " % str(best_match), bool(best_match)) for new, old in best_match: logwrite(" - - Moving signature: %s on %s to %s as %s" % (old, rec, new, new_signatures_names[new]), True) modify_signature(old, rec, new, new_signatures_names[new]) remove_signatures(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match)) pids_having_rec = set([int(row[0]) for row in get_signatures_of_paper(rec)]) logwrite(" - Not matched: %s" % str(not_matched), bool(not_matched)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = list() if USE_EXT_IDS: if USE_INSPIREID: inspire_id = get_inspire_id_of_signature(sig + (rec,)) if inspire_id: matched_pids = list(get_author_by_external_id(inspire_id[0])) if matched_pids and int(matched_pids[0][0]) in pids_having_rec: matched_pids = list() if matched_pids: add_signature(list(sig) + [rec], name, matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) pids_having_rec.add(matched_pids[0][0]) continue matched_pids = find_pids_by_exact_name(name) matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids] if not matched_pids or int(matched_pids[0][0]) in pids_having_rec: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) pids_having_rec.add(matched_pids[0][0]) logwrite('Finished with %s' % str(rec), False) update_status_final() if personids_to_update_extids: updated_pids |= personids_to_update_extids if updated_pids: # an empty set will update all canonical_names update_canonical_names_of_authors(updated_pids) update_external_ids_of_authors(updated_pids, limit_to_claimed_papers=bconfig.LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS) if SWAPPED_GET_GROUPED_RECORDS: destroy_partial_marc_caches() remove_empty_authors()
def main(): """ Reads import file and verfies the md5 hash. For each line in the import file: find new record from bibcode, find new ref from name on record find old row in personid tables copy row with new authorref (tab:bibref,rec) to temp table overwrite personid tables w/ temp table """ ## create temporary tables... print "Creating temporary tables..." create_temp_pid_sql_table() create_temp_piddata_sql_table() create_temp_user_input_log_sql_table() ## fill temp tables w/ static values... print "Filling temporary tables with static, unchanged content" copy_unaltered_piddata_rows_to_temp() copy_unaltered_user_input_log_table_rows_to_temp() ## compile regexp for line break removal nlr = re.compile('[\n\r]+') #verify file integrity print ("Verifying file integrity of %s with" " MD5 checksum from %s" % (IMPORT_FILE_NAME, IMPORT_MD5_FILE_NAME)) fp = open(IMPORT_FILE_NAME, "rb") fmd5 = md5_for_file(fp) fp.close() fp = open(IMPORT_MD5_FILE_NAME, "r") vmd5 = fp.read() fp.close() if not fmd5 == vmd5: print "WARNING: Detected a disturbance in the file. Will exit here." return total_lines = file_len() fp = open(IMPORT_FILE_NAME, "r") print "Processing file %s..." % IMPORT_FILE_NAME for index, line in enumerate(fp.readlines()): # if index == 100: # break if index % 5000 == 0: percent = float(index) / float(total_lines) update_status(percent, "%s of %s lines processed in %s" % (index, total_lines, IMPORT_FILE_NAME)) new_ref = None tab1, old_ref, old_rec, tab2, enname, bibcode = line.split(" ") assert tab1 == tab2 if tab1 == "table": continue name = base64.b64decode(enname) # name = nq.sub("", name) bibcode = nlr.sub("", bibcode) new_rec = get_bibrec_from_bibcode(bibcode) for ref in get_authorrefs_and_names_from_bibrec(new_rec): # refname = create_normalized_name(split_name_parts(ref[2])) refname = ref[2] if refname == name and str(ref[0]) == tab1: #MySQL equivalent: col_name COLLATE utf8_bin = 'Case SenSitive name' new_ref = ref[1] if not new_ref: print "WARN: Authorref not found for name %s on new record %s?!" % (name, new_rec) continue # get personid, flag, lcul and last_updated from old aidPERSONIDPAPERS old_data = find_old_pidtable_row(tab1, old_ref, old_rec) if old_data: ## prepare data in temporary tables... pid, flag, lcul, lupdate = old_data old_authorref = "%s:%s,%s" % (tab1, old_ref, old_rec) new_authorref = "%s:%s,%s" % (tab1, new_ref, new_rec) ## Transform the name into a more consistent form inname = create_normalized_name(split_name_parts(name)) ## Insert transformed data into temp tables... insert_into_temp_table(pid, tab1, new_ref, new_rec, inname, flag, lcul, lupdate) update_temp_piddata_table(old_authorref, new_authorref) update_temp_user_input_log_table(old_authorref, new_authorref) else: print "WARN: %s does not exist in db!" % ([tab1, old_ref, old_rec]) # The following is true only if applied on the same data set # Commented out by default. For testing/debug uses only try: if RUN_IN_TEST_MODE: assert str(old_rec) == str(new_rec) assert str(old_ref) == str(new_ref) pass except AssertionError, e: print "ERROR: ", e print "%s:%s,%s vs. %s:%s,%s on %s:%s" % (tab1, old_ref, old_rec, tab1, new_ref, new_rec, bibcode, name)
def test_split_name_parts(self): for tn in self.names_split_name_parts.keys(): self.assertEqual(split_name_parts(tn), self.names_split_name_parts[tn])
def create_bibauthorid_indexer(): ''' It constructs the disk-based indexer. It consists of the dense index (which maps a name to the set of personids who withhold that name) and the inverted lists (which map a qgram to the set of name ids that share that qgram). ''' name_pids_dict = get_confirmed_name_to_authors_mapping() if not name_pids_dict: return indexable_name_pids_dict = dict() for name in name_pids_dict.keys(): asciified_name = translate_to_ascii(name)[0] indexable_name = create_indexable_name(asciified_name) if indexable_name: try: asciified_name, pids = indexable_name_pids_dict[indexable_name] updated_pids = pids | name_pids_dict[name] indexable_name_pids_dict[indexable_name] = (asciified_name, updated_pids) except KeyError: indexable_name_pids_dict[indexable_name] = ( asciified_name, name_pids_dict[name]) surname = split_name_parts(name)[0] asciified_surname = translate_to_ascii(surname)[0] indexable_surname = create_indexable_name(asciified_surname) if indexable_surname: try: asciified_surname, pids = indexable_name_pids_dict[ indexable_surname] updated_pids = pids | name_pids_dict[name] indexable_name_pids_dict[indexable_surname] = ( asciified_surname, updated_pids) except KeyError: indexable_name_pids_dict[indexable_surname] = ( asciified_surname, name_pids_dict[name]) indexable_names_list = indexable_name_pids_dict.keys() # If an exception/error occurs in any of the threads it is not detectable # so inter-thread communication is necessary to make it visible. q = Queue() threads = list() threads.append( Thread(target=create_dense_index, args=(indexable_name_pids_dict, indexable_names_list, q))) threads.append( Thread(target=create_inverted_lists, args=(indexable_names_list, q))) for t in threads: t.start() for t in threads: all_ok, error = q.get(block=True) if not all_ok: raise error q.task_done() for t in threads: t.join()
def find_personids_by_name(query_string, trust_is_operating=False): ''' It returns all the authors that match the query string, sorted by compatibility. WARNING: this is just querying the search engine, for a proper person search query one should use person_search_engine_query in bibauthorid_dbinterface @param query_string: the query string @type query_string: str @return: author identifiers @rtype: list [int,] ''' if not trust_is_operating: search_engine_is_oper = search_engine_is_operating() if not search_engine_is_oper: return None asciified_qstring = translate_to_ascii(query_string)[0] indexable_qstring = create_indexable_name( split_name_parts(indexable_name_re.sub(' ', asciified_qstring))) surname = split_name_parts(query_string)[0] + ',' asciified_qstring_sur = translate_to_ascii(surname)[0] indexable_qstring_sur = create_indexable_name( split_name_parts(indexable_name_re.sub(' ', asciified_qstring_sur))) qstring_first_names = indexable_qstring.split( ' ')[len(indexable_qstring_sur.split(' ')):] string_ids = solve_T_occurence_problem( indexable_qstring) | solve_T_occurence_problem(indexable_qstring_sur) if not string_ids: return list() strings_to_ids_mapping = get_indexed_strings(string_ids) passing_string_ids, surname_score_cache = remove_false_positives( indexable_qstring_sur, strings_to_ids_mapping) if not passing_string_ids: return list() author_groups = get_author_groups_from_string_ids(passing_string_ids) authors = set() for author_group in author_groups: authors |= set(deserialize(author_group[0])) author_to_names_mapping = get_name_variants_for_authors(authors) surname_score_clusters = create_surname_score_clusters( indexable_qstring_sur, author_to_names_mapping, surname_score_cache, strings_to_ids_mapping) sorted_authors = sort_authors(indexable_qstring, qstring_first_names, surname_score_clusters, author_to_names_mapping, strings_to_ids_mapping) return sorted_authors
def arxiv_login(req, picked_profile=None): ''' Log in through arxive. If user already associated to a personid, returns the personid. If user has no pid, try to guess which personid to associate based on surname and papers from arxiv. If no compatible person is found, creates a new person. At the end of the process opens a ticket for the user claiming the papers from arxiv. !!! the user will find the open ticket, which will require him to go through the final review before getting committed. @param req: Apache request object @type req: Apache request object @return: Returns the pid resulting in the process @rtype: int ''' def session_bareinit(req): session = get_session(req) try: pinfo = session["personinfo"] if 'ticket' not in pinfo: pinfo["ticket"] = [] except KeyError: pinfo = dict() session['personinfo'] = pinfo pinfo["ticket"] = [] session.dirty = True session_bareinit(req) session = get_session(req) pinfo = session['personinfo'] ticket = session['personinfo']['ticket'] uinfo = collect_user_info(req) pinfo['external_first_entry'] = False try: name = uinfo['external_firstname'] except KeyError: name = '' try: surname = uinfo['external_familyname'] except KeyError: surname = '' if surname: session['personinfo']['arxiv_name'] = nameapi.create_normalized_name( nameapi.split_name_parts(surname + ', ' + name)) else: session['personinfo']['arxiv_name'] = '' session.dirty = True try: arxiv_p_ids = uinfo['external_arxivids'].split(';') except KeyError: arxiv_p_ids = [] #'external_arxivids': 'hep-th/0112017;hep-th/0112020', #'external_familyname': 'Weiler', #'external_firstname': 'Henning', try: found_bibrecs = set(reduce(add, [perform_request_search(p='037:' + str(arx), of='id', rg=0)for arx in arxiv_p_ids])) except (IndexError, TypeError): found_bibrecs = set() #found_bibrecs = [567700, 567744] uid = getUid(req) pid, pid_found = dbapi.get_personid_from_uid([[uid]]) if pid_found: pid = pid[0] else: if picked_profile == None: top5_list = dbapi.find_top5_personid_for_new_arXiv_user(found_bibrecs, nameapi.create_normalized_name(nameapi.split_name_parts(surname + ', ' + name))) return ("top5_list", top5_list) else: pid = dbapi.check_personids_availability(picked_profile, uid) pid_bibrecs = set([i[0] for i in dbapi.get_all_personids_recs(pid, claimed_only=True)]) missing_bibrecs = found_bibrecs - pid_bibrecs #present_bibrecs = found_bibrecs.intersection(pid_bibrecs) #assert len(found_bibrecs) == len(missing_bibrecs) + len(present_bibrecs) tempticket = [] #now we have to open the tickets... #person_papers contains the papers which are already assigned to the person and came from arxive, #they can be claimed regardless for bibrec in missing_bibrecs: tempticket.append({'pid':pid, 'bibref':str(bibrec), 'action':'confirm'}) #check if ticket targets (bibref for pid) are already in ticket for t in list(tempticket): for e in list(ticket): if e['pid'] == t['pid'] and e['bibref'] == t['bibref']: ticket.remove(e) ticket.append(t) session.dirty = True if picked_profile != None and picked_profile != pid and picked_profile != -1: return ("chosen pid not available", pid) elif picked_profile != None and picked_profile == pid and picked_profile != -1: return ("pid assigned by user", pid) else: return ("pid", pid)
def test_split_name_parss(self): for tn in self.names_split_name_parts.keys(): self.assertEqual(split_name_parts(tn), self.names_split_name_parts[tn])
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' if bconfig.RABBIT_USE_CACHED_PID: PID_NAMES_CACHE = get_name_string_to_pid_dictionary() def find_pids_by_exact_names_cache(name): try: return zip(PID_NAMES_CACHE[name]) except KeyError: return [] def add_signature_using_names_cache(sig, name, pid): try: PID_NAMES_CACHE[name].add(pid) except KeyError: PID_NAMES_CACHE[name] = set([pid]) _add_signature(sig, name, pid) def new_person_from_signature_using_names_cache(sig, name): pid = get_new_personid() add_signature_using_names_cache(sig, name, pid) return pid add_signature = add_signature_using_names_cache new_person_from_signature = new_person_from_signature_using_names_cache find_pids_by_exact_name = find_pids_by_exact_names_cache else: add_signature = _add_signature new_person_from_signature = _new_person_from_signature find_pids_by_exact_name = _find_pids_by_exact_name compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_bibrecs() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD): populate_partial_marc_caches() SWAPPED_GET_GROUPED_RECORDS = True else: SWAPPED_GET_GROUPED_RECORDS = False updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): task_sleep_now_if_required(True) update_status( float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: delete_paper_from_personid(rec) continue markrefs = frozenset( chain( izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec))))) personid_rows = [ map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec) ] personidrefs_names = dict( ((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict( (new, create_normalized_name( split_name_parts(get_name_by_bibrecref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[ compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures ] for new in new_signatures] # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] for new, old in best_match: modify_signature(old, rec, new, new_signatures_names[new]) remove_sigs(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset( map(itemgetter(0), best_match)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = [] if USE_EXT_IDS: if USE_INSPIREID: inspire_id = get_inspire_id(sig + (rec, )) if inspire_id: matched_pids = list( get_person_with_extid(inspire_id[0])) if matched_pids: add_signature(list(sig) + [rec], name, matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) continue matched_pids = find_pids_by_exact_name(name) matched_pids = [ p for p in matched_pids if int(p[0]) not in used_pids ] if not matched_pids: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) update_status_final() if personids_to_update_extids: updated_pids |= personids_to_update_extids if updated_pids: # an empty set will update all canonical_names update_personID_canonical_names(updated_pids) update_personID_external_ids( updated_pids, limit_to_claimed_papers=bconfig. LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS) if SWAPPED_GET_GROUPED_RECORDS: destroy_partial_marc_caches()
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None, verbose=False): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' logfile = open('/tmp/RABBITLOG-%s' % str(now()).replace(" ", "_"), 'w') logfile.write("RABBIT %s running on %s \n" % (str(now()), str(bibrecs))) def logwrite(msg, is_error): verb = 9 if is_error or verbose: verb = 1 write_message(msg, verbose=verb) if bconfig.RABBIT_USE_CACHED_PID: PID_NAMES_CACHE = get_name_to_authors_mapping() def find_pids_by_exact_names_cache(name): try: return zip(PID_NAMES_CACHE[name]) except KeyError: return [] def add_signature_using_names_cache(sig, name, pid): try: PID_NAMES_CACHE[name].add(pid) except KeyError: PID_NAMES_CACHE[name] = set([pid]) _add_signature(sig, name, pid) def new_person_from_signature_using_names_cache(sig, name): pid = get_free_author_id() add_signature_using_names_cache(sig, name, pid) return pid add_signature = add_signature_using_names_cache new_person_from_signature = new_person_from_signature_using_names_cache find_pids_by_exact_name = find_pids_by_exact_names_cache else: add_signature = _add_signature new_person_from_signature = _new_person_from_signature find_pids_by_exact_name = _find_pids_by_exact_name compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_papers() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD): populate_partial_marc_caches() SWAPPED_GET_GROUPED_RECORDS = True else: SWAPPED_GET_GROUPED_RECORDS = False updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): logwrite("\nConsidering %s" % str(rec), False) if idx % 200 == 0: task_sleep_now_if_required(True) update_status( float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) task_update_progress("%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: logwrite( " - Record was deleted, removing from pid and continuing with next record", True) remove_papers([rec]) continue markrefs = frozenset( chain( izip(cycle([100]), imap(itemgetter(0), get_author_refs_of_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthor_refs_of_paper(rec))))) personid_rows = [ map(int, row[:3]) + [row[4]] for row in get_signatures_of_paper(rec) ] personidrefs_names = dict( ((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict( (new, create_normalized_name(split_name_parts(get_name_by_bibref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[ compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures ] for new in new_signatures] logwrite(" - Old signatures: %s" % str(old_signatures), bool(old_signatures)) logwrite(" - New signatures: %s" % str(new_signatures), bool(new_signatures)) logwrite(" - Matrix: %s" % str(matrix), bool(matrix)) # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] logwrite(" - Best match: %s " % str(best_match), bool(best_match)) for new, old in best_match: logwrite( " - - Moving signature: %s on %s to %s as %s" % (old, rec, new, new_signatures_names[new]), True) modify_signature(old, rec, new, new_signatures_names[new]) remove_signatures(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset( map(itemgetter(0), best_match)) pids_having_rec = set( [int(row[0]) for row in get_signatures_of_paper(rec)]) logwrite(" - Not matched: %s" % str(not_matched), bool(not_matched)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = list() if USE_EXT_IDS: if USE_INSPIREID: inspire_id = get_inspire_id_of_signature(sig + (rec, )) if inspire_id: matched_pids = list( get_author_by_external_id(inspire_id[0])) if matched_pids and int( matched_pids[0][0]) in pids_having_rec: matched_pids = list() if matched_pids: add_signature(list(sig) + [rec], name, matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) pids_having_rec.add(matched_pids[0][0]) continue matched_pids = find_pids_by_exact_name(name) matched_pids = [ p for p in matched_pids if int(p[0]) not in used_pids ] if not matched_pids or int(matched_pids[0][0]) in pids_having_rec: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) pids_having_rec.add(matched_pids[0][0]) logwrite('Finished with %s' % str(rec), False) update_status_final() if personids_to_update_extids: updated_pids |= personids_to_update_extids if updated_pids: # an empty set will update all canonical_names update_canonical_names_of_authors(updated_pids) update_external_ids_of_authors( updated_pids, limit_to_claimed_papers=bconfig. LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS) if SWAPPED_GET_GROUPED_RECORDS: destroy_partial_marc_caches() remove_empty_authors()