def convert_personid(): from dbquery import run_sql # oh come on, the whole function will be removed soon from itertools import repeat chunk = 1000 old_personid = run_sql("SELECT `personid`, `tag`, `data`, `flag`, `lcul` FROM `aidPERSONID`") def flush_papers(args): run_sql("INSERT INTO `aidPERSONIDPAPERS` " "(`personid`, " " `bibref_table`, " " `bibref_value`, " " `bibrec`, " " `name`, " " `flag`, " " `lcul`) " "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s, %s, %s)", len(args) / 7)) , tuple(args)) def flush_data(args): run_sql("INSERT INTO `aidPERSONIDDATA` " "(`personid`, " " `tag`, " " `data`, " " `opt1`, " " `opt2`) " "VALUES " + " , ".join(repeat("(%s, %s, %s, %s, %s)", len(args) / 5)) , tuple(args)) paper_args = [] data_args = [] for row in old_personid: if row[1] == 'paper': bibref, rec = row[2].split(',') tab, ref = bibref.split(':') try: name = get_name_by_bibrecref((int(tab), int(ref), int(rec))) except: continue name = split_name_parts(name) name = create_normalized_name(name) paper_args += [row[0], tab, ref, rec, name, row[3], row[4]] if len(paper_args) > chunk: flush_papers(paper_args) paper_args = [] elif row[1] == 'gathered_name': continue else: data_args += list(row) if len(data_args) > chunk: flush_data(data_args) data_args = [] if paper_args: flush_papers(paper_args) if data_args: flush_data(data_args)
def find_personIDs_by_name_string(target): ''' Search engine to find persons matching the given string The matching is done on the surname first, and names if present. An ordered list (per compatibility) of pids and found names is returned. @param namestring: string name, 'surname, names I.' @type: string @param strict: Define if this shall perform an exact or a fuzzy match @type strict: boolean @return: pid list of lists [pid,[[name string, occur count, compatibility]]] ''' splitted_name = split_name_parts(target) family = splitted_name[0] target_cleaned = create_normalized_name(splitted_name) levels = (#target + '%', #this introduces a weird problem: different results for mele, salvatore and salvatore mele family + ',%', family[:-2] + '%', '%' + family + ',%', '%' + family[1:-1] + '%') if len(family) <= 4: levels = [levels[0], levels[2]] for lev in levels: names = dbinter.get_all_personids_by_name(lev) if names: break is_canonical = False if not names: names = dbinter.get_personids_by_canonical_name(target) is_canonical = True names = groupby(sorted(names)) names = [(key[0], key[1], len(list(data)), soft_compare_names(target, key[1])) for key, data in names] names = groupby(names, itemgetter(0)) names = [(key, sorted([(d[1], d[2], d[3]) for d in data if (d[3] > 0.5 or is_canonical)], key=itemgetter(2), reverse=True)) for key, data in names] names = [name for name in names if name[1]] names = sorted(names, key=lambda x: (x[1][0][2], x[1][0][0], x[1][0][1]), reverse=True) return names
def create_lastname_list_from_personid(): ''' This function generates a dictionary from a last name to list of personids which have this lastname. ''' # ((personid, fulL Name1) ... ) all_names = get_all_names_from_personid() # ((personid, last_name) ... ) artifact_removal = re.compile("[^a-zA-Z0-9]") all_names = tuple((row[0], artifact_removal.sub("", split_name_parts(row[1].decode('utf-8'))[0]).lower()) for row in all_names) # { (last_name : [personid ... ]) ... } ret = {} for pair in all_names: ret[pair[1]] = ret.get(pair[1], []) + [pair[0]] return ret
def rabbit(bibrecs, check_invalid_papers=False): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_bibrecs() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): task_sleep_now_if_required(True) update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: delete_paper_from_personid(rec) continue markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec))))) personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec)] personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibrecref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures] for new in new_signatures] # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] for new, old in best_match: modify_signature(old, rec, new, new_signatures_names[new]) remove_sigs(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = find_pids_by_exact_name(name) matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids] if not matched_pids: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) update_status_final() if updated_pids: # an empty set will update all canonical_names update_personID_canonical_names(updated_pids)