def _get_personids_to_update_extids(papers=None): ''' It returns the set of personids of which we should recalculate their external ids. @param papers: papers @type papers: set or None @return: personids @rtype: set ''' last_log = get_user_log(userinfo='daemon', action='PID_UPDATE', only_most_recent=True) if last_log: daemon_last_time_run = last_log[0][2] modified_bibrecs = get_recently_modified_record_ids(daemon_last_time_run) else: modified_bibrecs = get_all_valid_bibrecs() if papers: modified_bibrecs &= set(papers) if not modified_bibrecs: return None if bconfig.LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS: modified_bibrecs = [rec[0] for rec in get_claimed_papers_from_papers(modified_bibrecs)] personids_to_update_extids = set() for bibrec in modified_bibrecs: personids_to_update_extids |= set(get_personids_from_bibrec(bibrec)) return personids_to_update_extids
def cluster_sets_from_marktables(): # { (100, 123) -> name } ref100 = get_bib10x() ref700 = get_bib70x() bibref_2_name = dict([((100, ref), generate_last_name_cluster_str(name)) for ref, name in ref100] + [((700, ref), generate_last_name_cluster_str(name)) for ref, name in ref700]) all_recs = get_all_valid_bibrecs() all_bibrefrecs = chain(set((100, ref, rec) for rec, ref in get_bibrefrec_subset(100, all_recs, map(itemgetter(0), ref100))), set((700, ref, rec) for rec, ref in get_bibrefrec_subset(700, all_recs, map(itemgetter(0), ref700)))) last_name_2_bibs = {} for bibrefrec in all_bibrefrecs: table, ref, unused = bibrefrec name = bibref_2_name[(table, ref)] last_name_2_bibs[name] = last_name_2_bibs.get(name, []) + [bibrefrec] cluster_sets = [] for name, bibrecrefs in last_name_2_bibs.items(): new_cluster_set = Cluster_set() new_cluster_set.clusters = [Cluster_set.Cluster([bib]) for bib in bibrecrefs] new_cluster_set.last_name = name cluster_sets.append(new_cluster_set) return cluster_sets
def rabbit(bibrecs, check_invalid_papers=False): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_bibrecs() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): task_sleep_now_if_required(True) update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: delete_paper_from_personid(rec) continue markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec))))) personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec)] personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibrecref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures] for new in new_signatures] # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] for new, old in best_match: modify_signature(old, rec, new, new_signatures_names[new]) remove_sigs(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = find_pids_by_exact_name(name) matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids] if not matched_pids: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) update_status_final() if updated_pids: # an empty set will update all canonical_names update_personID_canonical_names(updated_pids)