def _get_personids_to_update_extids(papers=None):
    '''
    It returns the set of personids of which we should recalculate
    their external ids.
    @param papers: papers
    @type papers: set or None
    @return: personids
    @rtype: set
    '''
    last_log = get_user_log(userinfo='daemon', action='PID_UPDATE', only_most_recent=True)
    if last_log:
        daemon_last_time_run = last_log[0][2]
        modified_bibrecs = get_recently_modified_record_ids(daemon_last_time_run)
    else:
        modified_bibrecs = get_all_valid_bibrecs()
    if papers:
        modified_bibrecs &= set(papers)
    if not modified_bibrecs:
        return None
    if bconfig.LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS:
        modified_bibrecs = [rec[0] for rec in get_claimed_papers_from_papers(modified_bibrecs)]
    personids_to_update_extids = set()
    for bibrec in modified_bibrecs:
        personids_to_update_extids |= set(get_personids_from_bibrec(bibrec))
    return personids_to_update_extids
def cluster_sets_from_marktables():
    # { (100, 123) -> name }
    ref100 = get_bib10x()
    ref700 = get_bib70x()
    bibref_2_name = dict([((100, ref), generate_last_name_cluster_str(name)) for ref, name in ref100] +
                         [((700, ref), generate_last_name_cluster_str(name)) for ref, name in ref700])

    all_recs = get_all_valid_bibrecs()

    all_bibrefrecs = chain(set((100, ref, rec) for rec, ref in get_bibrefrec_subset(100, all_recs, map(itemgetter(0), ref100))),
                           set((700, ref, rec) for rec, ref in get_bibrefrec_subset(700, all_recs, map(itemgetter(0), ref700))))

    last_name_2_bibs = {}

    for bibrefrec in all_bibrefrecs:
        table, ref, unused = bibrefrec
        name = bibref_2_name[(table, ref)]
        last_name_2_bibs[name] = last_name_2_bibs.get(name, []) + [bibrefrec]

    cluster_sets = []

    for name, bibrecrefs in last_name_2_bibs.items():
        new_cluster_set = Cluster_set()
        new_cluster_set.clusters = [Cluster_set.Cluster([bib]) for bib in bibrecrefs]
        new_cluster_set.last_name = name
        cluster_sets.append(new_cluster_set)

    return cluster_sets
Esempio n. 3
0
def rabbit(bibrecs, check_invalid_papers=False):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_bibrecs()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):
        task_sleep_now_if_required(True)
        update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec))
        if rec in deleted:
            delete_paper_from_personid(rec)
            continue

        markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))),
                                   izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec)))))

        personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec)]
        personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibrecref(new))))
                                    for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old])
                  for old in old_signatures] for new in new_signatures]

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix) if score > threshold]
        for new, old in best_match:
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_sigs(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids]

            if not matched_pids:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])

    update_status_final()

    if updated_pids: # an empty set will update all canonical_names
        update_personID_canonical_names(updated_pids)