예제 #1
0
def matched_claims(inspect=None):
    '''
        Checks how many claims are violated in aidRESULTS.
        Returs the number of preserved and the total number of claims.
    '''
    last_names = frozenset(name[0].split('.')[0] for name in get_existing_result_clusters())
    r_match = 0
    r_total = 0

    for lname in last_names:
        if inspect and lname != inspect:
            continue

        results_dict = dict(((row[1], row[2], row[3]), int(row[0].split(".")[1]))
                        for row in get_lastname_results(lname))

        results_clusters = max(results_dict.values()) + 1
        assert frozenset(results_dict.values()) == frozenset(range(results_clusters))

        pids = frozenset(x[0] for x in chain.from_iterable(personid_name_from_signature(r) for r in results_dict.keys()))

        matr = ((results_dict[x] for x in get_claimed_papers(pid) if x in results_dict) for pid in pids)
        matr = (dict((k, len(list(d))) for k, d in groupby(sorted(row))) for row in matr)
        matr = [[row.get(i, 0) for i in xrange(results_clusters)] for row in matr]

        r_match += sum(m[2] for m in maximized_mapping(matr))
        r_total += sum(sum(row) for row in matr)

    return r_match, r_total
예제 #2
0
    def match_cluster_sets(cs1, cs2):
        """
        This functions tries to generate the best matching
        between cs1 and cs2 acoarding to the shared bibrefrecs.
        It returns a dictionary with keys, clsuters in cs1,
        and values, clusters in cs2.
        @param and type of cs1 and cs2: cluster_set
        @return: dictionary with the matching clusters.
        @return type: { cluster : cluster }
        """

        matr = [[len(cl1.bibs & cl2.bibs) for cl2 in cs2.clusters] for cl1 in cs1.clusters]
        mapping = maximized_mapping(matr)
        return dict((cs1.clusters[mappy[0]], cs2.clusters[mappy[1]]) for mappy in mapping)
예제 #3
0
def merge():
    '''
        This function merges aidPERSONIDPAPERS with aidRESULTS.
        Use it after tortoise.
    '''
    last_names = frozenset(name[0].split('.')[0] for name in get_existing_result_clusters())

    def get_free_pids():
        while True:
            yield get_new_personid()

    free_pids = get_free_pids()

    def try_move_signature(sig, target_pid):
        """
        """
        paps = get_signature_info(sig)
        claimed = filter(lambda p: p[1] <= -2, paps)
        assigned = filter(lambda p:-2 < p[1] and p[1] < 2, paps)
        rejected = filter(lambda p: 2 <= p[1] and p[0] == target_pid, paps)

        if claimed or not assigned or assigned[0] == target_pid:
            return

        assert len(assigned) == 1

        if rejected:
            move_signature(sig, free_pids.next())
        else:
            conflicts = find_conflicts(sig, target_pid)
            if not conflicts:
                move_signature(sig, target_pid)
            else:
                assert len(conflicts) == 1
                if conflicts[0][3] == 2:
                    move_signature(sig, free_pids.next())
                else:
                    move_signature(conflicts[0][:3], free_pids.next())
                    move_signature(sig, target_pid)

    for idx, last in enumerate(last_names):
        update_status(float(idx) / len(last_names), "%d/%d current: %s" % (idx, len(last_names), last))

        results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_lastname_results(last))

        # [(last name number, [bibrefrecs])]
        results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))]

        # List of dictionaries.
        # [{new_pid -> N}]
        matr = []

        # Set of all old pids.
        old_pids = set()

        for k, ds in results:
            pids = []
            claim = []
            for d in ds:
                pid_flag = personid_from_signature(d)
                if pid_flag:
                    pid, flag = pid_flag[0]
                    pids.append(pid)
                    old_pids.add(pid)
                    if flag > 1:
                        claim.append((d, pid))

            matr.append(dict((k, len(list(d))) for k, d in groupby(sorted(pids))))

        # We cast it to list in order to ensure the order persistence.
        old_pids = list(old_pids)
        best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr])

        matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, unused in best_match]
        not_matched_clusters = frozenset(xrange(len(results))) - frozenset(imap(itemgetter(0), best_match))
        not_matched_clusters = izip((results[i][1] for i in not_matched_clusters), free_pids)

        for sigs, pid in chain(matched_clusters, not_matched_clusters):
            for sig in sigs:
                try_move_signature(sig, pid)

    update_status_final()
    delete_empty_persons()
    update_personID_canonical_names()
예제 #4
0
def rabbit(bibrecs, check_invalid_papers=False):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_bibrecs()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):
        task_sleep_now_if_required(True)
        update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec))
        if rec in deleted:
            delete_paper_from_personid(rec)
            continue

        markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))),
                                   izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec)))))

        personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec)]
        personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibrecref(new))))
                                    for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old])
                  for old in old_signatures] for new in new_signatures]

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix) if score > threshold]
        for new, old in best_match:
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_sigs(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids]

            if not matched_pids:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])

    update_status_final()

    if updated_pids: # an empty set will update all canonical_names
        update_personID_canonical_names(updated_pids)