def group_edges(cs): plus = [] minus = [] pairs = [] gc.disable() for current, cl1 in enumerate(cs.clusters): update_status( float(current) / len(cs.clusters), "Grouping all edges...") bib1 = tuple(cl1.bibs)[0] pointers = cl1.out_edges for bib2 in xrange(len(cl1.out_edges)): val = pointers[bib2] if val[0] not in Bib_matrix.special_numbers: if val[0] > edge_cut_prob: pairs.append((bib1, bib2, val)) elif val[0] == Bib_matrix.special_symbols['+']: plus.append((bib1, bib2)) elif val[0] == Bib_matrix.special_symbols['-']: minus.append((bib1, bib2)) else: assert val[0] == Bib_matrix.special_symbols[ None], "Invalid Edge" update_status_final("Finished with the edge grouping.") bibauthor_print( "Positive edges: %d, Negative edges: %d, Value edges: %d." % (len(plus), len(minus), len(pairs))) gc.enable() return plus, minus, pairs
def group_edges(cs): plus = [] minus = [] pairs = [] for current, cl1 in enumerate(cs.clusters): update_status(float(current) / len(cs.clusters), "Grouping all edges...") bib1 = tuple(cl1.bibs)[0] pointers = cl1.out_edges for bib2 in xrange(len(cl1.out_edges)): val = pointers[bib2] if val[0] not in special_numbers: if val[0] > edge_cut_prob: pairs.append((bib1, bib2, val)) elif val[0] == special_symbols["+"]: plus.append((bib1, bib2)) elif val[0] == special_symbols["-"]: minus.append((bib1, bib2)) else: assert val[0] == special_symbols[None] update_status_final("Finished with the edge grouping.") bibauthor_print("Positive edges: %d, Negative edges: %d, Value edges: %d." % (len(plus), len(minus), len(pairs))) return plus, minus, pairs
def group_edges(cs): plus = [] minus = [] pairs = [] gc.disable() interval = 1000 for current, cl1 in enumerate(cs.clusters): if (current % interval) == 0: update_status(float(current) / len(cs.clusters), "Grouping all edges...") bib1 = tuple(cl1.bibs)[0] pointers = cl1.out_edges for bib2 in xrange(len(cl1.out_edges)): val = pointers[bib2] if val[0] not in Bib_matrix.special_numbers: if val[0] > edge_cut_prob: pairs.append((bib1, bib2, val)) elif val[0] == Bib_matrix.special_symbols['+']: plus.append((bib1, bib2)) elif val[0] == Bib_matrix.special_symbols['-']: minus.append((bib1, bib2)) else: assert val[0] == Bib_matrix.special_symbols[None], "Invalid Edge" update_status_final("Finished with the edge grouping.") bibauthor_print("Positive edges: %d, Negative edges: %d, Value edges: %d." % (len(plus), len(minus), len(pairs))) gc.enable() return plus, minus, pairs
def main(): """Main function """ try: import bibauthorid_daemon as daemon except ImportError: bibauthor_print("Hmm...No Daemon process running.") return daemon.bibauthorid_daemon()
def tortoise_last_name(name, pure=False): lname = generate_last_name_cluster_str(name) names = create_lastname_list_from_personid() names = filter(lambda x: x[0] == name, names) if names: pids = names[0][1] bibauthor_print("Found %s(%s), %d pids" % (name, lname, len(pids))) disambiguate_last_name(pids, lname, pure, False) else: bibauthor_print("Sorry, %s(%s) not found in the last name clusters" % (name, lname))
def disambiguate(cluster_set): ''' Updates personid from a list of personids, sharing common last name, and this last name. ''' bibs = sum(len(c.bibs) for c in cluster_set.clusters) expected = bibs * (bibs - 1) / 2 bibauthor_print("Start working on %s. Total number of bibs: %d, " "maximum number of comparisons: %d" % (cluster_set.last_name, bibs, expected)) wedge(cluster_set) cluster_set.store()
def prepare_matirx(cluster_set, force): if bconfig.DEBUG_CHECKS: assert cluster_set._debug_test_hate_relation() assert cluster_set._debug_duplicated_recs() matr = ProbabilityMatrix() matr.load(cluster_set.last_name, load_map=True, load_matrix=False) if not force and matr.is_up_to_date(cluster_set): bibauthor_print("Cluster %s is up-to-date and therefore will not be computed." % cluster_set.last_name) # nothing to do return False matr.load(cluster_set.last_name, load_map=False, load_matrix=True) matr.recalculate(cluster_set) matr.store(cluster_set.last_name) return True
def prepare_matirx(cluster_set, force): if bconfig.DEBUG_CHECKS: assert cluster_set._debug_test_hate_relation() assert cluster_set._debug_duplicated_recs() matr = ProbabilityMatrix() matr.load(cluster_set.last_name, load_map=True, load_matrix=False) if not force and matr.is_up_to_date(cluster_set): bibauthor_print( "Cluster %s is up-to-date and therefore will not be computed." % cluster_set.last_name) # nothing to do return False matr.load(cluster_set.last_name, load_map=False, load_matrix=True) matr.recalculate(cluster_set) matr.store(cluster_set.last_name) return True
def do_wedge(cluster_set, deep_debug=False): ''' Rearranges the cluster_set acoarding to be values in the probability_matrix. The deep debug option will produce a lot of output. Avoid using it with more than 20 bibs in the cluster set. ''' def decide(cl1, cl2): score1 = compare_to(cl1, cl2) score2 = compare_to(cl2, cl1) return compare_to_final_bounds(score1, score2) def compare_to(cl1, cl2): pointers = [cl1.out_edges[v] for v in cl2.bibs] assert pointers, "Wedge: no edges between clusters!" vals, probs = zip(*pointers) avg = sum(vals) / len(vals) if avg > eps: nvals = ((val / avg)**prob for val, prob in pointers) else: return 0 coeff = gini(nvals) weight = sum(starmap(mul, pointers)) / sum(probs) wedge_print("Wedge: Decide: vals = %s, probs = %s" % (str(vals), str(probs))) wedge_print("Wedge: Decide: coeff = %f, weight = %f" % (coeff, weight)) return coeff * weight def gini(arr): arr = sorted(arr, reverse=True) dividend = sum(starmap(mul, izip(arr, xrange(1, 2 * len(arr), 2)))) divisor = len(arr) * sum(arr) return float(dividend) / divisor def compare_to_final_bounds(score1, score2): return score1 + score2 > bconfig.WEDGE_THRESHOLD def edge_sorting(edge): ''' probability + certainty / 10 ''' return edge[2][0] + edge[2][1] / 10. bib_map = create_bib_2_cluster_dict(cluster_set) plus_edges, minus_edges, edges = group_edges(cluster_set) for i, (bib1, bib2) in enumerate(plus_edges): update_status( float(i) / len(plus_edges), "Agglomerating obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 update_status_final("Agglomerating obvious clusters done.") for i, (bib1, bib2) in enumerate(minus_edges): update_status( float(i) / len(minus_edges), "Dividing obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): cl1.quarrel(cl2) update_status_final("Dividing obvious clusters done.") bibauthor_print("Sorting the value edges.") edges = sorted(edges, key=edge_sorting, reverse=True) interval = 1000 wedge_print("Wedge: New wedge, %d edges." % len(edges)) for current, (v1, v2, unused) in enumerate(edges): if (current % interval) == 0: update_status(float(current) / len(edges), "Wedge...") assert unused != '+' and unused != '-', "Signed edge after filter!" wedge_print("Wedge: poped new edge: Verts = %s, %s Value = (%f, %f)" % (v1, v2, unused[0], unused[1])) cl1 = bib_map[v1] cl2 = bib_map[v2] if cl1 != cl2 and not cl1.hates(cl2): if deep_debug: export_to_dot( cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused)) if decide(cl1, cl2): wedge_print("Wedge: Joined!") join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 else: wedge_print("Wedge: Quarreled!") cl1.quarrel(cl2) elif cl1 == cl2: wedge_print("Wedge: Clusters already joined!") else: wedge_print("Wedge: Clusters hate each other!") update_status_final("Wedge done.") bibauthor_print("") if deep_debug: export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map)
def do_wedge(cluster_set, deep_debug=False): ''' Rearranges the cluster_set acoarding to be values in the probability_matrix. The deep debug option will produce a lot of output. Avoid using it with more than 20 bibs in the cluster set. ''' bib_map = create_bib_2_cluster_dict(cluster_set) plus_edges, minus_edges, edges = group_edges(cluster_set) interval = 1000 for i, (bib1, bib2) in enumerate(plus_edges): if (i % interval) == 0: update_status(float(i) / len(plus_edges), "Agglomerating obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 update_status_final("Agglomerating obvious clusters done.") interval = 1000 for i, (bib1, bib2) in enumerate(minus_edges): if (i % interval) == 0: update_status(float(i) / len(minus_edges), "Dividing obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): cl1.quarrel(cl2) update_status_final("Dividing obvious clusters done.") bibauthor_print("Sorting the value edges.") edges = sorted(edges, key=_edge_sorting, reverse=True) interval = 500000 wedge_print("Wedge: New wedge, %d edges." % len(edges)) for current, (v1, v2, unused) in enumerate(edges): if (current % interval) == 0: update_status(float(current) / len(edges), "Wedge...") assert unused != '+' and unused != '-', PID()+"Signed edge after filter!" cl1 = bib_map[v1] cl2 = bib_map[v2] idcl1 = cluster_set.clusters.index(cl1) idcl2 = cluster_set.clusters.index(cl2) #keep the ids low! if idcl1 > idcl2: idcl1, idcl2 = idcl2, idcl1 cl1, cl2 = cl2, cl1 wedge_print("Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1])) if cl1 != cl2 and not cl1.hates(cl2): if deep_debug: export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused)) decision, value = _decide(cl1, cl2) if decision: wedge_print("Wedge: Joined %s to %s with %s"% (idcl1, idcl2, value)) join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 else: wedge_print("Wedge: Quarreled %s from %s with %s " % (idcl1, idcl2, value)) cl1.quarrel(cl2) elif cl1 == cl2: wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2)) else: wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2)) update_status_final("Wedge done.") bibauthor_print("") if deep_debug: export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map)
def do_wedge(cluster_set, deep_debug=False): ''' Rearranges the cluster_set acoarding to be values in the probability_matrix. The deep debug option will produce a lot of output. Avoid using it with more than 20 bibs in the cluster set. ''' def decide(cl1, cl2): score1 = compare_to(cl1, cl2) score2 = compare_to(cl2, cl1) return compare_to_final_bounds(score1, score2) def compare_to(cl1, cl2): pointers = [cl1.out_edges[v] for v in cl2.bibs] assert pointers, "Wedge: no edges between clusters!" vals, probs = zip(*pointers) avg = sum(vals) / len(vals) if avg > eps: nvals = ((val / avg) ** prob for val, prob in pointers) else: return 0 coeff = gini(nvals) weight = sum(starmap(mul, pointers)) / sum(probs) wedge_print("Wedge: Decide: vals = %s, probs = %s" % (str(vals), str(probs))) wedge_print("Wedge: Decide: coeff = %f, weight = %f" % (coeff, weight)) return coeff * weight def gini(arr): arr = sorted(arr, reverse=True) dividend = sum(starmap(mul, izip(arr, xrange(1, 2 * len(arr), 2)))) divisor = len(arr) * sum(arr) return float(dividend) / divisor def compare_to_final_bounds(score1, score2): return score1 + score2 > bconfig.WEDGE_THRESHOLD def edge_sorting(edge): ''' probability + certainty / 10 ''' return edge[2][0] + edge[2][1] / 10. bib_map = create_bib_2_cluster_dict(cluster_set) plus_edges, minus_edges, edges = group_edges(cluster_set) for i, (bib1, bib2) in enumerate(plus_edges): update_status(float(i) / len(plus_edges), "Agglomerating obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 update_status_final("Agglomerating obvious clusters done.") for i, (bib1, bib2) in enumerate(minus_edges): update_status(float(i) / len(minus_edges), "Dividing obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): cl1.quarrel(cl2) update_status_final("Dividing obvious clusters done.") bibauthor_print("Sorting the value edges.") edges = sorted(edges, key=edge_sorting, reverse=True) interval = 1000 wedge_print("Wedge: New wedge, %d edges." % len(edges)) for current, (v1, v2, unused) in enumerate(edges): if (current % interval) == 0: update_status(float(current) / len(edges), "Wedge...") assert unused != '+' and unused != '-', "Signed edge after filter!" wedge_print("Wedge: poped new edge: Verts = %s, %s Value = (%f, %f)" % (v1, v2, unused[0], unused[1])) cl1 = bib_map[v1] cl2 = bib_map[v2] if cl1 != cl2 and not cl1.hates(cl2): if deep_debug: export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), cluster_set.mapping, (v1, v2, unused)) if decide(cl1, cl2): wedge_print("Wedge: Joined!") join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 else: wedge_print("Wedge: Quarreled!") cl1.quarrel(cl2) elif cl1 == cl2: wedge_print("Wedge: Clusters already joined!") else: wedge_print("Wedge: Clusters hate each other!") update_status_final("Wedge done.") bibauthor_print("") if deep_debug: export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, cluster_set.mapping)