def group_edges(cs): plus = [] minus = [] pairs = [] for current, cl1 in enumerate(cs.clusters): update_status(float(current) / len(cs.clusters), "Grouping all edges...") bib1 = tuple(cl1.bibs)[0] pointers = cl1.out_edges for bib2 in xrange(len(cl1.out_edges)): val = pointers[bib2] if val[0] not in special_numbers: if val[0] > edge_cut_prob: pairs.append((bib1, bib2, val)) elif val[0] == special_symbols["+"]: plus.append((bib1, bib2)) elif val[0] == special_symbols["-"]: minus.append((bib1, bib2)) else: assert val[0] == special_symbols[None] update_status_final("Finished with the edge grouping.") bibauthor_print("Positive edges: %d, Negative edges: %d, Value edges: %d." % (len(plus), len(minus), len(pairs))) return plus, minus, pairs
def group_edges(cs): plus = [] minus = [] pairs = [] gc.disable() for current, cl1 in enumerate(cs.clusters): update_status( float(current) / len(cs.clusters), "Grouping all edges...") bib1 = tuple(cl1.bibs)[0] pointers = cl1.out_edges for bib2 in xrange(len(cl1.out_edges)): val = pointers[bib2] if val[0] not in Bib_matrix.special_numbers: if val[0] > edge_cut_prob: pairs.append((bib1, bib2, val)) elif val[0] == Bib_matrix.special_symbols['+']: plus.append((bib1, bib2)) elif val[0] == Bib_matrix.special_symbols['-']: minus.append((bib1, bib2)) else: assert val[0] == Bib_matrix.special_symbols[ None], "Invalid Edge" update_status_final("Finished with the edge grouping.") bibauthor_print( "Positive edges: %d, Negative edges: %d, Value edges: %d." % (len(plus), len(minus), len(pairs))) gc.enable() return plus, minus, pairs
def group_edges(cs): plus = [] minus = [] pairs = [] gc.disable() interval = 1000 for current, cl1 in enumerate(cs.clusters): if (current % interval) == 0: update_status(float(current) / len(cs.clusters), "Grouping all edges...") bib1 = tuple(cl1.bibs)[0] pointers = cl1.out_edges for bib2 in xrange(len(cl1.out_edges)): val = pointers[bib2] if val[0] not in Bib_matrix.special_numbers: if val[0] > edge_cut_prob: pairs.append((bib1, bib2, val)) elif val[0] == Bib_matrix.special_symbols['+']: plus.append((bib1, bib2)) elif val[0] == Bib_matrix.special_symbols['-']: minus.append((bib1, bib2)) else: assert val[0] == Bib_matrix.special_symbols[None], "Invalid Edge" update_status_final("Finished with the edge grouping.") bibauthor_print("Positive edges: %d, Negative edges: %d, Value edges: %d." % (len(plus), len(minus), len(pairs))) gc.enable() return plus, minus, pairs
def convert_cluster_set(cs, prob_matr): ''' Convertes a normal cluster set to a wedge clsuter set. @param cs: a cluster set to be converted @param type: cluster set @return: a mapping from a number to a bibrefrec. ''' gc.disable() # step 1: # + Assign a number to each bibrefrec. # + Replace the arrays of bibrefrecs with arrays of numbers. # + Store the result and prepare it to be returned. result_mapping = [] for clus in cs.clusters: start = len(result_mapping) result_mapping += list(clus.bibs) end = len(result_mapping) clus.bibs = range(start, end) assert len(result_mapping) == len(set(result_mapping)), PID()+"Cluster set conversion failed" assert len(result_mapping) == cs.num_all_bibs, PID()+"Cluster set conversion failed" cs.new2old = result_mapping # step 2: # + Using the prob matrix create a vector values to all other bibs. # + Meld those vectors into one for each cluster. special_symbols = Bib_matrix.special_symbols #locality optimization interval = 10000 for current, c1 in enumerate(cs.clusters): if (current % interval) == 0: update_status(float(current) / len(cs.clusters), "Converting the cluster set...") assert len(c1.bibs) > 0, PID()+"Empty cluster send to wedge" pointers = [] for v1 in c1.bibs: pointer = numpy.ndarray(shape=(len(result_mapping), 2), dtype=float, order='C') pointer.fill(special_symbols[None]) rm = result_mapping[v1] #locality optimization for c2 in cs.clusters: if c1 != c2 and not c1.hates(c2): for v2 in c2.bibs: val = prob_matr[rm, result_mapping[v2]] try: numb = special_symbols[val] val = (numb, numb) except KeyError: pass assert len(val) == 2, "Edge coding failed" pointer[v2] = val pointers.append((pointer, 1)) c1.out_edges = reduce(meld_edges, pointers)[0] update_status_final("Converting the cluster set done.") gc.enable()
def convert_cluster_set(cs, prob_matr): ''' Convertes a normal cluster set to a wedge clsuter set. @param cs: a cluster set to be converted @param type: cluster set @return: a mapping from a number to a bibrefrec. ''' gc.disable() # step 1: # + Assign a number to each bibrefrec. # + Replace the arrays of bibrefrecs with arrays of numbers. # + Store the result and prepare it to be returned. result_mapping = [] for clus in cs.clusters: start = len(result_mapping) result_mapping += list(clus.bibs) end = len(result_mapping) clus.bibs = range(start, end) assert len(result_mapping) == len(set(result_mapping)), "Cluster set conversion failed" assert len(result_mapping) == cs.num_all_bibs, "Cluster set conversion failed" cs.new2old = result_mapping # step 2: # + Using the prob matrix create a vector values to all other bibs. # + Meld those vectors into one for each cluster. special_symbols = Bib_matrix.special_symbols #locality optimization for current, c1 in enumerate(cs.clusters): update_status(float(current) / len(cs.clusters), "Converting the cluster set...") assert len(c1.bibs) > 0, "Empty cluster send to wedge" pointers = [] for v1 in c1.bibs: pointer = numpy.ndarray(shape=(len(result_mapping), 2), dtype=float, order='C') pointer.fill(special_symbols[None]) rm = result_mapping[v1] #locality optimization for c2 in cs.clusters: if c1 != c2 and not c1.hates(c2): for v2 in c2.bibs: val = prob_matr[rm, result_mapping[v2]] try: numb = special_symbols[val] val = (numb, numb) except KeyError: pass assert len(val) == 2, "Edge coding failed" pointer[v2] = val pointers.append((pointer, 1)) c1.out_edges = reduce(meld_edges, pointers)[0] update_status_final("Converting the cluster set done.") gc.enable()
def recalculate(self, cluster_set): ''' Constructs probability matrix. If use_cache is true, it will try to load old computations from the database. If save cache is true it will save the current results into the database. @param cluster_set: A cluster set object, used to initialize the matrix. ''' last_cleaned = 0 old_matrix = self._bib_matrix cached_bibs = self.__get_up_to_date_bibs() have_cached_bibs = bool(cached_bibs) self._bib_matrix = Bib_matrix(cluster_set) ncl = cluster_set.num_all_bibs expected = ((ncl * (ncl - 1)) / 2) if expected == 0: expected = 1 cur_calc, opti = 0, 0 for cl1 in cluster_set.clusters: update_status((float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti)) #clean caches if cur_calc - last_cleaned > 2000000: clear_comparison_caches() last_cleaned = cur_calc for cl2 in cluster_set.clusters: if id(cl1) < id(cl2) and not cl1.hates(cl2): for bib1 in cl1.bibs: for bib2 in cl2.bibs: if have_cached_bibs and bib1 in cached_bibs and bib2 in cached_bibs: val = old_matrix[bib1, bib2] if not val: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) else: opti += 1 if bconfig.DEBUG_CHECKS: assert _debug_is_eq_v( val, compare_bibrefrecs(bib1, bib2)) else: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) self._bib_matrix[bib1, bib2] = val clear_comparison_caches() update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))
def convert_cluster_set(cs, prob_matr): """ Convertes a normal cluster set to a wedge clsuter set. @param cs: a cluster set to be converted @param type: cluster set @return: a mapping from a number to a bibrefrec. """ # step 1: # + Assign a number to each bibrefrec. # + Replace the arrays of bibrefrecs with arrays of numbers. # + Store the result and prepare it to be returned. result_mapping = [] for clus in cs.clusters: start = len(result_mapping) result_mapping += list(clus.bibs) end = len(result_mapping) clus.bibs = range(start, end) assert len(result_mapping) == len(set(result_mapping)) # step 2: # + Using the prob matrix create a vector values to all other bibs. # + Meld those vectors into one for each cluster. for current, c1 in enumerate(cs.clusters): update_status(float(current) / len(cs.clusters), "Converting the cluster set...") assert len(c1.bibs) > 0 pointers = [] for v1 in c1.bibs: pointer = numpy.ndarray(shape=(len(result_mapping), 2), dtype=float, order="C") pointer.fill(special_symbols[None]) for c2 in cs.clusters: if c1 != c2 and not c1.hates(c2): for v2 in c2.bibs: val = prob_matr[result_mapping[v1], result_mapping[v2]] if val in special_symbols: numb = special_symbols[val] val = (numb, numb) assert len(val) == 2 pointer[v2] = val pointers.append((pointer, 1)) c1.out_edges = reduce(meld_edges, pointers)[0] update_status_final("Converting the cluster set done.") return result_mapping
def recalculate(self, cluster_set): ''' Constructs probability matrix. If use_cache is true, it will try to load old computations from the database. If save cache is true it will save the current results into the database. @param cluster_set: A cluster set object, used to initialize the matrix. ''' last_cleaned = 0 old_matrix = self._bib_matrix cached_bibs = self.__get_up_to_date_bibs() have_cached_bibs = bool(cached_bibs) self._bib_matrix = Bib_matrix(cluster_set) ncl = cluster_set.num_all_bibs expected = ((ncl * (ncl - 1)) / 2) if expected == 0: expected = 1 cur_calc, opti = 0, 0 for cl1 in cluster_set.clusters: update_status((float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti)) #clean caches if cur_calc - last_cleaned > 2000000: clear_comparison_caches() last_cleaned = cur_calc for cl2 in cluster_set.clusters: if id(cl1) < id(cl2) and not cl1.hates(cl2): for bib1 in cl1.bibs: for bib2 in cl2.bibs: if have_cached_bibs and bib1 in cached_bibs and bib2 in cached_bibs: val = old_matrix[bib1, bib2] if not val: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) else: opti += 1 if bconfig.DEBUG_CHECKS: assert _debug_is_eq_v(val, compare_bibrefrecs(bib1, bib2)) else: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) self._bib_matrix[bib1, bib2] = val clear_comparison_caches() update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))
pid = os.fork() if pid == 0: # child os.nice(int((float(sizs[idx]) * 20.0 / biggest))) run_job(job_idx) else: # parent pid_2_idx[pid] = job_idx assert free > sizs[job_idx] free -= sizs[job_idx] del free_idxs[idx] else: break pid, status = os.wait() assert pid in pid_2_idx idx = pid_2_idx[pid] freed = sizs[idx] done += freed ret_status[idx] = status free += freed del pid_2_idx[pid] update_status(done / total, "%d / %d" % (len(jobs) - len(free_idxs) - len(pid_2_idx), len(jobs))) update_status_final("%d / %d" % (len(jobs), len(jobs))) assert is_eq(free, initial) assert not pid_2_idx assert not free_idxs assert len(jobs) == len(sizs) == len(ret_status) == len(bibs) assert all(stat != None for stat in ret_status) return ret_status
def merge(): ''' This function merges aidPERSONIDPAPERS with aidRESULTS. Use it after tortoise. ''' last_names = frozenset(name[0].split('.')[0] for name in get_existing_result_clusters()) def get_free_pids(): while True: yield get_new_personid() free_pids = get_free_pids() def try_move_signature(sig, target_pid): """ """ paps = get_signature_info(sig) claimed = filter(lambda p: p[1] <= -2, paps) assigned = filter(lambda p:-2 < p[1] and p[1] < 2, paps) rejected = filter(lambda p: 2 <= p[1] and p[0] == target_pid, paps) if claimed or not assigned or assigned[0] == target_pid: return assert len(assigned) == 1 if rejected: move_signature(sig, free_pids.next()) else: conflicts = find_conflicts(sig, target_pid) if not conflicts: move_signature(sig, target_pid) else: assert len(conflicts) == 1 if conflicts[0][3] == 2: move_signature(sig, free_pids.next()) else: move_signature(conflicts[0][:3], free_pids.next()) move_signature(sig, target_pid) for idx, last in enumerate(last_names): update_status(float(idx) / len(last_names), "%d/%d current: %s" % (idx, len(last_names), last)) results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_lastname_results(last)) # [(last name number, [bibrefrecs])] results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))] # List of dictionaries. # [{new_pid -> N}] matr = [] # Set of all old pids. old_pids = set() for k, ds in results: pids = [] claim = [] for d in ds: pid_flag = personid_from_signature(d) if pid_flag: pid, flag = pid_flag[0] pids.append(pid) old_pids.add(pid) if flag > 1: claim.append((d, pid)) matr.append(dict((k, len(list(d))) for k, d in groupby(sorted(pids)))) # We cast it to list in order to ensure the order persistence. old_pids = list(old_pids) best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr]) matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, unused in best_match] not_matched_clusters = frozenset(xrange(len(results))) - frozenset(imap(itemgetter(0), best_match)) not_matched_clusters = izip((results[i][1] for i in not_matched_clusters), free_pids) for sigs, pid in chain(matched_clusters, not_matched_clusters): for sig in sigs: try_move_signature(sig, pid) update_status_final() delete_empty_persons() update_personID_canonical_names()
def rabbit(bibrecs, check_invalid_papers=False): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_bibrecs() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): task_sleep_now_if_required(True) update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: delete_paper_from_personid(rec) continue markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec))))) personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec)] personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibrecref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures] for new in new_signatures] # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] for new, old in best_match: modify_signature(old, rec, new, new_signatures_names[new]) remove_sigs(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = find_pids_by_exact_name(name) matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids] if not matched_pids: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) update_status_final() if updated_pids: # an empty set will update all canonical_names update_personID_canonical_names(updated_pids)
def __init__(self, cluster_set, use_cache=False, save_cache=False): ''' Constructs probability matrix. If use_cache is true, it will try to load old computations from the database. If save cache is true it will save the current results into the database. @param cluster_set: A cluster set object, used to initialize the matrix. ''' def check_for_cleaning(cur_calc): if cur_calc % 10000000 == 0: clear_comparison_caches() self._bib_matrix = bib_matrix(cluster_set) old_matrix = bib_matrix() ncl = sum(len(cl.bibs) for cl in cluster_set.clusters) expected = ((ncl * (ncl - 1)) / 2) if expected == 0: expected = 1 if use_cache and old_matrix.load(cluster_set.last_name): cached_bibs = set(filter_modified_record_ids( old_matrix.get_keys(), old_matrix.creation_time)) else: cached_bibs = set() if save_cache: creation_time = get_sql_time() cur_calc, opti = 0, 0 for cl1 in cluster_set.clusters: update_status((float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti)) for cl2 in cluster_set.clusters: if id(cl1) < id(cl2) and not cl1.hates(cl2): for bib1 in cl1.bibs: for bib2 in cl2.bibs: if bib1 in cached_bibs and bib2 in cached_bibs: val = old_matrix[bib1, bib2] if not val: cur_calc += 1 check_for_cleaning(cur_calc) val = compare_bibrefrecs(bib1, bib2) else: opti += 1 if bconfig.DEBUG_CHECKS: assert _debug_is_eq_v(val, compare_bibrefrecs(bib1, bib2)) else: cur_calc += 1 check_for_cleaning(cur_calc) val = compare_bibrefrecs(bib1, bib2) self._bib_matrix[bib1, bib2] = val clear_comparison_caches() if save_cache: update_status(1., "saving...") self._bib_matrix.store(cluster_set.last_name, creation_time) update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))
def do_wedge(cluster_set, deep_debug=False): ''' Rearranges the cluster_set acoarding to be values in the probability_matrix. The deep debug option will produce a lot of output. Avoid using it with more than 20 bibs in the cluster set. ''' def decide(cl1, cl2): score1 = compare_to(cl1, cl2) score2 = compare_to(cl2, cl1) return compare_to_final_bounds(score1, score2) def compare_to(cl1, cl2): pointers = [cl1.out_edges[v] for v in cl2.bibs] assert pointers, "Wedge: no edges between clusters!" vals, probs = zip(*pointers) avg = sum(vals) / len(vals) if avg > eps: nvals = ((val / avg)**prob for val, prob in pointers) else: return 0 coeff = gini(nvals) weight = sum(starmap(mul, pointers)) / sum(probs) wedge_print("Wedge: Decide: vals = %s, probs = %s" % (str(vals), str(probs))) wedge_print("Wedge: Decide: coeff = %f, weight = %f" % (coeff, weight)) return coeff * weight def gini(arr): arr = sorted(arr, reverse=True) dividend = sum(starmap(mul, izip(arr, xrange(1, 2 * len(arr), 2)))) divisor = len(arr) * sum(arr) return float(dividend) / divisor def compare_to_final_bounds(score1, score2): return score1 + score2 > bconfig.WEDGE_THRESHOLD def edge_sorting(edge): ''' probability + certainty / 10 ''' return edge[2][0] + edge[2][1] / 10. bib_map = create_bib_2_cluster_dict(cluster_set) plus_edges, minus_edges, edges = group_edges(cluster_set) for i, (bib1, bib2) in enumerate(plus_edges): update_status( float(i) / len(plus_edges), "Agglomerating obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 update_status_final("Agglomerating obvious clusters done.") for i, (bib1, bib2) in enumerate(minus_edges): update_status( float(i) / len(minus_edges), "Dividing obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): cl1.quarrel(cl2) update_status_final("Dividing obvious clusters done.") bibauthor_print("Sorting the value edges.") edges = sorted(edges, key=edge_sorting, reverse=True) interval = 1000 wedge_print("Wedge: New wedge, %d edges." % len(edges)) for current, (v1, v2, unused) in enumerate(edges): if (current % interval) == 0: update_status(float(current) / len(edges), "Wedge...") assert unused != '+' and unused != '-', "Signed edge after filter!" wedge_print("Wedge: poped new edge: Verts = %s, %s Value = (%f, %f)" % (v1, v2, unused[0], unused[1])) cl1 = bib_map[v1] cl2 = bib_map[v2] if cl1 != cl2 and not cl1.hates(cl2): if deep_debug: export_to_dot( cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused)) if decide(cl1, cl2): wedge_print("Wedge: Joined!") join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 else: wedge_print("Wedge: Quarreled!") cl1.quarrel(cl2) elif cl1 == cl2: wedge_print("Wedge: Clusters already joined!") else: wedge_print("Wedge: Clusters hate each other!") update_status_final("Wedge done.") bibauthor_print("") if deep_debug: export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map)
os.nice(int((float(sizs[idx]) * 20.0 / biggest))) run_job(job_idx) else: # parent pid_2_idx[pid] = job_idx assert free > sizs[job_idx] free -= sizs[job_idx] del free_idxs[idx] else: break pid, status = os.wait() assert pid in pid_2_idx idx = pid_2_idx[pid] freed = sizs[idx] done += freed ret_status[idx] = status free += freed del pid_2_idx[pid] update_status( done / total, "%d / %d" % (len(jobs) - len(free_idxs) - len(pid_2_idx), len(jobs))) update_status_final("%d / %d" % (len(jobs), len(jobs))) assert is_eq(free, initial) assert not pid_2_idx assert not free_idxs assert len(jobs) == len(sizs) == len(ret_status) == len(bibs) assert all(stat != None for stat in ret_status) return ret_status
def store(self, name): update_status(0., "Saving probability matrix...") self._bib_matrix.store(name) update_status_final("Probability matrix saved.")
def load(self, lname, load_map=True, load_matrix=True): update_status(0., "Loading probability matrix...") self._bib_matrix.load(lname, load_map, load_matrix) update_status_final("Probability matrix loaded.")
def do_wedge(cluster_set, deep_debug=False): ''' Rearranges the cluster_set acoarding to be values in the probability_matrix. The deep debug option will produce a lot of output. Avoid using it with more than 20 bibs in the cluster set. ''' def decide(cl1, cl2): score1 = compare_to(cl1, cl2) score2 = compare_to(cl2, cl1) return compare_to_final_bounds(score1, score2) def compare_to(cl1, cl2): pointers = [cl1.out_edges[v] for v in cl2.bibs] assert pointers, "Wedge: no edges between clusters!" vals, probs = zip(*pointers) avg = sum(vals) / len(vals) if avg > eps: nvals = ((val / avg) ** prob for val, prob in pointers) else: return 0 coeff = gini(nvals) weight = sum(starmap(mul, pointers)) / sum(probs) wedge_print("Wedge: Decide: vals = %s, probs = %s" % (str(vals), str(probs))) wedge_print("Wedge: Decide: coeff = %f, weight = %f" % (coeff, weight)) return coeff * weight def gini(arr): arr = sorted(arr, reverse=True) dividend = sum(starmap(mul, izip(arr, xrange(1, 2 * len(arr), 2)))) divisor = len(arr) * sum(arr) return float(dividend) / divisor def compare_to_final_bounds(score1, score2): return score1 + score2 > bconfig.WEDGE_THRESHOLD def edge_sorting(edge): ''' probability + certainty / 10 ''' return edge[2][0] + edge[2][1] / 10. bib_map = create_bib_2_cluster_dict(cluster_set) plus_edges, minus_edges, edges = group_edges(cluster_set) for i, (bib1, bib2) in enumerate(plus_edges): update_status(float(i) / len(plus_edges), "Agglomerating obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 update_status_final("Agglomerating obvious clusters done.") for i, (bib1, bib2) in enumerate(minus_edges): update_status(float(i) / len(minus_edges), "Dividing obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): cl1.quarrel(cl2) update_status_final("Dividing obvious clusters done.") bibauthor_print("Sorting the value edges.") edges = sorted(edges, key=edge_sorting, reverse=True) interval = 1000 wedge_print("Wedge: New wedge, %d edges." % len(edges)) for current, (v1, v2, unused) in enumerate(edges): if (current % interval) == 0: update_status(float(current) / len(edges), "Wedge...") assert unused != '+' and unused != '-', "Signed edge after filter!" wedge_print("Wedge: poped new edge: Verts = %s, %s Value = (%f, %f)" % (v1, v2, unused[0], unused[1])) cl1 = bib_map[v1] cl2 = bib_map[v2] if cl1 != cl2 and not cl1.hates(cl2): if deep_debug: export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), cluster_set.mapping, (v1, v2, unused)) if decide(cl1, cl2): wedge_print("Wedge: Joined!") join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 else: wedge_print("Wedge: Quarreled!") cl1.quarrel(cl2) elif cl1 == cl2: wedge_print("Wedge: Clusters already joined!") else: wedge_print("Wedge: Clusters hate each other!") update_status_final("Wedge done.") bibauthor_print("") if deep_debug: export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, cluster_set.mapping)
def do_wedge(cluster_set, deep_debug=False): ''' Rearranges the cluster_set acoarding to be values in the probability_matrix. The deep debug option will produce a lot of output. Avoid using it with more than 20 bibs in the cluster set. ''' bib_map = create_bib_2_cluster_dict(cluster_set) plus_edges, minus_edges, edges = group_edges(cluster_set) interval = 1000 for i, (bib1, bib2) in enumerate(plus_edges): if (i % interval) == 0: update_status(float(i) / len(plus_edges), "Agglomerating obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 update_status_final("Agglomerating obvious clusters done.") interval = 1000 for i, (bib1, bib2) in enumerate(minus_edges): if (i % interval) == 0: update_status(float(i) / len(minus_edges), "Dividing obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): cl1.quarrel(cl2) update_status_final("Dividing obvious clusters done.") bibauthor_print("Sorting the value edges.") edges = sorted(edges, key=_edge_sorting, reverse=True) interval = 500000 wedge_print("Wedge: New wedge, %d edges." % len(edges)) for current, (v1, v2, unused) in enumerate(edges): if (current % interval) == 0: update_status(float(current) / len(edges), "Wedge...") assert unused != '+' and unused != '-', PID()+"Signed edge after filter!" cl1 = bib_map[v1] cl2 = bib_map[v2] idcl1 = cluster_set.clusters.index(cl1) idcl2 = cluster_set.clusters.index(cl2) #keep the ids low! if idcl1 > idcl2: idcl1, idcl2 = idcl2, idcl1 cl1, cl2 = cl2, cl1 wedge_print("Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1])) if cl1 != cl2 and not cl1.hates(cl2): if deep_debug: export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused)) decision, value = _decide(cl1, cl2) if decision: wedge_print("Wedge: Joined %s to %s with %s"% (idcl1, idcl2, value)) join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 else: wedge_print("Wedge: Quarreled %s from %s with %s " % (idcl1, idcl2, value)) cl1.quarrel(cl2) elif cl1 == cl2: wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2)) else: wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2)) update_status_final("Wedge done.") bibauthor_print("") if deep_debug: export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map)