def _decide(cl1, cl2): score1 = _compare_to(cl1, cl2) score2 = _compare_to(cl2, cl1) s = score1 + score2 wedge_print("Wedge: _decide (%f+%f) = %f cmp to %f" % (score1, score2, s, wedge_thrsh)) return s > wedge_thrsh, s
def wedge(cluster_set, report_cluster_status=False, force_wedge_thrsh=False): # The lower bound of the edges being processed by the wedge algorithm. global edge_cut_prob global wedge_thrsh if not force_wedge_thrsh: edge_cut_prob = bconfig.WEDGE_THRESHOLD / 4. wedge_thrsh = bconfig.WEDGE_THRESHOLD else: edge_cut_prob = force_wedge_thrsh / 4. wedge_thrsh = force_wedge_thrsh matr = ProbabilityMatrix(cluster_set.last_name) matr.load() global h5file h5filepath = bconfig.TORTOISE_FILES_PATH+'wedge_cache_'+str(PID()) h5file = h5py.File(h5filepath) convert_cluster_set(cluster_set, matr) del matr # be sure that this is the last reference! do_wedge(cluster_set) report = [] if bconfig.DEBUG_WEDGE_PRINT_FINAL_CLUSTER_COMPATIBILITIES or report_cluster_status: msg = [] for cl1 in cluster_set.clusters: for cl2 in cluster_set.clusters: if cl2 > cl1: id1 = cluster_set.clusters.index(cl1) id2 = cluster_set.clusters.index(cl2) c12 = _compare_to(cl1,cl2) c21 = _compare_to(cl2,cl1) report.append((id1,id2,c12+c21)) msg.append( ' %s vs %s : %s + %s = %s -- %s' % (id1, id2, c12, c21, c12+c21, cl1.hates(cl2))) msg = 'Wedge final clusters for %s: \n' % str(wedge_thrsh) + '\n'.join(msg) if not bconfig.DEBUG_WEDGE_OUTPUT and bconfig.DEBUG_WEDGE_PRINT_FINAL_CLUSTER_COMPATIBILITIES: print print msg print wedge_print(msg) restore_cluster_set(cluster_set) if bconfig.DEBUG_CHECKS: assert cluster_set._debug_test_hate_relation() assert cluster_set._debug_duplicated_recs() if report_cluster_status: destfile = '/tmp/baistats/cluster_status_report_pid_%s_lastname_%s_thrsh_%s' % (str(PID()),str(cluster_set.last_name),str(wedge_thrsh)) f = filehandler.open(destfile, 'w') SER.dump([wedge_thrsh,cluster_set.last_name,report,cluster_set.num_all_bibs],f) f.close() gc.collect() h5file.close() os.remove(h5filepath)
def _compare_to(cl1, cl2): cl1_out_edges = h5file[str(id(cl1))] pointers = [cl1_out_edges[v] for v in cl2.bibs] assert pointers, PID() + "Wedge: no edges between clusters!" vals, probs = zip(*pointers) wedge_print("Wedge: _compare_to: vals = %s, probs = %s" % (str(vals), str(probs))) if SP_QUARREL in vals: ret = 0. wedge_print('Wedge: _compare_to: - edge present, returning 0') elif SP_CONFIRM in vals: ret = 0.5 wedge_print('Wedge: _compare_to: + edge present, returning 0.5') else: avg = sum(vals) / len(vals) if avg > eps: nvals = [(val / avg)**prob for val, prob in pointers] else: wedge_print( "Wedge: _compare_to: vals too low to compare, skipping") return 0 coeff = _gini(nvals) weight = sum(starmap(mul, pointers)) / sum(probs) ret = (coeff * weight) / 2. assert ret <= 0.5, PID( ) + 'COMPARE_TO big value returned ret %s coeff %s weight %s nvals %s vals %s prob %s' % ( ret, coeff, weight, nvals, vals, probs) wedge_print( "Wedge: _compare_to: coeff = %f, weight = %f, retval = %f" % (coeff, weight, ret)) return ret
def _compare_to(cl1, cl2): cl1_out_edges = h5file[str(id(cl1))] pointers = [cl1_out_edges[v] for v in cl2.bibs] assert pointers, PID()+"Wedge: no edges between clusters!" vals, probs = zip(*pointers) wedge_print("Wedge: _compare_to: vals = %s, probs = %s" % (str(vals), str(probs))) if SP_QUARREL in vals: ret = 0. wedge_print('Wedge: _compare_to: - edge present, returning 0') elif SP_CONFIRM in vals: ret = 0.5 wedge_print('Wedge: _compare_to: + edge present, returning 0.5') else: avg = sum(vals) / len(vals) if avg > eps: nvals = [(val / avg) ** prob for val, prob in pointers] else: wedge_print("Wedge: _compare_to: vals too low to compare, skipping") return 0 coeff = _gini(nvals) weight = sum(starmap(mul, pointers)) / sum(probs) ret = (coeff * weight) / 2. assert ret <= 0.5, PID()+'COMPARE_TO big value returned ret %s coeff %s weight %s nvals %s vals %s prob %s' % (ret, coeff, weight, nvals, vals, probs) wedge_print("Wedge: _compare_to: coeff = %f, weight = %f, retval = %f" % (coeff, weight, ret)) return ret
def do_wedge(cluster_set, deep_debug=False): ''' Rearranges the cluster_set acoarding to be values in the probability_matrix. The deep debug option will produce a lot of output. Avoid using it with more than 20 bibs in the cluster set. ''' bib_map = create_bib_2_cluster_dict(cluster_set) original_process_id = PID() #remember to close the files! #plus_edges_fp, len_plus, minus_edges_fp, len_minus, edges_fp, len_edges = group_sort_edges(cluster_set) p = Process(target=group_sort_edges, args=(cluster_set,original_process_id)) p.start() p.join() plus_edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id),'r') minus_edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id),'r') edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id),'r') data_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id),'r') len_plus,len_minus,len_edges = cPickle.load(data_fp) data_fp.close() interval = 1000 for i, s in enumerate(plus_edges_fp): bib1, bib2, unused = _unpack_vals(s) if (i % interval) == 0: update_status(float(i) / len_plus, "Agglomerating obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 update_status_final("Agglomerating obvious clusters done.") interval = 1000 for i, s in enumerate(minus_edges_fp): bib1, bib2, unused = _unpack_vals(s) if (i % interval) == 0: update_status(float(i) / len_minus, "Dividing obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): cl1.quarrel(cl2) update_status_final("Dividing obvious clusters done.") interval = 50000 wedge_print("Wedge: New wedge, %d edges." % len_edges) current = -1 for s in edges_fp: v1, v2, unused = _unpack_vals(s) current += 1 if (current % interval) == 0: update_status(float(current) / len_edges, "Wedge...") assert unused != '+' and unused != '-', PID()+"Signed edge after filter!" cl1 = bib_map[v1] cl2 = bib_map[v2] #try using object ids instead of index to boost performances #idcl1 = cluster_set.clusters.index(cl1) #idcl2 = cluster_set.clusters.index(cl2) idcl1 = id(cl1) idcl2 = id(cl2) #keep the ids low! if idcl1 > idcl2: idcl1, idcl2 = idcl2, idcl1 cl1, cl2 = cl2, cl1 wedge_print("Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1])) if cl1 != cl2 and not cl1.hates(cl2): if deep_debug: export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused)) decision, value = _decide(cl1, cl2) if decision: wedge_print("Wedge: Joined %s to %s with %s"% (idcl1, idcl2, value)) join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 else: wedge_print("Wedge: Quarreled %s from %s with %s " % (idcl1, idcl2, value)) cl1.quarrel(cl2) elif cl1 == cl2: wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2)) else: wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2)) update_status_final("Wedge done.") bibauthor_print("") if deep_debug: export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map) plus_edges_fp.close() minus_edges_fp.close() edges_fp.close() data_fp.close() try: os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id)) except: pass
def _decide(cl1, cl2): score1 = _compare_to(cl1, cl2) score2 = _compare_to(cl2, cl1) s = score1 + score2 wedge_print("Wedge: _decide (%f+%f) = %f cmp to %f" % (score1,score2,s,wedge_thrsh)) return s > wedge_thrsh, s
def do_wedge(cluster_set, deep_debug=False): ''' Rearranges the cluster_set acoarding to be values in the probability_matrix. The deep debug option will produce a lot of output. Avoid using it with more than 20 bibs in the cluster set. ''' bib_map = create_bib_2_cluster_dict(cluster_set) plus_edges, minus_edges, edges = group_edges(cluster_set) interval = 1000 for i, (bib1, bib2) in enumerate(plus_edges): if (i % interval) == 0: update_status(float(i) / len(plus_edges), "Agglomerating obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 update_status_final("Agglomerating obvious clusters done.") interval = 1000 for i, (bib1, bib2) in enumerate(minus_edges): if (i % interval) == 0: update_status(float(i) / len(minus_edges), "Dividing obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): cl1.quarrel(cl2) update_status_final("Dividing obvious clusters done.") bibauthor_print("Sorting the value edges.") edges = sorted(edges, key=_edge_sorting, reverse=True) interval = 500000 wedge_print("Wedge: New wedge, %d edges." % len(edges)) for current, (v1, v2, unused) in enumerate(edges): if (current % interval) == 0: update_status(float(current) / len(edges), "Wedge...") assert unused != '+' and unused != '-', PID()+"Signed edge after filter!" cl1 = bib_map[v1] cl2 = bib_map[v2] idcl1 = cluster_set.clusters.index(cl1) idcl2 = cluster_set.clusters.index(cl2) #keep the ids low! if idcl1 > idcl2: idcl1, idcl2 = idcl2, idcl1 cl1, cl2 = cl2, cl1 wedge_print("Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1])) if cl1 != cl2 and not cl1.hates(cl2): if deep_debug: export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused)) decision, value = _decide(cl1, cl2) if decision: wedge_print("Wedge: Joined %s to %s with %s"% (idcl1, idcl2, value)) join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 else: wedge_print("Wedge: Quarreled %s from %s with %s " % (idcl1, idcl2, value)) cl1.quarrel(cl2) elif cl1 == cl2: wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2)) else: wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2)) update_status_final("Wedge done.") bibauthor_print("") if deep_debug: export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map)
def do_wedge(cluster_set, deep_debug=False): ''' Rearranges the cluster_set acoarding to be values in the probability_matrix. The deep debug option will produce a lot of output. Avoid using it with more than 20 bibs in the cluster set. ''' bib_map = create_bib_2_cluster_dict(cluster_set) original_process_id = PID() #remember to close the files! #plus_edges_fp, len_plus, minus_edges_fp, len_minus, edges_fp, len_edges = group_sort_edges(cluster_set) p = Process(target=group_sort_edges, args=(cluster_set, original_process_id)) p.start() p.join() plus_edges_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' + str(original_process_id), 'r') minus_edges_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' + str(original_process_id), 'r') edges_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' + str(original_process_id), 'r') data_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' + str(original_process_id), 'r') len_plus, len_minus, len_edges = cPickle.load(data_fp) data_fp.close() interval = 1000 for i, s in enumerate(plus_edges_fp): bib1, bib2, unused = _unpack_vals(s) if (i % interval) == 0: update_status( float(i) / len_plus, "Agglomerating obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 update_status_final("Agglomerating obvious clusters done.") interval = 1000 for i, s in enumerate(minus_edges_fp): bib1, bib2, unused = _unpack_vals(s) if (i % interval) == 0: update_status(float(i) / len_minus, "Dividing obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): cl1.quarrel(cl2) update_status_final("Dividing obvious clusters done.") interval = 50000 wedge_print("Wedge: New wedge, %d edges." % len_edges) current = -1 for s in edges_fp: v1, v2, unused = _unpack_vals(s) current += 1 if (current % interval) == 0: update_status(float(current) / len_edges, "Wedge...") assert unused != '+' and unused != '-', PID( ) + "Signed edge after filter!" cl1 = bib_map[v1] cl2 = bib_map[v2] #try using object ids instead of index to boost performances #idcl1 = cluster_set.clusters.index(cl1) #idcl2 = cluster_set.clusters.index(cl2) idcl1 = id(cl1) idcl2 = id(cl2) #keep the ids low! if idcl1 > idcl2: idcl1, idcl2 = idcl2, idcl1 cl1, cl2 = cl2, cl1 wedge_print( "Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1])) if cl1 != cl2 and not cl1.hates(cl2): if deep_debug: export_to_dot( cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused)) decision, value = _decide(cl1, cl2) if decision: wedge_print("Wedge: Joined %s to %s with %s" % (idcl1, idcl2, value)) join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 else: wedge_print("Wedge: Quarreled %s from %s with %s " % (idcl1, idcl2, value)) cl1.quarrel(cl2) elif cl1 == cl2: wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2)) else: wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2)) update_status_final("Wedge done.") bibauthor_print("") if deep_debug: export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map) plus_edges_fp.close() minus_edges_fp.close() edges_fp.close() data_fp.close() try: os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' + str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' + str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' + str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' + str(original_process_id)) except: pass