def wedge(cluster_set, report_cluster_status=False, force_wedge_thrsh=False): # The lower bound of the edges being processed by the wedge algorithm. global edge_cut_prob global wedge_thrsh if not force_wedge_thrsh: edge_cut_prob = bconfig.WEDGE_THRESHOLD / 3. wedge_thrsh = bconfig.WEDGE_THRESHOLD else: edge_cut_prob = force_wedge_thrsh / 3. wedge_thrsh = force_wedge_thrsh matr = ProbabilityMatrix() matr.load(cluster_set.last_name) convert_cluster_set(cluster_set, matr) del matr # be sure that this is the last reference! do_wedge(cluster_set) report = [] if bconfig.DEBUG_WEDGE_PRINT_FINAL_CLUSTER_COMPATIBILITIES or report_cluster_status: msg = [] for cl1 in cluster_set.clusters: for cl2 in cluster_set.clusters: if cl2 > cl1: id1 = cluster_set.clusters.index(cl1) id2 = cluster_set.clusters.index(cl2) c12 = _compare_to(cl1,cl2) c21 = _compare_to(cl2,cl1) report.append((id1,id2,c12+c21)) msg.append( ' %s vs %s : %s + %s = %s -- %s' % (id1, id2, c12, c21, c12+c21, cl1.hates(cl2))) msg = 'Wedge final clusters for %s: \n' % str(wedge_thrsh) + '\n'.join(msg) if not bconfig.DEBUG_WEDGE_OUTPUT and bconfig.DEBUG_WEDGE_PRINT_FINAL_CLUSTER_COMPATIBILITIES: print print msg print wedge_print(msg) restore_cluster_set(cluster_set) if bconfig.DEBUG_CHECKS: assert cluster_set._debug_test_hate_relation() assert cluster_set._debug_duplicated_recs() if report_cluster_status: destfile = '/tmp/baistats/cluster_status_report_pid_%s_lastname_%s_thrsh_%s' % (str(PID()),str(cluster_set.last_name),str(wedge_thrsh)) f = open(destfile, 'w') SER.dump([wedge_thrsh,cluster_set.last_name,report,cluster_set.num_all_bibs],f) f.close() gc.collect()
def wedge(cluster_set): matr = ProbabilityMatrix() matr.load(cluster_set.last_name) convert_cluster_set(cluster_set, matr) del matr # be sure that this is the last reference! do_wedge(cluster_set) restore_cluster_set(cluster_set) if bconfig.DEBUG_CHECKS: assert cluster_set._debug_test_hate_relation() assert cluster_set._debug_duplicated_recs()