def wedge(cluster_set, report_cluster_status=False, force_wedge_thrsh=False):
    # The lower bound of the edges being processed by the wedge algorithm.
    global edge_cut_prob
    global wedge_thrsh

    if not force_wedge_thrsh:
        edge_cut_prob = bconfig.WEDGE_THRESHOLD / 3.
        wedge_thrsh = bconfig.WEDGE_THRESHOLD
    else:
        edge_cut_prob = force_wedge_thrsh / 3.
        wedge_thrsh = force_wedge_thrsh

    matr = ProbabilityMatrix()
    matr.load(cluster_set.last_name)

    convert_cluster_set(cluster_set, matr)
    del matr # be sure that this is the last reference!

    do_wedge(cluster_set)

    report = []
    if bconfig.DEBUG_WEDGE_PRINT_FINAL_CLUSTER_COMPATIBILITIES or report_cluster_status:
        msg = []
        for cl1 in cluster_set.clusters:
            for cl2 in cluster_set.clusters:
                if cl2 > cl1:
                    id1 = cluster_set.clusters.index(cl1)
                    id2 = cluster_set.clusters.index(cl2)
                    c12 = _compare_to(cl1,cl2)
                    c21 = _compare_to(cl2,cl1)
                    report.append((id1,id2,c12+c21))
                    msg.append( ' %s vs %s : %s + %s = %s -- %s' %  (id1, id2, c12, c21, c12+c21, cl1.hates(cl2)))
        msg = 'Wedge final clusters for %s: \n' % str(wedge_thrsh) + '\n'.join(msg)
        if not bconfig.DEBUG_WEDGE_OUTPUT and bconfig.DEBUG_WEDGE_PRINT_FINAL_CLUSTER_COMPATIBILITIES:
            print
            print msg
            print
        wedge_print(msg)


    restore_cluster_set(cluster_set)

    if bconfig.DEBUG_CHECKS:
        assert cluster_set._debug_test_hate_relation()
        assert cluster_set._debug_duplicated_recs()

    if report_cluster_status:
        destfile = '/tmp/baistats/cluster_status_report_pid_%s_lastname_%s_thrsh_%s' % (str(PID()),str(cluster_set.last_name),str(wedge_thrsh))
        f = open(destfile, 'w')
        SER.dump([wedge_thrsh,cluster_set.last_name,report,cluster_set.num_all_bibs],f)
        f.close()
    gc.collect()
Esempio n. 2
0
def wedge(cluster_set):
    matr = ProbabilityMatrix()
    matr.load(cluster_set.last_name)

    convert_cluster_set(cluster_set, matr)
    del matr  # be sure that this is the last reference!

    do_wedge(cluster_set)

    restore_cluster_set(cluster_set)

    if bconfig.DEBUG_CHECKS:
        assert cluster_set._debug_test_hate_relation()
        assert cluster_set._debug_duplicated_recs()
def wedge(cluster_set):
    matr = ProbabilityMatrix()
    matr.load(cluster_set.last_name)

    convert_cluster_set(cluster_set, matr)
    del matr # be sure that this is the last reference!

    do_wedge(cluster_set)

    restore_cluster_set(cluster_set)

    if bconfig.DEBUG_CHECKS:
        assert cluster_set._debug_test_hate_relation()
        assert cluster_set._debug_duplicated_recs()