Beispiel #1
0
def exhaustive_ll(N, nparts, parallel=True):
    global ex_ll_graph, ex_ll_nparts, ex_ll_author_prod_map, ex_ll_ref_prt
    ex_ll_graph, ex_ll_author_prod_map, cluster_sizes = gen_synthetic_graph(N, nparts)
    N = sum(cluster_sizes)      # sum of cluster sizes is close to N but does not always match
    ex_ll_nparts = nparts
    ex_ll_graph, ex_ll_author_prod_map = HardEM._preprocess_graph_and_map(ex_ll_graph, ex_ll_author_prod_map)
    # reference partitioning
    ex_ll_ref_prt = []
    for i in range(len(cluster_sizes)):
        ex_ll_ref_prt.extend([i]*cluster_sizes[i])
    ex_ll_ref_prt = tuple(ex_ll_ref_prt)
    # all possible partitioning of at most `nparts` partitions
    partitions = itertools.chain(*[gen_partition(N, nparts_i) for nparts_i in range(1, nparts + 1)])
    logging.info('Processing %d partitions' % sum(stirling2(N, nparts_i) for nparts_i in range(1, nparts + 1)))
    if parallel:
        p = Pool()
        v = p.imap(em_ll_map, partitions)
        p.close(); p.join()
    else:
        v = itertools.imap(em_ll_map, partitions)
    v = list(v)     # since v is a generator, keeps them in a list so reading from it won't consume it
    # find the logl for the presumed correct partitioning
    ref_ll = 0
    for vv in v:
        if vv[0] == ex_ll_ref_prt:
            ref_ll = vv[1]
            break
    else:
        logging.error('The correct partitioning was not found')
    # keep only one from set of permutations with the same loglikelihood
    # v_dict = {ll: prt for prt, ll in v}
    # v = v_dict.items()
    # v.sort(key=lambda tup: tup[0], reverse=True)
    # for i in range(0, min(10, len(v))):
    #     print '#%d\t%s' % (i, v[i])
    # print '##\t%s' % ((ref_ll, ex_ll_ref_prt),)
    return v, cluster_sizes, ex_ll_graph