コード例 #1
0
def g_update(comm, theta, g_theta, norm_const, K, rec, max_len, send_only=False, hash_mode=False):
    fff = stdout.flush
    map_list = None
    if hash_mode:
        code = np.zeros((K, int(1e5)), dtype=np.int32)
        comm.Bcast([code, MPI.INT], root=0)
        code = code > 0

        code_b = get_code(theta[:, :])
        map_list = search(code, code_b)
    else:
        map_list = range(K)
    comm.Gather([np.int32(rec), MPI.INT], [None, MPI.INT], root=0)
    g_rec = np.int32(rec).copy()
    comm.Bcast([g_rec, MPI.INT], root=0)
    g_rec = g_rec > 0
    comm.Reduce([norm_const, MPI.FLOAT], [None, MPI.FLOAT], op=MPI.SUM, root=0)
    if not send_only: comm.Bcast([norm_const, MPI.FLOAT], root=0)
    true_len = g_rec.sum()
    if true_len <= max_len:

        if rec.sum() != 0:
            theta_batch = theta[:, rec][map_list]
            comm.Send([theta_batch, MPI.FLOAT], dest=0, tag=112)
    else:

        g_mask_list = part_rec(g_rec, max_len)
        mask_list = part_rec(rec, max_len, g_rec=g_rec)

        for i_m in xrange(len(mask_list)):
            if mask_list[i_m].sum() != 0:
                theta_batch = theta[:, mask_list[i_m]][map_list]
                comm.Send([theta_batch, MPI.FLOAT], dest=0, tag=112)

    comm.barrier()
コード例 #2
0
ファイル: LWsampler_l.py プロジェクト: gblackout/very_large
def g_update(comm, theta, g_theta, norm_const, K, rec, max_len, send_only=False, hash_mode=False):
    fff = stdout.flush
    map_list = None
    if hash_mode:
        code = np.zeros((K, int(1e5)), dtype=np.int32)
        comm.Bcast([code, MPI.INT], root=0)
        code = code > 0

        code_b = get_code(theta[:, :])
        map_list = search(code, code_b)
    else:
        map_list = range(K)
    comm.Gather([np.int32(rec), MPI.INT], [None, MPI.INT], root=0)
    g_rec = np.int32(rec).copy()
    comm.Bcast([g_rec, MPI.INT], root=0)
    g_rec = g_rec > 0
    comm.Reduce([norm_const, MPI.FLOAT], [None, MPI.FLOAT], op=MPI.SUM, root=0)
    if not send_only:
        comm.Bcast([norm_const, MPI.FLOAT], root=0)
    true_len = g_rec.sum()
    if true_len <= max_len:

        if rec.sum() != 0:
            theta_batch = theta[:, rec][map_list]
            comm.Send([theta_batch, MPI.FLOAT], dest=0, tag=112)
    else:

        g_mask_list = part_rec(g_rec, max_len)
        mask_list = part_rec(rec, max_len, g_rec=g_rec)

        for i_m in xrange(len(mask_list)):
            if mask_list[i_m].sum() != 0:
                theta_batch = theta[:, mask_list[i_m]][map_list]
                comm.Send([theta_batch, MPI.FLOAT], dest=0, tag=112)

    comm.barrier()
コード例 #3
0
def lw_frame(num, out_dir, dir, K, V, apprx, train_set_size=20726, doc_per_set=200, alpha=0.01, beta=0.0001,
             batch_size=50, step_size_param=(10**5.2, 10**(-6), 0.33), MH_max=2, word_partition=10000, max_send_times=3):
    """ num is the num_of_samples
        dir: indicates the root folder of each data folder, tmp file folder shall be created in here"""
    fff = stdout.flush
    # ************************************ init params *******************************************************
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    suffix = time.strftime('_%m%d_%H%M%S', time.localtime()) + '_' + str(rank)
    g_name = dir + 'tmp' + suffix + '/' + 'g_theta_file' + suffix + '.h5'
    g_theta = None
    iters = 0
    iters_mean = 0
    H = 1 ** (1 + 0.3) * np.sqrt(size - 1)
    start_time = time.time()
    output_name = out_dir + 'LW_perplexity' + suffix + '.txt'
    sampler = LDSampler(H, dir, rank, train_set_size * doc_per_set, K, V, word_partition * max_send_times, apprx,
                        batch_size=batch_size, alpha=alpha, beta=beta, a=step_size_param[0],
                        b=step_size_param[1], c=step_size_param[2], suffix=suffix)
    if rank != 0:
        rec = np.zeros(V, dtype=bool)
        g_theta_file = h5py.File(g_name, 'w')
        g_theta = g_theta_file.create_dataset('g_theta', (K, V), dtype='float32')

    # init theta and g_theta
    start = 0
    while start < V:
        end = start + word_partition * max_send_times
        end = end * (end <= V) + V * (end > V)

        dummy = sampler.theta[start:end, :]; collect()
        comm.Bcast([dummy, MPI.FLOAT], root=0)
        sampler.theta[start:end, :] = dummy
        if rank != 0: g_theta[start:end, :] = dummy

        start = end

    comm.Bcast([sampler.norm_const, MPI.FLOAT], root=0)

    # ************************************ worker *******************************************************

    if rank != 0:
        # TODO wait for initial perplexity
        # comm.barrier()
        while not comm.Iprobe(source=0, tag=101):
            comm.isend(iters, dest=0, tag=111)

            sampler.update(MH_max, LWsampler=True, g_theta=g_theta, rec=rec)

            if comm.Iprobe(source=0, tag=102):
                comm.recv(source=0, tag=102)

                g_update(comm, sampler.theta, g_theta, sampler.norm_const, K, rec, word_partition * max_send_times)
                g_update(comm, sampler.theta, g_theta, sampler.norm_const, K, rec, word_partition * max_send_times, hash_mode=True)

                rec.fill(0)
                sampler.time_bak = 0

            iters += 1

    # ************************************ master *******************************************************
    else:
        # sche = [2*i**2 for i in xrange(1, num) if 2*i**2 <= num]
        sche = [1, 250, 500, 1000]
        # TODO
        # start_time = get_per_LW(output_name, sampler, start_time, 0)
        # comm.barrier()

        for i in xrange(len(sche)):
            print '0---> update %i of %i' % (i, len(sche))

            while iters_mean < sche[i]:
                iters_mean = get_iters_mean(comm, size)
                print '0---> iter_mean %i' % iters_mean

            # inform to update
            for j in xrange(1, size): comm.isend(None, dest=j, tag=102)

            t_save = sampler.theta[:, :]
            g_recv(comm, sampler.theta, sampler.norm_const, size - 1, K, V, word_partition * max_send_times, apprx)
            start_time = get_per_LW(output_name, sampler, start_time, 0)
            comm.barrier()

            sampler.theta[:, :] = t_save
            code = get_code(t_save)
            comm.Bcast([np.int32(code), MPI.INT], root=0)
            g_recv(comm, sampler.theta, sampler.norm_const, size - 1, K, V, word_partition * max_send_times, apprx)
            start_time = get_per_LW(output_name, sampler, start_time, 0)
            comm.barrier()

        # stop workers, obtain final
        for i in xrange(1, size): comm.send(None, dest=i, tag=101)
コード例 #4
0
ファイル: LWsampler_l.py プロジェクト: gblackout/very_large
def lw_frame(
    num,
    out_dir,
    dir,
    K,
    V,
    apprx,
    train_set_size=20726,
    doc_per_set=200,
    alpha=0.01,
    beta=0.0001,
    batch_size=50,
    step_size_param=(10 ** 5.2, 10 ** (-6), 0.33),
    MH_max=2,
    word_partition=10000,
    max_send_times=3,
):
    """ num is the num_of_samples
        dir: indicates the root folder of each data folder, tmp file folder shall be created in here"""
    fff = stdout.flush
    # ************************************ init params *******************************************************
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    suffix = time.strftime("_%m%d_%H%M%S", time.localtime()) + "_" + str(rank)
    g_name = dir + "tmp" + suffix + "/" + "g_theta_file" + suffix + ".h5"
    g_theta = None
    iters = 0
    iters_mean = 0
    H = 1 ** (1 + 0.3) * np.sqrt(size - 1)
    start_time = time.time()
    output_name = out_dir + "LW_perplexity" + suffix + ".txt"
    sampler = LDSampler(
        H,
        dir,
        rank,
        train_set_size * doc_per_set,
        K,
        V,
        word_partition * max_send_times,
        apprx,
        batch_size=batch_size,
        alpha=alpha,
        beta=beta,
        a=step_size_param[0],
        b=step_size_param[1],
        c=step_size_param[2],
        suffix=suffix,
    )
    if rank != 0:
        rec = np.zeros(V, dtype=bool)
        g_theta_file = h5py.File(g_name, "w")
        g_theta = g_theta_file.create_dataset("g_theta", (K, V), dtype="float32")

    # init theta and g_theta
    start = 0
    while start < V:
        end = start + word_partition * max_send_times
        end = end * (end <= V) + V * (end > V)

        dummy = sampler.theta[start:end, :]
        collect()
        comm.Bcast([dummy, MPI.FLOAT], root=0)
        sampler.theta[start:end, :] = dummy
        if rank != 0:
            g_theta[start:end, :] = dummy

        start = end

    comm.Bcast([sampler.norm_const, MPI.FLOAT], root=0)

    # ************************************ worker *******************************************************

    if rank != 0:
        # TODO wait for initial perplexity
        # comm.barrier()
        while not comm.Iprobe(source=0, tag=101):
            comm.isend(iters, dest=0, tag=111)

            sampler.update(MH_max, LWsampler=True, g_theta=g_theta, rec=rec)

            if comm.Iprobe(source=0, tag=102):
                comm.recv(source=0, tag=102)

                g_update(comm, sampler.theta, g_theta, sampler.norm_const, K, rec, word_partition * max_send_times)
                g_update(
                    comm,
                    sampler.theta,
                    g_theta,
                    sampler.norm_const,
                    K,
                    rec,
                    word_partition * max_send_times,
                    hash_mode=True,
                )

                rec.fill(0)
                sampler.time_bak = 0

            iters += 1

    # ************************************ master *******************************************************
    else:
        # sche = [2*i**2 for i in xrange(1, num) if 2*i**2 <= num]
        sche = [1, 250, 500, 1000]
        # TODO
        # start_time = get_per_LW(output_name, sampler, start_time, 0)
        # comm.barrier()

        for i in xrange(len(sche)):
            print "0---> update %i of %i" % (i, len(sche))

            while iters_mean < sche[i]:
                iters_mean = get_iters_mean(comm, size)
                print "0---> iter_mean %i" % iters_mean

            # inform to update
            for j in xrange(1, size):
                comm.isend(None, dest=j, tag=102)

            t_save = sampler.theta[:, :]
            g_recv(comm, sampler.theta, sampler.norm_const, size - 1, K, V, word_partition * max_send_times, apprx)
            start_time = get_per_LW(output_name, sampler, start_time, 0)
            comm.barrier()

            sampler.theta[:, :] = t_save
            code = get_code(t_save)
            comm.Bcast([np.int32(code), MPI.INT], root=0)
            g_recv(comm, sampler.theta, sampler.norm_const, size - 1, K, V, word_partition * max_send_times, apprx)
            start_time = get_per_LW(output_name, sampler, start_time, 0)
            comm.barrier()

        # stop workers, obtain final
        for i in xrange(1, size):
            comm.send(None, dest=i, tag=101)