Example #1
0
def test_parquet_concat_within_workers(client_connection):
    if not os.path.exists("test_files_parquet"):
        print("Generate data... ")
        os.mkdir("test_files_parquet")
    for x in range(10):
        if not os.path.exists("test_files_parquet/df" + str(x)):
            df = utils.random_edgelist(e=100,
                                       ef=16,
                                       dtypes={
                                           "src": np.int32,
                                           "dst": np.int32
                                       },
                                       seed=x)
            df.to_parquet("test_files_parquet/df" + str(x), index=False)

    n_gpu = get_n_workers()

    print("Read_parquet... ")
    t1 = time.time()
    ddf = dask_cudf.read_parquet("test_files_parquet/*",
                                 dtype=["int32", "int32"])
    ddf = ddf.persist()
    futures_of(ddf)
    wait(ddf)
    t1 = time.time() - t1
    print("*** Read Time: ", t1, "s")
    print(ddf)

    assert ddf.npartitions > n_gpu

    print("Drop_duplicates... ")
    t2 = time.time()
    ddf.drop_duplicates(inplace=True)
    ddf = ddf.persist()
    futures_of(ddf)
    wait(ddf)
    t2 = time.time() - t2
    print("*** Drop duplicate time: ", t2, "s")
    assert t2 < t1

    print("Repartition... ")
    t3 = time.time()
    # Notice that ideally we would use :
    # ddf = ddf.repartition(npartitions=n_gpu)
    # However this is slower than reading and requires more memory
    # Using custom concat instead
    client = default_client()
    ddf = concat_within_workers(client, ddf)
    ddf = ddf.persist()
    futures_of(ddf)
    wait(ddf)
    t3 = time.time() - t3
    print("*** repartition Time: ", t3, "s")
    print(ddf)

    assert t3 < t1
Example #2
0
def get_n_workers(sID=None):
    if sID is None:
        return read_utils.get_n_workers()
    else:
        sessionstate = get_raft_comm_state(sID)
        return sessionstate['nworkers']