Ejemplo n.º 1
0
def test_pagerank():
    gc.collect()
    input_data_path = r"../datasets/karate.csv"
    # Networkx Call
    pd_df = pd.read_csv(input_data_path,
                        delimiter=' ',
                        names=['src', 'dst', 'value'])
    G = nx.Graph()
    for i in range(0, len(pd_df)):
        G.add_edge(pd_df['src'][i], pd_df['dst'][i])
    nx_pr = nx.pagerank(G, alpha=0.85)
    nx_pr = sorted(nx_pr.items(), key=lambda x: x[0])
    # Cugraph snmg pagerank Call
    cluster = LocalCUDACluster(threads_per_worker=1)
    client = Client(cluster)
    chunksize = dcg.get_chunksize(input_data_path)
    ddf = dask_cudf.read_csv(input_data_path,
                             chunksize=chunksize,
                             delimiter=' ',
                             names=['src', 'dst', 'value'],
                             dtype=['int32', 'int32', 'float32'])

    pr = dcg.pagerank(ddf, alpha=0.85, max_iter=50)
    res_df = pr.compute()

    err = 0
    tol = 1.0e-05
    for i in range(len(res_df)):
        if (abs(res_df['pagerank'][i] - nx_pr[i][1]) > tol * 1.1):
            err = err + 1
    print("Mismatches:", err)
    assert err < (0.01 * len(res_df))

    client.close()
    cluster.close()
Ejemplo n.º 2
0
def test_pagerank():
    gc.collect()
    input_data_path = r"../datasets/hibench_small/1/part-00000.csv"

    # Networkx Call
    pd_df = pd.read_csv(input_data_path, delimiter='\t', names=['src', 'dst'])
    G = nx.DiGraph()
    for i in range(0, len(pd_df)):
        G.add_edge(pd_df['src'][i], pd_df['dst'][i])
    nx_pr = nx.pagerank(G, alpha=0.85)
    nx_pr = sorted(nx_pr.items(), key=lambda x: x[0])

    # Cugraph snmg pagerank Call
    cluster = LocalCUDACluster(threads_per_worker=1)
    client = Client(cluster)

    t0 = time.time()
    chunksize = dcg.get_chunksize(input_data_path)
    ddf = dask_cudf.read_csv(input_data_path,
                             chunksize=chunksize,
                             delimiter='\t',
                             names=['src', 'dst'],
                             dtype=['int32', 'int32'])
    y = ddf.to_delayed()
    x = client.compute(y)
    wait(x)
    t1 = time.time()
    print("Reading Csv time: ", t1 - t0)
    new_ddf = dcg.drop_duplicates(x)
    t2 = time.time()
    pr = dcg.pagerank(new_ddf, alpha=0.85, max_iter=50)
    wait(pr)
    t3 = time.time()
    print("Running PR algo time: ", t3 - t2)
    t4 = time.time()
    res_df = pr.compute()
    t5 = time.time()
    print("Compute time: ", t5 - t4)
    print(res_df)
    t6 = time.time()
    # For bigdatax4, chunksize=100000000 to avoid oom on write csv
    res_df.to_csv('~/pagerank.csv', header=False, index=False)
    t7 = time.time()
    print("Write csv time: ", t7 - t6)

    # Comparison
    err = 0
    tol = 1.0e-05
    for i in range(len(res_df)):
        if (abs(res_df['pagerank'][i] - nx_pr[i][1]) > tol * 1.1):
            err = err + 1
    print("Mismatches:", err)
    assert err < (0.02 * len(res_df))

    client.close()
    cluster.close()
Ejemplo n.º 3
0
def test_splitting():
    gc.collect()

    # This is an experimental setup for 300GB bigdatax8 dataset.
    # This test can be run on 16 32GB gpus. The dataset is split into 32 files.
    input_data_path = r"/datasets/pagerank_demo/1/Input-bigdatax8/edges/"
    input_files = [
        'file-00000.csv', 'file-00001.csv', 'file-00002.csv', 'file-00003.csv',
        'file-00004.csv', 'file-00005.csv', 'file-00006.csv', 'file-00007.csv',
        'file-00008.csv', 'file-00009.csv', 'file-00010.csv', 'file-00011.csv',
        'file-00012.csv', 'file-00013.csv', 'file-00014.csv', 'file-00015.csv',
        'file-00016.csv', 'file-00017.csv', 'file-00018.csv', 'file-00019.csv',
        'file-00020.csv', 'file-00021.csv', 'file-00022.csv', 'file-00023.csv',
        'file-00024.csv', 'file-00025.csv', 'file-00026.csv', 'file-00027.csv',
        'file-00028.csv', 'file-00029.csv', 'file-00030.csv', 'file-00031.csv'
    ]

    # Cugraph snmg pagerank Call
    cluster = LocalCUDACluster(threads_per_worker=1)
    client = Client(cluster)

    files = [input_data_path + f for f in input_files]

    # Read 2 files per gpu/worker and concatenate the dataframe
    # This is a work around for large files to fit memory requirements
    # of cudf.read_csv
    t0 = time.time()
    new_ddf = dcg.read_split_csv(files)
    t1 = time.time()
    print("Reading Csv time: ", t1 - t0)
    t2 = time.time()
    pr = dcg.pagerank(new_ddf, alpha=0.85, max_iter=3)
    wait(pr)
    t3 = time.time()
    print("Pagerank (Dask) time: ", t3 - t2)
    t4 = time.time()
    res_df = pr.compute()
    t5 = time.time()
    print("Compute time: ", t5 - t4)
    print(res_df)
    t6 = time.time()
    res_df.to_csv('~/pagerank.csv',
                  chunksize=40000000,
                  header=False,
                  index=False)
    t7 = time.time()
    print("Write csv time: ", t7 - t6)

    client.close()
    cluster.close()