Example #1
0
def test_pagerank():
    gc.collect()
    input_data_path = r"../datasets/karate.csv"
    # Networkx Call
    pd_df = pd.read_csv(input_data_path,
                        delimiter=' ',
                        names=['src', 'dst', 'value'])
    G = nx.Graph()
    for i in range(0, len(pd_df)):
        G.add_edge(pd_df['src'][i], pd_df['dst'][i])
    nx_pr = nx.pagerank(G, alpha=0.85)
    nx_pr = sorted(nx_pr.items(), key=lambda x: x[0])
    # Cugraph snmg pagerank Call
    cluster = LocalCUDACluster(threads_per_worker=1)
    client = Client(cluster)
    chunksize = dcg.get_chunksize(input_data_path)
    ddf = dask_cudf.read_csv(input_data_path,
                             chunksize=chunksize,
                             delimiter=' ',
                             names=['src', 'dst', 'value'],
                             dtype=['int32', 'int32', 'float32'])

    pr = dcg.pagerank(ddf, alpha=0.85, max_iter=50)
    res_df = pr.compute()

    err = 0
    tol = 1.0e-05
    for i in range(len(res_df)):
        if (abs(res_df['pagerank'][i] - nx_pr[i][1]) > tol * 1.1):
            err = err + 1
    print("Mismatches:", err)
    assert err < (0.01 * len(res_df))

    client.close()
    cluster.close()
Example #2
0
def test_pagerank():
    gc.collect()
    input_data_path = r"../datasets/hibench_small/1/part-00000.csv"

    # Networkx Call
    pd_df = pd.read_csv(input_data_path, delimiter='\t', names=['src', 'dst'])
    G = nx.DiGraph()
    for i in range(0, len(pd_df)):
        G.add_edge(pd_df['src'][i], pd_df['dst'][i])
    nx_pr = nx.pagerank(G, alpha=0.85)
    nx_pr = sorted(nx_pr.items(), key=lambda x: x[0])

    # Cugraph snmg pagerank Call
    cluster = LocalCUDACluster(threads_per_worker=1)
    client = Client(cluster)

    t0 = time.time()
    chunksize = dcg.get_chunksize(input_data_path)
    ddf = dask_cudf.read_csv(input_data_path,
                             chunksize=chunksize,
                             delimiter='\t',
                             names=['src', 'dst'],
                             dtype=['int32', 'int32'])
    y = ddf.to_delayed()
    x = client.compute(y)
    wait(x)
    t1 = time.time()
    print("Reading Csv time: ", t1 - t0)
    new_ddf = dcg.drop_duplicates(x)
    t2 = time.time()
    pr = dcg.pagerank(new_ddf, alpha=0.85, max_iter=50)
    wait(pr)
    t3 = time.time()
    print("Running PR algo time: ", t3 - t2)
    t4 = time.time()
    res_df = pr.compute()
    t5 = time.time()
    print("Compute time: ", t5 - t4)
    print(res_df)
    t6 = time.time()
    # For bigdatax4, chunksize=100000000 to avoid oom on write csv
    res_df.to_csv('~/pagerank.csv', header=False, index=False)
    t7 = time.time()
    print("Write csv time: ", t7 - t6)

    # Comparison
    err = 0
    tol = 1.0e-05
    for i in range(len(res_df)):
        if (abs(res_df['pagerank'][i] - nx_pr[i][1]) > tol * 1.1):
            err = err + 1
    print("Mismatches:", err)
    assert err < (0.02 * len(res_df))

    client.close()
    cluster.close()