def test_pagerank(): gc.collect() input_data_path = r"../datasets/karate.csv" # Networkx Call pd_df = pd.read_csv(input_data_path, delimiter=' ', names=['src', 'dst', 'value']) G = nx.Graph() for i in range(0, len(pd_df)): G.add_edge(pd_df['src'][i], pd_df['dst'][i]) nx_pr = nx.pagerank(G, alpha=0.85) nx_pr = sorted(nx_pr.items(), key=lambda x: x[0]) # Cugraph snmg pagerank Call cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) pr = dcg.pagerank(ddf, alpha=0.85, max_iter=50) res_df = pr.compute() err = 0 tol = 1.0e-05 for i in range(len(res_df)): if (abs(res_df['pagerank'][i] - nx_pr[i][1]) > tol * 1.1): err = err + 1 print("Mismatches:", err) assert err < (0.01 * len(res_df)) client.close() cluster.close()
def test_pagerank(): gc.collect() input_data_path = r"../datasets/hibench_small/1/part-00000.csv" # Networkx Call pd_df = pd.read_csv(input_data_path, delimiter='\t', names=['src', 'dst']) G = nx.DiGraph() for i in range(0, len(pd_df)): G.add_edge(pd_df['src'][i], pd_df['dst'][i]) nx_pr = nx.pagerank(G, alpha=0.85) nx_pr = sorted(nx_pr.items(), key=lambda x: x[0]) # Cugraph snmg pagerank Call cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) t0 = time.time() chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter='\t', names=['src', 'dst'], dtype=['int32', 'int32']) y = ddf.to_delayed() x = client.compute(y) wait(x) t1 = time.time() print("Reading Csv time: ", t1 - t0) new_ddf = dcg.drop_duplicates(x) t2 = time.time() pr = dcg.pagerank(new_ddf, alpha=0.85, max_iter=50) wait(pr) t3 = time.time() print("Running PR algo time: ", t3 - t2) t4 = time.time() res_df = pr.compute() t5 = time.time() print("Compute time: ", t5 - t4) print(res_df) t6 = time.time() # For bigdatax4, chunksize=100000000 to avoid oom on write csv res_df.to_csv('~/pagerank.csv', header=False, index=False) t7 = time.time() print("Write csv time: ", t7 - t6) # Comparison err = 0 tol = 1.0e-05 for i in range(len(res_df)): if (abs(res_df['pagerank'][i] - nx_pr[i][1]) > tol * 1.1): err = err + 1 print("Mismatches:", err) assert err < (0.02 * len(res_df)) client.close() cluster.close()
def test_splitting(): gc.collect() # This is an experimental setup for 300GB bigdatax8 dataset. # This test can be run on 16 32GB gpus. The dataset is split into 32 files. input_data_path = r"/datasets/pagerank_demo/1/Input-bigdatax8/edges/" input_files = [ 'file-00000.csv', 'file-00001.csv', 'file-00002.csv', 'file-00003.csv', 'file-00004.csv', 'file-00005.csv', 'file-00006.csv', 'file-00007.csv', 'file-00008.csv', 'file-00009.csv', 'file-00010.csv', 'file-00011.csv', 'file-00012.csv', 'file-00013.csv', 'file-00014.csv', 'file-00015.csv', 'file-00016.csv', 'file-00017.csv', 'file-00018.csv', 'file-00019.csv', 'file-00020.csv', 'file-00021.csv', 'file-00022.csv', 'file-00023.csv', 'file-00024.csv', 'file-00025.csv', 'file-00026.csv', 'file-00027.csv', 'file-00028.csv', 'file-00029.csv', 'file-00030.csv', 'file-00031.csv' ] # Cugraph snmg pagerank Call cluster = LocalCUDACluster(threads_per_worker=1) client = Client(cluster) files = [input_data_path + f for f in input_files] # Read 2 files per gpu/worker and concatenate the dataframe # This is a work around for large files to fit memory requirements # of cudf.read_csv t0 = time.time() new_ddf = dcg.read_split_csv(files) t1 = time.time() print("Reading Csv time: ", t1 - t0) t2 = time.time() pr = dcg.pagerank(new_ddf, alpha=0.85, max_iter=3) wait(pr) t3 = time.time() print("Pagerank (Dask) time: ", t3 - t2) t4 = time.time() res_df = pr.compute() t5 = time.time() print("Compute time: ", t5 - t4) print(res_df) t6 = time.time() res_df.to_csv('~/pagerank.csv', chunksize=40000000, header=False, index=False) t7 = time.time() print("Write csv time: ", t7 - t6) client.close() cluster.close()