def cugraph_nx_call(G, max_iter, tol, alpha, personalization, nstart): # cugraph Pagerank Call t1 = time.time() pr = cugraph.pagerank( G, alpha=alpha, max_iter=max_iter, tol=tol, personalization=personalization, nstart=nstart, ) t2 = time.time() - t1 print("Cugraph Time : " + str(t2)) return pr
def test_dask_pagerank(client_connection): gc.collect() pandas.set_option("display.max_rows", 10000) input_data_path = r"../datasets/karate.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") expected_pr = cugraph.pagerank(g) result_pr = dcg.pagerank(dg).compute() err = 0 tol = 1.0e-05 assert len(expected_pr) == len(result_pr) compare_pr = expected_pr.merge(result_pr, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr)): diff = abs(compare_pr["pagerank_local"].iloc[i] - compare_pr["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 print("Mismatches:", err) assert err == 0
def test_dask_pagerank(client_connection): gc.collect() pandas.set_option('display.max_rows', 10000) input_data_path = r"../datasets/karate.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) df = cudf.read_csv(input_data_path, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) g = cugraph.DiGraph() g.from_cudf_edgelist(df, 'src', 'dst') dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, 'src', 'dst') # Pre compute local data # dg.compute_local_data(by='dst') expected_pr = cugraph.pagerank(g) result_pr = dcg.pagerank(dg) err = 0 tol = 1.0e-05 assert len(expected_pr) == len(result_pr) compare_pr = expected_pr.merge(result_pr, on="vertex", suffixes=['_local', '_dask']) for i in range(len(compare_pr)): diff = abs(compare_pr['pagerank_local'].iloc[i] - compare_pr['pagerank_dask'].iloc[i]) if diff > tol * 1.1: err = err + 1 print("Mismatches:", err) assert err == 0
def cugraph_call(cu_M, max_iter, tol, alpha, personalization, nstart): # cugraph Pagerank Call G = cugraph.DiGraph() G.from_cudf_edgelist(cu_M, source='0', destination='1') t1 = time.time() df = cugraph.pagerank(G, alpha=alpha, max_iter=max_iter, tol=tol, personalization=personalization, nstart=nstart) t2 = time.time() - t1 print('Cugraph Time : '+str(t2)) # Sort Pagerank values sorted_pr = [] pr_scores = df['pagerank'].to_array() for i, rank in enumerate(pr_scores): sorted_pr.append((i, rank)) return sorted_pr
def test_dask_pagerank(dask_client): pandas.set_option("display.max_rows", 10000) input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix() chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.Graph(directed=True) g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.Graph(directed=True) dg.from_dask_cudf_edgelist(ddf, "src", "dst") expected_pr = cugraph.pagerank(g) result_pr = dcg.pagerank(dg).compute() err = 0 tol = 1.0e-05 assert len(expected_pr) == len(result_pr) compare_pr = expected_pr.merge(result_pr, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr)): diff = abs(compare_pr["pagerank_local"].iloc[i] - compare_pr["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 print("Mismatches:", err) assert err == 0
def test_pagerank(): import cugraph gdf = read_csv_file(csvFile) sources = gdf['0'] destinations = gdf['1'] # Assuming that data has been loaded into a cuDF (using read_csv) Dataframe # create a Graph using the source and destination vertex pairs G = cugraph.Graph() G.add_edge_list(sources, destinations, None) # Call cugraph.pagerank to get the pagerank scores gdf_page = cugraph.pagerank(G) assert len(expectedPageRanks) == len(gdf_page["pagerank"]) for (actual, expected) in zip(gdf_page["pagerank"], expectedPageRanks): assert actual == pytest.approx(expected)
def test_pagerank() : import cugraph gdf = read_csv_file(csvFile) # Assuming that data has been loaded into a cuDF (using read_csv) Dataframe # create a Graph using the source and destination vertex pairs G = cugraph.Graph() G.from_cudf_edgelist(gdf, "0", "1") # Call cugraph.pagerank to get the pagerank scores # Sort values since renumbering may have changed expected order gdf_page = cugraph.pagerank(G) gdf_page = gdf_page.sort_values('vertex').reset_index(drop=True) assert len(expectedPageRanks) == len(gdf_page["pagerank"]) for (actual, expected) in zip(gdf_page["pagerank"].to_pandas(), expectedPageRanks): assert actual == pytest.approx(expected)
def data(df): net = cudf.from_pandas(df) net['to'] = net['to'].astype('int32') net['from'] = net['from'].astype('int32') n = net.iloc[0, 0] G = cugraph.Graph() G.add_edge_list(net['from'], net['to'], None) out_bfs = cugraph.bfs(G, n, directed=True) out_page = cugraph.pagerank(G) out_bfs = out_bfs.to_pandas() out_page = out_page.to_pandas() out_bfs.loc[out_bfs['distance'] < 3, 'group'] = 2 out_bfs.loc[out_bfs['distance'] == 3, 'group'] = 0 out_bfs.loc[out_bfs['distance'] > 3, 'group'] = 1 out_bfs = out_bfs[['vertex', 'group']] return out_bfs, out_page
def cugraph_Call(M, max_iter, tol, alpha): # Device data sources = cudf.Series(M.row) destinations = cudf.Series(M.col) # values = cudf.Series(np.ones(len(sources), dtype = np.float64)) # cugraph Pagerank Call G = cugraph.Graph() G.add_edge_list(sources, destinations, None) t1 = time.time() df = cugraph.pagerank(G, alpha=alpha, max_iter=max_iter, tol=tol) t2 = time.time() - t1 print('Time : ' + str(t2)) # Sort Pagerank values sorted_pr = [] for i, rank in enumerate(df['pagerank']): sorted_pr.append((i, rank)) return sorted(sorted_pr, key=lambda x: x[1], reverse=True)
def cugraph_call(G, max_iter, tol, alpha, personalization, nstart): # cugraph Pagerank Call t1 = time.time() df = cugraph.pagerank( G, alpha=alpha, max_iter=max_iter, tol=tol, personalization=personalization, nstart=nstart, ) t2 = time.time() - t1 print("Cugraph Time : " + str(t2)) # Sort Pagerank values sorted_pr = [] df = df.sort_values("vertex").reset_index(drop=True) pr_scores = df["pagerank"].to_array() for i, rank in enumerate(pr_scores): sorted_pr.append((i, rank)) return sorted_pr
def pagerank(G): return cugraph.pagerank(G)
def test_dask_pagerank(client_connection): gc.collect() # Initialize and run pagerank on two distributed graphs # with same communicator input_data_path1 = r"../datasets/karate.csv" chunksize1 = dcg.get_chunksize(input_data_path1) input_data_path2 = r"../datasets/dolphins.csv" chunksize2 = dcg.get_chunksize(input_data_path2) ddf1 = dask_cudf.read_csv(input_data_path1, chunksize=chunksize1, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) dg1 = cugraph.DiGraph() dg1.from_dask_cudf_edgelist(ddf1, 'src', 'dst') result_pr1 = dcg.pagerank(dg1) ddf2 = dask_cudf.read_csv(input_data_path2, chunksize=chunksize2, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) dg2 = cugraph.DiGraph() dg2.from_dask_cudf_edgelist(ddf2, 'src', 'dst') result_pr2 = dcg.pagerank(dg2) # Calculate single GPU pagerank for verification of results df1 = cudf.read_csv(input_data_path1, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) g1 = cugraph.DiGraph() g1.from_cudf_edgelist(df1, 'src', 'dst') expected_pr1 = cugraph.pagerank(g1) df2 = cudf.read_csv(input_data_path2, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) g2 = cugraph.DiGraph() g2.from_cudf_edgelist(df2, 'src', 'dst') expected_pr2 = cugraph.pagerank(g2) # Compare and verify pagerank results err1 = 0 err2 = 0 tol = 1.0e-05 compare_pr1 = expected_pr1.merge(result_pr1, on="vertex", suffixes=['_local', '_dask']) assert len(expected_pr1) == len(result_pr1) for i in range(len(compare_pr1)): diff = abs(compare_pr1['pagerank_local'].iloc[i] - compare_pr1['pagerank_dask'].iloc[i]) if diff > tol * 1.1: err1 = err1 + 1 print("Mismatches in ", input_data_path1, ": ", err1) assert len(expected_pr2) == len(result_pr2) compare_pr2 = expected_pr2.merge(result_pr2, on="vertex", suffixes=['_local', '_dask']) for i in range(len(compare_pr2)): diff = abs(compare_pr2['pagerank_local'].iloc[i] - compare_pr2['pagerank_dask'].iloc[i]) if diff > tol * 1.1: err2 = err2 + 1 print("Mismatches in ", input_data_path2, ": ", err2) assert err1 == err2 == 0
def cugraph_pagerank(graph: CuGraph, damping: float, maxiter: int, tolerance: float) -> CuDFNodeMap: pagerank = cugraph.pagerank( graph.value, alpha=damping, max_iter=maxiter, tol=tolerance).set_index("vertex")["pagerank"] return CuDFNodeMap(pagerank)
def test_dask_pagerank(client_connection): gc.collect() # Initialize and run pagerank on two distributed graphs # with same communicator # FIXME: update this to allow dataset to be parameterized and have dataset # part of test param id (see other tests) input_data_path1 = r"../datasets/karate.csv" print(f"dataset1={input_data_path1}") chunksize1 = dcg.get_chunksize(input_data_path1) input_data_path2 = r"../datasets/dolphins.csv" print(f"dataset2={input_data_path2}") chunksize2 = dcg.get_chunksize(input_data_path2) ddf1 = dask_cudf.read_csv( input_data_path1, chunksize=chunksize1, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg1 = cugraph.DiGraph() dg1.from_dask_cudf_edgelist(ddf1, "src", "dst") result_pr1 = dcg.pagerank(dg1).compute() ddf2 = dask_cudf.read_csv( input_data_path2, chunksize=chunksize2, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg2 = cugraph.DiGraph() dg2.from_dask_cudf_edgelist(ddf2, "src", "dst") result_pr2 = dcg.pagerank(dg2).compute() # Calculate single GPU pagerank for verification of results df1 = cudf.read_csv( input_data_path1, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g1 = cugraph.DiGraph() g1.from_cudf_edgelist(df1, "src", "dst") expected_pr1 = cugraph.pagerank(g1) df2 = cudf.read_csv( input_data_path2, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g2 = cugraph.DiGraph() g2.from_cudf_edgelist(df2, "src", "dst") expected_pr2 = cugraph.pagerank(g2) # Compare and verify pagerank results err1 = 0 err2 = 0 tol = 1.0e-05 compare_pr1 = expected_pr1.merge(result_pr1, on="vertex", suffixes=["_local", "_dask"]) assert len(expected_pr1) == len(result_pr1) for i in range(len(compare_pr1)): diff = abs(compare_pr1["pagerank_local"].iloc[i] - compare_pr1["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err1 = err1 + 1 print("Mismatches in ", input_data_path1, ": ", err1) assert len(expected_pr2) == len(result_pr2) compare_pr2 = expected_pr2.merge(result_pr2, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr2)): diff = abs(compare_pr2["pagerank_local"].iloc[i] - compare_pr2["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err2 = err2 + 1 print("Mismatches in ", input_data_path2, ": ", err2) assert err1 == err2 == 0
def test_pagerank_multi_column(graph_file, max_iter, tol, alpha, personalization_perc, has_guess): gc.collect() # NetworkX PageRank M = utils.read_csv_for_nx(graph_file) nnz_vtx = np.unique(M[['0', '1']]) Gnx = nx.from_pandas_edgelist(M, source="0", target="1", edge_attr="weight", create_using=nx.DiGraph()) networkx_pr, networkx_prsn = networkx_call(Gnx, max_iter, tol, alpha, personalization_perc, nnz_vtx) cu_nstart = None if has_guess == 1: cu_nstart_temp = cudify(networkx_pr) max_iter = 100 cu_nstart = cudf.DataFrame() cu_nstart["vertex_0"] = cu_nstart_temp["vertex"] cu_nstart["vertex_1"] = cu_nstart["vertex_0"] + 1000 cu_nstart["values"] = cu_nstart_temp["values"] cu_prsn_temp = cudify(networkx_prsn) if cu_prsn_temp is not None: cu_prsn = cudf.DataFrame() cu_prsn["vertex_0"] = cu_prsn_temp["vertex"] cu_prsn["vertex_1"] = cu_prsn["vertex_0"] + 1000 cu_prsn["values"] = cu_prsn_temp["values"] else: cu_prsn = cu_prsn_temp cu_M = cudf.DataFrame() cu_M["src_0"] = cudf.Series(M["0"]) cu_M["dst_0"] = cudf.Series(M["1"]) cu_M["src_1"] = cu_M["src_0"] + 1000 cu_M["dst_1"] = cu_M["dst_0"] + 1000 cu_M["weights"] = cudf.Series(M["weight"]) cu_G = cugraph.DiGraph() cu_G.from_cudf_edgelist(cu_M, source=["src_0", "src_1"], destination=["dst_0", "dst_1"], edge_attr="weights") df = cugraph.pagerank( cu_G, alpha=alpha, max_iter=max_iter, tol=tol, personalization=cu_prsn, nstart=cu_nstart, ) cugraph_pr = [] df = df.sort_values("0_vertex").reset_index(drop=True) pr_scores = df["pagerank"].to_array() for i, rank in enumerate(pr_scores): cugraph_pr.append((i, rank)) # Calculating mismatch networkx_pr = sorted(networkx_pr.items(), key=lambda x: x[0]) err = 0 assert len(cugraph_pr) == len(networkx_pr) for i in range(len(cugraph_pr)): if (abs(cugraph_pr[i][1] - networkx_pr[i][1]) > tol * 1.1 and cugraph_pr[i][0] == networkx_pr[i][0]): err = err + 1 print("Mismatches:", err) assert err < (0.01 * len(cugraph_pr))