def test_from_edgelist(dask_client): input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix() print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg1 = cugraph.from_edgelist(ddf, source="src", destination="dst", edge_attr="value", create_using=cugraph.DiGraph) dg2 = cugraph.DiGraph() dg2.from_dask_cudf_edgelist(ddf, source="src", destination="dst", edge_attr="value") assert dg1.EdgeList == dg2.EdgeList
def test_compute_local_data(client_connection): gc.collect() input_data_path = r"../datasets/karate.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', edge_attr='value') # Compute_local_data dg.compute_local_data(by='dst') data = dg.local_data['data'] by = dg.local_data['by'] assert by == 'dst' assert Comms.is_initialized() global_num_edges = data.local_data['edges'].sum() assert global_num_edges == dg.number_of_edges() global_num_verts = data.local_data['verts'].sum() assert global_num_verts == dg.number_of_nodes()
def test_from_edgelist(client_connection): # FIXME: update this to allow dataset to be parameterized and have dataset # part of test param id (see other tests) input_data_path = r"../datasets/karate.csv" print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg1 = cugraph.from_edgelist(ddf, source="src", destination="dst", edge_attr="value", create_using=cugraph.DiGraph) dg2 = cugraph.DiGraph() dg2.from_dask_cudf_edgelist(ddf, source="src", destination="dst", edge_attr="value") assert dg1.EdgeList == dg2.EdgeList
def test_compute_local_data(client_connection): gc.collect() input_data_path = r"../datasets/karate.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, source="src", destination="dst", edge_attr="value") # Compute_local_data dg.compute_local_data(by="dst") data = dg.local_data["data"] by = dg.local_data["by"] assert by == "dst" assert Comms.is_initialized() global_num_edges = data.local_data["edges"].sum() assert global_num_edges == dg.number_of_edges() global_num_verts = data.local_data["verts"].sum() assert global_num_verts == dg.number_of_nodes()
def test_consolidation(graph_file): gc.collect() cluster = LocalCUDACluster() client = Client(cluster) chunksize = dcg.get_chunksize(graph_file) M = utils.read_csv_for_nx(graph_file) df = pd.DataFrame() df['source'] = pd.Series(M['0']) df['target'] = pd.Series(M['1']) ddf = dask_cudf.read_csv(graph_file, chunksize=chunksize, delimiter=' ', names=['source', 'target', 'weight'], dtype=['int32', 'int32', 'float32'], header=None) Gnx = nx.from_pandas_edgelist(df, source='source', target='target', create_using=nx.DiGraph) G = cugraph.from_cudf_edgelist(ddf, source='source', destination='target', create_using=cugraph.DiGraph) assert compare_graphs(Gnx, G) Gnx.clear() G.clear() client.close() cluster.close()
def test_dask_katz_centrality(client_connection): gc.collect() input_data_path = r"../datasets/karate.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") largest_out_degree = g.degrees().nlargest(n=1, columns="out_degree") largest_out_degree = largest_out_degree["out_degree"].iloc[0] katz_alpha = 1 / (largest_out_degree + 1) mg_res = dcg.katz_centrality(dg, alpha=katz_alpha, tol=1e-6) mg_res = mg_res.compute() import networkx as nx from cugraph.tests import utils NM = utils.read_csv_for_nx(input_data_path) Gnx = nx.from_pandas_edgelist( NM, create_using=nx.DiGraph(), source="0", target="1" ) nk = nx.katz_centrality(Gnx, alpha=katz_alpha) import pandas as pd pdf = pd.DataFrame(nk.items(), columns=['vertex', 'katz_centrality']) exp_res = cudf.DataFrame(pdf) err = 0 tol = 1.0e-05 compare_res = exp_res.merge( mg_res, on="vertex", suffixes=["_local", "_dask"] ) for i in range(len(compare_res)): diff = abs( compare_res["katz_centrality_local"].iloc[i] - compare_res["katz_centrality_dask"].iloc[i] ) if diff > tol * 1.1: err = err + 1 assert err == 0
def test_dask_bfs(dask_client): gc.collect() input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "netscience.csv").as_posix() print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) def modify_dataset(df): temp_df = cudf.DataFrame() temp_df['src'] = df['src'] + 1000 temp_df['dst'] = df['dst'] + 1000 temp_df['value'] = df['value'] return cudf.concat([df, temp_df]) meta = ddf._meta ddf = ddf.map_partitions(modify_dataset, meta=meta) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = modify_dataset(df) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") expected_dist = cugraph.bfs(g, [0, 1000]) result_dist = dcg.bfs(dg, [0, 1000]) result_dist = result_dist.compute() compare_dist = expected_dist.merge(result_dist, on="vertex", suffixes=["_local", "_dask"]) err = 0 for i in range(len(compare_dist)): if (compare_dist["distance_local"].iloc[i] != compare_dist["distance_dask"].iloc[i]): err = err + 1 assert err == 0
def test_dask_pagerank(client_connection, personalization_perc): gc.collect() input_data_path = r"../datasets/karate.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") # Pre compute local data and personalize personalization = None if personalization_perc != 0: dg.compute_local_data(by="dst") personalization = personalize(dg.number_of_vertices(), personalization_perc) expected_pr = cugraph.pagerank(g, personalization=personalization, tol=1e-6) result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6) result_pr = result_pr.compute() err = 0 tol = 1.0e-05 assert len(expected_pr) == len(result_pr) compare_pr = expected_pr.merge(result_pr, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr)): diff = abs(compare_pr["pagerank_local"].iloc[i] - compare_pr["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 assert err == 0
def test_dask_pagerank(client_connection, personalization_perc): gc.collect() # FIXME: update this to allow dataset to be parameterized and have dataset # part of test param id (see other tests) input_data_path = r"../datasets/karate.csv" print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") personalization = None if personalization_perc != 0: personalization, p = personalize(g.nodes(), personalization_perc) expected_pr = cugraph.pagerank(g, personalization=personalization, tol=1e-6) result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6) result_pr = result_pr.compute() err = 0 tol = 1.0e-05 assert len(expected_pr) == len(result_pr) compare_pr = expected_pr.merge(result_pr, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr)): diff = abs(compare_pr["pagerank_local"].iloc[i] - compare_pr["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 assert err == 0
def test_dask_bfs_multi_column_depthlimit(dask_client): gc.collect() input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "netscience.csv").as_posix() print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src_a", "dst_a", "value"], dtype=["int32", "int32", "float32"], ) ddf['src_b'] = ddf['src_a'] + 1000 ddf['dst_b'] = ddf['dst_a'] + 1000 df = cudf.read_csv( input_data_path, delimiter=" ", names=["src_a", "dst_a", "value"], dtype=["int32", "int32", "float32"], ) df['src_b'] = df['src_a'] + 1000 df['dst_b'] = df['dst_a'] + 1000 g = cugraph.DiGraph() g.from_cudf_edgelist(df, ["src_a", "src_b"], ["dst_a", "dst_b"]) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, ["src_a", "src_b"], ["dst_a", "dst_b"]) start = cudf.DataFrame() start['a'] = [0] start['b'] = [1000] depth_limit = 18 expected_dist = cugraph.bfs(g, start, depth_limit=depth_limit) result_dist = dcg.bfs(dg, start, depth_limit=depth_limit) result_dist = result_dist.compute() compare_dist = expected_dist.merge(result_dist, on=["0_vertex", "1_vertex"], suffixes=["_local", "_dask"]) err = 0 for i in range(len(compare_dist)): if (compare_dist["distance_local"].iloc[i] <= depth_limit and compare_dist["distance_dask"].iloc[i] <= depth_limit and compare_dist["distance_local"].iloc[i] != compare_dist["distance_dask"].iloc[i]): err = err + 1 assert err == 0
def test_dask_pagerank(dask_client, personalization_perc): gc.collect() input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix() print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") personalization = None if personalization_perc != 0: personalization, p = personalize(g.nodes(), personalization_perc) expected_pr = cugraph.pagerank(g, personalization=personalization, tol=1e-6) result_pr = dcg.pagerank(dg, personalization=personalization, tol=1e-6) result_pr = result_pr.compute() err = 0 tol = 1.0e-05 assert len(expected_pr) == len(result_pr) compare_pr = expected_pr.merge(result_pr, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr)): diff = abs(compare_pr["pagerank_local"].iloc[i] - compare_pr["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 assert err == 0
def test_dask_pagerank(client_connection): gc.collect() pandas.set_option("display.max_rows", 10000) input_data_path = r"../datasets/karate.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") # Pre compute local data # dg.compute_local_data(by='dst') expected_pr = cugraph.pagerank(g) result_pr = dcg.pagerank(dg) err = 0 tol = 1.0e-05 assert len(expected_pr) == len(result_pr) compare_pr = expected_pr.merge(result_pr, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr)): diff = abs(compare_pr["pagerank_local"].iloc[i] - compare_pr["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 print("Mismatches:", err) assert err == 0
def test_directed_graph_renumber_false(renumber, dask_client): input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix() chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg = cugraph.Graph(directed=True) with pytest.raises(ValueError): dg.from_dask_cudf_edgelist(ddf, "src", "dst", renumber=renumber)
def test_dask_sssp(dask_client): gc.collect() input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "netscience.csv").as_posix() print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst", "value", renumber=True) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst", "value") expected_dist = cugraph.sssp(g, 0) print(expected_dist) result_dist = dcg.sssp(dg, 0) result_dist = result_dist.compute() compare_dist = expected_dist.merge( result_dist, on="vertex", suffixes=["_local", "_dask"] ) err = 0 for i in range(len(compare_dist)): if ( compare_dist["distance_local"].iloc[i] != compare_dist["distance_dask"].iloc[i] ): err = err + 1 assert err == 0
def test_dask_sssp(client_connection): gc.collect() # FIXME: update this to allow dataset to be parameterized and have dataset # part of test param id (see other tests) input_data_path = r"../datasets/netscience.csv" print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst", "value", renumber=True) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst", "value") expected_dist = cugraph.sssp(g, 0) print(expected_dist) result_dist = dcg.sssp(dg, 0) result_dist = result_dist.compute() compare_dist = expected_dist.merge(result_dist, on="vertex", suffixes=["_local", "_dask"]) err = 0 for i in range(len(compare_dist)): if (compare_dist["distance_local"].iloc[i] != compare_dist["distance_dask"].iloc[i]): err = err + 1 assert err == 0
def test_dask_pagerank(dask_client): pandas.set_option("display.max_rows", 10000) input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv").as_posix() chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.Graph(directed=True) g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.Graph(directed=True) dg.from_dask_cudf_edgelist(ddf, "src", "dst") expected_pr = cugraph.pagerank(g) result_pr = dcg.pagerank(dg).compute() err = 0 tol = 1.0e-05 assert len(expected_pr) == len(result_pr) compare_pr = expected_pr.merge(result_pr, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr)): diff = abs(compare_pr["pagerank_local"].iloc[i] - compare_pr["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err = err + 1 print("Mismatches:", err) assert err == 0
def test_dask_wcc(client_connection): gc.collect() # FIXME: update this to allow dataset to be parameterized and have dataset # part of test param id (see other tests) input_data_path = r"../datasets/netscience.csv" print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst", renumber=True) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") expected_dist = cugraph.weakly_connected_components(g) result_dist = dcg.weakly_connected_components(dg) result_dist = result_dist.compute() compare_dist = expected_dist.merge(result_dist, on="vertex", suffixes=["_local", "_dask"]) unique_local_labels = compare_dist['labels_local'].unique() for label in unique_local_labels.values.tolist(): dask_labels_df = compare_dist[compare_dist['labels_local'] == label] dask_labels = dask_labels_df['labels_dask'] assert (dask_labels.iloc[0] == dask_labels).all()
def test_dask_bfs(): gc.collect() cluster = LocalCUDACluster() client = Client(cluster) Comms.initialize() input_data_path = r"../datasets/netscience.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) df = cudf.read_csv(input_data_path, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) g = cugraph.DiGraph() g.from_cudf_edgelist(df, 'src', 'dst', renumber=True) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, renumber=True) expected_dist = cugraph.bfs(g, 0) result_dist = dcg.bfs(dg, 0, True) compare_dist = expected_dist.merge(result_dist, on="vertex", suffixes=['_local', '_dask']) err = 0 for i in range(len(compare_dist)): if (compare_dist['distance_local'].iloc[i] != compare_dist['distance_dask'].iloc[i]): err = err + 1 assert err == 0 Comms.destroy() client.close() cluster.close()
def daskGraphFromDataset(request, client_connection): """ Returns a new dask dataframe created from the dataset file param. """ # Since parameterized fixtures do not assign param names to param values, # manually call the helper to do so. setFixtureParamNames(request, ["dataset"]) dataset = request.param chunksize = dcg.get_chunksize(dataset) ddf = dask_cudf.read_csv(dataset, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, 'src', 'dst') return dg
def test_dask_wcc(dask_client): gc.collect() input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "netscience.csv").as_posix() print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst", renumber=True) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") expected_dist = cugraph.weakly_connected_components(g) result_dist = dcg.weakly_connected_components(dg) result_dist = result_dist.compute() compare_dist = expected_dist.merge(result_dist, on="vertex", suffixes=["_local", "_dask"]) unique_local_labels = compare_dist['labels_local'].unique() for label in unique_local_labels.values.tolist(): dask_labels_df = compare_dist[compare_dist['labels_local'] == label] dask_labels = dask_labels_df['labels_dask'] assert (dask_labels.iloc[0] == dask_labels).all()
def test_dask_bfs(client_connection): gc.collect() input_data_path = r"../datasets/netscience.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst", renumber=True) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") expected_dist = cugraph.bfs(g, 0) result_dist = dcg.bfs(dg, 0, True) result_dist = result_dist.compute() compare_dist = expected_dist.merge(result_dist, on="vertex", suffixes=["_local", "_dask"]) err = 0 for i in range(len(compare_dist)): if (compare_dist["distance_local"].iloc[i] != compare_dist["distance_dask"].iloc[i]): err = err + 1 assert err == 0
def test_from_edgelist(client_connection): input_data_path = r"../datasets/karate.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg1 = cugraph.from_edgelist( ddf, source="src", destination="dst", edge_attr="value", create_using=cugraph.DiGraph) dg2 = cugraph.DiGraph() dg2.from_dask_cudf_edgelist( ddf, source="src", destination="dst", edge_attr="value" ) assert dg1.EdgeList == dg2.EdgeList
def daskGraphFromDataset(request, dask_client): """ Returns a new dask dataframe created from the dataset file param. """ # Since parameterized fixtures do not assign param names to param values, # manually call the helper to do so. setFixtureParamNames(request, ["dataset"]) dataset = request.param chunksize = dcg.get_chunksize(dataset) ddf = dask_cudf.read_csv( dataset, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") return dg
def test_dask_pagerank(client_connection): gc.collect() # Initialize and run pagerank on two distributed graphs # with same communicator input_data_path1 = r"../datasets/karate.csv" chunksize1 = dcg.get_chunksize(input_data_path1) input_data_path2 = r"../datasets/dolphins.csv" chunksize2 = dcg.get_chunksize(input_data_path2) ddf1 = dask_cudf.read_csv(input_data_path1, chunksize=chunksize1, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) dg1 = cugraph.DiGraph() dg1.from_dask_cudf_edgelist(ddf1, 'src', 'dst') result_pr1 = dcg.pagerank(dg1) ddf2 = dask_cudf.read_csv(input_data_path2, chunksize=chunksize2, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) dg2 = cugraph.DiGraph() dg2.from_dask_cudf_edgelist(ddf2, 'src', 'dst') result_pr2 = dcg.pagerank(dg2) # Calculate single GPU pagerank for verification of results df1 = cudf.read_csv(input_data_path1, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) g1 = cugraph.DiGraph() g1.from_cudf_edgelist(df1, 'src', 'dst') expected_pr1 = cugraph.pagerank(g1) df2 = cudf.read_csv(input_data_path2, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) g2 = cugraph.DiGraph() g2.from_cudf_edgelist(df2, 'src', 'dst') expected_pr2 = cugraph.pagerank(g2) # Compare and verify pagerank results err1 = 0 err2 = 0 tol = 1.0e-05 compare_pr1 = expected_pr1.merge(result_pr1, on="vertex", suffixes=['_local', '_dask']) assert len(expected_pr1) == len(result_pr1) for i in range(len(compare_pr1)): diff = abs(compare_pr1['pagerank_local'].iloc[i] - compare_pr1['pagerank_dask'].iloc[i]) if diff > tol * 1.1: err1 = err1 + 1 print("Mismatches in ", input_data_path1, ": ", err1) assert len(expected_pr2) == len(result_pr2) compare_pr2 = expected_pr2.merge(result_pr2, on="vertex", suffixes=['_local', '_dask']) for i in range(len(compare_pr2)): diff = abs(compare_pr2['pagerank_local'].iloc[i] - compare_pr2['pagerank_dask'].iloc[i]) if diff > tol * 1.1: err2 = err2 + 1 print("Mismatches in ", input_data_path2, ": ", err2) assert err1 == err2 == 0
def test_dask_pagerank(client_connection): gc.collect() # Initialize and run pagerank on two distributed graphs # with same communicator # FIXME: update this to allow dataset to be parameterized and have dataset # part of test param id (see other tests) input_data_path1 = r"../datasets/karate.csv" print(f"dataset1={input_data_path1}") chunksize1 = dcg.get_chunksize(input_data_path1) input_data_path2 = r"../datasets/dolphins.csv" print(f"dataset2={input_data_path2}") chunksize2 = dcg.get_chunksize(input_data_path2) ddf1 = dask_cudf.read_csv( input_data_path1, chunksize=chunksize1, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg1 = cugraph.DiGraph() dg1.from_dask_cudf_edgelist(ddf1, "src", "dst") result_pr1 = dcg.pagerank(dg1).compute() ddf2 = dask_cudf.read_csv( input_data_path2, chunksize=chunksize2, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) dg2 = cugraph.DiGraph() dg2.from_dask_cudf_edgelist(ddf2, "src", "dst") result_pr2 = dcg.pagerank(dg2).compute() # Calculate single GPU pagerank for verification of results df1 = cudf.read_csv( input_data_path1, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g1 = cugraph.DiGraph() g1.from_cudf_edgelist(df1, "src", "dst") expected_pr1 = cugraph.pagerank(g1) df2 = cudf.read_csv( input_data_path2, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g2 = cugraph.DiGraph() g2.from_cudf_edgelist(df2, "src", "dst") expected_pr2 = cugraph.pagerank(g2) # Compare and verify pagerank results err1 = 0 err2 = 0 tol = 1.0e-05 compare_pr1 = expected_pr1.merge(result_pr1, on="vertex", suffixes=["_local", "_dask"]) assert len(expected_pr1) == len(result_pr1) for i in range(len(compare_pr1)): diff = abs(compare_pr1["pagerank_local"].iloc[i] - compare_pr1["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err1 = err1 + 1 print("Mismatches in ", input_data_path1, ": ", err1) assert len(expected_pr2) == len(result_pr2) compare_pr2 = expected_pr2.merge(result_pr2, on="vertex", suffixes=["_local", "_dask"]) for i in range(len(compare_pr2)): diff = abs(compare_pr2["pagerank_local"].iloc[i] - compare_pr2["pagerank_dask"].iloc[i]) if diff > tol * 1.1: err2 = err2 + 1 print("Mismatches in ", input_data_path2, ": ", err2) assert err1 == err2 == 0