def _compare_bfs(G, Gnx, source): df = cugraph.bfs(G, source, return_sp_counter=False) # This call should only contain 3 columns: # 'vertex', 'distance', 'predecessor' # It also confirms wether or not 'sp_counter' has been created by the call # 'sp_counter' triggers atomic operations in BFS, thus we want to make # sure that it was not the case # NOTE: 'predecessor' is always returned while the C++ function allows to # pass a nullptr assert len(df.columns) == 3, ("The result of the BFS has an invalid " "number of columns") cu_distances = { vertex: dist for vertex, dist in zip(df["vertex"].to_array(), df["distance"].to_array()) } cu_predecessors = { vertex: dist for vertex, dist in zip(df["vertex"].to_array(), df["predecessor"].to_array()) } nx_distances = nx.single_source_shortest_path_length(Gnx, source) # FIXME: The following only verifies vertices that were reached # by cugraph's BFS. # We assume that the distances are given back as integers in BFS # max_val = np.iinfo(df['distance'].dtype).max # Unreached vertices have a distance of max_val missing_vertex_error = 0 distance_mismatch_error = 0 invalid_predecessor_error = 0 for vertex in nx_distances: if vertex in cu_distances: result = cu_distances[vertex] expected = nx_distances[vertex] if result != expected: print("[ERR] Mismatch on distances: " "vid = {}, cugraph = {}, nx = {}".format( vertex, result, expected)) distance_mismatch_error += 1 if vertex not in cu_predecessors: missing_vertex_error += 1 else: pred = cu_predecessors[vertex] if vertex != source and pred not in nx_distances: invalid_predecessor_error += 1 else: # The graph is unweighted thus, predecessors are 1 away if vertex != source and ( (nx_distances[pred] + 1 != cu_distances[vertex])): print("[ERR] Invalid on predecessors: " "vid = {}, cugraph = {}".format(vertex, pred)) invalid_predecessor_error += 1 else: missing_vertex_error += 1 assert missing_vertex_error == 0, "There are missing vertices" assert distance_mismatch_error == 0, "There are invalid distances" assert invalid_predecessor_error == 0, "There are invalid predecessors"
def breadth_first_search(graph: CuGraph, source_node: NodeID, depth_limit: int) -> CuDFVector: bfs_df = cugraph.bfs(graph.value, source_node) bfs_df = bfs_df[bfs_df.predecessor.isin(bfs_df.vertex) | (bfs_df.distance == 0)] bfs_ordered_vertices = bfs_df.sort_values( "distance").vertex.reset_index(drop=True) return CuDFVector(bfs_ordered_vertices)
def test_dask_bfs(dask_client): gc.collect() input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "netscience.csv").as_posix() print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) def modify_dataset(df): temp_df = cudf.DataFrame() temp_df['src'] = df['src'] + 1000 temp_df['dst'] = df['dst'] + 1000 temp_df['value'] = df['value'] return cudf.concat([df, temp_df]) meta = ddf._meta ddf = ddf.map_partitions(modify_dataset, meta=meta) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = modify_dataset(df) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst") dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") expected_dist = cugraph.bfs(g, [0, 1000]) result_dist = dcg.bfs(dg, [0, 1000]) result_dist = result_dist.compute() compare_dist = expected_dist.merge(result_dist, on="vertex", suffixes=["_local", "_dask"]) err = 0 for i in range(len(compare_dist)): if (compare_dist["distance_local"].iloc[i] != compare_dist["distance_dask"].iloc[i]): err = err + 1 assert err == 0
def test_dask_bfs_multi_column_depthlimit(dask_client): gc.collect() input_data_path = (RAPIDS_DATASET_ROOT_DIR_PATH / "netscience.csv").as_posix() print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src_a", "dst_a", "value"], dtype=["int32", "int32", "float32"], ) ddf['src_b'] = ddf['src_a'] + 1000 ddf['dst_b'] = ddf['dst_a'] + 1000 df = cudf.read_csv( input_data_path, delimiter=" ", names=["src_a", "dst_a", "value"], dtype=["int32", "int32", "float32"], ) df['src_b'] = df['src_a'] + 1000 df['dst_b'] = df['dst_a'] + 1000 g = cugraph.DiGraph() g.from_cudf_edgelist(df, ["src_a", "src_b"], ["dst_a", "dst_b"]) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, ["src_a", "src_b"], ["dst_a", "dst_b"]) start = cudf.DataFrame() start['a'] = [0] start['b'] = [1000] depth_limit = 18 expected_dist = cugraph.bfs(g, start, depth_limit=depth_limit) result_dist = dcg.bfs(dg, start, depth_limit=depth_limit) result_dist = result_dist.compute() compare_dist = expected_dist.merge(result_dist, on=["0_vertex", "1_vertex"], suffixes=["_local", "_dask"]) err = 0 for i in range(len(compare_dist)): if (compare_dist["distance_local"].iloc[i] <= depth_limit and compare_dist["distance_dask"].iloc[i] <= depth_limit and compare_dist["distance_local"].iloc[i] != compare_dist["distance_dask"].iloc[i]): err = err + 1 assert err == 0
def cugraph_call(cu_M, start_vertex): G = cugraph.DiGraph() G.from_cudf_edgelist(cu_M, source='0', target='1', edge_attr='2') t1 = time.time() df = cugraph.bfs(G, start_vertex) t2 = time.time() - t1 print('Time : '+str(t2)) # Return distances as np.array() return df['vertex'].to_array(), df['distance'].to_array()
def _compare_bfs_spc(G, Gnx, source): df = cugraph.bfs(G, source, return_sp_counter=True) # This call should only contain 3 columns: # 'vertex', 'distance', 'predecessor', 'sp_counter' assert len(df.columns) == 4, ( "The result of the BFS has an invalid " "number of columns" ) _, _, nx_sp_counter = nxacb._single_source_shortest_path_basic(Gnx, source) sorted_nx = [nx_sp_counter[key] for key in sorted(nx_sp_counter.keys())] # We are not checking for distances / predecessors here as we assume # that these have been checked in the _compare_bfs tests # We focus solely on shortest path counting # cugraph return a dataframe that should contain exactly one time each # vertex # We could us isin to filter only vertices that are common to both # But it would slow down the comparison, and in this specific case # nxacb._single_source_shortest_path_basic is a dictionary containing all # the vertices. # There is no guarantee when we get `df` that the vertices are sorted # thus we enforce the order so that we can leverage faster comparison after sorted_df = df.sort_values("vertex").rename( columns={"sp_counter": "cu_spc"}, copy=False ) # This allows to detect vertices identifier that could have been # wrongly present multiple times cu_vertices = set(sorted_df['vertex'].values_host) nx_vertices = nx_sp_counter.keys() assert len(cu_vertices.intersection(nx_vertices)) == len( nx_vertices ), "There are missing vertices" # We add the nx shortest path counter in the cudf.DataFrame, both the # the DataFrame and `sorted_nx` are sorted base on vertices identifiers sorted_df["nx_spc"] = sorted_nx # We could use numpy.isclose or cupy.isclose, we can then get the entries # in the cudf.DataFrame where there are is a mismatch. # numpy / cupy allclose would get only a boolean and we might want the # extra information about the discrepancies shortest_path_counter_errors = sorted_df[ ~cupy.isclose( sorted_df["cu_spc"], sorted_df["nx_spc"], rtol=DEFAULT_EPSILON ) ] if len(shortest_path_counter_errors) > 0: print(shortest_path_counter_errors) assert len(shortest_path_counter_errors) == 0, ( "Shortest path counters " "are too different" )
def cugraph_call(cu_M, start_vertex): # Device data sources = cu_M['0'] destinations = cu_M['1'] values = cu_M['2'] G = cugraph.Graph() G.add_edge_list(sources, destinations, values) t1 = time.time() df = cugraph.bfs(G, start_vertex) t2 = time.time() - t1 print('Time : ' + str(t2)) # Return distances as np.array() return df['vertex'].to_array(), df['distance'].to_array()
def cugraph_call(M, start_vertex): # Device data M = M.tocsr() sources = cudf.Series(M.indptr) destinations = cudf.Series(M.indices) values = cudf.Series(M.data) G = cugraph.Graph() G.add_adj_list(sources, destinations, values) t1 = time.time() df = cugraph.bfs(G, start_vertex) t2 = time.time() - t1 print('Time : ' + str(t2)) # Return distances as np.array() return df['vertex'].to_array(), df['distance'].to_array()
def test_dask_bfs(client_connection): gc.collect() # FIXME: update this to allow dataset to be parameterized and have dataset # part of test param id (see other tests) input_data_path = r"../datasets/netscience.csv" print(f"dataset={input_data_path}") chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv( input_data_path, chunksize=chunksize, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) df = cudf.read_csv( input_data_path, delimiter=" ", names=["src", "dst", "value"], dtype=["int32", "int32", "float32"], ) g = cugraph.DiGraph() g.from_cudf_edgelist(df, "src", "dst", renumber=True) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, "src", "dst") expected_dist = cugraph.bfs(g, 0) result_dist = dcg.bfs(dg, 0, True) result_dist = result_dist.compute() compare_dist = expected_dist.merge(result_dist, on="vertex", suffixes=["_local", "_dask"]) err = 0 for i in range(len(compare_dist)): if (compare_dist["distance_local"].iloc[i] != compare_dist["distance_dask"].iloc[i]): err = err + 1 assert err == 0
def test_dask_bfs(): gc.collect() cluster = LocalCUDACluster() client = Client(cluster) Comms.initialize() input_data_path = r"../datasets/netscience.csv" chunksize = dcg.get_chunksize(input_data_path) ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) df = cudf.read_csv(input_data_path, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) g = cugraph.DiGraph() g.from_cudf_edgelist(df, 'src', 'dst', renumber=True) dg = cugraph.DiGraph() dg.from_dask_cudf_edgelist(ddf, renumber=True) expected_dist = cugraph.bfs(g, 0) result_dist = dcg.bfs(dg, 0, True) compare_dist = expected_dist.merge(result_dist, on="vertex", suffixes=['_local', '_dask']) err = 0 for i in range(len(compare_dist)): if (compare_dist['distance_local'].iloc[i] != compare_dist['distance_dask'].iloc[i]): err = err + 1 assert err == 0 Comms.destroy() client.close() cluster.close()
def data(df): net = cudf.from_pandas(df) net['to'] = net['to'].astype('int32') net['from'] = net['from'].astype('int32') n = net.iloc[0, 0] G = cugraph.Graph() G.add_edge_list(net['from'], net['to'], None) out_bfs = cugraph.bfs(G, n, directed=True) out_page = cugraph.pagerank(G) out_bfs = out_bfs.to_pandas() out_page = out_page.to_pandas() out_bfs.loc[out_bfs['distance'] < 3, 'group'] = 2 out_bfs.loc[out_bfs['distance'] == 3, 'group'] = 0 out_bfs.loc[out_bfs['distance'] > 3, 'group'] = 1 out_bfs = out_bfs[['vertex', 'group']] return out_bfs, out_page
def test_bfs_paths_array(): with pytest.raises(ValueError) as ErrorMsg: gc.collect() graph_file = '../datasets/karate.csv' cu_M = utils.read_csv_file(graph_file) G = cugraph.Graph() G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='2') # run BFS starting at vertex 17 df = cugraph.bfs(G, 16) # Get the path to vertex 1 answer = cugraph.utils.get_traversed_path_list(df, 0) assert len(answer) == 3 # Get path to vertex 0 - which is not in graph answer = cugraph.utils.get_traversed_path_list(df, 100) assert "not in the result set" in str(ErrorMsg)
def test_bfs_paths(): with pytest.raises(ValueError) as ErrorMsg: gc.collect() graph_file = PurePath(utils.RAPIDS_DATASET_ROOT_DIR) / "karate.csv" cu_M = utils.read_csv_file(graph_file) G = cugraph.Graph() G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='2') # run BFS starting at vertex 17 df = cugraph.bfs(G, 16) # Get the path to vertex 1 p_df = cugraph.utils.get_traversed_path(df, 0) assert len(p_df) == 3 # Get path to vertex 0 - which is not in graph p_df = cugraph.utils.get_traversed_path(df, 100) assert "not in the result set" in str(ErrorMsg)
def bfs(G, start): return cugraph.bfs(G, start=start)
def strong_connected_component(source, destination): """ Generate the strongly connected components using the FW-BW-TRIM approach, but skipping the trimming) Parameters ---------- source : cudf.Seriers A cudf seriers that contains the source side of an edge list destination : cudf.Seriers A cudf seriers that contains the destination side of an edge list Returns ------- cdf : cudf.DataFrame - a dataframe for components df['vertex'] - the vertex ID df['id'] - the component ID sdf : cudf.DataFrame - a dataframe with single vertex components df['vertex'] - the vertex ID count - int - the number of components found Examples -------- >>> M = read_mtx_file(graph_file) >>> sources = cudf.Series(M.row) >>> destinations = cudf.Series(M.col) >>> components, single_components, count = scc.strong_connected_component(source, destination) """ max_value = np.iinfo(np.int32).max # NOQA # create the FW and BW graphs - this version dopes nopt modify the graphs G_fw = cugraph.Graph() G_bw = cugraph.Graph() G_fw.add_edge_list(source, destination) G_bw.add_edge_list(destination, source) # get a list of vertices and sort the list on out_degree d = G_fw.degrees() d = d.sort_values(by='out_degree', ascending=False) num_verts = len(d) # create space for the answers components = [None] * num_verts single_components = [None] * num_verts # Counts - aka array indexies count = 0 single_count = 0 # remove vertices that cannot be in a component bad = d.query('in_degree == 0 or out_degree == 0') if len(bad): bad = bad.drop(['in_degree', 'out_degree']) single_components[single_count] = bad single_count = single_count + 1 d = _filter_list(d, bad) # ----- Start processing ----- while len(d) > 0: v = d['vertex'][0] # compute the forward BFS bfs_fw = cugraph.bfs(G_fw, v) bfs_fw = bfs_fw.query("distance != @max_value") # Now backwards bfs_bw = cugraph.bfs(G_bw, v) bfs_bw = bfs_bw.query("distance != @max_value") # intersection common = bfs_fw.merge(bfs_bw, on='vertex', how='inner') if len(common) > 1: common['id'] = v components[count] = common d = _filter_list(d, common) count = count + 1 else: # v is an isolated vertex vdf = cudf.DataFrame() vdf['vertex'] = v single_components[single_count] = vdf single_count = single_count + 1 d = d.iloc[1:] # end of loop until vertex queue is empty comp = _compress_array(components, count) sing = _compress_array(single_components, single_count) return comp, sing, count
def test_scipy_api_compat(): graph_file = utils.DATASETS[0] input_cugraph_graph = utils.create_obj_from_csv(graph_file, cugraph.Graph, edgevals=True) input_coo_matrix = utils.create_obj_from_csv(graph_file, cp_coo_matrix, edgevals=True) # Ensure scipy-only options are rejected for cugraph inputs with pytest.raises(TypeError): cugraph.bfs(input_cugraph_graph, start=0, directed=False) with pytest.raises(TypeError): cugraph.bfs(input_cugraph_graph) # required arg missing # Ensure cugraph-compatible options work as expected cugraph.bfs(input_cugraph_graph, i_start=0) cugraph.bfs(input_cugraph_graph, i_start=0, return_sp_counter=True) # cannot have start and i_start with pytest.raises(TypeError): cugraph.bfs(input_cugraph_graph, start=0, i_start=0) # Ensure SciPy options for matrix inputs work as expected cugraph.bfs(input_coo_matrix, i_start=0) cugraph.bfs(input_coo_matrix, i_start=0, directed=True) cugraph.bfs(input_coo_matrix, i_start=0, directed=False) result = cugraph.bfs(input_coo_matrix, i_start=0, return_sp_counter=True) assert type(result) is tuple assert len(result) == 3