def renumber(input_graph): client = default_client() ddf = input_graph.edgelist.edgelist_df num_edges = len(ddf) if isinstance(ddf, dask_cudf.DataFrame): is_mnmg = True else: is_mnmg = False num_verts = input_graph.number_of_vertices() if is_mnmg: data = get_distributed_data(ddf) result = [ client.submit(call_renumber, Comms.get_session_id(), wf[1], num_verts, num_edges, is_mnmg, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait(result) ddf = dask_cudf.from_delayed(result) else: call_renumber(Comms.get_session_id(), ddf, num_verts, num_edges, is_mnmg) return ddf
def louvain(input_graph, max_iter=100, resolution=1.0, load_balance=True): """ Compute the modularity optimizing partition of the input graph using the Louvain method on multiple GPUs Examples -------- >>> import cugraph.dask as dcg >>> Comms.initialize() >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv('datasets/karate.csv', chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.Graph() >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', edge_attr='value') >>> parts, modularity_score = dcg.louvain(dg) """ # FIXME: finish docstring: describe parameters, etc. # FIXME: import here to prevent circular import: cugraph->louvain # wrapper->cugraph/structure->cugraph/dask->dask/louvain->cugraph/structure # from cugraph.structure.graph import Graph # FIXME: dask methods to populate graphs from edgelists are only present on # DiGraph classes. Disable the Graph check for now and assume inputs are # symmetric DiGraphs. # if type(graph) is not Graph: # raise Exception("input graph must be undirected") client = default_client() if (input_graph.local_data is not None and input_graph.local_data['by'] == 'src'): data = input_graph.local_data['data'] else: data = get_local_data(input_graph, by='src', load_balance=load_balance) result = dict([(data.worker_info[wf[0]]["rank"], client.submit(call_louvain, Comms.get_session_id(), wf[1], data.local_data, max_iter, resolution, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items())]) wait(result) (parts, modularity_score) = result[0].result() if input_graph.renumbered: # MG renumbering is lazy, but it's safe to assume it's been called at # this point if renumbered=True parts = input_graph.unrenumber(parts, "vertex") return parts, modularity_score
def _mg_rmat(scale, num_edges, a, b, c, seed, clip_and_flip, scramble_vertex_ids, create_using=cugraph.DiGraph): """ Calls RMAT on multiple GPUs and uses the resulting Dask cuDF DataFrame to initialize and return a cugraph Graph object specified with create_using. If create_using is None, returns the Dask DataFrame edgelist as-is. seed is used as the initial seed for the first worker used (worker 0), then each subsequent worker will receive seed+<worker num> as the seed value. """ client = default_client() worker_list = list(client.scheduler_info()['workers'].keys()) num_workers = len(worker_list) num_edges_list = _calc_num_edges_per_worker(num_workers, num_edges) futures = [] for (i, worker_num_edges) in enumerate(num_edges_list): unique_worker_seed = seed + i future = client.submit(_call_rmat, Comms.get_session_id(), scale, worker_num_edges, a, b, c, unique_worker_seed, clip_and_flip, scramble_vertex_ids, workers=worker_list[i]) futures.append(future) ddf = dask_cudf.from_delayed(futures) if create_using is None: return ddf G = create_using() G.from_dask_cudf_edgelist(ddf, source="src", destination="dst") return G
def weakly_connected_components(input_graph): """ Generate the Weakly Connected Components and attach a component label to each vertex. Parameters ---------- input_graph : cugraph.Graph, networkx.Graph, CuPy or SciPy sparse matrix Graph or matrix object, which should contain the connectivity information """ client = default_client() input_graph.compute_renumber_edge_list() ddf = input_graph.edgelist.edgelist_df vertex_partition_offsets = get_vertex_partition_offsets(input_graph) num_verts = vertex_partition_offsets.iloc[-1] num_edges = len(ddf) data = get_distributed_data(ddf) src_col_name = input_graph.renumber_map.renumbered_src_col_name dst_col_name = input_graph.renumber_map.renumbered_dst_col_name result = [client.submit(call_wcc, Comms.get_session_id(), wf[1], src_col_name, dst_col_name, num_verts, num_edges, vertex_partition_offsets, input_graph.aggregate_segment_offsets, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items())] wait(result) ddf = dask_cudf.from_delayed(result) if input_graph.renumbered: return input_graph.unrenumber(ddf, 'vertex') return ddf
def katz_centrality(input_graph, alpha=None, beta=None, max_iter=100, tol=1.0e-5, nstart=None, normalized=True): """ Compute the Katz centrality for the nodes of the graph G. Parameters ---------- input_graph : cuGraph.Graph cuGraph graph descriptor with connectivity information. The graph can contain either directed (DiGraph) or undirected edges (Graph). alpha : float Attenuation factor defaulted to None. If alpha is not specified then it is internally calculated as 1/(degree_max) where degree_max is the maximum out degree. NOTE : The maximum acceptable value of alpha for convergence alpha_max = 1/(lambda_max) where lambda_max is the largest eigenvalue of the graph. Since lambda_max is always lesser than or equal to degree_max for a graph, alpha_max will always be greater than or equal to (1/degree_max). Therefore, setting alpha to (1/degree_max) will guarantee that it will never exceed alpha_max thus in turn fulfilling the requirement for convergence. beta : None A weight scalar - currently Not Supported max_iter : int The maximum number of iterations before an answer is returned. This can be used to limit the execution time and do an early exit before the solver reaches the convergence tolerance. If this value is lower or equal to 0 cuGraph will use the default value, which is 100. tolerance : float Set the tolerance the approximation, this parameter should be a small magnitude value. The lower the tolerance the better the approximation. If this value is 0.0f, cuGraph will use the default value which is 1.0e-6. Setting too small a tolerance can lead to non-convergence due to numerical roundoff. Usually values between 1e-2 and 1e-6 are acceptable. nstart : dask_cudf.Dataframe GPU Dataframe containing the initial guess for katz centrality nstart['vertex'] : dask_cudf.Series Contains the vertex identifiers nstart['values'] : dask_cudf.Series Contains the katz centrality values of vertices normalized : bool If True normalize the resulting katz centrality values Returns ------- katz_centrality : dask_cudf.DataFrame GPU data frame containing two dask_cudf.Series of size V: the vertex identifiers and the corresponding katz centrality values. ddf['vertex'] : dask_cudf.Series Contains the vertex identifiers ddf['katz_centrality'] : dask_cudf.Series Contains the katz centrality of vertices Examples -------- >>> import cugraph.dask as dcg >>> Comms.initialize(p2p=True) >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.DiGraph() >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', edge_attr='value') >>> pr = dcg.katz_centrality(dg) >>> Comms.destroy() """ nstart = None client = default_client() input_graph.compute_renumber_edge_list(transposed=True) (ddf, num_verts, partition_row_size, partition_col_size, vertex_partition_offsets) = shuffle(input_graph, transposed=True) num_edges = len(ddf) data = get_distributed_data(ddf) result = [ client.submit(call_katz_centrality, Comms.get_session_id(), wf[1], num_verts, num_edges, vertex_partition_offsets, alpha, beta, max_iter, tol, nstart, normalized, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait(result) ddf = dask_cudf.from_delayed(result) if input_graph.renumbered: return input_graph.unrenumber(ddf, 'vertex') return ddf
def bfs(graph, start, depth_limit=None, return_distances=True): """ Find the distances and predecessors for a breadth first traversal of a graph. The input graph must contain edge list as dask-cudf dataframe with one partition per GPU. Parameters ---------- graph : cugraph.DiGraph cuGraph graph descriptor, should contain the connectivity information as dask cudf edge list dataframe(edge weights are not used for this algorithm). Undirected Graph not currently supported. start : Integer Specify starting vertex for breadth-first search; this function iterates over edges in the component reachable from this node. depth_limit : Integer or None Limit the depth of the search return_distances : bool, optional, default=True Indicates if distances should be returned Returns ------- df : dask_cudf.DataFrame df['vertex'] gives the vertex id df['distance'] gives the path distance from the starting vertex (Only if return_distances is True) df['predecessor'] gives the vertex it was reached from in the traversal Examples -------- >>> import cugraph.dask as dcg >>> ... Init a DASK Cluster >> see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.DiGraph() >>> dg.from_dask_cudf_edgelist(ddf, 'src', 'dst') >>> df = dcg.bfs(dg, 0) """ client = default_client() graph.compute_renumber_edge_list(transposed=False) ddf = graph.edgelist.edgelist_df vertex_partition_offsets = get_vertex_partition_offsets(graph) num_verts = vertex_partition_offsets.iloc[-1] num_edges = len(ddf) data = get_distributed_data(ddf) if graph.renumbered: if isinstance(start, dask_cudf.DataFrame)\ or isinstance(start, cudf.DataFrame): start = graph.lookup_internal_vertex_id(start, start.columns).\ compute() start = start.iloc[0] else: start = graph.lookup_internal_vertex_id(cudf.Series([start ])).compute() start = start.iloc[0] result = [ client.submit(call_bfs, Comms.get_session_id(), wf[1], num_verts, num_edges, vertex_partition_offsets, graph.aggregate_segment_offsets, start, depth_limit, return_distances, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait(result) ddf = dask_cudf.from_delayed(result) if graph.renumbered: ddf = graph.unrenumber(ddf, 'vertex') ddf = graph.unrenumber(ddf, 'predecessor') ddf = ddf.fillna(-1) return ddf
def renumber(df, src_col_names, dst_col_names, preserve_order=False, store_transposed=False): if isinstance(src_col_names, list): renumber_type = 'legacy' elif not (df[src_col_names].dtype == np.int32 or df[src_col_names].dtype == np.int64): renumber_type = 'legacy' elif is_device_version_less_than((7, 0)): renumber_type = 'legacy' else: renumber_type = 'experimental' renumber_map = NumberMap() if not isinstance(src_col_names, list): src_col_names = [src_col_names] dst_col_names = [dst_col_names] if type(df) is cudf.DataFrame: renumber_map.implementation = NumberMap.SingleGPU( df, src_col_names, dst_col_names, renumber_map.id_type, store_transposed) elif type(df) is dask_cudf.DataFrame: renumber_map.implementation = NumberMap.MultiGPU( df, src_col_names, dst_col_names, renumber_map.id_type, store_transposed) else: raise Exception("df must be cudf.DataFrame or dask_cudf.DataFrame") if renumber_type == 'legacy': indirection_map = renumber_map.implementation.\ indirection_map(df, src_col_names, dst_col_names) df = renumber_map.add_internal_vertex_id( df, "src", src_col_names, drop=True, preserve_order=preserve_order) df = renumber_map.add_internal_vertex_id( df, "dst", dst_col_names, drop=True, preserve_order=preserve_order) else: df = df.rename(columns={ src_col_names[0]: "src", dst_col_names[0]: "dst" }) num_edges = len(df) if isinstance(df, dask_cudf.DataFrame): is_mnmg = True else: is_mnmg = False if is_mnmg: client = default_client() data = get_distributed_data(df) result = [(client.submit(call_renumber, Comms.get_session_id(), wf[1], num_edges, is_mnmg, store_transposed, workers=[wf[0]]), wf[0]) for idx, wf in enumerate(data.worker_to_parts.items())] wait(result) def get_renumber_map(data): return data[0] def get_renumbered_df(data): return data[1] renumbering_map = dask_cudf.from_delayed([ client.submit(get_renumber_map, data, workers=[wf]) for (data, wf) in result ]) renumbered_df = dask_cudf.from_delayed([ client.submit(get_renumbered_df, data, workers=[wf]) for (data, wf) in result ]) if renumber_type == 'legacy': renumber_map.implementation.ddf = indirection_map.merge( renumbering_map, right_on='original_ids', left_on='global_id', how='right').\ drop(columns=['global_id', 'original_ids'])\ .rename(columns={'new_ids': 'global_id'}) else: renumber_map.implementation.ddf = renumbering_map.rename( columns={ 'original_ids': '0', 'new_ids': 'global_id' }) renumber_map.implementation.numbered = True return renumbered_df, renumber_map else: if is_device_version_less_than((7, 0)): renumbered_df = df renumber_map.implementation.df = indirection_map renumber_map.implementation.numbered = True return renumbered_df, renumber_map renumbering_map, renumbered_df = c_renumber.renumber( df, num_edges, 0, Comms.get_default_handle(), is_mnmg, store_transposed) if renumber_type == 'legacy': renumber_map.implementation.df = indirection_map.\ merge(renumbering_map, right_on='original_ids', left_on='id').\ drop(columns=['id', 'original_ids'])\ .rename(columns={'new_ids': 'id'}, copy=False) else: renumber_map.implementation.df = renumbering_map.rename( columns={ 'original_ids': '0', 'new_ids': 'id' }, copy=False) renumber_map.implementation.numbered = True return renumbered_df, renumber_map
def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-5, nstart=None): """ Find the PageRank values for each vertex in a graph using multiple GPUs. cuGraph computes an approximation of the Pagerank using the power method. The input graph must contain edge list as dask-cudf dataframe with one partition per GPU. Parameters ---------- graph : cugraph.DiGraph cuGraph graph descriptor, should contain the connectivity information as dask cudf edge list dataframe(edge weights are not used for this algorithm). Undirected Graph not currently supported. alpha : float The damping factor alpha represents the probability to follow an outgoing edge, standard value is 0.85. Thus, 1.0-alpha is the probability to “teleport” to a random vertex. Alpha should be greater than 0.0 and strictly lower than 1.0. personalization : cudf.Dataframe GPU Dataframe containing the personalization information. Currently not supported. personalization['vertex'] : cudf.Series Subset of vertices of graph for personalization personalization['values'] : cudf.Series Personalization values for vertices max_iter : int The maximum number of iterations before an answer is returned. If this value is lower or equal to 0 cuGraph will use the default value, which is 30. tolerance : float Set the tolerance the approximation, this parameter should be a small magnitude value. The lower the tolerance the better the approximation. If this value is 0.0f, cuGraph will use the default value which is 1.0E-5. Setting too small a tolerance can lead to non-convergence due to numerical roundoff. Usually values between 0.01 and 0.00001 are acceptable. nstart : not supported initial guess for pagerank Returns ------- PageRank : dask_cudf.DataFrame GPU data frame containing two dask_cudf.Series of size V: the vertex identifiers and the corresponding PageRank values. ddf['vertex'] : dask_cudf.Series Contains the vertex identifiers ddf['pagerank'] : dask_cudf.Series Contains the PageRank score Examples -------- >>> import cugraph.dask as dcg >>> ... Init a DASK Cluster >> see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.DiGraph() >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', edge_attr='value') >>> pr = dcg.pagerank(dg) """ from cugraph.structure.graph_classes import null_check nstart = None client = default_client() input_graph.compute_renumber_edge_list(transposed=True) ddf = input_graph.edgelist.edgelist_df vertex_partition_offsets = get_vertex_partition_offsets(input_graph) num_verts = vertex_partition_offsets.iloc[-1] num_edges = len(ddf) data = get_distributed_data(ddf) if personalization is not None: null_check(personalization["vertex"]) null_check(personalization["values"]) if input_graph.renumbered is True: personalization = input_graph.add_internal_vertex_id( personalization, "vertex", "vertex") p_data = get_distributed_data(personalization) result = [ client.submit(call_pagerank, Comms.get_session_id(), wf[1], num_verts, num_edges, vertex_partition_offsets, input_graph.aggregate_segment_offsets, alpha, max_iter, tol, p_data.worker_to_parts[wf[0]][0], nstart, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] else: result = [ client.submit(call_pagerank, Comms.get_session_id(), wf[1], num_verts, num_edges, vertex_partition_offsets, input_graph.aggregate_segment_offsets, alpha, max_iter, tol, personalization, nstart, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait(result) ddf = dask_cudf.from_delayed(result) if input_graph.renumbered: return input_graph.unrenumber(ddf, 'vertex') return ddf
def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-5, nstart=None, load_balance=True): """ Find the PageRank values for each vertex in a graph using multiple GPUs. cuGraph computes an approximation of the Pagerank using the power method. The input graph must contain edge list as dask-cudf dataframe with one partition per GPU. Parameters ---------- graph : cugraph.DiGraph cuGraph graph descriptor, should contain the connectivity information as dask cudf edge list dataframe(edge weights are not used for this algorithm). Undirected Graph not currently supported. alpha : float The damping factor alpha represents the probability to follow an outgoing edge, standard value is 0.85. Thus, 1.0-alpha is the probability to “teleport” to a random vertex. Alpha should be greater than 0.0 and strictly lower than 1.0. personalization : cudf.Dataframe GPU Dataframe containing the personalization information. personalization['vertex'] : cudf.Series Subset of vertices of graph for personalization personalization['values'] : cudf.Series Personalization values for vertices max_iter : int The maximum number of iterations before an answer is returned. If this value is lower or equal to 0 cuGraph will use the default value, which is 30. tolerance : float Set the tolerance the approximation, this parameter should be a small magnitude value. The lower the tolerance the better the approximation. If this value is 0.0f, cuGraph will use the default value which is 1.0E-5. Setting too small a tolerance can lead to non-convergence due to numerical roundoff. Usually values between 0.01 and 0.00001 are acceptable. nstart : not supported initial guess for pagerank load_balance : bool Set as True to perform load_balancing after global sorting of dask-cudf DataFrame. This ensures that the data is uniformly distributed among multiple GPUs to avoid over-loading. Returns ------- PageRank : cudf.DataFrame GPU data frame containing two cudf.Series of size V: the vertex identifiers and the corresponding PageRank values. df['vertex'] : cudf.Series Contains the vertex identifiers df['pagerank'] : cudf.Series Contains the PageRank score Examples -------- >>> import cugraph.dask as dcg >>> Comms.initialize() >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.DiGraph() >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', edge_attr='value') >>> pr = dcg.pagerank(dg) >>> Comms.destroy() """ from cugraph.structure.graph import null_check nstart = None client = default_client() if (input_graph.local_data is not None and input_graph.local_data['by'] == 'dst'): data = input_graph.local_data['data'] else: data = get_local_data(input_graph, by='dst', load_balance=load_balance) if personalization is not None: null_check(personalization["vertex"]) null_check(personalization["values"]) if input_graph.renumbered is True: personalization = input_graph.add_internal_vertex_id( personalization, "vertex", "vertex").compute() result = dict([(data.worker_info[wf[0]]["rank"], client.submit(call_pagerank, Comms.get_session_id(), wf[1], data.local_data, alpha, max_iter, tol, personalization, nstart, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items())]) wait(result) if input_graph.renumbered: return input_graph.unrenumber(result[0].result(), 'vertex').compute() return result[0].result()
def sssp(input_graph, source): """ Compute the distance and predecessors for shortest paths from the specified source to all the vertices in the input_graph. The distances column will store the distance from the source to each vertex. The predecessors column will store each vertex's predecessor in the shortest path. Vertices that are unreachable will have a distance of infinity denoted by the maximum value of the data type and the predecessor set as -1. The source vertex's predecessor is also set to -1. The input graph must contain edge list as dask-cudf dataframe with one partition per GPU. Parameters ---------- input_graph : directed cugraph.Graph cuGraph graph descriptor, should contain the connectivity information as dask cudf edge list dataframe. Undirected Graph not currently supported. source : Integer Specify source vertex Returns ------- df : dask_cudf.DataFrame df['vertex'] gives the vertex id df['distance'] gives the path distance from the starting vertex df['predecessor'] gives the vertex id it was reached from in the traversal Examples -------- >>> # import cugraph.dask as dcg >>> #... Init a DASK Cluster >>> # see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html >>> # chunksize = dcg.get_chunksize(input_data_path) >>> # ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize...) >>> # dg = cugraph.Graph(directed=True) >>> # dg.from_dask_cudf_edgelist(ddf, 'src', 'dst') >>> # df = dcg.sssp(dg, 0) """ # FIXME: Uncomment out the above (broken) example client = default_client() input_graph.compute_renumber_edge_list(transposed=False) ddf = input_graph.edgelist.edgelist_df vertex_partition_offsets = get_vertex_partition_offsets(input_graph) num_verts = vertex_partition_offsets.iloc[-1] num_edges = len(ddf) data = get_distributed_data(ddf) if input_graph.renumbered: src_col_name = input_graph.renumber_map.renumbered_src_col_name dst_col_name = input_graph.renumber_map.renumbered_dst_col_name source = input_graph.lookup_internal_vertex_id(cudf.Series( [source])).compute() source = source.iloc[0] else: # If the input graph was created with renumbering disabled (Graph(..., # renumber=False), the above compute_renumber_edge_list() call will not # perform a renumber step and the renumber_map will not have src/dst # col names. In that case, the src/dst values specified when reading # the edgelist dataframe are to be used, but only if they were single # string values (ie. not a list representing multi-columns). if isinstance(input_graph.source_columns, Iterable): raise RuntimeError("input_graph was not renumbered but has a " "non-string source column name (got: " f"{input_graph.source_columns}). Re-create " "input_graph with either renumbering enabled " "or a source column specified as a string.") if isinstance(input_graph.destination_columns, Iterable): raise RuntimeError("input_graph was not renumbered but has a " "non-string destination column name (got: " f"{input_graph.destination_columns}). " "Re-create input_graph with either renumbering " "enabled or a destination column specified as " "a string.") src_col_name = input_graph.source_columns dst_col_name = input_graph.destination_columns result = [ client.submit(call_sssp, Comms.get_session_id(), wf[1], src_col_name, dst_col_name, num_verts, num_edges, vertex_partition_offsets, input_graph.aggregate_segment_offsets, source, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait(result) ddf = dask_cudf.from_delayed(result) if input_graph.renumbered: ddf = input_graph.unrenumber(ddf, 'vertex') ddf = input_graph.unrenumber(ddf, 'predecessor') ddf["predecessor"] = ddf["predecessor"].fillna(-1) return ddf
def bfs(input_graph, start, depth_limit=None, return_distances=True): """ Find the distances and predecessors for a breadth first traversal of a graph. The input graph must contain edge list as dask-cudf dataframe with one partition per GPU. Parameters ---------- input_graph : directed cugraph.Graph cuGraph graph instance, should contain the connectivity information as dask cudf edge list dataframe(edge weights are not used for this algorithm). Undirected Graph not currently supported. start : Integer Specify starting vertex for breadth-first search; this function iterates over edges in the component reachable from this node. depth_limit : Integer or None, optional (default=None) Limit the depth of the search return_distances : bool, optional (default=True) Indicates if distances should be returned Returns ------- df : dask_cudf.DataFrame df['vertex'] gives the vertex id df['distance'] gives the path distance from the starting vertex (Only if return_distances is True) df['predecessor'] gives the vertex it was reached from in the traversal Examples -------- >>> # import cugraph.dask as dcg >>> # ... Init a DASK Cluster >>> # see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html >>> # Download dataset from https://github.com/rapidsai/cugraph/datasets/.. >>> # chunksize = dcg.get_chunksize(datasets_path / "karate.csv") >>> # ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize) >>> # dg = cugraph.Graph(directed=True) >>> # dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', >>> # edge_attr='value') >>> # df = dcg.bfs(dg, 0) """ # FIXME: Uncomment out the above (broken) example client = default_client() input_graph.compute_renumber_edge_list(transposed=False) ddf = input_graph.edgelist.edgelist_df vertex_partition_offsets = get_vertex_partition_offsets(input_graph) num_verts = vertex_partition_offsets.iloc[-1] num_edges = len(ddf) data = get_distributed_data(ddf) def df_merge(df, tmp_df, tmp_col_names): x = df[0].merge(tmp_df, on=tmp_col_names, how='inner') return x['global_id'] if input_graph.renumbered: src_col_name = input_graph.renumber_map.renumbered_src_col_name dst_col_name = input_graph.renumber_map.renumbered_dst_col_name renumber_ddf = input_graph.renumber_map.implementation.ddf col_names = input_graph.renumber_map.implementation.col_names if isinstance(start, dask_cudf.DataFrame) or isinstance( start, cudf.DataFrame): tmp_df = start tmp_col_names = start.columns else: tmp_df = cudf.DataFrame() tmp_df["0"] = cudf.Series(start) tmp_col_names = ["0"] tmp_ddf = tmp_df[tmp_col_names].rename( columns=dict(zip(tmp_col_names, col_names))) for name in col_names: tmp_ddf[name] = tmp_ddf[name].astype(renumber_ddf[name].dtype) renumber_data = get_distributed_data(renumber_ddf) start = [ client.submit(df_merge, wf[1], tmp_ddf, col_names, workers=[wf[0]]) for idx, wf in enumerate(renumber_data.worker_to_parts.items()) ] else: # If the input graph was created with renumbering disabled (Graph(..., # renumber=False), the above compute_renumber_edge_list() call will not # perform a renumber step and the renumber_map will not have src/dst # col names. In that case, the src/dst values specified when reading # the edgelist dataframe are to be used, but only if they were single # string values (ie. not a list representing multi-columns). if isinstance(input_graph.source_columns, Iterable): raise RuntimeError("input_graph was not renumbered but has a " "non-string source column name (got: " f"{input_graph.source_columns}). Re-create " "input_graph with either renumbering enabled " "or a source column specified as a string.") if isinstance(input_graph.destination_columns, Iterable): raise RuntimeError("input_graph was not renumbered but has a " "non-string destination column name (got: " f"{input_graph.destination_columns}). " "Re-create input_graph with either renumbering " "enabled or a destination column specified as " "a string.") src_col_name = input_graph.source_columns dst_col_name = input_graph.destination_columns result = [ client.submit(call_bfs, Comms.get_session_id(), wf[1], src_col_name, dst_col_name, num_verts, num_edges, vertex_partition_offsets, input_graph.aggregate_segment_offsets, start[idx], depth_limit, return_distances, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait(result) ddf = dask_cudf.from_delayed(result) if input_graph.renumbered: ddf = input_graph.unrenumber(ddf, 'vertex') ddf = input_graph.unrenumber(ddf, 'predecessor') ddf = ddf.fillna(-1) return ddf
def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-5, nstart=None): """ Find the PageRank values for each vertex in a graph using multiple GPUs. cuGraph computes an approximation of the Pagerank using the power method. The input graph must contain edge list as dask-cudf dataframe with one partition per GPU. Parameters ---------- input_graph : cugraph.DiGraph cuGraph graph descriptor, should contain the connectivity information as dask cudf edge list dataframe(edge weights are not used for this algorithm). Undirected Graph not currently supported. alpha : float, optional (default=0.85) The damping factor alpha represents the probability to follow an outgoing edge, standard value is 0.85. Thus, 1.0-alpha is the probability to “teleport” to a random vertex. Alpha should be greater than 0.0 and strictly lower than 1.0. personalization : cudf.Dataframe, optional (default=None) GPU Dataframe containing the personalization information. Currently not supported. personalization['vertex'] : cudf.Series Subset of vertices of graph for personalization personalization['values'] : cudf.Series Personalization values for vertices max_iter : int, optional (default=100) The maximum number of iterations before an answer is returned. If this value is lower or equal to 0 cuGraph will use the default value, which is 30. tol : float, optional (default=1.0e-5) Set the tolerance the approximation, this parameter should be a small magnitude value. The lower the tolerance the better the approximation. If this value is 0.0f, cuGraph will use the default value which is 1.0E-5. Setting too small a tolerance can lead to non-convergence due to numerical roundoff. Usually values between 0.01 and 0.00001 are acceptable. nstart : not supported initial guess for pagerank Returns ------- PageRank : dask_cudf.DataFrame GPU data frame containing two dask_cudf.Series of size V: the vertex identifiers and the corresponding PageRank values. ddf['vertex'] : dask_cudf.Series Contains the vertex identifiers ddf['pagerank'] : dask_cudf.Series Contains the PageRank score Examples -------- >>> # import cugraph.dask as dcg >>> # ... Init a DASK Cluster >>> # see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html >>> # Download dataset from https://github.com/rapidsai/cugraph/datasets/.. >>> # chunksize = dcg.get_chunksize(datasets_path / "karate.csv") >>> # ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize) >>> # dg = cugraph.Graph(directed=True) >>> # dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', >>> # edge_attr='value') >>> # pr = dcg.pagerank(dg) """ nstart = None client = default_client() input_graph.compute_renumber_edge_list(transposed=True) ddf = input_graph.edgelist.edgelist_df vertex_partition_offsets = get_vertex_partition_offsets(input_graph) num_verts = vertex_partition_offsets.iloc[-1] num_edges = len(ddf) data = get_distributed_data(ddf) src_col_name = input_graph.renumber_map.renumbered_src_col_name dst_col_name = input_graph.renumber_map.renumbered_dst_col_name if personalization is not None: if input_graph.renumbered is True: personalization = input_graph.add_internal_vertex_id( personalization, "vertex", "vertex") # Function to assign partition id to personalization dataframe def _set_partitions_pre(s, divisions): partitions = divisions.searchsorted(s, side="right") - 1 partitions[divisions.tail(1).searchsorted( s, side="right").astype("bool")] = (len(divisions) - 2) return partitions # Assign partition id column as per vertex_partition_offsets df = personalization by = ['vertex'] meta = df._meta._constructor_sliced([0]) divisions = vertex_partition_offsets partitions = df[by].map_partitions(_set_partitions_pre, divisions=divisions, meta=meta) df2 = df.assign(_partitions=partitions) # Shuffle personalization values according to the partition id df3 = rearrange_by_column( df2, "_partitions", max_branch=None, npartitions=len(divisions) - 1, shuffle="tasks", ignore_index=False, ).drop(columns=["_partitions"]) p_data = get_distributed_data(df3) result = [ client.submit(call_pagerank, Comms.get_session_id(), wf[1], src_col_name, dst_col_name, num_verts, num_edges, vertex_partition_offsets, input_graph.aggregate_segment_offsets, alpha, max_iter, tol, p_data.worker_to_parts[wf[0]][0], nstart, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] else: result = [ client.submit(call_pagerank, Comms.get_session_id(), wf[1], src_col_name, dst_col_name, num_verts, num_edges, vertex_partition_offsets, input_graph.aggregate_segment_offsets, alpha, max_iter, tol, personalization, nstart, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait(result) ddf = dask_cudf.from_delayed(result) if input_graph.renumbered: return input_graph.unrenumber(ddf, 'vertex') return ddf
def louvain(input_graph, max_iter=100, resolution=1.0): """ Compute the modularity optimizing partition of the input graph using the Louvain method on multiple GPUs It uses the Louvain method described in: VD Blondel, J-L Guillaume, R Lambiotte and E Lefebvre: Fast unfolding of community hierarchies in large networks, J Stat Mech P10008 (2008), http://arxiv.org/abs/0803.0476 Parameters ---------- input_graph : cugraph.Graph or NetworkX Graph The graph descriptor should contain the connectivity information and weights. The adjacency list will be computed if not already present. max_iter : integer, optional (default=100) This controls the maximum number of levels/iterations of the Louvain algorithm. When specified the algorithm will terminate after no more than the specified number of iterations. No error occurs when the algorithm terminates early in this manner. resolution: float/double, optional (default=1.0) Called gamma in the modularity formula, this changes the size of the communities. Higher resolutions lead to more smaller communities, lower resolutions lead to fewer larger communities. Defaults to 1. Returns ------- parts : cudf.DataFrame GPU data frame of size V containing two columns the vertex id and the partition id it is assigned to. df['vertex'] : cudf.Series Contains the vertex identifiers df['partition'] : cudf.Series Contains the partition assigned to the vertices modularity_score : float a floating point number containing the global modularity score of the partitioning. Examples -------- >>> # import cugraph.dask as dcg >>> # ... Init a DASK Cluster >>> # see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html >>> # Download dataset from https://github.com/rapidsai/cugraph/datasets/.. >>> # chunksize = dcg.get_chunksize(datasets_path / "karate.csv") >>> # ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize) >>> # dg = cugraph.Graph(directed=True) >>> # dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', >>> # edge_attr='value') >>> # parts, modularity_score = dcg.louvain(dg) """ # FIXME: Uncomment out the above (broken) example # MG Louvain currently requires CUDA 10.2 or higher. # FIXME: remove this check once RAPIDS drops support for CUDA < 10.2 if is_cuda_version_less_than((10, 2)): raise NotImplementedError("Multi-GPU Louvain is not implemented for " "this version of CUDA. Ensure CUDA version " "10.2 or higher is installed.") # FIXME: dask methods to populate graphs from edgelists are only present on # DiGraph classes. Disable the Graph check for now and assume inputs are # symmetric DiGraphs. # if type(graph) is not Graph: # raise Exception("input graph must be undirected") client = default_client() # Calling renumbering results in data that is sorted by degree input_graph.compute_renumber_edge_list(transposed=False) ddf = input_graph.edgelist.edgelist_df vertex_partition_offsets = get_vertex_partition_offsets(input_graph) num_verts = vertex_partition_offsets.iloc[-1] num_edges = len(ddf) data = get_distributed_data(ddf) src_col_name = input_graph.renumber_map.renumbered_src_col_name dst_col_name = input_graph.renumber_map.renumbered_dst_col_name futures = [ client.submit(call_louvain, Comms.get_session_id(), wf[1], src_col_name, dst_col_name, num_verts, num_edges, vertex_partition_offsets, input_graph.aggregate_segment_offsets, max_iter, resolution, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait(futures) # futures is a list of Futures containing tuples of (DataFrame, mod_score), # unpack using separate calls to client.submit with a callable to get # individual items. # FIXME: look into an alternate way (not returning a tuples, accessing # tuples differently, etc.) since multiple client.submit() calls may not be # optimal. df_futures = [client.submit(op.getitem, f, 0) for f in futures] mod_score_futures = [client.submit(op.getitem, f, 1) for f in futures] ddf = dask_cudf.from_delayed(df_futures) # Each worker should have computed the same mod_score mod_score = mod_score_futures[0].result() if input_graph.renumbered: # MG renumbering is lazy, but it's safe to assume it's been called at # this point if renumbered=True ddf = input_graph.unrenumber(ddf, "vertex") return (ddf, mod_score)
def renumber_and_segment(df, src_col_names, dst_col_names, preserve_order=False, store_transposed=False): if isinstance(src_col_names, list): renumber_type = 'legacy' elif not (df[src_col_names].dtype == np.int32 or df[src_col_names].dtype == np.int64): renumber_type = 'legacy' else: renumber_type = 'experimental' renumber_map = NumberMap() if not isinstance(src_col_names, list): src_col_names = [src_col_names] dst_col_names = [dst_col_names] # Assign the new src and dst column names to be used in the renumbered # dataframe to return (renumbered_src_col_name and # renumbered_dst_col_name) renumber_map.set_renumbered_col_names(src_col_names, dst_col_names, df.columns) id_type = df[src_col_names[0]].dtype if isinstance(df, cudf.DataFrame): renumber_map.implementation = NumberMap.SingleGPU( df, src_col_names, dst_col_names, renumber_map.id_type, store_transposed) elif isinstance(df, dask_cudf.DataFrame): renumber_map.implementation = NumberMap.MultiGPU( df, src_col_names, dst_col_names, renumber_map.id_type, store_transposed) else: raise TypeError("df must be cudf.DataFrame or dask_cudf.DataFrame") if renumber_type == 'legacy': indirection_map = renumber_map.implementation.\ indirection_map(df, src_col_names, dst_col_names) df = renumber_map.add_internal_vertex_id( df, renumber_map.renumbered_src_col_name, src_col_names, drop=True, preserve_order=preserve_order) df = renumber_map.add_internal_vertex_id( df, renumber_map.renumbered_dst_col_name, dst_col_names, drop=True, preserve_order=preserve_order) else: df = df.rename( columns={ src_col_names[0]: renumber_map.renumbered_src_col_name, dst_col_names[0]: renumber_map.renumbered_dst_col_name }) num_edges = len(df) if isinstance(df, dask_cudf.DataFrame): is_mnmg = True else: is_mnmg = False if is_mnmg: client = default_client() data = get_distributed_data(df) result = [(client.submit(call_renumber, Comms.get_session_id(), wf[1], renumber_map.renumbered_src_col_name, renumber_map.renumbered_dst_col_name, num_edges, is_mnmg, store_transposed, workers=[wf[0]]), wf[0]) for idx, wf in enumerate(data.worker_to_parts.items())] wait(result) def get_renumber_map(id_type, data): return data[0].astype(id_type) def get_segment_offsets(data): return data[1] def get_renumbered_df(id_type, data): data[2][renumber_map.renumbered_src_col_name] = \ data[2][renumber_map.renumbered_src_col_name]\ .astype(id_type) data[2][renumber_map.renumbered_dst_col_name] = \ data[2][renumber_map.renumbered_dst_col_name]\ .astype(id_type) return data[2] renumbering_map = dask_cudf.from_delayed([ client.submit(get_renumber_map, id_type, data, workers=[wf]) for (data, wf) in result ]) list_of_segment_offsets = client.gather([ client.submit(get_segment_offsets, data, workers=[wf]) for (data, wf) in result ]) aggregate_segment_offsets = [] for segment_offsets in list_of_segment_offsets: aggregate_segment_offsets.extend(segment_offsets) renumbered_df = dask_cudf.from_delayed([ client.submit(get_renumbered_df, id_type, data, workers=[wf]) for (data, wf) in result ]) if renumber_type == 'legacy': renumber_map.implementation.ddf = indirection_map.merge( renumbering_map, right_on='original_ids', left_on='global_id', how='right').\ drop(columns=['global_id', 'original_ids'])\ .rename(columns={'new_ids': 'global_id'}) else: renumber_map.implementation.ddf = renumbering_map.rename( columns={ 'original_ids': '0', 'new_ids': 'global_id' }) renumber_map.implementation.numbered = True return renumbered_df, renumber_map, aggregate_segment_offsets else: renumbering_map, segment_offsets, renumbered_df = \ c_renumber.renumber(df, renumber_map.renumbered_src_col_name, renumber_map.renumbered_dst_col_name, num_edges, 0, Comms.get_default_handle(), is_mnmg, store_transposed) if renumber_type == 'legacy': renumber_map.implementation.df = indirection_map.\ merge(renumbering_map, right_on='original_ids', left_on='id').\ drop(columns=['id', 'original_ids'])\ .rename(columns={'new_ids': 'id'}, copy=False) else: renumber_map.implementation.df = renumbering_map.rename( columns={ 'original_ids': '0', 'new_ids': 'id' }, copy=False) renumber_map.implementation.numbered = True return renumbered_df, renumber_map, segment_offsets
def louvain(input_graph, max_iter=100, resolution=1.0): """ Compute the modularity optimizing partition of the input graph using the Louvain method on multiple GPUs Examples -------- >>> import cugraph.dask as dcg >>> ... Init a DASK Cluster >> see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv('datasets/karate.csv', chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.Graph() >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', edge_attr='value') >>> parts, modularity_score = dcg.louvain(dg) """ # FIXME: finish docstring: describe parameters, etc. # MG Louvain currently requires CUDA 10.2 or higher. # FIXME: remove this check once RAPIDS drops support for CUDA < 10.2 if is_cuda_version_less_than((10, 2)): raise NotImplementedError("Multi-GPU Louvain is not implemented for " "this version of CUDA. Ensure CUDA version " "10.2 or higher is installed.") # FIXME: dask methods to populate graphs from edgelists are only present on # DiGraph classes. Disable the Graph check for now and assume inputs are # symmetric DiGraphs. # if type(graph) is not Graph: # raise Exception("input graph must be undirected") client = default_client() # Calling renumbering results in data that is sorted by degree input_graph.compute_renumber_edge_list(transposed=False) sorted_by_degree = True ddf = input_graph.edgelist.edgelist_df vertex_partition_offsets = get_vertex_partition_offsets(input_graph) num_verts = vertex_partition_offsets.iloc[-1] num_edges = len(ddf) data = get_distributed_data(ddf) futures = [ client.submit(call_louvain, Comms.get_session_id(), wf[1], num_verts, num_edges, vertex_partition_offsets, sorted_by_degree, max_iter, resolution, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait(futures) # futures is a list of Futures containing tuples of (DataFrame, mod_score), # unpack using separate calls to client.submit with a callable to get # individual items. # FIXME: look into an alternate way (not returning a tuples, accessing # tuples differently, etc.) since multiple client.submit() calls may not be # optimal. df_futures = [client.submit(op.getitem, f, 0) for f in futures] mod_score_futures = [client.submit(op.getitem, f, 1) for f in futures] ddf = dask_cudf.from_delayed(df_futures) # Each worker should have computed the same mod_score mod_score = mod_score_futures[0].result() if input_graph.renumbered: # MG renumbering is lazy, but it's safe to assume it's been called at # this point if renumbered=True ddf = input_graph.unrenumber(ddf, "vertex") return (ddf, mod_score)
def bfs(graph, start, return_distances=False): """ Find the distances and predecessors for a breadth first traversal of a graph. The input graph must contain edge list as dask-cudf dataframe with one partition per GPU. Parameters ---------- graph : cugraph.DiGraph cuGraph graph descriptor, should contain the connectivity information as dask cudf edge list dataframe(edge weights are not used for this algorithm). Undirected Graph not currently supported. start : Integer Specify starting vertex for breadth-first search; this function iterates over edges in the component reachable from this node. return_distances : bool, optional, default=False Indicates if distances should be returned Returns ------- df : dask_cudf.DataFrame df['vertex'] gives the vertex id df['distance'] gives the path distance from the starting vertex (Only if return_distances is True) df['predecessor'] gives the vertex it was reached from in the traversal Examples -------- >>> import cugraph.dask as dcg >>> Comms.initialize(p2p=True) >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.DiGraph() >>> dg.from_dask_cudf_edgelist(ddf, 'src', 'dst') >>> df = dcg.bfs(dg, 0) >>> Comms.destroy() """ client = default_client() graph.compute_renumber_edge_list(transposed=False) (ddf, num_verts, partition_row_size, partition_col_size, vertex_partition_offsets) = shuffle(graph, transposed=False) num_edges = len(ddf) data = get_distributed_data(ddf) if graph.renumbered: start = graph.lookup_internal_vertex_id(cudf.Series([start], dtype='int32')).compute() start = start.iloc[0] result = [client.submit( call_bfs, Comms.get_session_id(), wf[1], num_verts, num_edges, vertex_partition_offsets, start, return_distances, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items())] wait(result) ddf = dask_cudf.from_delayed(result) if graph.renumbered: ddf = graph.unrenumber(ddf, 'vertex') ddf = graph.unrenumber(ddf, 'predecessor') ddf["predecessor"] = ddf["predecessor"].fillna(-1) return ddf
def katz_centrality(input_graph, alpha=None, beta=None, max_iter=100, tol=1.0e-5, nstart=None, normalized=True): """ Compute the Katz centrality for the nodes of the graph G. Parameters ---------- input_graph : cuGraph.Graph cuGraph graph descriptor with connectivity information. The graph can contain either directed (DiGraph) or undirected edges (Graph). alpha : float, optional (default=None) Attenuation factor. If alpha is not specified then it is internally calculated as 1/(degree_max) where degree_max is the maximum out degree. NOTE The maximum acceptable value of alpha for convergence alpha_max = 1/(lambda_max) where lambda_max is the largest eigenvalue of the graph. Since lambda_max is always lesser than or equal to degree_max for a graph, alpha_max will always be greater than or equal to (1/degree_max). Therefore, setting alpha to (1/degree_max) will guarantee that it will never exceed alpha_max thus in turn fulfilling the requirement for convergence. beta : None A weight scalar - currently Not Supported max_iter : int, optional (default=100) The maximum number of iterations before an answer is returned. This can be used to limit the execution time and do an early exit before the solver reaches the convergence tolerance. If this value is lower or equal to 0 cuGraph will use the default value, which is 100. tol : float, optional (default=1.0e-5) Set the tolerance the approximation, this parameter should be a small magnitude value. The lower the tolerance the better the approximation. If this value is 0.0f, cuGraph will use the default value which is 1.0e-6. Setting too small a tolerance can lead to non-convergence due to numerical roundoff. Usually values between 1e-2 and 1e-6 are acceptable. nstart : dask_cudf.Dataframe, optional (default=None) GPU Dataframe containing the initial guess for katz centrality nstart['vertex'] : dask_cudf.Series Contains the vertex identifiers nstart['values'] : dask_cudf.Series Contains the katz centrality values of vertices normalized : bool, optional (default=True) If True normalize the resulting katz centrality values Returns ------- katz_centrality : dask_cudf.DataFrame GPU data frame containing two dask_cudf.Series of size V: the vertex identifiers and the corresponding katz centrality values. ddf['vertex'] : dask_cudf.Series Contains the vertex identifiers ddf['katz_centrality'] : dask_cudf.Series Contains the katz centrality of vertices Examples -------- >>> # import cugraph.dask as dcg >>> # ... Init a DASK Cluster >>> # see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html >>> # Download dataset from https://github.com/rapidsai/cugraph/datasets/.. >>> # chunksize = dcg.get_chunksize(datasets_path / "karate.csv") >>> # ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize) >>> # dg = cugraph.Graph(directed=True) >>> # dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', >>> # edge_attr='value') >>> # pr = dcg.katz_centrality(dg) """ # FIXME: Uncomment out the above (broken) example nstart = None client = default_client() input_graph.compute_renumber_edge_list(transposed=True) ddf = input_graph.edgelist.edgelist_df vertex_partition_offsets = get_vertex_partition_offsets(input_graph) num_verts = vertex_partition_offsets.iloc[-1] num_edges = len(ddf) data = get_distributed_data(ddf) src_col_name = input_graph.renumber_map.renumbered_src_col_name dst_col_name = input_graph.renumber_map.renumbered_dst_col_name result = [ client.submit(call_katz_centrality, Comms.get_session_id(), wf[1], src_col_name, dst_col_name, num_verts, num_edges, vertex_partition_offsets, input_graph.aggregate_segment_offsets, alpha, beta, max_iter, tol, nstart, normalized, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait(result) ddf = dask_cudf.from_delayed(result) if input_graph.renumbered: return input_graph.unrenumber(ddf, 'vertex') return ddf
def sssp(graph, source): """ Compute the distance and predecessors for shortest paths from the specified source to all the vertices in the graph. The distances column will store the distance from the source to each vertex. The predecessors column will store each vertex's predecessor in the shortest path. Vertices that are unreachable will have a distance of infinity denoted by the maximum value of the data type and the predecessor set as -1. The source vertex's predecessor is also set to -1. The input graph must contain edge list as dask-cudf dataframe with one partition per GPU. Parameters ---------- graph : cugraph.DiGraph cuGraph graph descriptor, should contain the connectivity information as dask cudf edge list dataframe. Undirected Graph not currently supported. source : Integer Specify source vertex Returns ------- df : dask_cudf.DataFrame df['vertex'] gives the vertex id df['distance'] gives the path distance from the starting vertex df['predecessor'] gives the vertex id it was reached from in the traversal Examples -------- >>> import cugraph.dask as dcg >>> ... Init a DASK Cluster >> see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.DiGraph() >>> dg.from_dask_cudf_edgelist(ddf, 'src', 'dst') >>> df = dcg.sssp(dg, 0) """ client = default_client() graph.compute_renumber_edge_list(transposed=False) ddf = graph.edgelist.edgelist_df vertex_partition_offsets = get_vertex_partition_offsets(graph) num_verts = vertex_partition_offsets.iloc[-1] num_edges = len(ddf) data = get_distributed_data(ddf) if graph.renumbered: source = graph.lookup_internal_vertex_id(cudf.Series([source ])).compute() source = source.iloc[0] result = [ client.submit(call_sssp, Comms.get_session_id(), wf[1], num_verts, num_edges, vertex_partition_offsets, graph.aggregate_segment_offsets, source, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait(result) ddf = dask_cudf.from_delayed(result) if graph.renumbered: ddf = graph.unrenumber(ddf, 'vertex') ddf = graph.unrenumber(ddf, 'predecessor') ddf["predecessor"] = ddf["predecessor"].fillna(-1) return ddf
def bfs(graph, start, return_distances=False): """ Find the distances and predecessors for a breadth first traversal of a graph. The input graph must contain edge list as dask-cudf dataframe with one partition per GPU. Parameters ---------- graph : cugraph.DiGraph cuGraph graph descriptor, should contain the connectivity information as dask cudf edge list dataframe(edge weights are not used for this algorithm). Undirected Graph not currently supported. start : Integer Specify starting vertex for breadth-first search; this function iterates over edges in the component reachable from this node. return_distances : bool, optional, default=False Indicates if distances should be returned Returns ------- df : cudf.DataFrame df['vertex'][i] gives the vertex id of the i'th vertex df['distance'][i] gives the path distance for the i'th vertex from the starting vertex (Only if return_distances is True) df['predecessor'][i] gives for the i'th vertex the vertex it was reached from in the traversal Examples -------- >>> import cugraph.dask as dcg >>> Comms.initialize() >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.DiGraph() >>> dg.from_dask_cudf_edgelist(ddf) >>> df = dcg.bfs(dg, 0) >>> Comms.destroy() """ client = default_client() if (graph.local_data is not None and graph.local_data['by'] == 'src'): data = graph.local_data['data'] else: data = get_local_data(graph, by='src') if graph.renumbered: start = graph.lookup_internal_vertex_id(cudf.Series([start])).compute() start = start.iloc[0] result = dict([(data.worker_info[wf[0]]["rank"], client.submit(call_bfs, Comms.get_session_id(), wf[1], data.local_data, start, return_distances, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items())]) wait(result) df = result[0].result() if graph.renumbered: df = graph.unrenumber(df, 'vertex').compute() df = graph.unrenumber(df, 'predecessor').compute() df["predecessor"].fillna(-1, inplace=True) return df