def overlap(input_graph, vertex_pair=None): """ Compute the Overlap Coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. Overlap Coefficient is defined between two sets as the ratio of the volume of their intersection divided by the smaller of their two volumes. In the context of graphs, the neighborhood of a vertex is seen as a set. The Overlap Coefficient weight of each edge represents the strength of connection between vertices based on the relative similarity of their neighbors. If first is specified but second is not, or vice versa, an exception will be thrown. Parameters ---------- graph : cugraph.Graph cuGraph graph descriptor, should contain the connectivity information as an edge list (edge weights are not used for this algorithm). The adjacency list will be computed if not already present. vertex_pair : cudf.DataFrame A GPU dataframe consisting of two columns representing pairs of vertices. If provided, the overlap coefficient is computed for the given vertex pairs, else, it is computed for all vertex pairs. Returns ------- df : cudf.DataFrame GPU data frame of size E (the default) or the size of the given pairs (first, second) containing the Overlap coefficients. The ordering is relative to the adjacency list, or that given by the specified vertex pairs. df['source'] : cudf.Series The source vertex ID (will be identical to first if specified). df['destination'] : cudf.Series The destination vertex ID (will be identical to second if specified). df['overlap_coeff'] : cudf.Series The computed Overlap coefficient between the source and destination vertices. Examples -------- >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(gdf, source='0', destination='1') >>> df = cugraph.overlap(G) """ if (type(vertex_pair) == cudf.DataFrame): null_check(vertex_pair[vertex_pair.columns[0]]) null_check(vertex_pair[vertex_pair.columns[1]]) elif vertex_pair is None: pass else: raise ValueError("vertex_pair must be a cudf dataframe") df = overlap_wrapper.overlap(input_graph, None, vertex_pair) return df
def subgraph(G, vertices): """ Compute a subgraph of the existing graph including only the specified vertices. This algorithm works for both directed and undirected graphs, it does not actually traverse the edges, simply pulls out any edges that are incident on vertices that are both contained in the vertices list. Parameters ---------- G : cugraph.Graph cuGraph graph descriptor vertices : cudf.Series Specifies the vertices of the induced subgraph Returns ------- Sg : cugraph.Graph A graph object containing the subgraph induced by the given vertex set. Examples -------- >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter = ' ', dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(gdf, source='0', destination='1') >>> verts = numpy.zeros(3, dtype=numpy.int32) >>> verts[0] = 0 >>> verts[1] = 1 >>> verts[2] = 2 >>> sverts = cudf.Series(verts) >>> Sg = cugraph.subgraph(G, sverts) """ null_check(vertices) if G.renumbered: vertices = G.lookup_internal_vertex_id(vertices) result_graph = type(G)() df = subgraph_extraction_wrapper.subgraph(G, vertices) if G.renumbered: df = G.unrenumber(df, "src") df = G.unrenumber(df, "dst") if G.edgelist.weights: result_graph.from_cudf_edgelist(df, source="src", destination="dst", edge_attr="weight") else: result_graph.from_cudf_edgelist(df, source="src", destination="dst") return result_graph
def renumber(source_col, dest_col): """ Take a (potentially sparse) set of source and destination vertex ids and renumber the vertices to create a dense set of vertex ids using all values contiguously from 0 to the number of unique vertices - 1. Input columns can be either int64 or int32. The output will be mapped to int32, since many of the cugraph functions are limited to int32. If the number of unique values in source_col and dest_col > 2^31-1 then this function will return an error. Return from this call will be three cudf Series - the renumbered source_col, the renumbered dest_col and a numbering map that maps the new ids to the original ids. Parameters ---------- source_col : cudf.Series This cudf.Series wraps a gdf_column of size E (E: number of edges). The gdf column contains the source index for each edge. Source indices must be an integer type. dest_col : cudf.Series This cudf.Series wraps a gdf_column of size E (E: number of edges). The gdf column contains the destination index for each edge. Destination indices must be an integer type. numbering_map : cudf.Series This cudf.Series wraps a gdf column of size V (V: number of vertices). The gdf column contains a numbering map that maps the new ids to the original ids. Examples -------- >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> sources = cudf.Series(M['0']) >>> destinations = cudf.Series(M['1']) >>> source_col, dest_col, numbering_map = cugraph.renumber(sources, >>> destinations) >>> G = cugraph.Graph() >>> G.add_edge_list(source_col, dest_col, None) """ csg.null_check(source_col) csg.null_check(dest_col) (source_col, dest_col, numbering_map) = graph_new_wrapper.renumber(source_col, dest_col) return source_col, dest_col, numbering_map
def subgraph(G, vertices): """ Compute a subgraph of the existing graph including only the specified vertices. This algorithm works for both directed and undirected graphs, it does not actually traverse the edges, simply pulls out any edges that are incident on vertices that are both contained in the vertices list. Parameters ---------- G : cugraph.Graph cuGraph graph descriptor vertices : cudf.Series Specifies the vertices of the induced subgraph Returns ------- Sg : cugraph.Graph A graph object containing the subgraph induced by the given vertex set. Examples -------- >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> sources = cudf.Series(M['0']) >>> destinations = cudf.Series(M['1']) >>> G = cugraph.Graph() >>> G.add_edge_list(sources, destinations, None) >>> verts = numpy.zeros(3, dtype=numpy.int32) >>> verts[0] = 0 >>> verts[1] = 1 >>> verts[2] = 2 >>> sverts = cudf.Series(verts) >>> Sg = cugraph.subgraph(G, sverts) """ null_check(vertices) result_graph = Graph() subgraph_extraction_wrapper.subgraph(G.graph_ptr, vertices, result_graph.graph_ptr) return result_graph
def symmetrize(source_col, dest_col, value_col=None): """ Take a COO set of source destination pairs along with associated values and create a new COO set of source destination pairs along with values where all edges exist in both directions. Return from this call will be a COO stored as two cudf Series - the symmetrized source column and the symmetrized dest column, along with an optional cudf Series containing the associated values (only if the values are passed in). Parameters ---------- source_col : cudf.Series This cudf.Series wraps a gdf_column of size E (E: number of edges). The gdf column contains the source index for each edge. Source indices must be an integer type. dest_col : cudf.Series This cudf.Series wraps a gdf_column of size E (E: number of edges). The gdf column contains the destination index for each edge. Destination indices must be an integer type. value_col : cudf.Series (optional) This cudf.Series wraps a gdf_column of size E (E: number of edges). The gdf column contains values associated with this edge. For this function the values can be any type, they are not examined, just copied. Examples -------- >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> sources = cudf.Series(M['0']) >>> destinations = cudf.Series(M['1']) >>> values = cudf.Series(M['2']) >>> src, dst, val = cugraph.symmetrize(sources, destinations, values) >>> G = cugraph.Graph() >>> G.add_edge_list(src, dst, val) """ csg.null_check(source_col) csg.null_check(dest_col) input_df = cudf.DataFrame({"source": source_col, "destination": dest_col}) if value_col is not None: csg.null_check(value_col) input_df.insert(len(input_df.columns), "value", value_col) output_df = symmetrize_df(input_df, "source", "destination") if value_col is not None: return ( output_df["source"], output_df["destination"], output_df["value"], ) return output_df["source"], output_df["destination"]
def traveling_salesperson( pos_list, restarts=100000, beam_search=True, k=4, nstart=None, verbose=False, ): """ Finds an approximate solution to the traveling salesperson problem (TSP). cuGraph computes an approximation of the TSP problem using hill climbing optimization. The current implementation does not support a weighted graph. Parameters ---------- pos_list: cudf.DataFrame Data frame with initial vertex positions containing three columns: 'vertex' ids and 'x', 'y' positions. restarts: int Number of starts to try. The more restarts, the better the solution will be approximated. The number of restarts depends on the problem size and should be kept low for instances above 2k cities. beam_search: bool Specify if the initial solution should use KNN for an approximation solution. k: int Beam width to use in the search. nstart: int Vertex id to use as starting position. verbose: bool Logs configuration and iterative improvement. Returns ------- route : cudf.Series cudf.Series of size V containing the ordered list of vertices than needs to be visited. """ if not isinstance(pos_list, cudf.DataFrame): raise TypeError("Instance should be cudf.DataFrame") null_check(pos_list['vertex']) null_check(pos_list['x']) null_check(pos_list['y']) if nstart is not None and not pos_list[pos_list['vertex'] == nstart].index: raise ValueError("nstart should be in vertex ids") route, cost = traveling_salesperson_wrapper.traveling_salesperson( pos_list, restarts, beam_search, k, nstart, verbose) return route, cost
def symmetrize(source_col, dest_col, value_col=None, multi=False, symmetrize=True): """ Take a COO set of source destination pairs along with associated values stored in a single GPU or distributed create a new COO set of source destination pairs along with values where all edges exist in both directions. Return from this call will be a COO stored as two cudf Series or dask_cudf.Series -the symmetrized source column and the symmetrized dest column, along with an optional cudf Series containing the associated values (only if the values are passed in). Parameters ---------- source_col : cudf.Series or dask_cudf.Series This cudf.Series wraps a gdf_column of size E (E: number of edges). The gdf column contains the source index for each edge. Source indices must be an integer type. dest_col : cudf.Series or dask_cudf.Series This cudf.Series wraps a gdf_column of size E (E: number of edges). The gdf column contains the destination index for each edge. Destination indices must be an integer type. value_col : cudf.Series or dask_cudf.Series (optional) This cudf.Series wraps a gdf_column of size E (E: number of edges). The gdf column contains values associated with this edge. For this function the values can be any type, they are not examined, just copied. Examples -------- >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> sources = cudf.Series(M['0']) >>> destinations = cudf.Series(M['1']) >>> values = cudf.Series(M['2']) >>> src, dst, val = cugraph.symmetrize(sources, destinations, values) """ input_df = None weight_name = None if type(source_col) is dask_cudf.Series: # FIXME convoluted way of just wrapping dask cudf Series in a ddf input_df = source_col.to_frame() input_df = input_df.rename(columns={source_col.name: "source"}) input_df["destination"] = dest_col else: input_df = cudf.DataFrame( {"source": source_col, "destination": dest_col} ) csg.null_check(source_col) csg.null_check(dest_col) if value_col is not None: weight_name = "value" input_df.insert(len(input_df.columns), "value", value_col) output_df = None if type(source_col) is dask_cudf.Series: output_df = symmetrize_ddf( input_df, "source", "destination", weight_name ).persist() else: output_df = symmetrize_df(input_df, "source", "destination", multi, symmetrize) if value_col is not None: return ( output_df["source"], output_df["destination"], output_df["value"], ) return output_df["source"], output_df["destination"]
def jaccard(input_graph, vertex_pair=None): """ Compute the Jaccard similarity between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. Jaccard similarity is defined between two sets as the ratio of the volume of their intersection divided by the volume of their union. In the context of graphs, the neighborhood of a vertex is seen as a set. The Jaccard similarity weight of each edge represents the strength of connection between vertices based on the relative similarity of their neighbors. If first is specified but second is not, or vice versa, an exception will be thrown. NOTE: If the vertex_pair parameter is not specified then the behavior of cugraph.jaccard is different from the behavior of networkx.jaccard_coefficient. cugraph.jaccard, in the absence of a specified vertex pair list, will use the edges of the graph to construct a vertex pair list and will return the jaccard coefficient for those vertex pairs. networkx.jaccard_coefficient, in the absence of a specified vertex pair list, will return an upper triangular dense matrix, excluding the diagonal as well as vertex pairs that are directly connected by an edge in the graph, of jaccard coefficients. Technically, networkx returns a lazy iterator across this upper triangular matrix where the actual jaccard coefficient is computed when the iterator is dereferenced. Computing a dense matrix of results is not feasible if the number of vertices in the graph is large (100,000 vertices would result in 4.9 billion values in that iterator). If your graph is small enough (or you have enough memory and patience) you can get the interesting (non-zero) values that are part of the networkx solution by doing the following: >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(gdf, source='0', destination='1') >>> pairs = cugraph.get_two_hop_neighbors(G) >>> df = cugraph.jaccard(G, pairs) But please remember that cugraph will fill the dataframe with the entire solution you request, so you'll need enough memory to store the 2-hop neighborhood dataframe. Parameters ---------- graph : cugraph.Graph cuGraph graph descriptor, should contain the connectivity information as an edge list (edge weights are not used for this algorithm). The graph should be undirected where an undirected edge is represented by a directed edge in both direction. The adjacency list will be computed if not already present. vertex_pair : cudf.DataFrame A GPU dataframe consisting of two columns representing pairs of vertices. If provided, the jaccard coefficient is computed for the given vertex pairs. If the vertex_pair is not provided then the current implementation computes the jaccard coefficient for all adjacent vertices in the graph. Returns ------- df : cudf.DataFrame GPU data frame of size E (the default) or the size of the given pairs (first, second) containing the Jaccard weights. The ordering is relative to the adjacency list, or that given by the specified vertex pairs. df['source'] : cudf.Series The source vertex ID (will be identical to first if specified) df['destination'] : cudf.Series The destination vertex ID (will be identical to second if specified) df['jaccard_coeff'] : cudf.Series The computed Jaccard coefficient between the source and destination vertices Examples -------- >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(gdf, source='0', destination='1') >>> df = cugraph.jaccard(G) """ if type(input_graph) is not Graph: raise Exception("input graph must be undirected") # FIXME: Add support for multi-column vertices if type(vertex_pair) == cudf.DataFrame: for col in vertex_pair.columns: null_check(vertex_pair[col]) if input_graph.renumbered: vertex_pair = input_graph.add_internal_vertex_id( vertex_pair, col, col) elif vertex_pair is None: pass else: raise ValueError("vertex_pair must be a cudf dataframe") df = jaccard_wrapper.jaccard(input_graph, None, vertex_pair) if input_graph.renumbered: df = input_graph.unrenumber(df, "source") df = input_graph.unrenumber(df, "destination") return df
def jaccard(input_graph, first=None, second=None): """ Compute the Jaccard similarity between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. Jaccard similarity is defined between two sets as the ratio of the volume of their intersection divided by the volume of their union. In the context of graphs, the neighborhood of a vertex is seen as a set. The Jaccard similarity weight of each edge represents the strength of connection between vertices based on the relative similarity of their neighbors. If first is specified but second is not, or vice versa, an exception will be thrown. Parameters ---------- graph : cugraph.Graph cuGraph graph descriptor, should contain the connectivity information as an edge list (edge weights are not used for this algorithm). The graph should be undirected where an undirected edge is represented by a directed edge in both direction. The adjacency list will be computed if not already present. first : cudf.Series Specifies the first vertices of each pair of vertices to compute for, must be specified along with second. second : cudf.Series Specifies the second vertices of each pair of vertices to compute for, must be specified along with first. Returns ------- df : cudf.DataFrame GPU data frame of size E (the default) or the size of the given pairs (first, second) containing the Jaccard weights. The ordering is relative to the adjacency list, or that given by the specified vertex pairs. df['source'] : cudf.Series The source vertex ID (will be identical to first if specified) df['destination'] : cudf.Series The destination vertex ID (will be identical to second if specified) df['jaccard_coeff'] : cudf.Series The computed Jaccard coefficient between the source and destination vertices Examples -------- >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> sources = cudf.Series(M['0']) >>> destinations = cudf.Series(M['1']) >>> G = cugraph.Graph() >>> G.add_edge_list(sources, destinations, None) >>> df = cugraph.jaccard(G) """ if (type(first) == cudf.Series and type(second) == cudf.Series): null_check(first) null_check(second) elif first is None and second is None: pass else: raise ValueError("Specify first and second or neither") df = jaccard_wrapper.jaccard(input_graph.graph_ptr, first, second) return df
def force_atlas2(input_graph, max_iter=500, pos_list=None, outbound_attraction_distribution=True, lin_log_mode=False, prevent_overlapping=False, edge_weight_influence=1.0, jitter_tolerance=1.0, barnes_hut_optimize=True, barnes_hut_theta=0.5, scaling_ratio=2.0, strong_gravity_mode=False, gravity=1.0, verbose=False, callback=None): """ ForceAtlas2 is a continuous graph layout algorithm for handy network visualization. NOTE: Peak memory allocation occurs at 17*V. Parameters ---------- input_graph : cugraph.Graph cuGraph graph descriptor with connectivity information. Edge weights, if present, should be single or double precision floating point values. max_iter : integer This controls the maximum number of levels/iterations of the Force Atlas algorithm. When specified the algorithm will terminate after no more than the specified number of iterations. No error occurs when the algorithm terminates early in this manner. Good short-term quality can be achieved with 50-100 iterations. Above 1000 iterations is discouraged. pos_list: cudf.DataFrame Data frame with initial vertex positions containing two columns: 'x' and 'y' positions. outbound_attraction_distribution: bool Distributes attraction along outbound edges. Hubs attract less and thus are pushed to the borders. lin_log_mode: bool Switch Force Atlas model from lin-lin to lin-log. Makes clusters more tight. prevent_overlapping: bool Prevent nodes to overlap. edge_weight_influence: float How much influence you give to the edges weight. 0 is “no influence” and 1 is “normal”. jitter_tolerance: float How much swinging you allow. Above 1 discouraged. Lower gives less speed and more precision. barnes_hut_theta: float Float between 0 and 1. Tradeoff for speed (1) vs accuracy (0) for Barnes Hut only. scaling_ratio: float How much repulsion you want. More makes a more sparse graph. Switching from regular mode to LinLog mode needs a readjustment of the scaling parameter. gravity : float Attracts nodes to the center. Prevents islands from drifting away. verbose: bool Output convergence info at each interation. callback: GraphBasedDimRedCallback An instance of GraphBasedDimRedCallback class to intercept the internal state of positions while they are being trained. Example of callback usage: from cugraph.layout import GraphBasedDimRedCallback class CustomCallback(GraphBasedDimRedCallback): def on_preprocess_end(self, positions): print(positions.copy_to_host()) def on_train_end(self, positions): print(positions.copy_to_host()) def on_train_end(self, positions): print(positions.copy_to_host()) Returns ------- pos : cudf.DataFrame GPU data frame of size V containing three columns: the vertex identifiers and the x and y positions. """ if pos_list is not None: null_check(pos_list['vertex']) null_check(pos_list['x']) null_check(pos_list['y']) if prevent_overlapping: raise Exception("Feature not supported") if input_graph.is_directed(): input_graph = input_graph.to_undirected() pos = force_atlas2_wrapper.force_atlas2( input_graph, max_iter=max_iter, pos_list=pos_list, outbound_attraction_distribution=outbound_attraction_distribution, lin_log_mode=lin_log_mode, prevent_overlapping=prevent_overlapping, edge_weight_influence=edge_weight_influence, jitter_tolerance=jitter_tolerance, barnes_hut_optimize=barnes_hut_optimize, barnes_hut_theta=barnes_hut_theta, scaling_ratio=scaling_ratio, strong_gravity_mode=strong_gravity_mode, gravity=gravity, verbose=verbose, callback=callback) return pos
def pagerank(G, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-5, nstart=None): """ Find the PageRank score for every vertex in a graph. cuGraph computes an approximation of the Pagerank eigenvector using the power method. The number of iterations depends on the properties of the network itself; it increases when the tolerance descreases and/or alpha increases toward the limiting value of 1. The user is free to use default values or to provide inputs for the initial guess, tolerance and maximum number of iterations. Parameters ---------- graph : cugraph.Graph cuGraph graph descriptor, should contain the connectivity information as an edge list (edge weights are not used for this algorithm). The transposed adjacency list will be computed if not already present. alpha : float The damping factor alpha represents the probability to follow an outgoing edge, standard value is 0.85. Thus, 1.0-alpha is the probability to “teleport” to a random vertex. Alpha should be greater than 0.0 and strictly lower than 1.0. personalization : cudf.Dataframe GPU Dataframe containing the personalization information. personalization['vertex'] : cudf.Series Subset of vertices of graph for personalization personalization['values'] : cudf.Series Personalization values for vertices max_iter : int The maximum number of iterations before an answer is returned. This can be used to limit the execution time and do an early exit before the solver reaches the convergence tolerance. If this value is lower or equal to 0 cuGraph will use the default value, which is 100. tolerance : float Set the tolerance the approximation, this parameter should be a small magnitude value. The lower the tolerance the better the approximation. If this value is 0.0f, cuGraph will use the default value which is 1.0E-5. Setting too small a tolerance can lead to non-convergence due to numerical roundoff. Usually values between 0.01 and 0.00001 are acceptable. nstart : cudf.Dataframe GPU Dataframe containing the initial guess for pagerank. nstart['vertex'] : cudf.Series Subset of vertices of graph for initial guess for pagerank values nstart['values'] : cudf.Series Pagerank values for vertices Returns ------- PageRank : cudf.DataFrame GPU data frame containing two cudf.Series of size V: the vertex identifiers and the corresponding PageRank values. df['vertex'] : cudf.Series Contains the vertex identifiers df['pagerank'] : cudf.Series Contains the PageRank score Examples -------- >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(gdf, source='0', destination='1') >>> pr = cugraph.pagerank(G, alpha = 0.85, max_iter = 500, tol = 1.0e-05) """ if personalization is not None: null_check(personalization["vertex"]) null_check(personalization["values"]) if G.renumbered is True: personalization = G.add_internal_vertex_id(personalization, "vertex", "vertex") if nstart is not None: if G.renumbered is True: nstart = G.add_internal_vertex_id(nstart, "vertex", "vertex") df = pagerank_wrapper.pagerank(G, alpha, personalization, max_iter, tol, nstart) if G.renumbered: df = G.unrenumber(df, "vertex") return df
def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-5, nstart=None, load_balance=True): """ Find the PageRank values for each vertex in a graph using multiple GPUs. cuGraph computes an approximation of the Pagerank using the power method. The input graph must contain edge list as dask-cudf dataframe with one partition per GPU. Parameters ---------- graph : cugraph.DiGraph cuGraph graph descriptor, should contain the connectivity information as dask cudf edge list dataframe(edge weights are not used for this algorithm). Undirected Graph not currently supported. alpha : float The damping factor alpha represents the probability to follow an outgoing edge, standard value is 0.85. Thus, 1.0-alpha is the probability to “teleport” to a random vertex. Alpha should be greater than 0.0 and strictly lower than 1.0. personalization : cudf.Dataframe GPU Dataframe containing the personalization information. personalization['vertex'] : cudf.Series Subset of vertices of graph for personalization personalization['values'] : cudf.Series Personalization values for vertices max_iter : int The maximum number of iterations before an answer is returned. If this value is lower or equal to 0 cuGraph will use the default value, which is 30. tolerance : float Set the tolerance the approximation, this parameter should be a small magnitude value. The lower the tolerance the better the approximation. If this value is 0.0f, cuGraph will use the default value which is 1.0E-5. Setting too small a tolerance can lead to non-convergence due to numerical roundoff. Usually values between 0.01 and 0.00001 are acceptable. nstart : not supported initial guess for pagerank load_balance : bool Set as True to perform load_balancing after global sorting of dask-cudf DataFrame. This ensures that the data is uniformly distributed among multiple GPUs to avoid over-loading. Returns ------- PageRank : cudf.DataFrame GPU data frame containing two cudf.Series of size V: the vertex identifiers and the corresponding PageRank values. df['vertex'] : cudf.Series Contains the vertex identifiers df['pagerank'] : cudf.Series Contains the PageRank score Examples -------- >>> import cugraph.dask as dcg >>> Comms.initialize() >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.DiGraph() >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', edge_attr='value') >>> pr = dcg.pagerank(dg) >>> Comms.destroy() """ from cugraph.structure.graph import null_check nstart = None client = default_client() if (input_graph.local_data is not None and input_graph.local_data['by'] == 'dst'): data = input_graph.local_data['data'] else: data = get_local_data(input_graph, by='dst', load_balance=load_balance) if personalization is not None: null_check(personalization["vertex"]) null_check(personalization["values"]) if input_graph.renumbered is True: personalization = input_graph.add_internal_vertex_id( personalization, "vertex", "vertex").compute() result = dict([(data.worker_info[wf[0]]["rank"], client.submit(call_pagerank, Comms.get_session_id(), wf[1], data.local_data, alpha, max_iter, tol, personalization, nstart, workers=[wf[0]])) for idx, wf in enumerate(data.worker_to_parts.items())]) wait(result) if input_graph.renumbered: return input_graph.unrenumber(result[0].result(), 'vertex').compute() return result[0].result()
def pagerank(input_graph, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-5, nstart=None): """ Find the PageRank values for each vertex in a graph using multiple GPUs. cuGraph computes an approximation of the Pagerank using the power method. The input graph must contain edge list as dask-cudf dataframe with one partition per GPU. Parameters ---------- graph : cugraph.DiGraph cuGraph graph descriptor, should contain the connectivity information as dask cudf edge list dataframe(edge weights are not used for this algorithm). Undirected Graph not currently supported. alpha : float The damping factor alpha represents the probability to follow an outgoing edge, standard value is 0.85. Thus, 1.0-alpha is the probability to “teleport” to a random vertex. Alpha should be greater than 0.0 and strictly lower than 1.0. personalization : cudf.Dataframe GPU Dataframe containing the personalization information. Currently not supported. personalization['vertex'] : cudf.Series Subset of vertices of graph for personalization personalization['values'] : cudf.Series Personalization values for vertices max_iter : int The maximum number of iterations before an answer is returned. If this value is lower or equal to 0 cuGraph will use the default value, which is 30. tolerance : float Set the tolerance the approximation, this parameter should be a small magnitude value. The lower the tolerance the better the approximation. If this value is 0.0f, cuGraph will use the default value which is 1.0E-5. Setting too small a tolerance can lead to non-convergence due to numerical roundoff. Usually values between 0.01 and 0.00001 are acceptable. nstart : not supported initial guess for pagerank Returns ------- PageRank : dask_cudf.DataFrame GPU data frame containing two dask_cudf.Series of size V: the vertex identifiers and the corresponding PageRank values. ddf['vertex'] : dask_cudf.Series Contains the vertex identifiers ddf['pagerank'] : dask_cudf.Series Contains the PageRank score Examples -------- >>> import cugraph.dask as dcg >>> ... Init a DASK Cluster >> see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html >>> chunksize = dcg.get_chunksize(input_data_path) >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize, delimiter=' ', names=['src', 'dst', 'value'], dtype=['int32', 'int32', 'float32']) >>> dg = cugraph.DiGraph() >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst', edge_attr='value') >>> pr = dcg.pagerank(dg) """ from cugraph.structure.graph import null_check nstart = None client = default_client() input_graph.compute_renumber_edge_list(transposed=True) ddf = input_graph.edgelist.edgelist_df vertex_partition_offsets = get_vertex_partition_offsets(input_graph) num_verts = vertex_partition_offsets.iloc[-1] num_edges = len(ddf) data = get_distributed_data(ddf) if personalization is not None: null_check(personalization["vertex"]) null_check(personalization["values"]) if input_graph.renumbered is True: personalization = input_graph.add_internal_vertex_id( personalization, "vertex", "vertex") p_data = get_distributed_data(personalization) result = [ client.submit(call_pagerank, Comms.get_session_id(), wf[1], num_verts, num_edges, vertex_partition_offsets, alpha, max_iter, tol, p_data.worker_to_parts[wf[0]][0], nstart, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] else: result = [ client.submit(call_pagerank, Comms.get_session_id(), wf[1], num_verts, num_edges, vertex_partition_offsets, alpha, max_iter, tol, personalization, nstart, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items()) ] wait(result) ddf = dask_cudf.from_delayed(result) if input_graph.renumbered: return input_graph.unrenumber(ddf, 'vertex') return ddf
def jaccard_w(input_graph, weights, vertex_pair=None): """ Compute the weighted Jaccard similarity between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. Jaccard similarity is defined between two sets as the ratio of the volume of their intersection divided by the volume of their union. In the context of graphs, the neighborhood of a vertex is seen as a set. The Jaccard similarity weight of each edge represents the strength of connection between vertices based on the relative similarity of their neighbors. If first is specified but second is not, or vice versa, an exception will be thrown. Parameters ---------- graph : cugraph.Graph cuGraph graph descriptor, should contain the connectivity information as an edge list (edge weights are not used for this algorithm). The adjacency list will be computed if not already present. weights : cudf.Series Specifies the weights to be used for each vertex. vertex_pair : cudf.DataFrame A GPU dataframe consisting of two columns representing pairs of vertices. If provided, the jaccard coefficient is computed for the given vertex pairs, else, it is computed for all vertex pairs. Returns ------- df : cudf.DataFrame GPU data frame of size E (the default) or the size of the given pairs (first, second) containing the Jaccard weights. The ordering is relative to the adjacency list, or that given by the specified vertex pairs. df['source'] : cudf.Series The source vertex ID df['destination'] : cudf.Series The destination vertex ID df['jaccard_coeff'] : cudf.Series The computed weighted Jaccard coefficient between the source and destination vertices. Examples -------- >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> sources = cudf.Series(M['0']) >>> destinations = cudf.Series(M['1']) >>> weights = cudf.Series(numpy.ones( >>> max(sources.max(),destinations.max())+1, dtype=numpy.float32)) >>> G = cugraph.Graph() >>> G.add_edge_list(sources, destinations, None) >>> df = cugraph.jaccard_w(G, weights) """ if type(input_graph) is not Graph: raise Exception("input graph must be undirected") if (type(vertex_pair) == cudf.DataFrame): null_check(vertex_pair[vertex_pair.columns[0]]) null_check(vertex_pair[vertex_pair.columns[1]]) elif vertex_pair is None: pass else: raise ValueError("vertex_pair must be a cudf dataframe") df = jaccard_wrapper.jaccard(input_graph, weights, vertex_pair) return df