def _convert_df_to_output_type(df, input_type, return_labels): """ Given a cudf.DataFrame df, convert it to a new type appropriate for the graph algos in this module, based on input_type. return_labels is only used for return values from cupy/scipy input types. """ if input_type in [Graph, DiGraph]: return df elif is_nx_graph_type(input_type): return df_score_to_dictionary(df, "labels", "vertex") elif is_matrix_type(input_type): # Convert DF of 2 columns (labels, vertices) to the SciPy-style return # value: # n_components: int # The number of connected components (number of unique labels). # labels: ndarray # The length-N array of labels of the connected components. n_components = len(df["labels"].unique()) sorted_df = df.sort_values("vertex") if return_labels: if is_cp_matrix_type(input_type): labels = cp.fromDlpack(sorted_df["labels"].to_dlpack()) else: labels = sorted_df["labels"].to_numpy() return (n_components, labels) else: return n_components else: raise TypeError(f"input type {input_type} is not a supported type.")
def test_core_number(graph_file): gc.collect() nx_num = calc_nx_core_number(graph_file) cg_num = calc_cg_core_number(graph_file) # convert cugraph dataframe to a dictionary cg_num_dic = df_score_to_dictionary(cg_num, k="core_number") assert cg_num_dic == nx_num
def strongly_connected_components(G): """ Generate the Stronlgly Connected Components and attach a component label to each vertex. Parameters ---------- G : cugraph.Graph or networkx.Graph cuGraph graph descriptor, should contain the connectivity information as an edge list (edge weights are not used for this algorithm). The graph can be either directed or undirected where an undirected edge is represented by a directed edge in both directions. The adjacency list will be computed if not already present. The number of vertices should fit into a 32b int. Returns ------- df : cudf.DataFrame GPU data frame containing two cudf.Series of size V: the vertex identifiers and the corresponding component identifier. df['vertices'] Contains the vertex identifier df['labels'] The component identifier Examples -------- >>> M = cudf.read_csv('datasets/karate.csv', delimiter = ' ', dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr=None) >>> df = cugraph.strongly_connected_components(G) """ G, isNx = check_nx_graph(G) df = connectivity_wrapper.strongly_connected_components(G) if G.renumbered: df = G.unrenumber(df, "vertices") if isNx is True: df = df_score_to_dictionary(df, "labels", "vertices") return df
def core_number(G): """ Compute the core numbers for the nodes of the graph G. A k-core of a graph is a maximal subgraph that contains nodes of degree k or more. A node has a core number of k if it belongs a k-core but not to k+1-core. This call does not support a graph with self-loops and parallel edges. Parameters ---------- G : cuGraph.Graph or networkx.Graph The graph should contain undirected edges where undirected edges are represented as directed edges in both directions. While this graph can contain edge weights, they don't participate in the calculation of the core numbers. Returns ------- df : cudf.DataFrame or python dictionary (in NetworkX input) GPU data frame containing two cudf.Series of size V: the vertex identifiers and the corresponding core number values. df['vertex'] : cudf.Series Contains the vertex identifiers df['core_number'] : cudf.Series Contains the core number of vertices Examples -------- >>> gdf = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ', ... dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(gdf, source='0', destination='1') >>> cn = cugraph.core_number(G) """ G, isNx = ensure_cugraph_obj_for_nx(G) df = core_number_wrapper.core_number(G) if G.renumbered: df = G.unrenumber(df, "vertex") if isNx is True: df = df_score_to_dictionary(df, 'core_number') return df
def hits(G, max_iter=100, tol=1.0e-5, nstart=None, normalized=True): """ Compute HITS hubs and authorities values for each vertex The HITS algorithm computes two numbers for a node. Authorities estimates the node value based on the incoming links. Hubs estimates the node value based on outgoing links. The cuGraph implementation of HITS is a wrapper around the gunrock implementation of HITS. Note that the gunrock implementation uses a 2-norm, while networkx uses a 1-norm. The raw scores will be different, but the rank ordering should be comparable with networkx. Parameters ---------- graph : cugraph.Graph cuGraph graph descriptor, should contain the connectivity information as an edge list (edge weights are not used for this algorithm). The adjacency list will be computed if not already present. max_iter : int The maximum number of iterations before an answer is returned. The gunrock implementation does not currently support tolerance, so this will in fact be the number of iterations the HITS algorithm executes. tolerance : float Set the tolerance the approximation, this parameter should be a small magnitude value. This parameter is not currently supported. nstart : cudf.Dataframe Not currently supported normalized : bool Not currently supported, always used as True Returns ------- HubsAndAuthorities : cudf.DataFrame GPU data frame containing three cudf.Series of size V: the vertex identifiers and the corresponding hubs values and the corresponding authorities values. df['vertex'] : cudf.Series Contains the vertex identifiers df['hubs'] : cudf.Series Contains the hubs score df['authorities'] : cudf.Series Contains the authorities score Examples -------- >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(gdf, source='0', destination='1') >>> hits = cugraph.hits(G, max_iter = 50) """ G, isNx = check_nx_graph(G) df = hits_wrapper.hits(G, max_iter, tol) if G.renumbered: df = G.unrenumber(df, "vertex") if isNx is True: d1 = df_score_to_dictionary(df[["vertex", "hubs"]], "hubs") d2 = df_score_to_dictionary(df[["vertex", "authorities"]], "authorities") df = (d1, d2) return df
def pagerank( G, alpha=0.85, personalization=None, max_iter=100, tol=1.0e-5, nstart=None, weight=None, dangling=None ): """ Find the PageRank score for every vertex in a graph. cuGraph computes an approximation of the Pagerank eigenvector using the power method. The number of iterations depends on the properties of the network itself; it increases when the tolerance descreases and/or alpha increases toward the limiting value of 1. The user is free to use default values or to provide inputs for the initial guess, tolerance and maximum number of iterations. Parameters ---------- G : cugraph.Graph or networkx.Graph cuGraph graph descriptor, should contain the connectivity information as an edge list. The transposed adjacency list will be computed if not already present. alpha : float, optional (default=0.85) The damping factor alpha represents the probability to follow an outgoing edge, standard value is 0.85. Thus, 1.0-alpha is the probability to “teleport” to a random vertex. Alpha should be greater than 0.0 and strictly lower than 1.0. personalization : cudf.Dataframe, optional (default=None) GPU Dataframe containing the personalization information. personalization['vertex'] : cudf.Series Subset of vertices of graph for personalization personalization['values'] : cudf.Series Personalization values for vertices max_iter : int, optional (default=100) The maximum number of iterations before an answer is returned. This can be used to limit the execution time and do an early exit before the solver reaches the convergence tolerance. If this value is lower or equal to 0 cuGraph will use the default value, which is 100. tol : float, optional (default=1e-05) Set the tolerance the approximation, this parameter should be a small magnitude value. The lower the tolerance the better the approximation. If this value is 0.0f, cuGraph will use the default value which is 1.0E-5. Setting too small a tolerance can lead to non-convergence due to numerical roundoff. Usually values between 0.01 and 0.00001 are acceptable. nstart : cudf.Dataframe, optional (default=None) GPU Dataframe containing the initial guess for pagerank. nstart['vertex'] : cudf.Series Subset of vertices of graph for initial guess for pagerank values nstart['values'] : cudf.Series Pagerank values for vertices weight: str, optional (default=None) The attribute column to be used as edge weights if Graph is a NetworkX Graph. This parameter is here for NetworkX compatibility and is ignored in case of a cugraph.Graph dangling : dict, optional (default=None) This parameter is here for NetworkX compatibility and ignored Returns ------- PageRank : cudf.DataFrame GPU data frame containing two cudf.Series of size V: the vertex identifiers and the corresponding PageRank values. df['vertex'] : cudf.Series Contains the vertex identifiers df['pagerank'] : cudf.Series Contains the PageRank score Examples -------- >>> gdf = cudf.read_csv(datasets_path / 'karate.csv', delimiter=' ', ... dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(gdf, source='0', destination='1') >>> pr = cugraph.pagerank(G, alpha = 0.85, max_iter = 500, tol = 1.0e-05) """ G, isNx = ensure_cugraph_obj_for_nx(G, weight) if personalization is not None: if not isinstance(personalization, cudf.DataFrame): raise NotImplementedError( "personalization other than a cudf dataframe " "currently not supported" ) if G.renumbered is True: if len(G.renumber_map.implementation.col_names) > 1: cols = personalization.columns[:-1].to_list() else: cols = 'vertex' personalization = G.add_internal_vertex_id( personalization, "vertex", cols ) if nstart is not None: if G.renumbered is True: if len(G.renumber_map.implementation.col_names) > 1: cols = nstart.columns[:-1].to_list() else: cols = 'vertex' nstart = G.add_internal_vertex_id( nstart, "vertex", cols ) df = pagerank_wrapper.pagerank( G, alpha, personalization, max_iter, tol, nstart ) if G.renumbered: df = G.unrenumber(df, "vertex") if isNx is True: return df_score_to_dictionary(df, 'pagerank') else: return df
def leiden(G, max_iter=100, resolution=1.): """ Compute the modularity optimizing partition of the input graph using the Leiden algorithm It uses the Louvain method described in: Traag, V. A., Waltman, L., & van Eck, N. J. (2019). From Louvain to Leiden: guaranteeing well-connected communities. Scientific reports, 9(1), 5233. doi: 10.1038/s41598-019-41695-z Parameters ---------- G : cugraph.Graph cuGraph graph descriptor of type Graph The adjacency list will be computed if not already present. max_iter : integer This controls the maximum number of levels/iterations of the Leiden algorithm. When specified the algorithm will terminate after no more than the specified number of iterations. No error occurs when the algorithm terminates early in this manner. resolution: float/double, optional Called gamma in the modularity formula, this changes the size of the communities. Higher resolutions lead to more smaller communities, lower resolutions lead to fewer larger communities. Defaults to 1. Returns ------- parts : cudf.DataFrame GPU data frame of size V containing two columns the vertex id and the partition id it is assigned to. df['vertex'] : cudf.Series Contains the vertex identifiers df['partition'] : cudf.Series Contains the partition assigned to the vertices modularity_score : float a floating point number containing the global modularity score of the partitioning. Examples -------- >>> M = cudf.read_csv('datasets/karate.csv', delimiter = ' ', dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(M, source='0', destination='1') >>> parts, modularity_score = cugraph.leiden(G) """ G, isNx = check_nx_graph(G) if type(G) is not Graph: raise Exception(f"input graph must be undirected was {type(G)}") parts, modularity_score = leiden_wrapper.leiden( G, max_iter, resolution ) if G.renumbered: parts = G.unrenumber(parts, "vertex") if isNx is True: parts = df_score_to_dictionary(parts, "partition") return parts, modularity_score
def louvain(G, max_iter=100, resolution=1.): """ Compute the modularity optimizing partition of the input graph using the Louvain method It uses the Louvain method described in: VD Blondel, J-L Guillaume, R Lambiotte and E Lefebvre: Fast unfolding of community hierarchies in large networks, J Stat Mech P10008 (2008), http://arxiv.org/abs/0803.0476 Parameters ---------- G : cugraph.Graph or NetworkX Graph The graph descriptor should contain the connectivity information and weights. The adjacency list will be computed if not already present. max_iter : integer, optional (default=100) This controls the maximum number of levels/iterations of the Louvain algorithm. When specified the algorithm will terminate after no more than the specified number of iterations. No error occurs when the algorithm terminates early in this manner. resolution: float/double, optional (default=1.0) Called gamma in the modularity formula, this changes the size of the communities. Higher resolutions lead to more smaller communities, lower resolutions lead to fewer larger communities. Defaults to 1. Returns ------- parts : cudf.DataFrame GPU data frame of size V containing two columns the vertex id and the partition id it is assigned to. df['vertex'] : cudf.Series Contains the vertex identifiers df['partition'] : cudf.Series Contains the partition assigned to the vertices modularity_score : float a floating point number containing the global modularity score of the partitioning. Examples -------- >>> M = cudf.read_csv(datasets_path / 'karate.csv', ... delimiter = ' ', ... dtype=['int32', 'int32', 'float32'], ... header=None) >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(M, source='0', destination='1') >>> parts, modularity_score = cugraph.louvain(G) """ G, isNx = ensure_cugraph_obj_for_nx(G) if type(G) is not Graph: raise Exception("input graph must be undirected") parts, modularity_score = louvain_wrapper.louvain(G, max_iter, resolution) if G.renumbered: parts = G.unrenumber(parts, "vertex") if isNx is True: parts = df_score_to_dictionary(parts, "partition") return parts, modularity_score
def betweenness_centrality( G, k=None, normalized=True, weight=None, endpoints=False, seed=None, result_dtype=np.float64, ): """ Compute the betweenness centrality for all vertices of the graph G. Betweenness centrality is a measure of the number of shortest paths that pass through a vertex. A vertex with a high betweenness centrality score has more paths passing through it and is therefore believed to be more important. Rather than doing an all-pair shortest path, a sample of k starting vertices can be used. CuGraph does not currently support the 'endpoints' and 'weight' parameters as seen in the corresponding networkX call. Parameters ---------- G : cuGraph.Graph or networkx.Graph The graph can be either directed (DiGraph) or undirected (Graph). Weights in the graph are ignored, the current implementation uses BFS traversals. Use weight parameter if weights need to be considered (currently not supported) k : int or list or None, optional, default=None If k is not None, use k node samples to estimate betweenness. Higher values give better approximation If k is a list, use the content of the list for estimation: the list should contain vertices identifiers. If k is None (the default), all the vertices are used to estimate betweenness. Vertices obtained through sampling or defined as a list will be used as sources for traversals inside the algorithm. normalized : bool, optional Default is True. If true, the betweenness values are normalized by 2 / ((n - 1) * (n - 2)) for Graphs (undirected), and 1 / ((n - 1) * (n - 2)) for DiGraphs (directed graphs) where n is the number of nodes in G. Normalization will ensure that values are in [0, 1], this normalization scales for the highest possible value where one node is crossed by every single shortest path. weight : cudf.DataFrame, optional, default=None Specifies the weights to be used for each edge. Should contain a mapping between edges and weights. (Not Supported) endpoints : bool, optional, default=False If true, include the endpoints in the shortest path counts. (Not Supported) seed : optional if k is specified and k is an integer, use seed to initialize the random number generator. Using None as seed relies on random.seed() behavior: using current system time If k is either None or list: seed parameter is ignored result_dtype : np.float32 or np.float64, optional, default=np.float64 Indicate the data type of the betweenness centrality scores Returns ------- df : cudf.DataFrame or Dictionary if using NetworkX GPU data frame containing two cudf.Series of size V: the vertex identifiers and the corresponding betweenness centrality values. Please note that the resulting the 'vertex' column might not be in ascending order. The Dictionary conatains the same two columns df['vertex'] : cudf.Series Contains the vertex identifiers df['betweenness_centrality'] : cudf.Series Contains the betweenness centrality of vertices Examples -------- >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(gdf, source='0', destination='1') >>> bc = cugraph.betweenness_centrality(G) """ # vertices is intended to be a cuDF series that contains a sampling of # k vertices out of the graph. # # NOTE: cuDF doesn't currently support sampling, but there is a python # workaround. if weight is not None: raise NotImplementedError("weighted implementation of betweenness " "centrality not currently supported") if result_dtype not in [np.float32, np.float64]: raise TypeError("result type can only be np.float32 or np.float64") G, isNx = cugraph.utilities.check_nx_graph(G) vertices = _initialize_vertices(G, k, seed) df = betweenness_centrality_wrapper.betweenness_centrality( G, normalized, endpoints, weight, vertices, result_dtype) if G.renumbered: df = G.unrenumber(df, "vertex") if isNx is True: dict = df_score_to_dictionary(df, 'betweenness_centrality') return dict else: return df
def ecg(input_graph, min_weight=0.05, ensemble_size=16, weight=None): """ Compute the Ensemble Clustering for Graphs (ECG) partition of the input graph. ECG runs truncated Louvain on an ensemble of permutations of the input graph, then uses the ensemble partitions to determine weights for the input graph. The final result is found by running full Louvain on the input graph using the determined weights. See https://arxiv.org/abs/1809.05578 for further information. Parameters ---------- input_graph : cugraph.Graph or NetworkX Graph The graph descriptor should contain the connectivity information and weights. The adjacency list will be computed if not already present. min_weight : floating point The minimum value to assign as an edgeweight in the ECG algorithm. It should be a value in the range [0,1] usually left as the default value of .05 ensemble_size : integer The number of graph permutations to use for the ensemble. The default value is 16, larger values may produce higher quality partitions for some graphs. weight : str This parameter is here for NetworkX compatibility and represents which NetworkX data column represents Edge weights. Default is None Returns ------- parts : cudf.DataFrame or python dictionary GPU data frame of size V containing two columns, the vertex id and the partition id it is assigned to. df[vertex] : cudf.Series Contains the vertex identifiers df[partition] : cudf.Series Contains the partition assigned to the vertices Examples -------- >>> M = cudf.read_csv('datasets/karate.csv', delimiter = ' ', dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2') >>> parts = cugraph.ecg(G) """ input_graph, isNx = check_nx_graph(input_graph, weight) parts = ecg_wrapper.ecg(input_graph, min_weight, ensemble_size) if input_graph.renumbered: parts = input_graph.unrenumber(parts, "vertex") if isNx is True: return df_score_to_dictionary(parts, 'partition') else: return parts
def spectralBalancedCutClustering( G, num_clusters, num_eigen_vects=2, evs_tolerance=0.00001, evs_max_iter=100, kmean_tolerance=0.00001, kmean_max_iter=100, ): """ Compute a clustering/partitioning of the given graph using the spectral balanced cut method. Parameters ---------- G : cugraph.Graph or networkx.Graph Graph descriptor num_clusters : integer Specifies the number of clusters to find, must be greater than 1 num_eigen_vects : integer, optional Specifies the number of eigenvectors to use. Must be lower or equal to num_clusters. Default is 2 evs_tolerance: float, optional Specifies the tolerance to use in the eigensolver. Default is 0.00001 evs_max_iter: integer, optional Specifies the maximum number of iterations for the eigensolver. Default is 100 kmean_tolerance: float, optional Specifies the tolerance to use in the k-means solver. Default is 0.00001 kmean_max_iter: integer, optional Specifies the maximum number of iterations for the k-means solver. Default is 100 Returns ------- df : cudf.DataFrame GPU data frame containing two cudf.Series of size V: the vertex identifiers and the corresponding cluster assignments. df['vertex'] : cudf.Series contains the vertex identifiers df['cluster'] : cudf.Series contains the cluster assignments Examples -------- >>> M = cudf.read_csv(datasets_path / 'karate.csv', ... delimiter = ' ', ... dtype=['int32', 'int32', 'float32'], ... header=None) >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(M, source='0', destination='1') >>> df = cugraph.spectralBalancedCutClustering(G, 5) """ # Error checking in C++ code G, isNx = ensure_cugraph_obj_for_nx(G) df = spectral_clustering_wrapper.spectralBalancedCutClustering( G, num_clusters, num_eigen_vects, evs_tolerance, evs_max_iter, kmean_tolerance, kmean_max_iter, ) if G.renumbered: df = G.unrenumber(df, "vertex") if isNx is True: df = df_score_to_dictionary(df, "cluster") return df
def spectralModularityMaximizationClustering( G, num_clusters, num_eigen_vects=2, evs_tolerance=0.00001, evs_max_iter=100, kmean_tolerance=0.00001, kmean_max_iter=100, ): """ Compute a clustering/partitioning of the given graph using the spectral modularity maximization method. Parameters ---------- G : cugraph.Graph or networkx.Graph cuGraph graph descriptor. This graph should have edge weights. num_clusters : integer Specifies the number of clusters to find num_eigen_vects : integer Specifies the number of eigenvectors to use. Must be lower or equal to num_clusters. Default is 2 evs_tolerance: float Specifies the tolerance to use in the eigensolver. Default is 0.00001 evs_max_iter: integer Specifies the maximum number of iterations for the eigensolver. Default is 100 kmean_tolerance: float Specifies the tolerance to use in the k-means solver. Default is 0.00001 kmean_max_iter: integer Specifies the maximum number of iterations for the k-means solver. Default is 100 Returns ------- df : cudf.DataFrame df['vertex'] : cudf.Series contains the vertex identifiers df['cluster'] : cudf.Series contains the cluster assignments Examples -------- >>> M = cudf.read_csv('datasets/karate.csv', delimiter = ' ', dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2') >>> df = cugraph.spectralModularityMaximizationClustering(G, 5) """ # Error checking in C++ code G, isNx = check_nx_graph(G) df = spectral_clustering_wrapper.spectralModularityMaximizationClustering( G, num_clusters, num_eigen_vects, evs_tolerance, evs_max_iter, kmean_tolerance, kmean_max_iter, ) if G.renumbered: df = G.unrenumber(df, "vertex") if isNx is True: df = df_score_to_dictionary(df, "cluster") return df