Exemple #1
0
def jaccard_coefficient(G, ebunch=None):
    """
    For NetworkX Compatability.  See `jaccard`

    Parameters
    ----------
    graph : cugraph.Graph
        cuGraph graph descriptor, should contain the connectivity information
        as an edge list (edge weights are not used for this algorithm). The
        graph should be undirected where an undirected edge is represented by a
        directed edge in both direction. The adjacency list will be computed if
        not already present.
    ebunch : cudf.DataFrame
        A GPU dataframe consisting of two columns representing pairs of
        vertices. If provided, the jaccard coefficient is computed for the
        given vertex pairs.  If the vertex_pair is not provided then the
        current implementation computes the jaccard coefficient for all
        adjacent vertices in the graph.

    Returns
    -------
    df  : cudf.DataFrame
        GPU data frame of size E (the default) or the size of the given pairs
        (first, second) containing the Jaccard weights. The ordering is
        relative to the adjacency list, or that given by the specified vertex
        pairs.

        df['source'] : cudf.Series
            The source vertex ID (will be identical to first if specified)
        df['destination'] : cudf.Series
            The destination vertex ID (will be identical to second if
            specified)
        df['jaccard_coeff'] : cudf.Series
            The computed Jaccard coefficient between the source and destination
            vertices

    Examples
    --------
    >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ',
    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
    >>> df = cugraph.jaccard_coefficient(G)
    """
    vertex_pair = None

    G, isNx = check_nx_graph(G)

    if isNx is True and ebunch is not None:
        vertex_pair = cudf.from_pandas(pd.DataFrame(ebunch))

    df = jaccard(G, vertex_pair)

    if isNx is True:
        df = df_edge_score_to_dictionary(df,
                                         k="jaccard_coeff",
                                         src="source",
                                         dst="destination")

    return df
Exemple #2
0
def maximum_spanning_tree(G,
                          weight=None,
                          algorithm="boruvka",
                          ignore_nan=False):
    """
    Returns a maximum spanning tree (MST) or forest (MSF) on an undirected
    graph

    Parameters
    ----------
    G : cuGraph.Graph or networkx.Graph
        cuGraph graph descriptor with connectivity information.
    weight : string
        default to the weights in the graph, if the graph edges do not have a
        weight attribute a default weight of 1 will be used.
    algorithm : string
        Default to 'boruvka'. The parallel algorithm to use when finding a
        maximum spanning tree.
    ignore_nan : bool
        Default to False
    Returns
    -------
    G_mst : cuGraph.Graph or networkx.Graph
        A graph descriptor with a maximum spanning tree or forest.
        The networkx graph will not have all attributes copied over
    """

    G, isNx = check_nx_graph(G)

    if isNx is True:
        mst = maximum_spanning_tree_subgraph(G)
        return cugraph_to_nx(mst)
    else:
        return maximum_spanning_tree_subgraph(G)
Exemple #3
0
def k_truss(G, k):
    """
    Returns the K-Truss subgraph of a graph for a specific k.

    The k-truss of a graph is a subgraph where each edge is part of at least
    (k−2) triangles. K-trusses are used for finding tighlty knit groups of
    vertices in a graph. A k-truss is a relaxation of a k-clique in the graph
    and was define in [1]. Finding cliques is computationally demanding and
    finding the maximal k-clique is known to be NP-Hard.

    Parameters
    ----------
    G : cuGraph.Graph or networkx.Graph
        cuGraph graph descriptor with connectivity information. k-Trusses are
        defined for only undirected graphs as they are defined for
        undirected triangle in a graph.

    k : int
        The desired k to be used for extracting the k-truss subgraph.

    Returns
    -------
    G_truss : cuGraph.Graph or networkx.Graph
        A cugraph graph descriptor with the k-truss subgraph for the given k.
        The networkx graph will NOT have all attributes copied over
    """

    G, isNx = check_nx_graph(G)

    if isNx is True:
        k_sub = ktruss_subgraph(G, k)
        S = cugraph_to_nx(k_sub)
        return S
    else:
        return ktruss_subgraph(G, k)
Exemple #4
0
def analyzeClustering_edge_cut(G,
                               n_clusters,
                               clustering,
                               vertex_col_name='vertex',
                               cluster_col_name='cluster'):
    """
    Compute the edge cut score for a partitioning/clustering

    Parameters
    ----------
    G : cugraph.Graph
        cuGraph graph descriptor
    n_clusters : integer
        Specifies the number of clusters in the given clustering
    clustering : cudf.DataFrame
        The cluster assignment to analyze.
    vertex_col_name : str
        The name of the column in the clustering dataframe identifying
        the external vertex id
    cluster_col_name : str
        The name of the column in the clustering dataframe identifying
        the cluster id

    Returns
    -------
    score : float
        The computed edge cut score

    Examples
    --------
    >>> M = cudf.read_csv('datasets/karate.csv',
                          delimiter = ' ',
                          dtype=['int32', 'int32', 'float32'],
                          header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr=None)
    >>> df = cugraph.spectralBalancedCutClustering(G, 5)
    >>> score = cugraph.analyzeClustering_edge_cut(G, 5, df,
    >>>   'vertex', 'cluster')
    """

    G, isNx = check_nx_graph(G)

    if G.renumbered:
        clustering = G.add_internal_vertex_id(clustering,
                                              vertex_col_name,
                                              vertex_col_name,
                                              drop=True)

    clustering = clustering.sort_values(vertex_col_name).reset_index(drop=True)

    score = spectral_clustering_wrapper.analyzeClustering_edge_cut(
        G, n_clusters, clustering[cluster_col_name])

    return score
Exemple #5
0
def strongly_connected_components(G):
    """
    Generate the Stronlgly Connected Components and attach a component label to
    each vertex.

    Parameters
    ----------
    G : cugraph.Graph or networkx.Graph
      cuGraph graph descriptor, should contain the connectivity information as
      an edge list (edge weights are not used for this algorithm). The graph
      can be either directed or undirected where an undirected edge is
      represented by a directed edge in both directions.
      The adjacency list will be computed if not already present.
      The number of vertices should fit into a 32b int.

    Returns
    -------
    df : cudf.DataFrame
        GPU data frame containing two cudf.Series of size V: the vertex
        identifiers and the corresponding component identifier.

        df['vertices']
            Contains the vertex identifier
        df['labels']
            The component identifier

    Examples
    --------
    >>> M = cudf.read_csv('datasets/karate.csv',
                          delimiter = ' ',
                          dtype=['int32', 'int32', 'float32'],
                          header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr=None)
    >>> df = cugraph.strongly_connected_components(G)
    """

    G, isNx = check_nx_graph(G)

    df = connectivity_wrapper.strongly_connected_components(G)

    if G.renumbered:
        df = G.unrenumber(df, "vertices")

    if isNx is True:
        df = df_score_to_dictionary(df, "labels", "vertices")

    return df
Exemple #6
0
def core_number(G):
    """
    Compute the core numbers for the nodes of the graph G. A k-core of a graph
    is a maximal subgraph that contains nodes of degree k or more.
    A node has a core number of k if it belongs a k-core but not to k+1-core.
    This call does not support a graph with self-loops and parallel
    edges.

    Parameters
    ----------
    graph : cuGraph.Graph or networkx.Graph
        The graph should contain undirected edges where undirected edges are
        represented as directed edges in both directions. While this graph
        can contain edge weights, they don't participate in the calculation
        of the core numbers.

    Returns
    -------
    df : cudf.DataFrame or python dictionary (in NetworkX input)
        GPU data frame containing two cudf.Series of size V: the vertex
        identifiers and the corresponding core number values.

        df['vertex'] : cudf.Series
            Contains the vertex identifiers
        df['core_number'] : cudf.Series
            Contains the core number of vertices

    Examples
    --------
    >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ',
    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
    >>> cn = cugraph.core_number(G)
    """

    G, isNx = check_nx_graph(G)

    df = core_number_wrapper.core_number(G)

    if G.renumbered:
        df = G.unrenumber(df, "vertex")

    if isNx is True:
        df = df_score_to_dictionary(df, 'core_number')

    return df
Exemple #7
0
def shortest_path(G, source):
    """
    Compute the distance and predecessors for shortest paths from the specified
    source to all the vertices in the graph. The distances column will store
    the distance from the source to each vertex. The predecessors column will
    store each vertex's predecessor in the shortest path. Vertices that are
    unreachable will have a distance of infinity denoted by the maximum value
    of the data type and the predecessor set as -1. The source vertex's
    predecessor is also set to -1. Graphs with negative weight cycles are not
    supported.

    Parameters
    ----------
    graph : cuGraph.Graph or NetworkX.Graph
        cuGraph graph descriptor with connectivity information. Edge weights,
        if present, should be single or double precision floating point values.
    source : int
        Index of the source vertex.

    Returns
    -------
    df : cudf.DataFrame or pandas.DataFrame
        df['vertex']
            vertex id

        df['distance']
            gives the path distance from the starting vertex

        df['predecessor']
            the vertex it was reached from

    Examples
    --------
    >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(M, source='0', destination='1')
    >>> distances = cugraph.shortest_path(G, 0)
    """
    G, isNx = check_nx_graph(G)

    df = sssp(G, source)

    if isNx is True:
        df = df.to_pandas()

    return df
Exemple #8
0
def overlap_coefficient(G, ebunch=None):
    """
    NetworkX similar API.  See 'jaccard' for a description

    """
    vertex_pair = None

    G, isNx = check_nx_graph(G)

    if isNx is True and ebunch is not None:
        vertex_pair = cudf.from_pandas(pd.DataFrame(ebunch))

    df = overlap(G, vertex_pair)

    if isNx is True:
        df = df_edge_score_to_dictionary(df,
                                         k="overlap_coeff",
                                         src="source",
                                         dst="destination")

    return df
Exemple #9
0
def triangles(G):
    """
    Compute the number of triangles (cycles of length three) in the
    input graph.

    Unlike NetworkX, this algorithm simply returns the total number of
    triangle and not the number per vertex.

    Parameters
    ----------
    G : cugraph.graph or networkx.Graph
        cuGraph graph descriptor, should contain the connectivity information,
        (edge weights are not used in this algorithm)

    Returns
    -------
    count : int64
        A 64 bit integer whose value gives the number of triangles in the
        graph.

    Examples
    --------
    >>> gdf = cudf.read_csv('datasets/karate.csv',
                          delimiter = ' ',
                          dtype=['int32', 'int32', 'float32'],
                          header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
    >>> count = cugraph.triangles(G)
    """

    G, _ = check_nx_graph(G)

    if type(G) is not Graph:
        raise Exception("input graph must be undirected")

    result = triangle_count_wrapper.triangles(G)

    return result
Exemple #10
0
def bfs_edges(G,
              source,
              reverse=False,
              depth_limit=None,
              sort_neighbors=None,
              return_sp_counter=False):
    """
    Find the distances and predecessors for a breadth first traversal of a
    graph.

    Parameters
    ----------
    G : cugraph.graph or NetworkX.Graph
        graph descriptor that contains connectivity information
    source : Integer
        The starting vertex index
    reverse : boolean
        If a directed graph, then process edges in a reverse direction
        Currently not implemented
    depth_limit : Int or None
        Limit the depth of the search
        Currently not implemented
    sort_neighbors : None or Function
        Currently not implemented
    return_sp_counter : bool, optional, default=False
        Indicates if shortest path counters should be returned

    Returns
    -------
    df : cudf.DataFrame or Pandas.DataFrame
        df['vertex'][i] gives the vertex id of the i'th vertex

        df['distance'][i] gives the path distance for the i'th vertex from the
        starting vertex

        df['predecessor'][i] gives for the i'th vertex the vertex it was
        reached from in the traversal

        df['sp_counter'][i] gives for the i'th vertex the number of shortest
        path leading to it during traversal (Only if retrun_sp_counter is True)

    Examples
    --------
    >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(M, source='0', destination='1')
    >>> df = cugraph.bfs_edges(G, 0)
    """

    if reverse is True:
        raise NotImplementedError("reverse processing of graph is "
                                  "currently not supported")

    if depth_limit is not None:
        raise NotImplementedError("depth limit implementation of BFS "
                                  "is not currently supported")

    G, isNx = check_nx_graph(G)

    df = bfs(G, source, return_sp_counter)

    if isNx is True:
        df = df.to_pandas()

    return df
Exemple #11
0
def hits(G, max_iter=100, tol=1.0e-5, nstart=None, normalized=True):
    """
    Compute HITS hubs and authorities values for each vertex

    The HITS algorithm computes two numbers for a node.  Authorities
    estimates the node value based on the incoming links.  Hubs estimates
    the node value based on outgoing links.

    The cuGraph implementation of HITS is a wrapper around the gunrock
    implementation of HITS.

    Note that the gunrock implementation uses a 2-norm, while networkx
    uses a 1-norm.  The raw scores will be different, but the rank ordering
    should be comparable with networkx.

    Parameters
    ----------
    graph : cugraph.Graph
        cuGraph graph descriptor, should contain the connectivity information
        as an edge list (edge weights are not used for this algorithm).
        The adjacency list will be computed if not already present.
    max_iter : int
        The maximum number of iterations before an answer is returned.
        The gunrock implementation does not currently support tolerance,
        so this will in fact be the number of iterations the HITS algorithm
        executes.
    tolerance : float
        Set the tolerance the approximation, this parameter should be a small
        magnitude value.  This parameter is not currently supported.
    nstart : cudf.Dataframe
        Not currently supported
    normalized : bool
        Not currently supported, always used as True

    Returns
    -------
    HubsAndAuthorities : cudf.DataFrame
        GPU data frame containing three cudf.Series of size V: the vertex
        identifiers and the corresponding hubs values and the corresponding
        authorities values.

        df['vertex'] : cudf.Series
            Contains the vertex identifiers
        df['hubs'] : cudf.Series
            Contains the hubs score
        df['authorities'] : cudf.Series
            Contains the authorities score


    Examples
    --------
    >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ',
    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
    >>> hits = cugraph.hits(G, max_iter = 50)
    """

    G, isNx = check_nx_graph(G)

    df = hits_wrapper.hits(G, max_iter, tol)

    if G.renumbered:
        df = G.unrenumber(df, "vertex")

    if isNx is True:
        d1 = df_score_to_dictionary(df[["vertex", "hubs"]], "hubs")
        d2 = df_score_to_dictionary(df[["vertex", "authorities"]],
                                    "authorities")
        df = (d1, d2)

    return df
Exemple #12
0
def spectralBalancedCutClustering(
    G,
    num_clusters,
    num_eigen_vects=2,
    evs_tolerance=0.00001,
    evs_max_iter=100,
    kmean_tolerance=0.00001,
    kmean_max_iter=100,
):
    """
    Compute a clustering/partitioning of the given graph using the spectral
    balanced cut method.

    Parameters
    ----------
    G : cugraph.Graph or networkx.Graph
        cuGraph graph descriptor
    num_clusters : integer
         Specifies the number of clusters to find
    num_eigen_vects : integer
         Specifies the number of eigenvectors to use. Must be lower or equal to
         num_clusters.
    evs_tolerance: float
         Specifies the tolerance to use in the eigensolver
         Default is 0.00001
    evs_max_iter: integer
         Specifies the maximum number of iterations for the eigensolver
         Default is 100
    kmean_tolerance: float
         Specifies the tolerance to use in the k-means solver
         Default is 0.00001
    kmean_max_iter: integer
         Specifies the maximum number of iterations for the k-means solver
         Default is 100

    Returns
    -------
    df : cudf.DataFrame
        GPU data frame containing two cudf.Series of size V: the vertex
        identifiers and the corresponding cluster assignments.

        df['vertex'] : cudf.Series
            contains the vertex identifiers
        df['cluster'] : cudf.Series
            contains the cluster assignments

    Examples
    --------
    >>> M = cudf.read_csv('datasets/karate.csv',
                          delimiter = ' ',
                          dtype=['int32', 'int32', 'float32'],
                          header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(M, source='0', destination='1')
    >>> df = cugraph.spectralBalancedCutClustering(G, 5)
    """

    G, isNx = check_nx_graph(G)

    df = spectral_clustering_wrapper.spectralBalancedCutClustering(
        G,
        num_clusters,
        num_eigen_vects,
        evs_tolerance,
        evs_max_iter,
        kmean_tolerance,
        kmean_max_iter,
    )

    if G.renumbered:
        df = G.unrenumber(df, "vertex")

    if isNx is True:
        df = df_score_to_dictionary(df, "cluster")

    return df
Exemple #13
0
def leiden(G, max_iter=100, resolution=1.):
    """
    Compute the modularity optimizing partition of the input graph using the
    Leiden algorithm

    It uses the Louvain method described in:

    Traag, V. A., Waltman, L., & van Eck, N. J. (2019). From Louvain to Leiden:
    guaranteeing well-connected communities. Scientific reports, 9(1), 5233.
    doi: 10.1038/s41598-019-41695-z

    Parameters
    ----------
    G : cugraph.Graph
        cuGraph graph descriptor of type Graph

        The adjacency list will be computed if not already present.

    max_iter : integer
        This controls the maximum number of levels/iterations of the Leiden
        algorithm. When specified the algorithm will terminate after no more
        than the specified number of iterations. No error occurs when the
        algorithm terminates early in this manner.

    resolution: float/double, optional
        Called gamma in the modularity formula, this changes the size
        of the communities.  Higher resolutions lead to more smaller
        communities, lower resolutions lead to fewer larger communities.
        Defaults to 1.

    Returns
    -------
    parts : cudf.DataFrame
        GPU data frame of size V containing two columns the vertex id and the
        partition id it is assigned to.

        df['vertex'] : cudf.Series
            Contains the vertex identifiers
        df['partition'] : cudf.Series
            Contains the partition assigned to the vertices

    modularity_score : float
        a floating point number containing the global modularity score of the
        partitioning.

    Examples
    --------
    >>> M = cudf.read_csv('datasets/karate.csv',
                          delimiter = ' ',
                          dtype=['int32', 'int32', 'float32'],
                          header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(M, source='0', destination='1')
    >>> parts, modularity_score = cugraph.leiden(G)
    """
    G, isNx = check_nx_graph(G)

    if type(G) is not Graph:
        raise Exception(f"input graph must be undirected was {type(G)}")

    parts, modularity_score = leiden_wrapper.leiden(
        G, max_iter, resolution
    )

    if G.renumbered:
        parts = G.unrenumber(parts, "vertex")

    if isNx is True:
        parts = df_score_to_dictionary(parts, "partition")

    return parts, modularity_score
def analyzeClustering_modularity(G,
                                 n_clusters,
                                 clustering,
                                 vertex_col_name='vertex',
                                 cluster_col_name='cluster'):
    """
    Compute the modularity score for a given partitioning/clustering.
    The assumption is that “clustering” is the results from a call
    from a special clustering algorithm and contains columns named
    “vertex” and “cluster”.

    Parameters
    ----------
    G : cugraph.Graph or networkx.Graph
        graph descriptor. This graph should have edge weights.
    n_clusters : integer
        Specifies the number of clusters in the given clustering
    clustering : cudf.DataFrame
        The cluster assignment to analyze.
    vertex_col_name : str
        The name of the column in the clustering dataframe identifying
        the external vertex id
    cluster_col_name : str
        The name of the column in the clustering dataframe identifying
        the cluster id

    Returns
    -------
    score : float
        The computed modularity score

    Examples
    --------
    >>> M = cudf.read_csv('datasets/karate.csv',
                          delimiter = ' ',
                          dtype=['int32', 'int32', 'float32'],
                          header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2')
    >>> df = cugraph.spectralBalancedCutClustering(G, 5)
    >>> score = cugraph.analyzeClustering_modularity(G, 5, df)
    """

    if type(vertex_col_name) is not str:
        raise Exception("vertex_col_name must be a string")

    if type(cluster_col_name) is not str:
        raise Exception("cluster_col_name must be a string")

    G, isNx = check_nx_graph(G)

    if G.renumbered:
        clustering = G.add_internal_vertex_id(clustering,
                                              vertex_col_name,
                                              vertex_col_name,
                                              drop=True)

    clustering = clustering.sort_values(vertex_col_name)

    score = spectral_clustering_wrapper.analyzeClustering_modularity(
        G, n_clusters, clustering[cluster_col_name])

    return score
Exemple #15
0
def k_core(G, k=None, core_number=None):
    """
    Compute the k-core of the graph G based on the out degree of its nodes. A
    k-core of a graph is a maximal subgraph that contains nodes of degree k or
    more. This call does not support a graph with self-loops and parallel
    edges.

    Parameters
    ----------
    G : cuGraph.Graph or networkx.Graph
        cuGraph graph descriptor with connectivity information. The graph
        should contain undirected edges where undirected edges are represented
        as directed edges in both directions. While this graph can contain edge
        weights, they don't participate in the calculation of the k-core.
    k : int, optional
        Order of the core. This value must not be negative. If set to None, the
        main core is returned.
    core_number : cudf.DataFrame, optional
        Precomputed core number of the nodes of the graph G containing two
        cudf.Series of size V: the vertex identifiers and the corresponding
        core number values. If set to None, the core numbers of the nodes are
        calculated internally.

        core_number['vertex'] : cudf.Series
            Contains the vertex identifiers
        core_number['values'] : cudf.Series
            Contains the core number of vertices

    Returns
    -------
    KCoreGraph : cuGraph.Graph
        K Core of the input graph

    Examples
    --------
    >>> gdf = cudf.read_csv('datasets/karate.csv', delimiter=' ',
    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
    >>> KCoreGraph = cugraph.k_core(G)
    """

    G, isNx = check_nx_graph(G)

    mytype = type(G)
    KCoreGraph = mytype()

    if mytype is not Graph:
        raise Exception("directed graph not supported")

    if core_number is not None:
        if G.renumbered is True:
            core_number = G.add_internal_vertex_id(core_number,
                                                   "vertex",
                                                   "vertex",
                                                   drop=True)
    else:
        core_number = core_number_wrapper.core_number(G)
        core_number = core_number.rename(columns={"core_number": "values"},
                                         copy=False)
    print(core_number)
    if k is None:
        k = core_number["values"].max()

    k_core_df = k_core_wrapper.k_core(G, k, core_number)

    if G.renumbered:
        k_core_df = G.unrenumber(k_core_df, "src")
        k_core_df = G.unrenumber(k_core_df, "dst")

    if G.edgelist.weights:
        KCoreGraph.from_cudf_edgelist(k_core_df,
                                      source="src",
                                      destination="dst",
                                      edge_attr="weight")
    else:
        KCoreGraph.from_cudf_edgelist(k_core_df,
                                      source="src",
                                      destination="dst")

    if isNx is True:
        KCoreGraph = cugraph_to_nx(KCoreGraph)

    return KCoreGraph
def spectralModularityMaximizationClustering(
    G,
    num_clusters,
    num_eigen_vects=2,
    evs_tolerance=0.00001,
    evs_max_iter=100,
    kmean_tolerance=0.00001,
    kmean_max_iter=100,
):
    """
    Compute a clustering/partitioning of the given graph using the spectral
    modularity maximization method.

    Parameters
    ----------
    G : cugraph.Graph or networkx.Graph
        cuGraph graph descriptor. This graph should have edge weights.
    num_clusters : integer
         Specifies the number of clusters to find
    num_eigen_vects : integer
         Specifies the number of eigenvectors to use. Must be lower or equal to
         num_clusters.  Default is 2
    evs_tolerance: float
         Specifies the tolerance to use in the eigensolver.
         Default is 0.00001
    evs_max_iter: integer
         Specifies the maximum number of iterations for the eigensolver.
         Default is 100
    kmean_tolerance: float
         Specifies the tolerance to use in the k-means solver.
         Default is 0.00001
    kmean_max_iter: integer
         Specifies the maximum number of iterations for the k-means solver.
         Default is 100

    Returns
    -------
    df : cudf.DataFrame
        df['vertex'] : cudf.Series
            contains the vertex identifiers
        df['cluster'] : cudf.Series
            contains the cluster assignments

    Examples
    --------
    >>> M = cudf.read_csv('datasets/karate.csv',
                          delimiter = ' ',
                          dtype=['int32', 'int32', 'float32'],
                          header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2')
    >>> df = cugraph.spectralModularityMaximizationClustering(G, 5)
    """

    # Error checking in C++ code

    G, isNx = check_nx_graph(G)

    df = spectral_clustering_wrapper.spectralModularityMaximizationClustering(
        G,
        num_clusters,
        num_eigen_vects,
        evs_tolerance,
        evs_max_iter,
        kmean_tolerance,
        kmean_max_iter,
    )

    if G.renumbered:
        df = G.unrenumber(df, "vertex")

    if isNx is True:
        df = df_score_to_dictionary(df, "cluster")

    return df
def subgraph(G, vertices):
    """
    Compute a subgraph of the existing graph including only the specified
    vertices.  This algorithm works for both directed and undirected graphs,
    it does not actually traverse the edges, simply pulls out any edges that
    are incident on vertices that are both contained in the vertices list.

    Parameters
    ----------
    G : cugraph.Graph
        cuGraph graph descriptor
    vertices : cudf.Series
        Specifies the vertices of the induced subgraph

    Returns
    -------
    Sg : cugraph.Graph
        A graph object containing the subgraph induced by the given vertex set.

    Examples
    --------
    >>> gdf = cudf.read_csv('datasets/karate.csv',
                          delimiter = ' ',
                          dtype=['int32', 'int32', 'float32'],
                          header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(gdf, source='0', destination='1')
    >>> verts = numpy.zeros(3, dtype=numpy.int32)
    >>> verts[0] = 0
    >>> verts[1] = 1
    >>> verts[2] = 2
    >>> sverts = cudf.Series(verts)
    >>> Sg = cugraph.subgraph(G, sverts)
    """

    null_check(vertices)

    G, isNx = check_nx_graph(G)

    if G.renumbered:
        vertices = G.lookup_internal_vertex_id(vertices)

    result_graph = type(G)()

    df = subgraph_extraction_wrapper.subgraph(G, vertices)

    if G.renumbered:
        df = G.unrenumber(df, "src")
        df = G.unrenumber(df, "dst")

    if G.edgelist.weights:
        result_graph.from_cudf_edgelist(df,
                                        source="src",
                                        destination="dst",
                                        edge_attr="weight")
    else:
        result_graph.from_cudf_edgelist(df, source="src", destination="dst")

    if isNx is True:
        result_graph = cugraph_to_nx(result_graph)

    return result_graph
Exemple #18
0
def ecg(input_graph, min_weight=0.05, ensemble_size=16, weight=None):
    """
    Compute the Ensemble Clustering for Graphs (ECG) partition of the input
    graph. ECG runs truncated Louvain on an ensemble of permutations of the
    input graph, then uses the ensemble partitions to determine weights for
    the input graph. The final result is found by running full Louvain on
    the input graph using the determined weights.

    See https://arxiv.org/abs/1809.05578 for further information.

    Parameters
    ----------
    input_graph : cugraph.Graph or NetworkX Graph
        The graph descriptor should contain the connectivity information
        and weights. The adjacency list will be computed if not already
        present.

    min_weight : floating point
        The minimum value to assign as an edgeweight in the ECG algorithm.
        It should be a value in the range [0,1] usually left as the default
        value of .05

    ensemble_size : integer
        The number of graph permutations to use for the ensemble.
        The default value is 16, larger values may produce higher quality
        partitions for some graphs.

    weight : str
        This parameter is here for NetworkX compatibility and
        represents which NetworkX data column represents Edge weights.
        Default is None

    Returns
    -------
    parts : cudf.DataFrame or python dictionary
        GPU data frame of size V containing two columns, the vertex id and
        the partition id it is assigned to.

        df[vertex] : cudf.Series
            Contains the vertex identifiers
        df[partition] : cudf.Series
            Contains the partition assigned to the vertices

    Examples
    --------
    >>> M = cudf.read_csv('datasets/karate.csv', delimiter = ' ',
                          dtype=['int32', 'int32', 'float32'],
                          header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(M, source='0', destination='1', edge_attr='2')
    >>> parts = cugraph.ecg(G)

    """

    input_graph, isNx = check_nx_graph(input_graph, weight)

    parts = ecg_wrapper.ecg(input_graph, min_weight, ensemble_size)

    if input_graph.renumbered:
        parts = input_graph.unrenumber(parts, "vertex")

    if isNx is True:
        return df_score_to_dictionary(parts, 'partition')
    else:
        return parts
Exemple #19
0
def louvain(G, max_iter=100, resolution=1.):
    """
    Compute the modularity optimizing partition of the input graph using the
    Louvain method

    It uses the Louvain method described in:

    VD Blondel, J-L Guillaume, R Lambiotte and E Lefebvre: Fast unfolding of
    community hierarchies in large networks, J Stat Mech P10008 (2008),
    http://arxiv.org/abs/0803.0476

    Parameters
    ----------
    G : cugraph.Graph or NetworkX Graph
        The graph descriptor should contain the connectivity information
        and weights. The adjacency list will be computed if not already
        present.

    max_iter : integer
        This controls the maximum number of levels/iterations of the Louvain
        algorithm. When specified the algorithm will terminate after no more
        than the specified number of iterations. No error occurs when the
        algorithm terminates early in this manner.

    resolution: float/double, optional
        Called gamma in the modularity formula, this changes the size
        of the communities.  Higher resolutions lead to more smaller
        communities, lower resolutions lead to fewer larger communities.
        Defaults to 1.

    Returns
    -------
    parts : cudf.DataFrame
        GPU data frame of size V containing two columns the vertex id and the
        partition id it is assigned to.

        df['vertex'] : cudf.Series
            Contains the vertex identifiers
        df['partition'] : cudf.Series
            Contains the partition assigned to the vertices

    modularity_score : float
        a floating point number containing the global modularity score of the
        partitioning.

    Examples
    --------
    >>> M = cudf.read_csv('datasets/karate.csv',
                          delimiter = ' ',
                          dtype=['int32', 'int32', 'float32'],
                          header=None)
    >>> G = cugraph.Graph()
    >>> G.from_cudf_edgelist(M, source='0', destination='1')
    >>> parts, modularity_score = cugraph.louvain(G)
    """

    G, isNx = check_nx_graph(G)

    if type(G) is not Graph:
        raise Exception("input graph must be undirected")

    parts, modularity_score = louvain_wrapper.louvain(G, max_iter, resolution)

    if G.renumbered:
        parts = G.unrenumber(parts, "vertex")

    if isNx is True:
        parts = df_score_to_dictionary(parts, "partition")

    return parts, modularity_score