Example #1
0
def renumber(input_graph):

    client = default_client()

    ddf = input_graph.edgelist.edgelist_df

    num_edges = len(ddf)

    if isinstance(ddf, dask_cudf.DataFrame):
        is_mnmg = True
    else:
        is_mnmg = False

    num_verts = input_graph.number_of_vertices()

    if is_mnmg:
        data = get_distributed_data(ddf)
        result = [
            client.submit(call_renumber,
                          Comms.get_session_id(),
                          wf[1],
                          num_verts,
                          num_edges,
                          is_mnmg,
                          workers=[wf[0]])
            for idx, wf in enumerate(data.worker_to_parts.items())
        ]
        wait(result)
        ddf = dask_cudf.from_delayed(result)
    else:
        call_renumber(Comms.get_session_id(), ddf, num_verts, num_edges,
                      is_mnmg)
    return ddf
Example #2
0
def louvain(input_graph, max_iter=100, resolution=1.0, load_balance=True):
    """
    Compute the modularity optimizing partition of the input graph using the
    Louvain method on multiple GPUs

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> Comms.initialize()
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv('datasets/karate.csv', chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.Graph()
    >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
                                   edge_attr='value')
    >>> parts, modularity_score = dcg.louvain(dg)
    """
    # FIXME: finish docstring: describe parameters, etc.

    # FIXME: import here to prevent circular import: cugraph->louvain
    # wrapper->cugraph/structure->cugraph/dask->dask/louvain->cugraph/structure
    # from cugraph.structure.graph import Graph

    # FIXME: dask methods to populate graphs from edgelists are only present on
    # DiGraph classes. Disable the Graph check for now and assume inputs are
    # symmetric DiGraphs.
    # if type(graph) is not Graph:
    #     raise Exception("input graph must be undirected")

    client = default_client()

    if (input_graph.local_data is not None
            and input_graph.local_data['by'] == 'src'):
        data = input_graph.local_data['data']
    else:
        data = get_local_data(input_graph, by='src', load_balance=load_balance)

    result = dict([(data.worker_info[wf[0]]["rank"],
                    client.submit(call_louvain,
                                  Comms.get_session_id(),
                                  wf[1],
                                  data.local_data,
                                  max_iter,
                                  resolution,
                                  workers=[wf[0]]))
                   for idx, wf in enumerate(data.worker_to_parts.items())])
    wait(result)

    (parts, modularity_score) = result[0].result()

    if input_graph.renumbered:
        # MG renumbering is lazy, but it's safe to assume it's been called at
        # this point if renumbered=True
        parts = input_graph.unrenumber(parts, "vertex")

    return parts, modularity_score
Example #3
0
def _mg_rmat(scale,
             num_edges,
             a,
             b,
             c,
             seed,
             clip_and_flip,
             scramble_vertex_ids,
             create_using=cugraph.DiGraph):
    """
    Calls RMAT on multiple GPUs and uses the resulting Dask cuDF DataFrame to
    initialize and return a cugraph Graph object specified with create_using.
    If create_using is None, returns the Dask DataFrame edgelist as-is.

    seed is used as the initial seed for the first worker used (worker 0), then
    each subsequent worker will receive seed+<worker num> as the seed value.
    """
    client = default_client()
    worker_list = list(client.scheduler_info()['workers'].keys())
    num_workers = len(worker_list)
    num_edges_list = _calc_num_edges_per_worker(num_workers, num_edges)
    futures = []
    for (i, worker_num_edges) in enumerate(num_edges_list):
        unique_worker_seed = seed + i
        future = client.submit(_call_rmat,
                               Comms.get_session_id(),
                               scale,
                               worker_num_edges,
                               a,
                               b,
                               c,
                               unique_worker_seed,
                               clip_and_flip,
                               scramble_vertex_ids,
                               workers=worker_list[i])
        futures.append(future)

    ddf = dask_cudf.from_delayed(futures)

    if create_using is None:
        return ddf

    G = create_using()
    G.from_dask_cudf_edgelist(ddf, source="src", destination="dst")

    return G
Example #4
0
def weakly_connected_components(input_graph):
    """
    Generate the Weakly Connected Components and attach a component label to
    each vertex.

    Parameters
    ----------
    input_graph : cugraph.Graph, networkx.Graph, CuPy or SciPy sparse matrix

        Graph or matrix object, which should contain the connectivity
        information
    """

    client = default_client()

    input_graph.compute_renumber_edge_list()

    ddf = input_graph.edgelist.edgelist_df
    vertex_partition_offsets = get_vertex_partition_offsets(input_graph)
    num_verts = vertex_partition_offsets.iloc[-1]
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    src_col_name = input_graph.renumber_map.renumbered_src_col_name
    dst_col_name = input_graph.renumber_map.renumbered_dst_col_name

    result = [client.submit(call_wcc,
                            Comms.get_session_id(),
                            wf[1],
                            src_col_name,
                            dst_col_name,
                            num_verts,
                            num_edges,
                            vertex_partition_offsets,
                            input_graph.aggregate_segment_offsets,
                            workers=[wf[0]])
              for idx, wf in enumerate(data.worker_to_parts.items())]
    wait(result)
    ddf = dask_cudf.from_delayed(result)

    if input_graph.renumbered:
        return input_graph.unrenumber(ddf, 'vertex')

    return ddf
Example #5
0
def katz_centrality(input_graph,
                    alpha=None,
                    beta=None,
                    max_iter=100,
                    tol=1.0e-5,
                    nstart=None,
                    normalized=True):
    """
    Compute the Katz centrality for the nodes of the graph G.

    Parameters
    ----------
    input_graph : cuGraph.Graph
        cuGraph graph descriptor with connectivity information. The graph can
        contain either directed (DiGraph) or undirected edges (Graph).
    alpha : float
        Attenuation factor defaulted to None. If alpha is not specified then
        it is internally calculated as 1/(degree_max) where degree_max is the
        maximum out degree.
        NOTE : The maximum acceptable value of alpha for convergence
        alpha_max = 1/(lambda_max) where lambda_max is the largest eigenvalue
        of the graph.
        Since lambda_max is always lesser than or equal to degree_max for a
        graph, alpha_max will always be greater than or equal to
        (1/degree_max). Therefore, setting alpha to (1/degree_max) will
        guarantee that it will never exceed alpha_max thus in turn fulfilling
        the requirement for convergence.
    beta : None
        A weight scalar - currently Not Supported
    max_iter : int
        The maximum number of iterations before an answer is returned. This can
        be used to limit the execution time and do an early exit before the
        solver reaches the convergence tolerance.
        If this value is lower or equal to 0 cuGraph will use the default
        value, which is 100.
    tolerance : float
        Set the tolerance the approximation, this parameter should be a small
        magnitude value.
        The lower the tolerance the better the approximation. If this value is
        0.0f, cuGraph will use the default value which is 1.0e-6.
        Setting too small a tolerance can lead to non-convergence due to
        numerical roundoff. Usually values between 1e-2 and 1e-6 are
        acceptable.
    nstart : dask_cudf.Dataframe
        GPU Dataframe containing the initial guess for katz centrality
        nstart['vertex'] : dask_cudf.Series
            Contains the vertex identifiers
        nstart['values'] : dask_cudf.Series
            Contains the katz centrality values of vertices
    normalized : bool
        If True normalize the resulting katz centrality values

    Returns
    -------
    katz_centrality : dask_cudf.DataFrame
        GPU data frame containing two dask_cudf.Series of size V: the
        vertex identifiers and the corresponding katz centrality values.

        ddf['vertex'] : dask_cudf.Series
            Contains the vertex identifiers
        ddf['katz_centrality'] : dask_cudf.Series
            Contains the katz centrality of vertices

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> Comms.initialize(p2p=True)
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.DiGraph()
    >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
                                   edge_attr='value')
    >>> pr = dcg.katz_centrality(dg)
    >>> Comms.destroy()
    """

    nstart = None

    client = default_client()

    input_graph.compute_renumber_edge_list(transposed=True)
    (ddf, num_verts, partition_row_size, partition_col_size,
     vertex_partition_offsets) = shuffle(input_graph, transposed=True)
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    result = [
        client.submit(call_katz_centrality,
                      Comms.get_session_id(),
                      wf[1],
                      num_verts,
                      num_edges,
                      vertex_partition_offsets,
                      alpha,
                      beta,
                      max_iter,
                      tol,
                      nstart,
                      normalized,
                      workers=[wf[0]])
        for idx, wf in enumerate(data.worker_to_parts.items())
    ]
    wait(result)
    ddf = dask_cudf.from_delayed(result)
    if input_graph.renumbered:
        return input_graph.unrenumber(ddf, 'vertex')

    return ddf
Example #6
0
def bfs(graph, start, depth_limit=None, return_distances=True):
    """
    Find the distances and predecessors for a breadth first traversal of a
    graph.
    The input graph must contain edge list as  dask-cudf dataframe with
    one partition per GPU.

    Parameters
    ----------
    graph : cugraph.DiGraph
        cuGraph graph descriptor, should contain the connectivity information
        as dask cudf edge list dataframe(edge weights are not used for this
        algorithm). Undirected Graph not currently supported.
    start : Integer
        Specify starting vertex for breadth-first search; this function
        iterates over edges in the component reachable from this node.
    depth_limit : Integer or None
        Limit the depth of the search
    return_distances : bool, optional, default=True
        Indicates if distances should be returned

    Returns
    -------
    df : dask_cudf.DataFrame
        df['vertex'] gives the vertex id

        df['distance'] gives the path distance from the
        starting vertex (Only if return_distances is True)

        df['predecessor'] gives the vertex it was
        reached from in the traversal

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> ... Init a DASK Cluster
    >>    see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.DiGraph()
    >>> dg.from_dask_cudf_edgelist(ddf, 'src', 'dst')
    >>> df = dcg.bfs(dg, 0)
    """

    client = default_client()

    graph.compute_renumber_edge_list(transposed=False)
    ddf = graph.edgelist.edgelist_df
    vertex_partition_offsets = get_vertex_partition_offsets(graph)
    num_verts = vertex_partition_offsets.iloc[-1]

    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    if graph.renumbered:
        if isinstance(start, dask_cudf.DataFrame)\
          or isinstance(start, cudf.DataFrame):
            start = graph.lookup_internal_vertex_id(start, start.columns).\
                    compute()
            start = start.iloc[0]
        else:
            start = graph.lookup_internal_vertex_id(cudf.Series([start
                                                                 ])).compute()
            start = start.iloc[0]

    result = [
        client.submit(call_bfs,
                      Comms.get_session_id(),
                      wf[1],
                      num_verts,
                      num_edges,
                      vertex_partition_offsets,
                      graph.aggregate_segment_offsets,
                      start,
                      depth_limit,
                      return_distances,
                      workers=[wf[0]])
        for idx, wf in enumerate(data.worker_to_parts.items())
    ]
    wait(result)
    ddf = dask_cudf.from_delayed(result)

    if graph.renumbered:
        ddf = graph.unrenumber(ddf, 'vertex')
        ddf = graph.unrenumber(ddf, 'predecessor')
        ddf = ddf.fillna(-1)
    return ddf
Example #7
0
    def renumber(df,
                 src_col_names,
                 dst_col_names,
                 preserve_order=False,
                 store_transposed=False):
        if isinstance(src_col_names, list):
            renumber_type = 'legacy'
        elif not (df[src_col_names].dtype == np.int32
                  or df[src_col_names].dtype == np.int64):
            renumber_type = 'legacy'
        elif is_device_version_less_than((7, 0)):
            renumber_type = 'legacy'
        else:
            renumber_type = 'experimental'

        renumber_map = NumberMap()
        if not isinstance(src_col_names, list):
            src_col_names = [src_col_names]
            dst_col_names = [dst_col_names]
        if type(df) is cudf.DataFrame:
            renumber_map.implementation = NumberMap.SingleGPU(
                df, src_col_names, dst_col_names, renumber_map.id_type,
                store_transposed)
        elif type(df) is dask_cudf.DataFrame:
            renumber_map.implementation = NumberMap.MultiGPU(
                df, src_col_names, dst_col_names, renumber_map.id_type,
                store_transposed)
        else:
            raise Exception("df must be cudf.DataFrame or dask_cudf.DataFrame")

        if renumber_type == 'legacy':
            indirection_map = renumber_map.implementation.\
                              indirection_map(df,
                                              src_col_names,
                                              dst_col_names)
            df = renumber_map.add_internal_vertex_id(
                df,
                "src",
                src_col_names,
                drop=True,
                preserve_order=preserve_order)
            df = renumber_map.add_internal_vertex_id(
                df,
                "dst",
                dst_col_names,
                drop=True,
                preserve_order=preserve_order)
        else:
            df = df.rename(columns={
                src_col_names[0]: "src",
                dst_col_names[0]: "dst"
            })

        num_edges = len(df)

        if isinstance(df, dask_cudf.DataFrame):
            is_mnmg = True
        else:
            is_mnmg = False

        if is_mnmg:
            client = default_client()
            data = get_distributed_data(df)
            result = [(client.submit(call_renumber,
                                     Comms.get_session_id(),
                                     wf[1],
                                     num_edges,
                                     is_mnmg,
                                     store_transposed,
                                     workers=[wf[0]]), wf[0])
                      for idx, wf in enumerate(data.worker_to_parts.items())]
            wait(result)

            def get_renumber_map(data):
                return data[0]

            def get_renumbered_df(data):
                return data[1]

            renumbering_map = dask_cudf.from_delayed([
                client.submit(get_renumber_map, data, workers=[wf])
                for (data, wf) in result
            ])
            renumbered_df = dask_cudf.from_delayed([
                client.submit(get_renumbered_df, data, workers=[wf])
                for (data, wf) in result
            ])
            if renumber_type == 'legacy':
                renumber_map.implementation.ddf = indirection_map.merge(
                    renumbering_map,
                    right_on='original_ids', left_on='global_id',
                    how='right').\
                    drop(columns=['global_id', 'original_ids'])\
                    .rename(columns={'new_ids': 'global_id'})
            else:
                renumber_map.implementation.ddf = renumbering_map.rename(
                    columns={
                        'original_ids': '0',
                        'new_ids': 'global_id'
                    })
            renumber_map.implementation.numbered = True
            return renumbered_df, renumber_map

        else:
            if is_device_version_less_than((7, 0)):
                renumbered_df = df
                renumber_map.implementation.df = indirection_map
                renumber_map.implementation.numbered = True
                return renumbered_df, renumber_map

            renumbering_map, renumbered_df = c_renumber.renumber(
                df, num_edges, 0, Comms.get_default_handle(), is_mnmg,
                store_transposed)
            if renumber_type == 'legacy':
                renumber_map.implementation.df = indirection_map.\
                    merge(renumbering_map,
                          right_on='original_ids', left_on='id').\
                    drop(columns=['id', 'original_ids'])\
                    .rename(columns={'new_ids': 'id'}, copy=False)
            else:
                renumber_map.implementation.df = renumbering_map.rename(
                    columns={
                        'original_ids': '0',
                        'new_ids': 'id'
                    }, copy=False)

            renumber_map.implementation.numbered = True
            return renumbered_df, renumber_map
Example #8
0
def pagerank(input_graph,
             alpha=0.85,
             personalization=None,
             max_iter=100,
             tol=1.0e-5,
             nstart=None):
    """
    Find the PageRank values for each vertex in a graph using multiple GPUs.
    cuGraph computes an approximation of the Pagerank using the power method.
    The input graph must contain edge list as  dask-cudf dataframe with
    one partition per GPU.

    Parameters
    ----------
    graph : cugraph.DiGraph
        cuGraph graph descriptor, should contain the connectivity information
        as dask cudf edge list dataframe(edge weights are not used for this
        algorithm). Undirected Graph not currently supported.
    alpha : float
        The damping factor alpha represents the probability to follow an
        outgoing edge, standard value is 0.85.
        Thus, 1.0-alpha is the probability to “teleport” to a random vertex.
        Alpha should be greater than 0.0 and strictly lower than 1.0.
    personalization : cudf.Dataframe
        GPU Dataframe containing the personalization information.
        Currently not supported.

        personalization['vertex'] : cudf.Series
            Subset of vertices of graph for personalization
        personalization['values'] : cudf.Series
            Personalization values for vertices
    max_iter : int
        The maximum number of iterations before an answer is returned.
        If this value is lower or equal to 0 cuGraph will use the default
        value, which is 30.
    tolerance : float
        Set the tolerance the approximation, this parameter should be a small
        magnitude value.
        The lower the tolerance the better the approximation. If this value is
        0.0f, cuGraph will use the default value which is 1.0E-5.
        Setting too small a tolerance can lead to non-convergence due to
        numerical roundoff. Usually values between 0.01 and 0.00001 are
        acceptable.
    nstart : not supported
        initial guess for pagerank

    Returns
    -------
    PageRank : dask_cudf.DataFrame
        GPU data frame containing two dask_cudf.Series of size V: the
        vertex identifiers and the corresponding PageRank values.

        ddf['vertex'] : dask_cudf.Series
            Contains the vertex identifiers
        ddf['pagerank'] : dask_cudf.Series
            Contains the PageRank score

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> ... Init a DASK Cluster
    >>    see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.DiGraph()
    >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
                                   edge_attr='value')
    >>> pr = dcg.pagerank(dg)
    """
    from cugraph.structure.graph_classes import null_check

    nstart = None

    client = default_client()

    input_graph.compute_renumber_edge_list(transposed=True)

    ddf = input_graph.edgelist.edgelist_df
    vertex_partition_offsets = get_vertex_partition_offsets(input_graph)
    num_verts = vertex_partition_offsets.iloc[-1]
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    if personalization is not None:
        null_check(personalization["vertex"])
        null_check(personalization["values"])
        if input_graph.renumbered is True:
            personalization = input_graph.add_internal_vertex_id(
                personalization, "vertex", "vertex")
        p_data = get_distributed_data(personalization)

        result = [
            client.submit(call_pagerank,
                          Comms.get_session_id(),
                          wf[1],
                          num_verts,
                          num_edges,
                          vertex_partition_offsets,
                          input_graph.aggregate_segment_offsets,
                          alpha,
                          max_iter,
                          tol,
                          p_data.worker_to_parts[wf[0]][0],
                          nstart,
                          workers=[wf[0]])
            for idx, wf in enumerate(data.worker_to_parts.items())
        ]
    else:
        result = [
            client.submit(call_pagerank,
                          Comms.get_session_id(),
                          wf[1],
                          num_verts,
                          num_edges,
                          vertex_partition_offsets,
                          input_graph.aggregate_segment_offsets,
                          alpha,
                          max_iter,
                          tol,
                          personalization,
                          nstart,
                          workers=[wf[0]])
            for idx, wf in enumerate(data.worker_to_parts.items())
        ]
    wait(result)
    ddf = dask_cudf.from_delayed(result)
    if input_graph.renumbered:
        return input_graph.unrenumber(ddf, 'vertex')

    return ddf
Example #9
0
def pagerank(input_graph,
             alpha=0.85,
             personalization=None,
             max_iter=100,
             tol=1.0e-5,
             nstart=None,
             load_balance=True):
    """
    Find the PageRank values for each vertex in a graph using multiple GPUs.
    cuGraph computes an approximation of the Pagerank using the power method.
    The input graph must contain edge list as  dask-cudf dataframe with
    one partition per GPU.

    Parameters
    ----------
    graph : cugraph.DiGraph
        cuGraph graph descriptor, should contain the connectivity information
        as dask cudf edge list dataframe(edge weights are not used for this
        algorithm). Undirected Graph not currently supported.
    alpha : float
        The damping factor alpha represents the probability to follow an
        outgoing edge, standard value is 0.85.
        Thus, 1.0-alpha is the probability to “teleport” to a random vertex.
        Alpha should be greater than 0.0 and strictly lower than 1.0.
    personalization : cudf.Dataframe
        GPU Dataframe containing the personalization information.

        personalization['vertex'] : cudf.Series
            Subset of vertices of graph for personalization
        personalization['values'] : cudf.Series
            Personalization values for vertices
    max_iter : int
        The maximum number of iterations before an answer is returned.
        If this value is lower or equal to 0 cuGraph will use the default
        value, which is 30.
    tolerance : float
        Set the tolerance the approximation, this parameter should be a small
        magnitude value.
        The lower the tolerance the better the approximation. If this value is
        0.0f, cuGraph will use the default value which is 1.0E-5.
        Setting too small a tolerance can lead to non-convergence due to
        numerical roundoff. Usually values between 0.01 and 0.00001 are
        acceptable.
    nstart : not supported
        initial guess for pagerank
    load_balance : bool
        Set as True to perform load_balancing after global sorting of
        dask-cudf DataFrame. This ensures that the data is uniformly
        distributed among multiple GPUs to avoid over-loading.

    Returns
    -------
    PageRank : cudf.DataFrame
        GPU data frame containing two cudf.Series of size V: the vertex
        identifiers and the corresponding PageRank values.

        df['vertex'] : cudf.Series
            Contains the vertex identifiers
        df['pagerank'] : cudf.Series
            Contains the PageRank score

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> Comms.initialize()
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.DiGraph()
    >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
                                   edge_attr='value')
    >>> pr = dcg.pagerank(dg)
    >>> Comms.destroy()
    """
    from cugraph.structure.graph import null_check

    nstart = None

    client = default_client()

    if (input_graph.local_data is not None
            and input_graph.local_data['by'] == 'dst'):
        data = input_graph.local_data['data']
    else:
        data = get_local_data(input_graph, by='dst', load_balance=load_balance)

    if personalization is not None:
        null_check(personalization["vertex"])
        null_check(personalization["values"])
        if input_graph.renumbered is True:
            personalization = input_graph.add_internal_vertex_id(
                personalization, "vertex", "vertex").compute()

    result = dict([(data.worker_info[wf[0]]["rank"],
                    client.submit(call_pagerank,
                                  Comms.get_session_id(),
                                  wf[1],
                                  data.local_data,
                                  alpha,
                                  max_iter,
                                  tol,
                                  personalization,
                                  nstart,
                                  workers=[wf[0]]))
                   for idx, wf in enumerate(data.worker_to_parts.items())])
    wait(result)

    if input_graph.renumbered:
        return input_graph.unrenumber(result[0].result(), 'vertex').compute()

    return result[0].result()
Example #10
0
def sssp(input_graph, source):
    """
    Compute the distance and predecessors for shortest paths from the specified
    source to all the vertices in the input_graph. The distances column will
    store the distance from the source to each vertex. The predecessors column
    will store each vertex's predecessor in the shortest path. Vertices that
    are unreachable will have a distance of infinity denoted by the maximum
    value of the data type and the predecessor set as -1. The source vertex's
    predecessor is also set to -1.  The input graph must contain edge list as
    dask-cudf dataframe with one partition per GPU.

    Parameters
    ----------
    input_graph : directed cugraph.Graph
        cuGraph graph descriptor, should contain the connectivity information
        as dask cudf edge list dataframe.
        Undirected Graph not currently supported.

    source : Integer
        Specify source vertex

    Returns
    -------
    df : dask_cudf.DataFrame
        df['vertex'] gives the vertex id

        df['distance'] gives the path distance from the
        starting vertex

        df['predecessor'] gives the vertex id it was
        reached from in the traversal

    Examples
    --------
    >>> # import cugraph.dask as dcg
    >>> #... Init a DASK Cluster
    >>> #   see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html
    >>> # chunksize = dcg.get_chunksize(input_data_path)
    >>> # ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize...)
    >>> # dg = cugraph.Graph(directed=True)
    >>> # dg.from_dask_cudf_edgelist(ddf, 'src', 'dst')
    >>> # df = dcg.sssp(dg, 0)
    """
    # FIXME: Uncomment out the above (broken) example

    client = default_client()

    input_graph.compute_renumber_edge_list(transposed=False)
    ddf = input_graph.edgelist.edgelist_df
    vertex_partition_offsets = get_vertex_partition_offsets(input_graph)
    num_verts = vertex_partition_offsets.iloc[-1]
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    if input_graph.renumbered:
        src_col_name = input_graph.renumber_map.renumbered_src_col_name
        dst_col_name = input_graph.renumber_map.renumbered_dst_col_name

        source = input_graph.lookup_internal_vertex_id(cudf.Series(
            [source])).compute()
        source = source.iloc[0]
    else:
        # If the input graph was created with renumbering disabled (Graph(...,
        # renumber=False), the above compute_renumber_edge_list() call will not
        # perform a renumber step and the renumber_map will not have src/dst
        # col names. In that case, the src/dst values specified when reading
        # the edgelist dataframe are to be used, but only if they were single
        # string values (ie. not a list representing multi-columns).
        if isinstance(input_graph.source_columns, Iterable):
            raise RuntimeError("input_graph was not renumbered but has a "
                               "non-string source column name (got: "
                               f"{input_graph.source_columns}). Re-create "
                               "input_graph with either renumbering enabled "
                               "or a source column specified as a string.")
        if isinstance(input_graph.destination_columns, Iterable):
            raise RuntimeError("input_graph was not renumbered but has a "
                               "non-string destination column name (got: "
                               f"{input_graph.destination_columns}). "
                               "Re-create input_graph with either renumbering "
                               "enabled or a destination column specified as "
                               "a string.")
        src_col_name = input_graph.source_columns
        dst_col_name = input_graph.destination_columns

    result = [
        client.submit(call_sssp,
                      Comms.get_session_id(),
                      wf[1],
                      src_col_name,
                      dst_col_name,
                      num_verts,
                      num_edges,
                      vertex_partition_offsets,
                      input_graph.aggregate_segment_offsets,
                      source,
                      workers=[wf[0]])
        for idx, wf in enumerate(data.worker_to_parts.items())
    ]
    wait(result)
    ddf = dask_cudf.from_delayed(result)

    if input_graph.renumbered:
        ddf = input_graph.unrenumber(ddf, 'vertex')
        ddf = input_graph.unrenumber(ddf, 'predecessor')
        ddf["predecessor"] = ddf["predecessor"].fillna(-1)

    return ddf
Example #11
0
def bfs(input_graph, start, depth_limit=None, return_distances=True):
    """
    Find the distances and predecessors for a breadth first traversal of a
    graph.
    The input graph must contain edge list as  dask-cudf dataframe with
    one partition per GPU.

    Parameters
    ----------
    input_graph : directed cugraph.Graph
        cuGraph graph instance, should contain the connectivity information
        as dask cudf edge list dataframe(edge weights are not used for this
        algorithm). Undirected Graph not currently supported.

    start : Integer
        Specify starting vertex for breadth-first search; this function
        iterates over edges in the component reachable from this node.

    depth_limit : Integer or None, optional (default=None)
        Limit the depth of the search

    return_distances : bool, optional (default=True)
        Indicates if distances should be returned

    Returns
    -------
    df : dask_cudf.DataFrame
        df['vertex'] gives the vertex id

        df['distance'] gives the path distance from the
        starting vertex (Only if return_distances is True)

        df['predecessor'] gives the vertex it was
        reached from in the traversal

    Examples
    --------
    >>> # import cugraph.dask as dcg
    >>> # ... Init a DASK Cluster
    >>> #    see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html
    >>> # Download dataset from https://github.com/rapidsai/cugraph/datasets/..
    >>> # chunksize = dcg.get_chunksize(datasets_path / "karate.csv")
    >>> # ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize)
    >>> # dg = cugraph.Graph(directed=True)
    >>> # dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
    >>> #                            edge_attr='value')
    >>> # df = dcg.bfs(dg, 0)

    """
    # FIXME: Uncomment out the above (broken) example

    client = default_client()

    input_graph.compute_renumber_edge_list(transposed=False)
    ddf = input_graph.edgelist.edgelist_df
    vertex_partition_offsets = get_vertex_partition_offsets(input_graph)
    num_verts = vertex_partition_offsets.iloc[-1]

    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    def df_merge(df, tmp_df, tmp_col_names):
        x = df[0].merge(tmp_df, on=tmp_col_names, how='inner')
        return x['global_id']

    if input_graph.renumbered:
        src_col_name = input_graph.renumber_map.renumbered_src_col_name
        dst_col_name = input_graph.renumber_map.renumbered_dst_col_name
        renumber_ddf = input_graph.renumber_map.implementation.ddf
        col_names = input_graph.renumber_map.implementation.col_names
        if isinstance(start, dask_cudf.DataFrame) or isinstance(
                start, cudf.DataFrame):
            tmp_df = start
            tmp_col_names = start.columns
        else:
            tmp_df = cudf.DataFrame()
            tmp_df["0"] = cudf.Series(start)
            tmp_col_names = ["0"]
        tmp_ddf = tmp_df[tmp_col_names].rename(
            columns=dict(zip(tmp_col_names, col_names)))
        for name in col_names:
            tmp_ddf[name] = tmp_ddf[name].astype(renumber_ddf[name].dtype)
        renumber_data = get_distributed_data(renumber_ddf)
        start = [
            client.submit(df_merge, wf[1], tmp_ddf, col_names, workers=[wf[0]])
            for idx, wf in enumerate(renumber_data.worker_to_parts.items())
        ]
    else:
        # If the input graph was created with renumbering disabled (Graph(...,
        # renumber=False), the above compute_renumber_edge_list() call will not
        # perform a renumber step and the renumber_map will not have src/dst
        # col names. In that case, the src/dst values specified when reading
        # the edgelist dataframe are to be used, but only if they were single
        # string values (ie. not a list representing multi-columns).
        if isinstance(input_graph.source_columns, Iterable):
            raise RuntimeError("input_graph was not renumbered but has a "
                               "non-string source column name (got: "
                               f"{input_graph.source_columns}). Re-create "
                               "input_graph with either renumbering enabled "
                               "or a source column specified as a string.")
        if isinstance(input_graph.destination_columns, Iterable):
            raise RuntimeError("input_graph was not renumbered but has a "
                               "non-string destination column name (got: "
                               f"{input_graph.destination_columns}). "
                               "Re-create input_graph with either renumbering "
                               "enabled or a destination column specified as "
                               "a string.")
        src_col_name = input_graph.source_columns
        dst_col_name = input_graph.destination_columns

    result = [
        client.submit(call_bfs,
                      Comms.get_session_id(),
                      wf[1],
                      src_col_name,
                      dst_col_name,
                      num_verts,
                      num_edges,
                      vertex_partition_offsets,
                      input_graph.aggregate_segment_offsets,
                      start[idx],
                      depth_limit,
                      return_distances,
                      workers=[wf[0]])
        for idx, wf in enumerate(data.worker_to_parts.items())
    ]
    wait(result)
    ddf = dask_cudf.from_delayed(result)

    if input_graph.renumbered:
        ddf = input_graph.unrenumber(ddf, 'vertex')
        ddf = input_graph.unrenumber(ddf, 'predecessor')
        ddf = ddf.fillna(-1)
    return ddf
Example #12
0
def pagerank(input_graph,
             alpha=0.85,
             personalization=None,
             max_iter=100,
             tol=1.0e-5,
             nstart=None):
    """
    Find the PageRank values for each vertex in a graph using multiple GPUs.
    cuGraph computes an approximation of the Pagerank using the power method.
    The input graph must contain edge list as  dask-cudf dataframe with
    one partition per GPU.

    Parameters
    ----------
    input_graph : cugraph.DiGraph
        cuGraph graph descriptor, should contain the connectivity information
        as dask cudf edge list dataframe(edge weights are not used for this
        algorithm). Undirected Graph not currently supported.

    alpha : float, optional (default=0.85)
        The damping factor alpha represents the probability to follow an
        outgoing edge, standard value is 0.85.
        Thus, 1.0-alpha is the probability to “teleport” to a random vertex.
        Alpha should be greater than 0.0 and strictly lower than 1.0.

    personalization : cudf.Dataframe, optional (default=None)
        GPU Dataframe containing the personalization information.
        Currently not supported.

        personalization['vertex'] : cudf.Series
            Subset of vertices of graph for personalization
        personalization['values'] : cudf.Series
            Personalization values for vertices

    max_iter : int, optional (default=100)
        The maximum number of iterations before an answer is returned.
        If this value is lower or equal to 0 cuGraph will use the default
        value, which is 30.

    tol : float, optional (default=1.0e-5)
        Set the tolerance the approximation, this parameter should be a small
        magnitude value.
        The lower the tolerance the better the approximation. If this value is
        0.0f, cuGraph will use the default value which is 1.0E-5.
        Setting too small a tolerance can lead to non-convergence due to
        numerical roundoff. Usually values between 0.01 and 0.00001 are
        acceptable.

    nstart : not supported
        initial guess for pagerank

    Returns
    -------
    PageRank : dask_cudf.DataFrame
        GPU data frame containing two dask_cudf.Series of size V: the
        vertex identifiers and the corresponding PageRank values.

        ddf['vertex'] : dask_cudf.Series
            Contains the vertex identifiers
        ddf['pagerank'] : dask_cudf.Series
            Contains the PageRank score

    Examples
    --------
    >>> # import cugraph.dask as dcg
    >>> # ... Init a DASK Cluster
    >>> #    see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html
    >>> # Download dataset from https://github.com/rapidsai/cugraph/datasets/..
    >>> # chunksize = dcg.get_chunksize(datasets_path / "karate.csv")
    >>> # ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize)
    >>> # dg = cugraph.Graph(directed=True)
    >>> # dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
    >>> #                            edge_attr='value')
    >>> # pr = dcg.pagerank(dg)

    """
    nstart = None

    client = default_client()

    input_graph.compute_renumber_edge_list(transposed=True)

    ddf = input_graph.edgelist.edgelist_df
    vertex_partition_offsets = get_vertex_partition_offsets(input_graph)
    num_verts = vertex_partition_offsets.iloc[-1]
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    src_col_name = input_graph.renumber_map.renumbered_src_col_name
    dst_col_name = input_graph.renumber_map.renumbered_dst_col_name

    if personalization is not None:
        if input_graph.renumbered is True:
            personalization = input_graph.add_internal_vertex_id(
                personalization, "vertex", "vertex")

        # Function to assign partition id to personalization dataframe
        def _set_partitions_pre(s, divisions):
            partitions = divisions.searchsorted(s, side="right") - 1
            partitions[divisions.tail(1).searchsorted(
                s, side="right").astype("bool")] = (len(divisions) - 2)
            return partitions

        # Assign partition id column as per vertex_partition_offsets
        df = personalization
        by = ['vertex']
        meta = df._meta._constructor_sliced([0])
        divisions = vertex_partition_offsets
        partitions = df[by].map_partitions(_set_partitions_pre,
                                           divisions=divisions,
                                           meta=meta)

        df2 = df.assign(_partitions=partitions)

        # Shuffle personalization values according to the partition id
        df3 = rearrange_by_column(
            df2,
            "_partitions",
            max_branch=None,
            npartitions=len(divisions) - 1,
            shuffle="tasks",
            ignore_index=False,
        ).drop(columns=["_partitions"])

        p_data = get_distributed_data(df3)

        result = [
            client.submit(call_pagerank,
                          Comms.get_session_id(),
                          wf[1],
                          src_col_name,
                          dst_col_name,
                          num_verts,
                          num_edges,
                          vertex_partition_offsets,
                          input_graph.aggregate_segment_offsets,
                          alpha,
                          max_iter,
                          tol,
                          p_data.worker_to_parts[wf[0]][0],
                          nstart,
                          workers=[wf[0]])
            for idx, wf in enumerate(data.worker_to_parts.items())
        ]
    else:
        result = [
            client.submit(call_pagerank,
                          Comms.get_session_id(),
                          wf[1],
                          src_col_name,
                          dst_col_name,
                          num_verts,
                          num_edges,
                          vertex_partition_offsets,
                          input_graph.aggregate_segment_offsets,
                          alpha,
                          max_iter,
                          tol,
                          personalization,
                          nstart,
                          workers=[wf[0]])
            for idx, wf in enumerate(data.worker_to_parts.items())
        ]
    wait(result)
    ddf = dask_cudf.from_delayed(result)
    if input_graph.renumbered:
        return input_graph.unrenumber(ddf, 'vertex')

    return ddf
Example #13
0
def louvain(input_graph, max_iter=100, resolution=1.0):
    """
    Compute the modularity optimizing partition of the input graph using the
    Louvain method on multiple GPUs

    It uses the Louvain method described in:

    VD Blondel, J-L Guillaume, R Lambiotte and E Lefebvre: Fast unfolding of
    community hierarchies in large networks, J Stat Mech P10008 (2008),
    http://arxiv.org/abs/0803.0476

    Parameters
    ----------
    input_graph : cugraph.Graph or NetworkX Graph
        The graph descriptor should contain the connectivity information
        and weights. The adjacency list will be computed if not already
        present.

    max_iter : integer, optional (default=100)
        This controls the maximum number of levels/iterations of the Louvain
        algorithm. When specified the algorithm will terminate after no more
        than the specified number of iterations. No error occurs when the
        algorithm terminates early in this manner.

    resolution: float/double, optional (default=1.0)
        Called gamma in the modularity formula, this changes the size
        of the communities.  Higher resolutions lead to more smaller
        communities, lower resolutions lead to fewer larger communities.
        Defaults to 1.

    Returns
    -------
    parts : cudf.DataFrame
        GPU data frame of size V containing two columns the vertex id and the
        partition id it is assigned to.

        df['vertex'] : cudf.Series
            Contains the vertex identifiers
        df['partition'] : cudf.Series
            Contains the partition assigned to the vertices

    modularity_score : float
        a floating point number containing the global modularity score of the
        partitioning.

    Examples
    --------
    >>> # import cugraph.dask as dcg
    >>> # ... Init a DASK Cluster
    >>> #    see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html
    >>> # Download dataset from https://github.com/rapidsai/cugraph/datasets/..
    >>> # chunksize = dcg.get_chunksize(datasets_path / "karate.csv")
    >>> # ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize)
    >>> # dg = cugraph.Graph(directed=True)
    >>> # dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
    >>> #                            edge_attr='value')
    >>> # parts, modularity_score = dcg.louvain(dg)
    """
    # FIXME: Uncomment out the above (broken) example

    # MG Louvain currently requires CUDA 10.2 or higher.
    # FIXME: remove this check once RAPIDS drops support for CUDA < 10.2
    if is_cuda_version_less_than((10, 2)):
        raise NotImplementedError("Multi-GPU Louvain is not implemented for "
                                  "this version of CUDA. Ensure CUDA version "
                                  "10.2 or higher is installed.")

    # FIXME: dask methods to populate graphs from edgelists are only present on
    # DiGraph classes. Disable the Graph check for now and assume inputs are
    # symmetric DiGraphs.
    # if type(graph) is not Graph:
    #     raise Exception("input graph must be undirected")
    client = default_client()
    # Calling renumbering results in data that is sorted by degree
    input_graph.compute_renumber_edge_list(transposed=False)

    ddf = input_graph.edgelist.edgelist_df
    vertex_partition_offsets = get_vertex_partition_offsets(input_graph)
    num_verts = vertex_partition_offsets.iloc[-1]
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    src_col_name = input_graph.renumber_map.renumbered_src_col_name
    dst_col_name = input_graph.renumber_map.renumbered_dst_col_name

    futures = [
        client.submit(call_louvain,
                      Comms.get_session_id(),
                      wf[1],
                      src_col_name,
                      dst_col_name,
                      num_verts,
                      num_edges,
                      vertex_partition_offsets,
                      input_graph.aggregate_segment_offsets,
                      max_iter,
                      resolution,
                      workers=[wf[0]])
        for idx, wf in enumerate(data.worker_to_parts.items())
    ]

    wait(futures)

    # futures is a list of Futures containing tuples of (DataFrame, mod_score),
    # unpack using separate calls to client.submit with a callable to get
    # individual items.
    # FIXME: look into an alternate way (not returning a tuples, accessing
    # tuples differently, etc.) since multiple client.submit() calls may not be
    # optimal.
    df_futures = [client.submit(op.getitem, f, 0) for f in futures]
    mod_score_futures = [client.submit(op.getitem, f, 1) for f in futures]

    ddf = dask_cudf.from_delayed(df_futures)
    # Each worker should have computed the same mod_score
    mod_score = mod_score_futures[0].result()

    if input_graph.renumbered:
        # MG renumbering is lazy, but it's safe to assume it's been called at
        # this point if renumbered=True
        ddf = input_graph.unrenumber(ddf, "vertex")

    return (ddf, mod_score)
Example #14
0
    def renumber_and_segment(df,
                             src_col_names,
                             dst_col_names,
                             preserve_order=False,
                             store_transposed=False):
        if isinstance(src_col_names, list):
            renumber_type = 'legacy'
        elif not (df[src_col_names].dtype == np.int32
                  or df[src_col_names].dtype == np.int64):
            renumber_type = 'legacy'
        else:
            renumber_type = 'experimental'

        renumber_map = NumberMap()
        if not isinstance(src_col_names, list):
            src_col_names = [src_col_names]
            dst_col_names = [dst_col_names]

        # Assign the new src and dst column names to be used in the renumbered
        # dataframe to return (renumbered_src_col_name and
        # renumbered_dst_col_name)
        renumber_map.set_renumbered_col_names(src_col_names, dst_col_names,
                                              df.columns)

        id_type = df[src_col_names[0]].dtype
        if isinstance(df, cudf.DataFrame):
            renumber_map.implementation = NumberMap.SingleGPU(
                df, src_col_names, dst_col_names, renumber_map.id_type,
                store_transposed)
        elif isinstance(df, dask_cudf.DataFrame):
            renumber_map.implementation = NumberMap.MultiGPU(
                df, src_col_names, dst_col_names, renumber_map.id_type,
                store_transposed)
        else:
            raise TypeError("df must be cudf.DataFrame or dask_cudf.DataFrame")

        if renumber_type == 'legacy':
            indirection_map = renumber_map.implementation.\
                              indirection_map(df,
                                              src_col_names,
                                              dst_col_names)
            df = renumber_map.add_internal_vertex_id(
                df,
                renumber_map.renumbered_src_col_name,
                src_col_names,
                drop=True,
                preserve_order=preserve_order)
            df = renumber_map.add_internal_vertex_id(
                df,
                renumber_map.renumbered_dst_col_name,
                dst_col_names,
                drop=True,
                preserve_order=preserve_order)
        else:
            df = df.rename(
                columns={
                    src_col_names[0]: renumber_map.renumbered_src_col_name,
                    dst_col_names[0]: renumber_map.renumbered_dst_col_name
                })
        num_edges = len(df)

        if isinstance(df, dask_cudf.DataFrame):
            is_mnmg = True
        else:
            is_mnmg = False

        if is_mnmg:
            client = default_client()
            data = get_distributed_data(df)
            result = [(client.submit(call_renumber,
                                     Comms.get_session_id(),
                                     wf[1],
                                     renumber_map.renumbered_src_col_name,
                                     renumber_map.renumbered_dst_col_name,
                                     num_edges,
                                     is_mnmg,
                                     store_transposed,
                                     workers=[wf[0]]), wf[0])
                      for idx, wf in enumerate(data.worker_to_parts.items())]
            wait(result)

            def get_renumber_map(id_type, data):
                return data[0].astype(id_type)

            def get_segment_offsets(data):
                return data[1]

            def get_renumbered_df(id_type, data):
                data[2][renumber_map.renumbered_src_col_name] = \
                    data[2][renumber_map.renumbered_src_col_name]\
                    .astype(id_type)
                data[2][renumber_map.renumbered_dst_col_name] = \
                    data[2][renumber_map.renumbered_dst_col_name]\
                    .astype(id_type)
                return data[2]

            renumbering_map = dask_cudf.from_delayed([
                client.submit(get_renumber_map, id_type, data, workers=[wf])
                for (data, wf) in result
            ])

            list_of_segment_offsets = client.gather([
                client.submit(get_segment_offsets, data, workers=[wf])
                for (data, wf) in result
            ])
            aggregate_segment_offsets = []
            for segment_offsets in list_of_segment_offsets:
                aggregate_segment_offsets.extend(segment_offsets)

            renumbered_df = dask_cudf.from_delayed([
                client.submit(get_renumbered_df, id_type, data, workers=[wf])
                for (data, wf) in result
            ])
            if renumber_type == 'legacy':
                renumber_map.implementation.ddf = indirection_map.merge(
                    renumbering_map,
                    right_on='original_ids', left_on='global_id',
                    how='right').\
                    drop(columns=['global_id', 'original_ids'])\
                    .rename(columns={'new_ids': 'global_id'})
            else:
                renumber_map.implementation.ddf = renumbering_map.rename(
                    columns={
                        'original_ids': '0',
                        'new_ids': 'global_id'
                    })
            renumber_map.implementation.numbered = True
            return renumbered_df, renumber_map, aggregate_segment_offsets

        else:
            renumbering_map, segment_offsets, renumbered_df = \
                c_renumber.renumber(df,
                                    renumber_map.renumbered_src_col_name,
                                    renumber_map.renumbered_dst_col_name,
                                    num_edges,
                                    0,
                                    Comms.get_default_handle(),
                                    is_mnmg,
                                    store_transposed)
            if renumber_type == 'legacy':
                renumber_map.implementation.df = indirection_map.\
                    merge(renumbering_map,
                          right_on='original_ids', left_on='id').\
                    drop(columns=['id', 'original_ids'])\
                    .rename(columns={'new_ids': 'id'}, copy=False)
            else:
                renumber_map.implementation.df = renumbering_map.rename(
                    columns={
                        'original_ids': '0',
                        'new_ids': 'id'
                    }, copy=False)

            renumber_map.implementation.numbered = True
            return renumbered_df, renumber_map, segment_offsets
Example #15
0
def louvain(input_graph, max_iter=100, resolution=1.0):
    """
    Compute the modularity optimizing partition of the input graph using the
    Louvain method on multiple GPUs

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> ... Init a DASK Cluster
    >>    see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv('datasets/karate.csv', chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.Graph()
    >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
                                   edge_attr='value')
    >>> parts, modularity_score = dcg.louvain(dg)
    """
    # FIXME: finish docstring: describe parameters, etc.

    # MG Louvain currently requires CUDA 10.2 or higher.
    # FIXME: remove this check once RAPIDS drops support for CUDA < 10.2
    if is_cuda_version_less_than((10, 2)):
        raise NotImplementedError("Multi-GPU Louvain is not implemented for "
                                  "this version of CUDA. Ensure CUDA version "
                                  "10.2 or higher is installed.")

    # FIXME: dask methods to populate graphs from edgelists are only present on
    # DiGraph classes. Disable the Graph check for now and assume inputs are
    # symmetric DiGraphs.
    # if type(graph) is not Graph:
    #     raise Exception("input graph must be undirected")
    client = default_client()
    # Calling renumbering results in data that is sorted by degree
    input_graph.compute_renumber_edge_list(transposed=False)
    sorted_by_degree = True

    ddf = input_graph.edgelist.edgelist_df
    vertex_partition_offsets = get_vertex_partition_offsets(input_graph)
    num_verts = vertex_partition_offsets.iloc[-1]
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    futures = [
        client.submit(call_louvain,
                      Comms.get_session_id(),
                      wf[1],
                      num_verts,
                      num_edges,
                      vertex_partition_offsets,
                      sorted_by_degree,
                      max_iter,
                      resolution,
                      workers=[wf[0]])
        for idx, wf in enumerate(data.worker_to_parts.items())
    ]

    wait(futures)

    # futures is a list of Futures containing tuples of (DataFrame, mod_score),
    # unpack using separate calls to client.submit with a callable to get
    # individual items.
    # FIXME: look into an alternate way (not returning a tuples, accessing
    # tuples differently, etc.) since multiple client.submit() calls may not be
    # optimal.
    df_futures = [client.submit(op.getitem, f, 0) for f in futures]
    mod_score_futures = [client.submit(op.getitem, f, 1) for f in futures]

    ddf = dask_cudf.from_delayed(df_futures)
    # Each worker should have computed the same mod_score
    mod_score = mod_score_futures[0].result()

    if input_graph.renumbered:
        # MG renumbering is lazy, but it's safe to assume it's been called at
        # this point if renumbered=True
        ddf = input_graph.unrenumber(ddf, "vertex")

    return (ddf, mod_score)
Example #16
0
def bfs(graph,
        start,
        return_distances=False):
    """
    Find the distances and predecessors for a breadth first traversal of a
    graph.
    The input graph must contain edge list as  dask-cudf dataframe with
    one partition per GPU.

    Parameters
    ----------
    graph : cugraph.DiGraph
        cuGraph graph descriptor, should contain the connectivity information
        as dask cudf edge list dataframe(edge weights are not used for this
        algorithm). Undirected Graph not currently supported.
    start : Integer
        Specify starting vertex for breadth-first search; this function
        iterates over edges in the component reachable from this node.
    return_distances : bool, optional, default=False
        Indicates if distances should be returned

    Returns
    -------
    df : dask_cudf.DataFrame
        df['vertex'] gives the vertex id

        df['distance'] gives the path distance from the
        starting vertex (Only if return_distances is True)

        df['predecessor'] gives the vertex it was
        reached from in the traversal

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> Comms.initialize(p2p=True)
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.DiGraph()
    >>> dg.from_dask_cudf_edgelist(ddf, 'src', 'dst')
    >>> df = dcg.bfs(dg, 0)
    >>> Comms.destroy()
    """

    client = default_client()

    graph.compute_renumber_edge_list(transposed=False)
    (ddf,
     num_verts,
     partition_row_size,
     partition_col_size,
     vertex_partition_offsets) = shuffle(graph, transposed=False)
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    if graph.renumbered:
        start = graph.lookup_internal_vertex_id(cudf.Series([start],
                                                dtype='int32')).compute()
        start = start.iloc[0]

    result = [client.submit(
              call_bfs,
              Comms.get_session_id(),
              wf[1],
              num_verts,
              num_edges,
              vertex_partition_offsets,
              start,
              return_distances,
              workers=[wf[0]])
              for idx, wf in enumerate(data.worker_to_parts.items())]
    wait(result)
    ddf = dask_cudf.from_delayed(result)

    if graph.renumbered:
        ddf = graph.unrenumber(ddf, 'vertex')
        ddf = graph.unrenumber(ddf, 'predecessor')
        ddf["predecessor"] = ddf["predecessor"].fillna(-1)
    return ddf
Example #17
0
def katz_centrality(input_graph,
                    alpha=None,
                    beta=None,
                    max_iter=100,
                    tol=1.0e-5,
                    nstart=None,
                    normalized=True):
    """
    Compute the Katz centrality for the nodes of the graph G.

    Parameters
    ----------
    input_graph : cuGraph.Graph
        cuGraph graph descriptor with connectivity information. The graph can
        contain either directed (DiGraph) or undirected edges (Graph).

    alpha : float, optional (default=None)
        Attenuation factor. If alpha is not specified then
        it is internally calculated as 1/(degree_max) where degree_max is the
        maximum out degree.

        NOTE
            The maximum acceptable value of alpha for convergence
            alpha_max = 1/(lambda_max) where lambda_max is the largest
            eigenvalue of the graph.
            Since lambda_max is always lesser than or equal to degree_max for a
            graph, alpha_max will always be greater than or equal to
            (1/degree_max). Therefore, setting alpha to (1/degree_max) will
            guarantee that it will never exceed alpha_max thus in turn
            fulfilling the requirement for convergence.

    beta : None
        A weight scalar - currently Not Supported

    max_iter : int, optional (default=100)
        The maximum number of iterations before an answer is returned. This can
        be used to limit the execution time and do an early exit before the
        solver reaches the convergence tolerance.
        If this value is lower or equal to 0 cuGraph will use the default
        value, which is 100.

    tol : float, optional (default=1.0e-5)
        Set the tolerance the approximation, this parameter should be a small
        magnitude value.
        The lower the tolerance the better the approximation. If this value is
        0.0f, cuGraph will use the default value which is 1.0e-6.
        Setting too small a tolerance can lead to non-convergence due to
        numerical roundoff. Usually values between 1e-2 and 1e-6 are
        acceptable.

    nstart : dask_cudf.Dataframe, optional (default=None)
        GPU Dataframe containing the initial guess for katz centrality

        nstart['vertex'] : dask_cudf.Series
            Contains the vertex identifiers
        nstart['values'] : dask_cudf.Series
            Contains the katz centrality values of vertices

    normalized : bool, optional (default=True)
        If True normalize the resulting katz centrality values

    Returns
    -------
    katz_centrality : dask_cudf.DataFrame
        GPU data frame containing two dask_cudf.Series of size V: the
        vertex identifiers and the corresponding katz centrality values.

        ddf['vertex'] : dask_cudf.Series
            Contains the vertex identifiers
        ddf['katz_centrality'] : dask_cudf.Series
            Contains the katz centrality of vertices

    Examples
    --------
    >>> # import cugraph.dask as dcg
    >>> # ... Init a DASK Cluster
    >>> #    see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html
    >>> # Download dataset from https://github.com/rapidsai/cugraph/datasets/..
    >>> # chunksize = dcg.get_chunksize(datasets_path / "karate.csv")
    >>> # ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize)
    >>> # dg = cugraph.Graph(directed=True)
    >>> # dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
    >>> #                            edge_attr='value')
    >>> # pr = dcg.katz_centrality(dg)

    """
    # FIXME: Uncomment out the above (broken) example

    nstart = None

    client = default_client()

    input_graph.compute_renumber_edge_list(transposed=True)
    ddf = input_graph.edgelist.edgelist_df
    vertex_partition_offsets = get_vertex_partition_offsets(input_graph)
    num_verts = vertex_partition_offsets.iloc[-1]
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    src_col_name = input_graph.renumber_map.renumbered_src_col_name
    dst_col_name = input_graph.renumber_map.renumbered_dst_col_name

    result = [
        client.submit(call_katz_centrality,
                      Comms.get_session_id(),
                      wf[1],
                      src_col_name,
                      dst_col_name,
                      num_verts,
                      num_edges,
                      vertex_partition_offsets,
                      input_graph.aggregate_segment_offsets,
                      alpha,
                      beta,
                      max_iter,
                      tol,
                      nstart,
                      normalized,
                      workers=[wf[0]])
        for idx, wf in enumerate(data.worker_to_parts.items())
    ]
    wait(result)
    ddf = dask_cudf.from_delayed(result)
    if input_graph.renumbered:
        return input_graph.unrenumber(ddf, 'vertex')

    return ddf
Example #18
0
def sssp(graph, source):
    """
    Compute the distance and predecessors for shortest paths from the specified
    source to all the vertices in the graph. The distances column will store
    the distance from the source to each vertex. The predecessors column will
    store each vertex's predecessor in the shortest path. Vertices that are
    unreachable will have a distance of infinity denoted by the maximum value
    of the data type and the predecessor set as -1. The source vertex's
    predecessor is also set to -1.
    The input graph must contain edge list as dask-cudf dataframe with
    one partition per GPU.

    Parameters
    ----------
    graph : cugraph.DiGraph
        cuGraph graph descriptor, should contain the connectivity information
        as dask cudf edge list dataframe.
        Undirected Graph not currently supported.
    source : Integer
        Specify source vertex

    Returns
    -------
    df : dask_cudf.DataFrame
        df['vertex'] gives the vertex id

        df['distance'] gives the path distance from the
        starting vertex

        df['predecessor'] gives the vertex id it was
        reached from in the traversal

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> ... Init a DASK Cluster
    >>    see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.DiGraph()
    >>> dg.from_dask_cudf_edgelist(ddf, 'src', 'dst')
    >>> df = dcg.sssp(dg, 0)
    """

    client = default_client()

    graph.compute_renumber_edge_list(transposed=False)
    ddf = graph.edgelist.edgelist_df
    vertex_partition_offsets = get_vertex_partition_offsets(graph)
    num_verts = vertex_partition_offsets.iloc[-1]
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    if graph.renumbered:
        source = graph.lookup_internal_vertex_id(cudf.Series([source
                                                              ])).compute()
        source = source.iloc[0]

    result = [
        client.submit(call_sssp,
                      Comms.get_session_id(),
                      wf[1],
                      num_verts,
                      num_edges,
                      vertex_partition_offsets,
                      graph.aggregate_segment_offsets,
                      source,
                      workers=[wf[0]])
        for idx, wf in enumerate(data.worker_to_parts.items())
    ]
    wait(result)
    ddf = dask_cudf.from_delayed(result)

    if graph.renumbered:
        ddf = graph.unrenumber(ddf, 'vertex')
        ddf = graph.unrenumber(ddf, 'predecessor')
        ddf["predecessor"] = ddf["predecessor"].fillna(-1)

    return ddf
Example #19
0
def bfs(graph, start, return_distances=False):
    """
    Find the distances and predecessors for a breadth first traversal of a
    graph.
    The input graph must contain edge list as  dask-cudf dataframe with
    one partition per GPU.

    Parameters
    ----------
    graph : cugraph.DiGraph
        cuGraph graph descriptor, should contain the connectivity information
        as dask cudf edge list dataframe(edge weights are not used for this
        algorithm). Undirected Graph not currently supported.
    start : Integer
        Specify starting vertex for breadth-first search; this function
        iterates over edges in the component reachable from this node.

    return_distances : bool, optional, default=False
        Indicates if distances should be returned

    Returns
    -------
    df : cudf.DataFrame
        df['vertex'][i] gives the vertex id of the i'th vertex

        df['distance'][i] gives the path distance for the i'th vertex from the
        starting vertex (Only if return_distances is True)

        df['predecessor'][i] gives for the i'th vertex the vertex it was
        reached from in the traversal

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> Comms.initialize()
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.DiGraph()
    >>> dg.from_dask_cudf_edgelist(ddf)
    >>> df = dcg.bfs(dg, 0)
    >>> Comms.destroy()
    """

    client = default_client()

    if (graph.local_data is not None and graph.local_data['by'] == 'src'):
        data = graph.local_data['data']
    else:
        data = get_local_data(graph, by='src')

    if graph.renumbered:
        start = graph.lookup_internal_vertex_id(cudf.Series([start])).compute()
        start = start.iloc[0]

    result = dict([(data.worker_info[wf[0]]["rank"],
                    client.submit(call_bfs,
                                  Comms.get_session_id(),
                                  wf[1],
                                  data.local_data,
                                  start,
                                  return_distances,
                                  workers=[wf[0]]))
                   for idx, wf in enumerate(data.worker_to_parts.items())])
    wait(result)

    df = result[0].result()

    if graph.renumbered:
        df = graph.unrenumber(df, 'vertex').compute()
        df = graph.unrenumber(df, 'predecessor').compute()
        df["predecessor"].fillna(-1, inplace=True)

    return df