Ejemplo n.º 1
0
    def compute_local_data(self, by, load_balance=True):
        """
        Compute the local edges, vertices and offsets for a distributed
        graph stored as a dask-cudf dataframe and initialize the
        communicator. Performs global sorting and load_balancing.

        Parameters
        ----------
        by : str
            by argument is the column by which we want to sort and
            partition. It should be the source column name for generating
            CSR format and destination column name for generating CSC
            format.
        load_balance : bool
            Set as True to perform load_balancing after global sorting of
            dask-cudf DataFrame. This ensures that the data is uniformly
            distributed among multiple GPUs to avoid over-loading.
        """
        if self.distributed:
            data = get_local_data(self, by, load_balance)
            self.local_data = {}
            self.local_data['data'] = data
            self.local_data['by'] = by
        else:
            raise Exception('Graph should be a distributed graph')
Ejemplo n.º 2
0
def louvain(input_graph, max_iter=100, resolution=1.0, load_balance=True):
    """
    Compute the modularity optimizing partition of the input graph using the
    Louvain method on multiple GPUs

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> Comms.initialize()
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv('datasets/karate.csv', chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.Graph()
    >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
                                   edge_attr='value')
    >>> parts, modularity_score = dcg.louvain(dg)
    """
    # FIXME: finish docstring: describe parameters, etc.

    # FIXME: import here to prevent circular import: cugraph->louvain
    # wrapper->cugraph/structure->cugraph/dask->dask/louvain->cugraph/structure
    # from cugraph.structure.graph import Graph

    # FIXME: dask methods to populate graphs from edgelists are only present on
    # DiGraph classes. Disable the Graph check for now and assume inputs are
    # symmetric DiGraphs.
    # if type(graph) is not Graph:
    #     raise Exception("input graph must be undirected")

    client = default_client()

    if (input_graph.local_data is not None
            and input_graph.local_data['by'] == 'src'):
        data = input_graph.local_data['data']
    else:
        data = get_local_data(input_graph, by='src', load_balance=load_balance)

    result = dict([(data.worker_info[wf[0]]["rank"],
                    client.submit(call_louvain,
                                  Comms.get_session_id(),
                                  wf[1],
                                  data.local_data,
                                  max_iter,
                                  resolution,
                                  workers=[wf[0]]))
                   for idx, wf in enumerate(data.worker_to_parts.items())])
    wait(result)

    (parts, modularity_score) = result[0].result()

    if input_graph.renumbered:
        # MG renumbering is lazy, but it's safe to assume it's been called at
        # this point if renumbered=True
        parts = input_graph.unrenumber(parts, "vertex")

    return parts, modularity_score
Ejemplo n.º 3
0
def pagerank(input_graph,
             alpha=0.85,
             personalization=None,
             max_iter=100,
             tol=1.0e-5,
             nstart=None,
             load_balance=True):
    """
    Find the PageRank values for each vertex in a graph using multiple GPUs.
    cuGraph computes an approximation of the Pagerank using the power method.
    The input graph must contain edge list as  dask-cudf dataframe with
    one partition per GPU.

    Parameters
    ----------
    graph : cugraph.DiGraph
        cuGraph graph descriptor, should contain the connectivity information
        as dask cudf edge list dataframe(edge weights are not used for this
        algorithm). Undirected Graph not currently supported.
    alpha : float
        The damping factor alpha represents the probability to follow an
        outgoing edge, standard value is 0.85.
        Thus, 1.0-alpha is the probability to “teleport” to a random vertex.
        Alpha should be greater than 0.0 and strictly lower than 1.0.
    personalization : cudf.Dataframe
        GPU Dataframe containing the personalization information.

        personalization['vertex'] : cudf.Series
            Subset of vertices of graph for personalization
        personalization['values'] : cudf.Series
            Personalization values for vertices
    max_iter : int
        The maximum number of iterations before an answer is returned.
        If this value is lower or equal to 0 cuGraph will use the default
        value, which is 30.
    tolerance : float
        Set the tolerance the approximation, this parameter should be a small
        magnitude value.
        The lower the tolerance the better the approximation. If this value is
        0.0f, cuGraph will use the default value which is 1.0E-5.
        Setting too small a tolerance can lead to non-convergence due to
        numerical roundoff. Usually values between 0.01 and 0.00001 are
        acceptable.
    nstart : not supported
        initial guess for pagerank
    load_balance : bool
        Set as True to perform load_balancing after global sorting of
        dask-cudf DataFrame. This ensures that the data is uniformly
        distributed among multiple GPUs to avoid over-loading.

    Returns
    -------
    PageRank : cudf.DataFrame
        GPU data frame containing two cudf.Series of size V: the vertex
        identifiers and the corresponding PageRank values.

        df['vertex'] : cudf.Series
            Contains the vertex identifiers
        df['pagerank'] : cudf.Series
            Contains the PageRank score

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> Comms.initialize()
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.DiGraph()
    >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst',
                                   edge_attr='value')
    >>> pr = dcg.pagerank(dg)
    >>> Comms.destroy()
    """
    from cugraph.structure.graph import null_check

    nstart = None

    client = default_client()

    if (input_graph.local_data is not None
            and input_graph.local_data['by'] == 'dst'):
        data = input_graph.local_data['data']
    else:
        data = get_local_data(input_graph, by='dst', load_balance=load_balance)

    if personalization is not None:
        null_check(personalization["vertex"])
        null_check(personalization["values"])
        if input_graph.renumbered is True:
            personalization = input_graph.add_internal_vertex_id(
                personalization, "vertex", "vertex").compute()

    result = dict([(data.worker_info[wf[0]]["rank"],
                    client.submit(call_pagerank,
                                  Comms.get_session_id(),
                                  wf[1],
                                  data.local_data,
                                  alpha,
                                  max_iter,
                                  tol,
                                  personalization,
                                  nstart,
                                  workers=[wf[0]]))
                   for idx, wf in enumerate(data.worker_to_parts.items())])
    wait(result)

    if input_graph.renumbered:
        return input_graph.unrenumber(result[0].result(), 'vertex').compute()

    return result[0].result()
Ejemplo n.º 4
0
def bfs(graph, start, return_distances=False):
    """
    Find the distances and predecessors for a breadth first traversal of a
    graph.
    The input graph must contain edge list as  dask-cudf dataframe with
    one partition per GPU.

    Parameters
    ----------
    graph : cugraph.DiGraph
        cuGraph graph descriptor, should contain the connectivity information
        as dask cudf edge list dataframe(edge weights are not used for this
        algorithm). Undirected Graph not currently supported.
    start : Integer
        Specify starting vertex for breadth-first search; this function
        iterates over edges in the component reachable from this node.

    return_distances : bool, optional, default=False
        Indicates if distances should be returned

    Returns
    -------
    df : cudf.DataFrame
        df['vertex'][i] gives the vertex id of the i'th vertex

        df['distance'][i] gives the path distance for the i'th vertex from the
        starting vertex (Only if return_distances is True)

        df['predecessor'][i] gives for the i'th vertex the vertex it was
        reached from in the traversal

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> Comms.initialize()
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.DiGraph()
    >>> dg.from_dask_cudf_edgelist(ddf)
    >>> df = dcg.bfs(dg, 0)
    >>> Comms.destroy()
    """

    client = default_client()

    if (graph.local_data is not None and graph.local_data['by'] == 'src'):
        data = graph.local_data['data']
    else:
        data = get_local_data(graph, by='src')

    if graph.renumbered:
        start = graph.lookup_internal_vertex_id(cudf.Series([start])).compute()
        start = start.iloc[0]

    result = dict([(data.worker_info[wf[0]]["rank"],
                    client.submit(call_bfs,
                                  Comms.get_session_id(),
                                  wf[1],
                                  data.local_data,
                                  start,
                                  return_distances,
                                  workers=[wf[0]]))
                   for idx, wf in enumerate(data.worker_to_parts.items())])
    wait(result)

    df = result[0].result()

    if graph.renumbered:
        df = graph.unrenumber(df, 'vertex').compute()
        df = graph.unrenumber(df, 'predecessor').compute()
        df["predecessor"].fillna(-1, inplace=True)

    return df