Beispiel #1
0
def renumber(input_graph):

    client = default_client()

    ddf = input_graph.edgelist.edgelist_df

    num_edges = len(ddf)

    if isinstance(ddf, dask_cudf.DataFrame):
        is_mnmg = True
    else:
        is_mnmg = False

    num_verts = input_graph.number_of_vertices()

    if is_mnmg:
        data = get_distributed_data(ddf)
        result = [
            client.submit(call_renumber,
                          Comms.get_session_id(),
                          wf[1],
                          num_verts,
                          num_edges,
                          is_mnmg,
                          workers=[wf[0]])
            for idx, wf in enumerate(data.worker_to_parts.items())
        ]
        wait(result)
        ddf = dask_cudf.from_delayed(result)
    else:
        call_renumber(Comms.get_session_id(), ddf, num_verts, num_edges,
                      is_mnmg)
    return ddf
Beispiel #2
0
    def __delayed_call_noports(self, inputs):

        def get_pout(df_out):
            '''Used for delayed unpacking.'''
            if isinstance(df_out, cudf.DataFrame):
                # Needed for the same reason as __make_copy. To prevent columns
                # addition in the input data frames. In python everything is
                # by reference value and dataframes are mutable.
                # Handle the case when dask_cudf.DataFrames are source frames
                # which appear as cudf.DataFrame in a dask-delayed function.
                return df_out.copy(deep=False)

            return df_out

        # handle the dask dataframe automatically
        # use the to_delayed interface
        # TODO, currently only handles first input is dask_cudf df
        i_df = inputs[0]
        rest = inputs[1:]
        if isinstance(i_df, dask_cudf.DataFrame):
            output_df_dly_list = []
            for input_dly in i_df.to_delayed():
                inputs_ = [input_dly] + rest
                output_df_dly = dask.delayed(self.decorate_process())(inputs_)
                output_df_dly_per = output_df_dly.persist()
                df_out = dask.delayed(get_pout)(output_df_dly_per)
                output_df_dly_list.append(df_out.persist())

            output_df = dask_cudf.from_delayed(output_df_dly_list)

        else:
            output_df = self.decorate_process()(inputs)

        return output_df
Beispiel #3
0
    def to_ddf(self, columns=None, shuffle=False, seed=None):
        """ Convert `Dataset` object to `dask_cudf.DataFrame`

        Parameters
        -----------
        columns : str or list(str); default None
            Columns to include in output `DataFrame`. If not specified,
            the output will contain all known columns in the Dataset.
        shuffle : bool; default False
            Whether to shuffle the order of partitions in the output
            `dask_cudf.DataFrame`.  Note that this does not shuffle
            the rows within each partition. This is because the data
            is not actually loaded into memory for this operation.
        seed : int; Optional
            The random seed to use if `shuffle=True`.  If nothing
            is specified, the current system time will be used by the
            `random` std library.
        """
        # Use DatasetEngine to create ddf
        ddf = self.engine.to_ddf(columns=columns)

        # Shuffle the partitions of ddf (optional)
        if shuffle and ddf.npartitions > 1:
            parts = ddf.to_delayed()
            random.seed(seed)
            random.shuffle(parts)
            ddf = dask_cudf.from_delayed(parts)

        # Special dtype conversion (optional)
        if self.dtypes:
            _meta = _set_dtypes(ddf._meta, self.dtypes)
            return ddf.map_partitions(_set_dtypes, self.dtypes, meta=_meta)
        return ddf
Beispiel #4
0
def test_dataframe_from_delayed():
    delays = [load_data(10 * i, i) for i in range(1, 3)]
    out = dgd.from_delayed(delays)
    res = out.compute()
    assert isinstance(res, gd.DataFrame)

    expected = gd.concat([d.compute() for d in delays])
    assert_frame_equal(res.to_pandas(), expected.to_pandas())
Beispiel #5
0
def test_series_from_delayed():
    delays = [get_combined_column(load_data(10 * i, i)) for i in range(1, 3)]
    out = dgd.from_delayed(delays)
    res = out.compute()
    assert isinstance(res, gd.Series)

    expected = gd.concat([d.compute() for d in delays])
    np.testing.assert_array_equal(res.to_pandas(), expected.to_pandas())
Beispiel #6
0
    def apply(self, function):
        """Transform each group using a python function.
        """
        @delayed
        def apply_to_group(grp):
            return grp.apply(function)

        grouped = [apply_to_group(g) for g in self._grouped]
        return from_delayed(grouped).reset_index()
Beispiel #7
0
    def kneighbors(self, X, k=None):

        """
        Queries the multi-gpu knn model given a dask-cudf as the query

        1. Create 2 new Dask dataframes to hold output
        (1 chunk each per chunk of X), co-locate pieces w/ X.
        2. Get IPC handles for each dataframe. Use IPCThread to hold onto
        them while calling query.

        :param input:
            A dask-cudf for calculating the kneighbors
        :param k:
            The number of nearest neighbors to query for each input vector.
        :return:
            dists and indices of the k-nearest neighbors to the input vectors
        """

        if k is None:
            k = self.n_neighbors

        client = default_client()
        dfs = client.sync(self._kneighbors, X, k).value

        dfs = [d for d in dfs if d.type != type(None)]  # NOQA

        local_divs = [client.submit(get_idx, f).result() for f in dfs]
        indices = [client.submit(get_I, f) for f in dfs]
        dists = [client.submit(get_D, f) for f in dfs]

        dfs_divs = list(zip(local_divs, indices, dists))

        # Sort delayed dfs by their starting index
        dfs_divs.sort(key=lambda x: x[0][0])

        I_meta = client.submit(get_I_meta, dfs[0]).result()
        D_meta = client.submit(get_D_meta, dfs[0]).result()

        I_ddf = dask_cudf.from_delayed(indices, meta=I_meta)
        D_ddf = dask_cudf.from_delayed(dists, meta=D_meta)

        return D_ddf, I_ddf
Beispiel #8
0
    def apply_grouped(self, *args, **kwargs):
        """Transform each group using a GPU function.

        Calls ``cudf.Groupby.apply_grouped`` concurrently
        """
        @delayed
        def apply_to_group(grp):
            return grp.apply_grouped(*args, **kwargs)

        grouped = [apply_to_group(g) for g in self._grouped]
        return from_delayed(grouped).reset_index()
    def build_dask_df(nrows, ncols):
        workers = client.has_what().keys()

        # Create dfs on each worker (gpu)
        dfs = [
            client.submit(create_df, n, nrows, ncols, workers=[worker])
            for worker, n in list(zip(workers, list(range(len(workers)))))
        ]
        # Wait for completion
        wait(dfs)
        meta = client.submit(get_meta, dfs[0]).result()
        return dask_cudf.from_delayed(dfs, meta=meta)
Beispiel #10
0
def load_balance_func(ddf_, by, client=None):
    # Load balances the sorted dask_cudf DataFrame.
    # Input is a dask_cudf dataframe ddf_ which is sorted by
    # the column name passed as the 'by' argument.

    client = default_client() if client is None else client

    parts = persist_distributed_data(ddf_, client)
    wait(parts)

    who_has = client.who_has(parts)
    key_to_part = [(str(part.key), part) for part in parts]
    gpu_fututres = [(first(who_has[key]), part.key[1], part)
                    for key, part in key_to_part]
    worker_to_data = create_dict(gpu_fututres)

    # Calculate cumulative sum in each dataframe partition
    cumsum_parts = [
        client.submit(get_cumsum, wf[1][0][0], by, workers=[wf[0]]).result()
        for idx, wf in enumerate(worker_to_data.items())
    ]

    num_rows = []
    for cumsum in cumsum_parts:
        num_rows.append(cumsum.iloc[-1])

    # Calculate current partition divisions
    divisions = [sum(num_rows[0:x:1]) for x in range(0, len(num_rows) + 1)]
    divisions[-1] = divisions[-1] - 1
    divisions = tuple(divisions)

    # Set global index from 0 to len(dask_cudf_dataframe) so that global
    # indexing of divisions can be used for repartitioning.
    futures = [
        client.submit(set_global_index,
                      wf[1][0][0],
                      divisions[wf[1][0][1]],
                      workers=[wf[0]])
        for idx, wf in enumerate(worker_to_data.items())
    ]
    wait(futures)

    ddf = dask_cudf.from_delayed(futures)
    ddf.divisions = divisions

    # Repartition the data
    ddf = repartition(ddf, cumsum_parts)

    return ddf
Beispiel #11
0
    def __call__(self, inputs_data):
        if self.load:
            if isinstance(self.load, bool):
                output_df = self.load_cache()
            else:
                output_df = self.load
        else:
            if self._using_ports():
                # nodes with ports take dictionary as inputs
                inputs = {iport: self.__make_copy(data_input)
                          for iport, data_input in inputs_data.items()}
            else:
                # nodes without ports take list as inputs
                inputs = [self.__make_copy(inputs_data[ient['to_port']])
                          for ient in self.inputs]
            if not self.delayed_process:
                output_df = self.decorate_process()(inputs)
            else:
                if self._using_ports():
                    use_delayed = self.__check_dly_processing_prereq(inputs)
                    if use_delayed:
                        output_df = self.__delayed_call(inputs)
                    else:
                        output_df = self.decorate_process()(inputs)
                else:
                    # handle the dask dataframe automatically
                    # use the to_delayed interface
                    # TODO, currently only handles first input is dask_cudf df
                    i_df = inputs[0]
                    rest = inputs[1:]
                    if isinstance(i_df, dask_cudf.DataFrame):
                        d_fun = dask.delayed(self.decorate_process())
                        output_df = dask_cudf.from_delayed([
                            d_fun([item] + rest)
                            for item in i_df.to_delayed()])
                    else:
                        output_df = self.decorate_process()(inputs)

        if self.uid != OUTPUT_ID and output_df is None:
            raise Exception("None output")
        else:
            self.__valide(output_df, self.output_columns)

        if self.save:
            self.save_cache(output_df)

        return output_df
Beispiel #12
0
def concat_within_workers(client, ddf):
    """
    Concats all partitions within workers without transfers
    """
    df_delayed = get_delayed_dict(ddf)

    result = []
    for worker, tasks in client.has_what().items():
        worker_task_list = []

        for task in list(tasks):
            if task in df_delayed:
                worker_task_list.append(df_delayed[task])
        concat_tasks = delayed(concat_dfs)(worker_task_list)
        result.append(client.persist(collections=concat_tasks, workers=worker))

    return dask_cudf.from_delayed(result)
Beispiel #13
0
def _mg_rmat(scale,
             num_edges,
             a,
             b,
             c,
             seed,
             clip_and_flip,
             scramble_vertex_ids,
             create_using=cugraph.DiGraph):
    """
    Calls RMAT on multiple GPUs and uses the resulting Dask cuDF DataFrame to
    initialize and return a cugraph Graph object specified with create_using.
    If create_using is None, returns the Dask DataFrame edgelist as-is.

    seed is used as the initial seed for the first worker used (worker 0), then
    each subsequent worker will receive seed+<worker num> as the seed value.
    """
    client = default_client()
    worker_list = list(client.scheduler_info()['workers'].keys())
    num_workers = len(worker_list)
    num_edges_list = _calc_num_edges_per_worker(num_workers, num_edges)
    futures = []
    for (i, worker_num_edges) in enumerate(num_edges_list):
        unique_worker_seed = seed + i
        future = client.submit(_call_rmat,
                               Comms.get_session_id(),
                               scale,
                               worker_num_edges,
                               a,
                               b,
                               c,
                               unique_worker_seed,
                               clip_and_flip,
                               scramble_vertex_ids,
                               workers=worker_list[i])
        futures.append(future)

    ddf = dask_cudf.from_delayed(futures)

    if create_using is None:
        return ddf

    G = create_using()
    G.from_dask_cudf_edgelist(ddf, source="src", destination="dst")

    return G
Beispiel #14
0
def test_mixing_series_frame_error():
    nelem = 20

    df = gd.DataFrame()
    df["x"] = np.arange(nelem)
    df["y"] = np.random.randint(nelem, size=nelem)

    ddf = dgd.from_cudf(df, npartitions=5)

    delay_frame = ddf.to_delayed()
    delay_series = ddf.x.to_delayed()
    combined = dgd.from_delayed(delay_frame + delay_series)

    with pytest.raises(ValueError) as raises:
        combined.compute()

    raises.match(r"^Metadata mismatch found in `from_delayed`.")
Beispiel #15
0
 def process(self, inputs):
     df = inputs[self.INPUT_PORT_NAME]
     # df = df.drop('datetime', axis=1)
     output = {}
     if self.outport_connected(self.OUTPUT_PORT_NAME):
         offset = self.conf.get('offset', 0)
         out_df = self._process(df, offset)
         output.update({self.OUTPUT_PORT_NAME: out_df})
     if self.outport_connected(self.OUTPUT_DASK_PORT):
         partitions = self.conf['partitions']
         out_dfs = [
             dask.delayed(self._process)(df, i) for i in range(partitions)
         ]
         meta = self.meta_setup().outports[self.OUTPUT_DASK_PORT]
         meta['date'] = 'datetime64[ns]'
         dask_df = dask_cudf.from_delayed(out_dfs, meta=meta)
         output.update({self.OUTPUT_DASK_PORT: dask_df})
     return output
Beispiel #16
0
def test_frame_extra_columns_error():
    nelem = 20

    df = gd.DataFrame()
    df["x"] = np.arange(nelem)
    df["y"] = np.random.randint(nelem, size=nelem)
    ddf1 = dgd.from_cudf(df, npartitions=5)

    df["z"] = np.arange(nelem)
    ddf2 = dgd.from_cudf(df, npartitions=5)

    combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed())

    with pytest.raises(ValueError) as raises:
        combined.compute()

    raises.match(r"^Metadata mismatch found in `from_delayed`.")
    raises.match(r"z")
Beispiel #17
0
def weakly_connected_components(input_graph):
    """
    Generate the Weakly Connected Components and attach a component label to
    each vertex.

    Parameters
    ----------
    input_graph : cugraph.Graph, networkx.Graph, CuPy or SciPy sparse matrix

        Graph or matrix object, which should contain the connectivity
        information
    """

    client = default_client()

    input_graph.compute_renumber_edge_list()

    ddf = input_graph.edgelist.edgelist_df
    vertex_partition_offsets = get_vertex_partition_offsets(input_graph)
    num_verts = vertex_partition_offsets.iloc[-1]
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    src_col_name = input_graph.renumber_map.renumbered_src_col_name
    dst_col_name = input_graph.renumber_map.renumbered_dst_col_name

    result = [client.submit(call_wcc,
                            Comms.get_session_id(),
                            wf[1],
                            src_col_name,
                            dst_col_name,
                            num_verts,
                            num_edges,
                            vertex_partition_offsets,
                            input_graph.aggregate_segment_offsets,
                            workers=[wf[0]])
              for idx, wf in enumerate(data.worker_to_parts.items())]
    wait(result)
    ddf = dask_cudf.from_delayed(result)

    if input_graph.renumbered:
        return input_graph.unrenumber(ddf, 'vertex')

    return ddf
Beispiel #18
0
def test_frame_dtype_error():
    nelem = 20

    df1 = gd.DataFrame()
    df1["bad"] = np.arange(nelem)
    df1["bad"] = np.arange(nelem, dtype=np.float64)

    df2 = gd.DataFrame()
    df2["bad"] = np.arange(nelem)
    df2["bad"] = np.arange(nelem, dtype=np.float32)

    ddf1 = dgd.from_cudf(df1, npartitions=5)
    ddf2 = dgd.from_cudf(df2, npartitions=5)

    combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed())

    with pytest.raises(ValueError) as raises:
        combined.compute()

    raises.match(r"same type")
Beispiel #19
0
    def __call__(self, inputs):
        # valide inputs
        Class = type(self)
        cache = Class.cache_dir
        inputs = [self.__make_copy(i) for i in inputs]
        if not isinstance(self.load, bool) or self.load:
            if isinstance(self.load, bool):
                output_df = self.load_cache(cache+'/'+self.uid+'.hdf5')
            else:
                output_df = self.load
        else:
            if not self.delayed_process:
                output_df = self.process(inputs)
            else:
                # handle the dask dataframe automatically
                # use the to_delayed interface
                # TODO, currently only handles first input is dask_cudf df
                i_df = inputs[0]
                rest = inputs[1:]
                if isinstance(i_df,  dask_cudf.DataFrame):
                    d_fun = dask.delayed(self.process)
                    output_df = dask_cudf.from_delayed([
                        d_fun([item] + rest) for item in i_df.to_delayed()])
                else:
                    output_df = self.process(inputs)

        if self.uid != 'unique_output' and output_df is None:
            raise Exception("None output")
        elif (isinstance(output_df, cudf.DataFrame) or
              isinstance(output_df, dask_cudf.DataFrame)
              ) and len(output_df) == 0:
            raise Exception("empty output")
        elif not self.__valide(output_df, self.output_columns):
            raise Exception("not valid output")

        if self.save:
            os.makedirs(cache, exist_ok=True)
            output_df.to_hdf(cache+'/'+self.uid+'.hdf5', key=self.uid)

        return output_df
Beispiel #20
0
    def fit(self, X, y):
        """
        Fits a multi-gpu linear regression model such that each of the
        resulting coefficients are also distributed across the GPUs.
        :param futures:
        :return:
        """
        client = default_client()

        self.dtype = X[X.columns[0]].compute().dtype

        coef, intercept, locations = client.sync(self._do_fit, X, y,
                                                 self.dtype)

        self.intercept = intercept
        self._locations = locations

        self._model_fit = True

        self._ncols = X.shape[1]

        self.coef_ = dask_cudf.from_delayed(coef)
Beispiel #21
0
def test_frame_dtype_error():
    nelem = 20

    df1 = gd.DataFrame()
    df1['bad'] = np.arange(nelem)
    df1['bad'] = np.arange(nelem, dtype=np.float64)

    df2 = gd.DataFrame()
    df2['bad'] = np.arange(nelem)
    df2['bad'] = np.arange(nelem, dtype=np.float32)

    ddf1 = dgd.from_cudf(df1, npartitions=5)
    ddf2 = dgd.from_cudf(df2, npartitions=5)

    combined = dgd.from_delayed(ddf1.to_delayed() + ddf2.to_delayed())

    with pytest.raises(ValueError) as raises:
        combined.compute()

    print("out")
    raises.match(r"^Metadata mismatch found in `from_delayed`.")
    raises.match(r"\s+\|\s+".join(['bad', 'float32', 'float64']))
Beispiel #22
0
    def predict(self, X):
        """
        Predict values for the multi-gpu linear regression model by making
        calls to the predict function with dask-cudf objects.

        :param df:
            a dask-cudf with data distributed one worker per GPU
        :return:
            a dask-cudf containing outputs of the linear regression
        """
        if self._model_fit:

            client = default_client()
            ret = client.sync(self._do_predict, X, self.coef_, self._locations,
                              self.intercept, self.dtype)

            ret = dask_cudf.from_delayed(ret)

            return ret

        else:
            raise ValueError('Model coefficients have not been fit. You need '
                             'to run the fit() method first. ')
Beispiel #23
0
def sssp(graph, source):
    """
    Compute the distance and predecessors for shortest paths from the specified
    source to all the vertices in the graph. The distances column will store
    the distance from the source to each vertex. The predecessors column will
    store each vertex's predecessor in the shortest path. Vertices that are
    unreachable will have a distance of infinity denoted by the maximum value
    of the data type and the predecessor set as -1. The source vertex's
    predecessor is also set to -1.
    The input graph must contain edge list as dask-cudf dataframe with
    one partition per GPU.

    Parameters
    ----------
    graph : cugraph.DiGraph
        cuGraph graph descriptor, should contain the connectivity information
        as dask cudf edge list dataframe.
        Undirected Graph not currently supported.
    source : Integer
        Specify source vertex

    Returns
    -------
    df : dask_cudf.DataFrame
        df['vertex'] gives the vertex id

        df['distance'] gives the path distance from the
        starting vertex

        df['predecessor'] gives the vertex id it was
        reached from in the traversal

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> ... Init a DASK Cluster
    >>    see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.DiGraph()
    >>> dg.from_dask_cudf_edgelist(ddf, 'src', 'dst')
    >>> df = dcg.sssp(dg, 0)
    """

    client = default_client()

    graph.compute_renumber_edge_list(transposed=False)
    ddf = graph.edgelist.edgelist_df
    vertex_partition_offsets = get_vertex_partition_offsets(graph)
    num_verts = vertex_partition_offsets.iloc[-1]
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    if graph.renumbered:
        source = graph.lookup_internal_vertex_id(cudf.Series([source
                                                              ])).compute()
        source = source.iloc[0]

    result = [
        client.submit(call_sssp,
                      Comms.get_session_id(),
                      wf[1],
                      num_verts,
                      num_edges,
                      vertex_partition_offsets,
                      graph.aggregate_segment_offsets,
                      source,
                      workers=[wf[0]])
        for idx, wf in enumerate(data.worker_to_parts.items())
    ]
    wait(result)
    ddf = dask_cudf.from_delayed(result)

    if graph.renumbered:
        ddf = graph.unrenumber(ddf, 'vertex')
        ddf = graph.unrenumber(ddf, 'predecessor')
        ddf["predecessor"] = ddf["predecessor"].fillna(-1)

    return ddf
Beispiel #24
0
def main(client):
    import dask_cudf
    import cudf

    item_df = read_tables()
    """
    Filter and Join web_clickstreams and item table.
    SELECT wcs_user_sk,
      (wcs_click_date_sk*24L*60L*60L + wcs_click_time_sk) AS tstamp_inSec,
      i_category_id
    FROM web_clickstreams wcs, item i
    WHERE wcs.wcs_item_sk = i.i_item_sk
    AND i.i_category_id IS NOT NULL
    AND wcs.wcs_user_sk IS NOT NULL
    """
    f_item_df = item_df[item_df["i_category_id"].notnull()].reset_index(
        drop=True)

    # The main idea is that we don't fuse a filtration task with reading task yet
    # this  causes more memory pressures as we try to read the whole thing ( and spill that)
    # at once and then do filtration .

    ### Below Pr has the dashboard snapshot which makes the problem clear
    ### https://github.com/rapidsai/tpcx-bb-internal/pull/496#issue-399946141

    web_clickstream_flist = glob.glob(cli_args["data_dir"] +
                                      "web_clickstreams/*.parquet")
    task_ls = [
        delayed(pre_repartition_task)(fn, f_item_df.to_delayed()[0])
        for fn in web_clickstream_flist
    ]

    meta_d = {
        "wcs_user_sk": np.ones(1, dtype=np.int64),
        "tstamp_inSec": np.ones(1, dtype=np.int64),
        "i_category_id": np.ones(1, dtype=np.int8),
    }
    meta_df = cudf.DataFrame(meta_d)

    merged_df = dask_cudf.from_delayed(task_ls, meta=meta_df)

    ### that the click for each user ends up at the same partition
    merged_df = merged_df.repartition(columns=["wcs_user_sk"])

    ### Main Query
    ### sessionize logic.
    distinct_session_df = merged_df.map_partitions(
        get_distinct_sessions,
        keep_cols=["wcs_user_sk", "i_category_id"],
        time_out=q30_session_timeout_inSec,
    )

    del merged_df
    ### create pairs out of item category id's.
    pair_df = distinct_session_df.map_partitions(
        get_pairs,
        pair_col="i_category_id",
        output_col_1="category_id_1",
        output_col_2="category_id_2",
    )

    del distinct_session_df
    ### apply groupby on "category_id_1", "category_id_2"
    grouped_df = (pair_df.groupby(["category_id_1", "category_id_2"
                                   ]).size(split_every=2).reset_index())

    grouped_df.columns = ["category_id_1", "category_id_2", "cnt"]

    result_df = grouped_df.repartition(npartitions=1).persist()
    ### sort records in desc order and reset index.
    ### below only has 40 rows so leaving as cudf frame should be fine
    result_df = result_df.map_partitions(
        lambda x: x.sort_values("cnt", ascending=False))
    result_df = result_df.reset_index(drop=True).head(q30_limit)
    return result_df
Beispiel #25
0
def pagerank(edge_list, alpha=0.85, max_iter=30):
    """
    Find the PageRank values for each vertex in a graph using multiple GPUs.
    cuGraph computes an approximation of the Pagerank using the power method.
    The input edge list should be provided in dask-cudf dataframe
    with one partition per GPU.

    Parameters
    ----------
    edge_list : dask_cudf.DataFrame
        Contain the connectivity information as an edge list.
        Source 'src' and destination 'dst' columns must be of type 'int32'.
        Edge weights are not used for this algorithm.
        Indices must be in the range [0, V-1], where V is the global number
        of vertices.
    alpha : float
        The damping factor alpha represents the probability to follow an
        outgoing edge, standard value is 0.85.
        Thus, 1.0-alpha is the probability to “teleport” to a random vertex.
        Alpha should be greater than 0.0 and strictly lower than 1.0.
    max_iter : int
        The maximum number of iterations before an answer is returned.
        If this value is lower or equal to 0 cuGraph will use the default
        value, which is 30.

    Returns
    -------
    PageRank : dask_cudf.DataFrame
        Dask GPU DataFrame containing two columns of size V: the vertex
        identifiers and the corresponding PageRank values.

    Examples
    --------
    >>> import dask_cugraph.pagerank as dcg
    >>> chunksize = dcg.get_chunksize(edge_list.csv)
    >>> ddf_edge_list = dask_cudf.read_csv(edge_list.csv,
    >>>                                    chunksize = chunksize,
    >>>                                    delimiter='\t',
    >>>                                    names=['src', 'dst'],
    >>>                                    dtype=['int32', 'int32'])
    >>> pr = dcg.pagerank(ddf_edge_list, alpha=0.85, max_iter=50)
    """

    client = default_client()
    gpu_futures = _get_mg_info(edge_list)
    # npartitions = len(gpu_futures)

    host_dict = _build_host_dict(gpu_futures, client).items()
    if len(host_dict) > 1:
        raise Exception("Dask cluster appears to span hosts. Current "
                        "multi-GPU version is limited to single host")

    master_host = [(host, random.sample(ports, 1)[0])
                   for host, ports in host_dict][0]

    host, port = master_host
    gpu_futures_for_host = list(filter(lambda d: d[0][0] == host, gpu_futures))
    exec_node = (host, port)
    # build ipc handles
    gpu_data_excl_worker = list(
        filter(lambda d: d[0] != exec_node, gpu_futures_for_host))
    gpu_data_incl_worker = list(
        filter(lambda d: d[0] == exec_node, gpu_futures_for_host))

    ipc_handles = [
        client.submit(get_ipc_handle, future, workers=[worker])
        for worker, future in gpu_data_excl_worker
    ]

    raw_arrays = [future for worker, future in gpu_data_incl_worker]
    pr = [
        client.submit(_mg_pagerank, (ipc_handles, raw_arrays, alpha, max_iter),
                      workers=[exec_node])
    ]
    c = cudf.DataFrame({
        'vertex': cudf.Series(dtype='int32'),
        'pagerank': cudf.Series(dtype='float32')
    })
    ddf = dc.from_delayed(pr, meta=c)
    return ddf
Beispiel #26
0
    def __delayed_call(self, inputs):
        '''Delayed processing called when self.delayed_process is set. To
        handle delayed processing automatically, prerequisites are checked via
        call to:
            :meth:`__check_dly_processing_prereq`
        Additionally all input dask_cudf dataframes have to be partitioned
        the same i.e. equal number of partitions.
        '''

        def get_pout(out_dict, port):
            '''Get the output in out_dict at key port. Used for delayed
            unpacking.'''
            # DEBUGGING
            # try:
            #     from dask.distributed import get_worker
            #     worker = get_worker()
            #     print('worker{} get_pout NODE "{}" port "{}" worker: {}'
            #           .format(worker.name, self.uid, port, worker))
            # except Exception as err:
            #     print(err)

            df_out = out_dict.get(port, cudf.DataFrame())

            if isinstance(df_out, cudf.DataFrame):
                # Needed for the same reason as __make_copy. To prevent columns
                # addition in the input data frames. In python everything is
                # by reference value and dataframes are mutable.
                # Handle the case when dask_cudf.DataFrames are source frames
                # which appear as cudf.DataFrame in a dask-delayed function.
                return df_out.copy(deep=False)

            return df_out

        inputs_dly = {}
        # A dask_cudf object will return a list of dask delayed object using
        # to_delayed() API. Below the logic assumes (otherwise error) that
        # all inputs are dask_cudf objects and are distributed in the same
        # manner. Ex. inputs_dly:
        #     inputs_dly = {
        #         p0: {
        #             iport0: ddf_dly_i0_p0,
        #             iport1: ddf_dly_i1_p0,
        #             ... for all iports
        #         },
        #         p1: {
        #             iport0: ddf_dly_i0_p1,
        #             iport1: ddf_dly_i1_p1,
        #             ... for all iports
        #         },
        #         ... for all partitions
        # i_x - iport
        # p_x - partition index

        npartitions = None
        for iport, dcudf in inputs.items():
            ddf_dly_list = dcudf.to_delayed()
            npartitions_ = len(ddf_dly_list)
            if npartitions is None:
                npartitions = npartitions_
            if npartitions != npartitions_:
                raise Exception(
                    'Error DASK_CUDF PARTITIONS MISMATCH: Node "{}" input "{}"'
                    ' has {} npartitions and other inputs have {} partitions'
                    .format(self.uid, iport, npartitions_, npartitions))
            for idly, dly in enumerate(ddf_dly_list):
                inputs_dly.setdefault(idly, {}).update({
                    # iport: dly.persist()  # DON'T PERSIST HERE
                    iport: dly
                })

        # DEBUGGING
        # print('INPUTS_DLY:\n{}'.format(inputs_dly))

        outputs_dly = {}
        # Formulate a list of delayed objects for each output port to be able
        # to call from_delayed to synthesize a dask_cudf object.
        # Ex. outputs_dly:
        #     outputs_dly = {
        #         o0: [ddf_dly_o0_p0, ddf_dly_o0_p1, ... _pN]
        #         o1: [ddf_dly_o1_p0, ddf_dly_o1_p1, ... _pN]
        #         ... for all output ports
        #     }
        # o_x - output port
        # p_x - delayed partition

        # VERY IMPORTANT TO USE PERSIST:
        # https://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.DataFrame.persist
        # Otherwise process will run several times.
        for inputs_ in inputs_dly.values():
            output_df_dly = dask.delayed(self.decorate_process())(inputs_)
            output_df_dly_per = output_df_dly.persist()
            for oport in self._get_output_ports():
                oport_out = dask.delayed(get_pout)(
                    output_df_dly_per, oport)
                outputs_dly.setdefault(oport, []).append(oport_out.persist())

        # DEBUGGING
        # print('OUTPUTS_DLY:\n{}'.format(outputs_dly))

        output_df = {}
        # A dask_cudf object is synthesized from a list of delayed objects.
        # Per outputs_dly above use dask_cudf.from_delayed API.
        for oport in self._get_output_ports():
            output_df[oport] = dask_cudf.from_delayed(outputs_dly[oport])

        return output_df
Beispiel #27
0
    def renumber_and_segment(df,
                             src_col_names,
                             dst_col_names,
                             preserve_order=False,
                             store_transposed=False):
        if isinstance(src_col_names, list):
            renumber_type = 'legacy'
        elif not (df[src_col_names].dtype == np.int32
                  or df[src_col_names].dtype == np.int64):
            renumber_type = 'legacy'
        else:
            renumber_type = 'experimental'

        renumber_map = NumberMap()
        if not isinstance(src_col_names, list):
            src_col_names = [src_col_names]
            dst_col_names = [dst_col_names]

        # Assign the new src and dst column names to be used in the renumbered
        # dataframe to return (renumbered_src_col_name and
        # renumbered_dst_col_name)
        renumber_map.set_renumbered_col_names(src_col_names, dst_col_names,
                                              df.columns)

        id_type = df[src_col_names[0]].dtype
        if isinstance(df, cudf.DataFrame):
            renumber_map.implementation = NumberMap.SingleGPU(
                df, src_col_names, dst_col_names, renumber_map.id_type,
                store_transposed)
        elif isinstance(df, dask_cudf.DataFrame):
            renumber_map.implementation = NumberMap.MultiGPU(
                df, src_col_names, dst_col_names, renumber_map.id_type,
                store_transposed)
        else:
            raise TypeError("df must be cudf.DataFrame or dask_cudf.DataFrame")

        if renumber_type == 'legacy':
            indirection_map = renumber_map.implementation.\
                              indirection_map(df,
                                              src_col_names,
                                              dst_col_names)
            df = renumber_map.add_internal_vertex_id(
                df,
                renumber_map.renumbered_src_col_name,
                src_col_names,
                drop=True,
                preserve_order=preserve_order)
            df = renumber_map.add_internal_vertex_id(
                df,
                renumber_map.renumbered_dst_col_name,
                dst_col_names,
                drop=True,
                preserve_order=preserve_order)
        else:
            df = df.rename(
                columns={
                    src_col_names[0]: renumber_map.renumbered_src_col_name,
                    dst_col_names[0]: renumber_map.renumbered_dst_col_name
                })
        num_edges = len(df)

        if isinstance(df, dask_cudf.DataFrame):
            is_mnmg = True
        else:
            is_mnmg = False

        if is_mnmg:
            client = default_client()
            data = get_distributed_data(df)
            result = [(client.submit(call_renumber,
                                     Comms.get_session_id(),
                                     wf[1],
                                     renumber_map.renumbered_src_col_name,
                                     renumber_map.renumbered_dst_col_name,
                                     num_edges,
                                     is_mnmg,
                                     store_transposed,
                                     workers=[wf[0]]), wf[0])
                      for idx, wf in enumerate(data.worker_to_parts.items())]
            wait(result)

            def get_renumber_map(id_type, data):
                return data[0].astype(id_type)

            def get_segment_offsets(data):
                return data[1]

            def get_renumbered_df(id_type, data):
                data[2][renumber_map.renumbered_src_col_name] = \
                    data[2][renumber_map.renumbered_src_col_name]\
                    .astype(id_type)
                data[2][renumber_map.renumbered_dst_col_name] = \
                    data[2][renumber_map.renumbered_dst_col_name]\
                    .astype(id_type)
                return data[2]

            renumbering_map = dask_cudf.from_delayed([
                client.submit(get_renumber_map, id_type, data, workers=[wf])
                for (data, wf) in result
            ])

            list_of_segment_offsets = client.gather([
                client.submit(get_segment_offsets, data, workers=[wf])
                for (data, wf) in result
            ])
            aggregate_segment_offsets = []
            for segment_offsets in list_of_segment_offsets:
                aggregate_segment_offsets.extend(segment_offsets)

            renumbered_df = dask_cudf.from_delayed([
                client.submit(get_renumbered_df, id_type, data, workers=[wf])
                for (data, wf) in result
            ])
            if renumber_type == 'legacy':
                renumber_map.implementation.ddf = indirection_map.merge(
                    renumbering_map,
                    right_on='original_ids', left_on='global_id',
                    how='right').\
                    drop(columns=['global_id', 'original_ids'])\
                    .rename(columns={'new_ids': 'global_id'})
            else:
                renumber_map.implementation.ddf = renumbering_map.rename(
                    columns={
                        'original_ids': '0',
                        'new_ids': 'global_id'
                    })
            renumber_map.implementation.numbered = True
            return renumbered_df, renumber_map, aggregate_segment_offsets

        else:
            renumbering_map, segment_offsets, renumbered_df = \
                c_renumber.renumber(df,
                                    renumber_map.renumbered_src_col_name,
                                    renumber_map.renumbered_dst_col_name,
                                    num_edges,
                                    0,
                                    Comms.get_default_handle(),
                                    is_mnmg,
                                    store_transposed)
            if renumber_type == 'legacy':
                renumber_map.implementation.df = indirection_map.\
                    merge(renumbering_map,
                          right_on='original_ids', left_on='id').\
                    drop(columns=['id', 'original_ids'])\
                    .rename(columns={'new_ids': 'id'}, copy=False)
            else:
                renumber_map.implementation.df = renumbering_map.rename(
                    columns={
                        'original_ids': '0',
                        'new_ids': 'id'
                    }, copy=False)

            renumber_map.implementation.numbered = True
            return renumbered_df, renumber_map, segment_offsets
Beispiel #28
0
def main(client, config):
    import cudf
    import dask_cudf

    (date_dim_df, web_page_df, web_sales_df) = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )

    date_dim_cov_df = date_dim_df.map_partitions(convert_datestring_to_days)
    q08_start_dt = np.datetime64(q08_STARTDATE, "D").astype(int)
    q08_end_dt = np.datetime64(q08_ENDDATE, "D").astype(int)
    filtered_date_df = date_dim_cov_df.query(
        f"d_date >= {q08_start_dt} and d_date <= {q08_end_dt}",
        meta=date_dim_cov_df._meta,
    ).reset_index(drop=True)

    # Convert wp_type to categorical and get cat_id of review and dynamic type
    # see https://github.com/rapidsai/cudf/issues/4093 for more info
    web_page_df = web_page_df.persist()

    # map_partitions is a bit faster than ddf[col].astype('category')
    web_page_df["wp_type"] = web_page_df["wp_type"].map_partitions(
        lambda ser: ser.astype("category"))
    cpu_categories = web_page_df["wp_type"].compute().cat.categories.to_pandas(
    )
    REVIEW_CAT_CODE = cpu_categories.get_loc("review")

    # cast to minimum viable dtype
    codes_min_signed_type = cudf.utils.dtypes.min_signed_type(
        len(cpu_categories))

    web_page_df["wp_type_codes"] = web_page_df["wp_type"].cat.codes.astype(
        codes_min_signed_type)
    web_page_newcols = ["wp_web_page_sk", "wp_type_codes"]
    web_page_df = web_page_df[web_page_newcols]

    web_clickstream_flist = glob.glob(config["data_dir"] +
                                      "web_clickstreams/*.parquet")

    task_ls = [
        delayed(etl_wcs)(fn, filtered_date_df.to_delayed()[0],
                         web_page_df.to_delayed()[0])
        for fn in web_clickstream_flist
    ]

    meta_d = {
        "wcs_user_sk": np.ones(1, dtype=np.int64),
        "tstamp_inSec": np.ones(1, dtype=np.int64),
        "wcs_sales_sk": np.ones(1, dtype=np.int64),
        "wp_type_codes": np.ones(1, dtype=np.int8),
    }
    meta_df = cudf.DataFrame(meta_d)
    merged_df = dask_cudf.from_delayed(task_ls, meta=meta_df)

    merged_df = merged_df.repartition(columns=["wcs_user_sk"])
    reviewed_sales = merged_df.map_partitions(
        reduction_function,
        REVIEW_CAT_CODE,
        meta=cudf.DataFrame({"wcs_sales_sk": np.ones(1, dtype=np.int64)}),
    )
    reviewed_sales = reviewed_sales.persist()
    wait(reviewed_sales)
    del merged_df

    all_sales_in_year = filtered_date_df.merge(web_sales_df,
                                               left_on=["d_date_sk"],
                                               right_on=["ws_sold_date_sk"],
                                               how="inner")
    all_sales_in_year = all_sales_in_year[["ws_net_paid", "ws_order_number"]]

    all_sales_in_year = all_sales_in_year.persist()
    wait(all_sales_in_year)

    # note: switch to mainline
    # once https://github.com/dask/dask/pull/6066
    # lands

    q08_reviewed_sales = hash_merge(
        lhs=all_sales_in_year,
        rhs=reviewed_sales,
        left_on=["ws_order_number"],
        right_on=["wcs_sales_sk"],
        how="inner",
    )

    q08_reviewed_sales_sum = q08_reviewed_sales["ws_net_paid"].sum()
    q08_all_sales_sum = all_sales_in_year["ws_net_paid"].sum()

    q08_reviewed_sales_sum, q08_all_sales_sum = client.compute(
        [q08_reviewed_sales_sum, q08_all_sales_sum])
    q08_reviewed_sales_sum, q08_all_sales_sum = (
        q08_reviewed_sales_sum.result(),
        q08_all_sales_sum.result(),
    )

    no_q08_review_sales_amount = q08_all_sales_sum - q08_reviewed_sales_sum

    final_result_df = cudf.DataFrame()
    final_result_df["q08_review_sales_amount"] = [q08_reviewed_sales_sum]
    final_result_df["q08_review_sales_amount"] = final_result_df[
        "q08_review_sales_amount"].astype("int")
    final_result_df["no_q08_review_sales_amount"] = [
        no_q08_review_sales_amount
    ]
    final_result_df["no_q08_review_sales_amount"] = final_result_df[
        "no_q08_review_sales_amount"].astype("int")

    return final_result_df
Beispiel #29
0
def main(client, config):
    import dask_cudf
    import cudf

    item_df = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )

    wcs_tstamp_min = get_wcs_minima(config)

    item_df["i_item_sk"] = item_df["i_item_sk"].astype("int32")
    item_df["i_category_id"] = item_df["i_category_id"].astype("int8")

    # we eventually will only care about these categories, so we can filter now
    item_df_filtered = item_df.loc[item_df.i_category_id.isin(
        q03_purchased_item_category_IN)].reset_index(drop=True)

    # The main idea is that we don't fuse a filtration task with reading task yet
    # this causes more memory pressures as we try to read the whole thing ( and spill that)
    # at once and then do filtration .

    ### Below Pr has the dashboard snapshot which makes the problem clear
    ### https://github.com/rapidsai/tpcx-bb-internal/pull/496#issue-399946141

    web_clickstream_flist = glob.glob(
        os.path.join(config["data_dir"], "web_clickstreams/*.parquet"))
    task_ls = [
        delayed(pre_repartition_task)(fn, item_df.to_delayed()[0],
                                      wcs_tstamp_min)
        for fn in web_clickstream_flist
    ]

    meta_d = {
        "wcs_user_sk": np.ones(1, dtype=np.int32),
        "tstamp": np.ones(1, dtype=np.int32),
        "wcs_item_sk": np.ones(1, dtype=np.int32),
        "wcs_sales_sk": np.ones(1, dtype=np.int32),
        "i_category_id": np.ones(1, dtype=np.int8),
    }
    meta_df = cudf.DataFrame(meta_d)

    merged_df = dask_cudf.from_delayed(task_ls, meta=meta_df)

    merged_df = merged_df.shuffle(on="wcs_user_sk")

    meta_d = {
        "i_item_sk": np.ones(1, dtype=merged_df["wcs_item_sk"].dtype),
        "cnt": np.ones(1, dtype=merged_df["wcs_item_sk"].dtype),
    }
    meta_df = cudf.DataFrame(meta_d)

    grouped_df = merged_df.map_partitions(reduction_function,
                                          item_df_filtered.to_delayed()[0],
                                          meta=meta_df)

    ### todo: check if this has any impact on stability
    grouped_df = grouped_df.persist(priority=10000)
    ### todo: remove this later after more testing
    wait(grouped_df)
    print("---" * 20)
    print("grouping complete ={}".format(len(grouped_df)))
    grouped_df = grouped_df.groupby(["i_item_sk"
                                     ]).sum(split_every=2).reset_index()
    grouped_df.columns = ["i_item_sk", "cnt"]
    result_df = grouped_df.map_partitions(
        lambda df: df.sort_values(by=["cnt"], ascending=False))

    result_df.columns = ["lastviewed_item", "cnt"]
    result_df["purchased_item"] = q03_purchased_item_IN
    cols_order = ["purchased_item", "lastviewed_item", "cnt"]
    result_df = result_df[cols_order]
    result_df = result_df.persist()
    ### todo: remove this later after more testing
    wait(result_df)
    print(len(result_df))
    result_df = result_df.head(q03_limit)
    print("result complete")
    print("---" * 20)
    return result_df
Beispiel #30
0
def bfs(graph,
        start,
        return_distances=False):
    """
    Find the distances and predecessors for a breadth first traversal of a
    graph.
    The input graph must contain edge list as  dask-cudf dataframe with
    one partition per GPU.

    Parameters
    ----------
    graph : cugraph.DiGraph
        cuGraph graph descriptor, should contain the connectivity information
        as dask cudf edge list dataframe(edge weights are not used for this
        algorithm). Undirected Graph not currently supported.
    start : Integer
        Specify starting vertex for breadth-first search; this function
        iterates over edges in the component reachable from this node.
    return_distances : bool, optional, default=False
        Indicates if distances should be returned

    Returns
    -------
    df : dask_cudf.DataFrame
        df['vertex'] gives the vertex id

        df['distance'] gives the path distance from the
        starting vertex (Only if return_distances is True)

        df['predecessor'] gives the vertex it was
        reached from in the traversal

    Examples
    --------
    >>> import cugraph.dask as dcg
    >>> Comms.initialize(p2p=True)
    >>> chunksize = dcg.get_chunksize(input_data_path)
    >>> ddf = dask_cudf.read_csv(input_data_path, chunksize=chunksize,
                                 delimiter=' ',
                                 names=['src', 'dst', 'value'],
                                 dtype=['int32', 'int32', 'float32'])
    >>> dg = cugraph.DiGraph()
    >>> dg.from_dask_cudf_edgelist(ddf, 'src', 'dst')
    >>> df = dcg.bfs(dg, 0)
    >>> Comms.destroy()
    """

    client = default_client()

    graph.compute_renumber_edge_list(transposed=False)
    (ddf,
     num_verts,
     partition_row_size,
     partition_col_size,
     vertex_partition_offsets) = shuffle(graph, transposed=False)
    num_edges = len(ddf)
    data = get_distributed_data(ddf)

    if graph.renumbered:
        start = graph.lookup_internal_vertex_id(cudf.Series([start],
                                                dtype='int32')).compute()
        start = start.iloc[0]

    result = [client.submit(
              call_bfs,
              Comms.get_session_id(),
              wf[1],
              num_verts,
              num_edges,
              vertex_partition_offsets,
              start,
              return_distances,
              workers=[wf[0]])
              for idx, wf in enumerate(data.worker_to_parts.items())]
    wait(result)
    ddf = dask_cudf.from_delayed(result)

    if graph.renumbered:
        ddf = graph.unrenumber(ddf, 'vertex')
        ddf = graph.unrenumber(ddf, 'predecessor')
        ddf["predecessor"] = ddf["predecessor"].fillna(-1)
    return ddf