Beispiel #1
0
async def _extract_partitions(dask_obj, client=None, batch_enabled=False):
    client = default_client() if client is None else client
    worker_list = Comms.get_workers()
    # dask.dataframe or dask.array
    if isinstance(dask_obj, (daskDataFrame, daskArray, daskSeries)):
        # parts = persist_distributed_data(dask_obj, client)
        # FIXME: persist data to the same worker when batch_enabled=True
        if batch_enabled:
            persisted = client.persist(dask_obj, workers=worker_list[0])
        else:
            persisted = [
                client.persist(dask_obj.get_partition(p), workers=w)
                for p, w in enumerate(worker_list)
            ]
        parts = futures_of(persisted)
    # iterable of dask collections (need to colocate them)
    elif isinstance(dask_obj, collections.abc.Sequence):
        # NOTE: We colocate (X, y) here by zipping delayed
        # n partitions of them as (X1, y1), (X2, y2)...
        # and asking client to compute a single future for
        # each tuple in the list
        dela = [np.asarray(d.to_delayed()) for d in dask_obj]

        # TODO: ravel() is causing strange behavior w/ delayed Arrays which are
        # not yet backed by futures. Need to investigate this behavior.
        # ref: https://github.com/rapidsai/cuml/issues/2045
        raveled = [d.flatten() for d in dela]
        parts = client.compute([p for p in zip(*raveled)])

    await wait(parts)
    key_to_part = [(str(part.key), part) for part in parts]
    who_has = await client.who_has(parts)
    return [(first(who_has[key]), part) for key, part in key_to_part]
Beispiel #2
0
def symmetrize_ddf(df, src_name, dst_name, weight_name=None):
    """
    Take a COO stored in a distributed DataFrame, and the column names of
    the source and destination columns and create a new data frame
    using the same column names that symmetrize the graph so that all
    edges appear in both directions.

    Note that if other columns exist in the data frame (e.g. edge weights)
    the other columns will also be replicated.  That is, if (u,v,data)
    represents the source value (u), destination value (v) and some
    set of other columns (data) in the input data, then the output
    data will contain both (u,v,data) and (v,u,data) with matching
    data.

    If (u,v,data1) and (v,u,data2) exist in the input data where data1
    != data2 then this code will arbitrarily pick the smaller data
    element to keep, if this is not desired then the caller should
    should correct the data prior to calling symmetrize.

    Parameters
    ----------
    df : dask_cudf.DataFrame
        Input data frame containing COO.  Columns should contain source
        ids, destination ids and any properties associated with the
        edges.
    src_name : string
        Name of the column in the data frame containing the source ids
    dst_name : string
        Name of the column in the data frame containing the destination ids
    multi : bool
        Set to True if graph is a Multi(Di)Graph. This allows multiple
        edges instead of dropping them.
    symmetrize : bool
        Default is True to perform symmetrization. If False only duplicate
        edges are dropped.

    Examples
    --------
    >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
    >>>                   dtype=['int32', 'int32', 'float32'], header=None)
    >>> sym_df = cugraph.symmetrize(M, '0', '1')
    """
    if weight_name:
        ddf2 = df[[dst_name, src_name, weight_name]]
        ddf2.columns = [src_name, dst_name, weight_name]
    else:
        ddf2 = df[[dst_name, src_name]]
        ddf2.columns = [src_name, dst_name]
    worker_list = Comms.get_workers()
    num_workers = len(worker_list)
    ddf = df.append(ddf2).reset_index(drop=True)
    result = ddf.shuffle(on=[src_name, dst_name],
                         ignore_index=True,
                         npartitions=num_workers)
    result = result.map_partitions(lambda x: x.groupby(
        by=[src_name, dst_name], as_index=False).min().reset_index(drop=True))

    return result
Beispiel #3
0
def persist_distributed_data(dask_df, client):
    client = default_client() if client is None else client
    worker_addresses = Comms.get_workers()
    _keys = dask_df.__dask_keys__()
    worker_dict = {}
    for i, key in enumerate(_keys):
        worker_dict[str(key)] = tuple([worker_addresses[i]])
    persisted = client.persist(dask_df, workers=worker_dict)
    parts = futures_of(persisted)
    return parts
Beispiel #4
0
def _workers_to_parts(futures):
    """
    Builds an ordered dict mapping each worker to their list
    of parts
    :param futures: list of (worker, part) tuples
    :return:
    """
    w_to_p_map = OrderedDict.fromkeys(Comms.get_workers())
    for w, p in futures:
        if w_to_p_map[w] is None:
            w_to_p_map[w] = []
        w_to_p_map[w].append(p)
    return w_to_p_map
Beispiel #5
0
def _workers_to_parts(futures):
    """
    Builds an ordered dict mapping each worker to their list
    of parts
    :param futures: list of (worker, part) tuples
    :return:
    """
    w_to_p_map = OrderedDict.fromkeys(Comms.get_workers())
    for w, p in futures:
        if w_to_p_map[w] is None:
            w_to_p_map[w] = []
        w_to_p_map[w].append(p)
    keys_to_delete = [w for (w, p) in w_to_p_map.items() if p is None]
    for k in keys_to_delete:
        del w_to_p_map[k]
    return w_to_p_map
Beispiel #6
0
def symmetrize_ddf(df, src_name, dst_name, weight_name=None):
    """
    Take a COO stored in a distributed DataFrame, and the column names of
    the source and destination columns and create a new data frame
    using the same column names that symmetrize the graph so that all
    edges appear in both directions.

    Note that if other columns exist in the data frame (e.g. edge weights)
    the other columns will also be replicated.  That is, if (u,v,data)
    represents the source value (u), destination value (v) and some
    set of other columns (data) in the input data, then the output
    data will contain both (u,v,data) and (v,u,data) with matching
    data.

    If (u,v,data1) and (v,u,data2) exist in the input data where data1
    != data2 then this code will arbitrarily pick the smaller data
    element to keep, if this is not desired then the caller should
    should correct the data prior to calling symmetrize.

    Parameters
    ----------
    df : dask_cudf.DataFrame
        Input data frame containing COO.  Columns should contain source
        ids, destination ids and any properties associated with the
        edges.

    src_name : string
        Name of the column in the data frame containing the source ids

    dst_name : string
        Name of the column in the data frame containing the destination ids

    weight_name : string, optional (default=None)
        Name of the column in the data frame containing the weights

    Examples
    --------
    >>> # import cugraph.dask as dcg
    >>> # from cugraph.structure.symmetrize import symmetrize_ddf
    >>> # Init a DASK Cluster
    >>> # Download dataset from https://github.com/rapidsai/cugraph/datasets/..
    >>> # chunksize = dcg.get_chunksize(datasets / 'karate.csv')
    >>> # ddf = dask_cudf.read_csv(datasets/'karate.csv', chunksize=chunksize,
    >>> #                          delimiter=' ',
    >>> #                          names=['src', 'dst', 'weight'],
    >>> #                          dtype=['int32', 'int32', 'float32'])
    >>> # sym_ddf = symmetrize_ddf(ddf, "src", "dst", "weight")

    """
    # FIXME: Uncomment out the above (broken) example

    if weight_name:
        ddf2 = df[[dst_name, src_name, weight_name]]
        ddf2.columns = [src_name, dst_name, weight_name]
    else:
        ddf2 = df[[dst_name, src_name]]
        ddf2.columns = [src_name, dst_name]
    worker_list = Comms.get_workers()
    num_workers = len(worker_list)
    ddf = df.append(ddf2).reset_index(drop=True)
    result = ddf.shuffle(on=[src_name, dst_name],
                         ignore_index=True,
                         npartitions=num_workers)
    result = result.map_partitions(lambda x: x.groupby(
        by=[src_name, dst_name], as_index=False).min().reset_index(drop=True))

    return result