コード例 #1
0
def _write_uniques(dfs, base_path, col_group, options):
    if options.concat_groups and len(col_group) > 1:
        col_group = [_make_name(*col_group, sep=options.name_sep)]
    if isinstance(col_group, str):
        col_group = [col_group]
    if options.on_host:
        # Construct gpu DataFrame from pyarrow data.
        # `on_host=True` implies gpu-backed data.
        df = pa.concat_tables(dfs, promote=True)
        df = _from_host(df)
    else:
        df = _concat(dfs, ignore_index=True)
    rel_path = "unique.%s.parquet" % (_make_name(*col_group,
                                                 sep=options.name_sep))
    path = "/".join([base_path, rel_path])
    if len(df):
        # Make sure first category is Null
        df = df.sort_values(col_group, na_position="first")
        new_cols = {}
        nulls_missing = False
        for col in col_group:
            name_count = col + "_count"
            if options.max_size:
                max_emb_size = options.max_size
                if isinstance(options.max_size, dict):
                    max_emb_size = max_emb_size[col]
                if options.num_buckets:
                    if isinstance(options.num_buckets, int):
                        nlargest = max_emb_size - options.num_buckets - 1
                    else:
                        nlargest = max_emb_size - options.num_buckets[col] - 1
                else:
                    nlargest = max_emb_size - 1

                if nlargest <= 0:
                    raise ValueError("`nlargest` cannot be 0 or negative")

                if nlargest < len(df):
                    df = df.nlargest(n=nlargest, columns=name_count)
            if not _series_has_nulls(df[col]):
                nulls_missing = True
                new_cols[col] = _concat(
                    [
                        df._constructor_sliced([None], dtype=df[col].dtype),
                        df[col]
                    ],
                    ignore_index=True,
                )
            else:
                new_cols[col] = df[col].copy(deep=False)
        if nulls_missing:
            df = type(df)(new_cols)
        df.to_parquet(path, index=False, compression=None)
    else:
        df_null = type(df)({c: [None] for c in col_group})
        for c in col_group:
            df_null[c] = df_null[c].astype(df[c].dtype)
        df_null.to_parquet(path, index=False, compression=None)
    del df
    return path
コード例 #2
0
def _write_uniques(dfs, base_path, col_group, on_host, concat_groups, name_sep):
    if concat_groups and len(col_group) > 1:
        col_group = [_make_name(*col_group, sep=name_sep)]
    ignore_index = True
    if isinstance(col_group, str):
        col_group = [col_group]
    df = _concat(dfs, ignore_index)
    if on_host:
        df.reset_index(drop=True, inplace=True)
        df = cudf.from_pandas(df)
    rel_path = "unique.%s.parquet" % (_make_name(*col_group, sep=name_sep))
    path = "/".join([base_path, rel_path])
    if len(df):
        # Make sure first category is Null
        df = df.sort_values(col_group, na_position="first")
        new_cols = {}
        nulls_missing = False
        for col in col_group:
            if not df[col]._column.has_nulls:
                nulls_missing = True
                new_cols[col] = _concat(
                    [cudf.Series([None], dtype=df[col].dtype), df[col]], ignore_index
                )
            else:
                new_cols[col] = df[col].copy(deep=False)
        if nulls_missing:
            df = cudf.DataFrame(new_cols)
        df.to_parquet(path, write_index=False, compression=None)
    else:
        df_null = cudf.DataFrame({c: [None] for c in col_group})
        for c in col_group:
            df_null[c] = df_null[c].astype(df[c].dtype)
        df_null.to_parquet(path, write_index=False, compression=None)
    del df
    return path
コード例 #3
0
ファイル: test_categorical.py プロジェクト: zzzz123321/dask
def test_concat_unions_categoricals():
    # Categorical DataFrame, regular index
    tm.assert_frame_equal(_concat(frames), pd.concat(frames2))

    # Categorical Series, regular index
    tm.assert_series_equal(_concat([i.y for i in frames]),
                           pd.concat([i.y for i in frames2]))

    # Categorical Index
    tm.assert_index_equal(_concat([i.index for i in frames3]),
                          pd.concat([i for i in frames4]).index)

    # Categorical DataFrame, Categorical Index
    tm.assert_frame_equal(_concat(frames3), pd.concat(frames4))

    # Non-categorical DataFrame, Categorical Index
    tm.assert_frame_equal(_concat([i[['x', 'z']] for i in frames3]),
                          pd.concat([i[['x', 'z']] for i in frames4]))

    # Categorical Series, Categorical Index
    tm.assert_series_equal(_concat([i.z for i in frames3]),
                           pd.concat([i.z for i in frames4]))

    # Non-categorical Series, Categorical Index
    tm.assert_series_equal(_concat([i.x for i in frames3]),
                           pd.concat([i.x for i in frames4]))

    # MultiIndex with Categorical Index
    tm.assert_index_equal(_concat([i.index for i in frames5]),
                          pd.concat([i for i in frames6]).index)

    # DataFrame, MultiIndex with CategoricalIndex
    tm.assert_frame_equal(_concat(frames5), pd.concat(frames6))
コード例 #4
0
def _mid_level_groupby(dfs, col, cont_cols, agg_list, freq_limit, on_host):
    ignore_index = True
    if on_host:
        gb = cudf.from_pandas(_concat(dfs, ignore_index)).groupby(
            col, dropna=False).sum()
    else:
        gb = _concat(dfs, ignore_index).groupby(col, dropna=False).sum()
    gb.reset_index(drop=False, inplace=True)

    name_count = _make_name(col, "count")
    if freq_limit:
        gb = gb[gb[name_count] >= freq_limit]

    required = [col]
    if "count" in agg_list:
        required.append(name_count)

    ddof = 1
    for cont_col in cont_cols:
        name_sum = _make_name(col, cont_col, "sum")
        if "sum" in agg_list:
            required.append(name_sum)

        if "mean" in agg_list:
            name_mean = _make_name(col, cont_col, "mean")
            required.append(name_mean)
            gb[name_mean] = gb[name_sum] / gb[name_count]

        if "var" in agg_list or "std" in agg_list:
            n = gb[name_count]
            x = gb[name_sum]
            x2 = gb[_make_name(col, cont_col, "pow2", "sum")]
            result = x2 - x**2 / n
            div = n - ddof
            div[div < 1] = 1
            result /= div
            result[(n - ddof) == 0] = np.nan

            if "var" in agg_list:
                name_var = _make_name(col, cont_col, "var")
                required.append(name_var)
                gb[name_var] = result
            if "std" in agg_list:
                name_std = _make_name(col, cont_col, "std")
                required.append(name_std)
                gb[name_std] = np.sqrt(result)

    if on_host:
        gb_pd = gb[required].to_pandas()
        del gb
        return gb_pd
    return gb[required]
コード例 #5
0
ファイル: groupby.py プロジェクト: vyasr/cudf
def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep):
    """ Node in groupby-aggregation reduction tree.

        Following the initial `_groupby_partition_agg` tasks,
        the `groupby_agg` algorithm will perform a tree reduction
        to combine the data from the input partitions into
        `split_out` different output partitions.  For each node in
        the reduction tree, the input DataFrame objects are
        concatenated, and "sum", "min" and/or "max" groupby
        aggregations are used to combine the necessary statistics.
    """

    df = _concat(dfs, ignore_index=True)
    agg_dict = {}
    for col in df.columns:
        if col in gb_cols:
            continue
        agg = col.split(sep)[-1]
        if agg in ("count", "sum"):
            agg_dict[col] = ["sum"]
        elif agg in ("min", "max"):
            agg_dict[col] = [agg]
        else:
            raise ValueError(f"Unexpected aggregation: {agg}")

    gb = df.groupby(gb_cols, dropna=dropna, as_index=False,
                    sort=sort).agg(agg_dict)

    # Don't include the last aggregation in the column names
    gb.columns = [_make_name(*name[:-1], sep=sep) for name in gb.columns]
    return gb
コード例 #6
0
ファイル: categorify.py プロジェクト: hephaex/NVTabular
def _cat_level_2(dfs, col, freq_limit, on_host):
    ignore_index = True
    if on_host:
        # Pandas groupby does not have `dropna` arg
        gb = cudf.from_pandas(_concat(dfs, ignore_index)).groupby(
            col, dropna=False).sum()
    else:
        gb = _concat(dfs, ignore_index).groupby(col, dropna=False).sum()
    gb.reset_index(drop=False, inplace=True)
    if freq_limit:
        gb = gb[gb["count"] >= freq_limit]
    if on_host:
        gb_pd = gb[[col]].to_pandas()
        del gb
        return gb_pd
    return gb[[col]]
コード例 #7
0
def _tree_node_moments(inputs):
    out = {}
    for val in ["df-count", "df-sum", "df2-sum"]:
        df_list = [x.get(val, None) for x in inputs]
        df_list = [df for df in df_list if df is not None]
        out[val] = _concat(df_list, ignore_index=True).sum().to_frame().transpose()
    return out
コード例 #8
0
ファイル: merge.py プロジェクト: Ethyling/dask-cuda
def df_concat(df_parts):
    """Making sure df_parts is a single dataframe or None"""
    if len(df_parts) == 0:
        return None
    elif len(df_parts) == 1:
        return df_parts[0]
    else:
        return _concat(df_parts)
コード例 #9
0
ファイル: categorify.py プロジェクト: hephaex/NVTabular
def _cat_level_3(dfs, base_path, col, on_host):
    ignore_index = True
    df = _concat(dfs, ignore_index)
    if on_host:
        df = cudf.from_pandas(df)
    rel_path = "unique.%s.parquet" % (col)
    path = "/".join([base_path, rel_path])
    if len(df):
        # Make sure first category is Null
        df = df.sort_values(col, na_position="first")
        if not df[col]._column.has_nulls:
            df = cudf.DataFrame(
                {col: _concat([cudf.Series([None]), df[col]], ignore_index)})
        df.to_parquet(path, write_index=False, compression=None)
    else:
        df_null = cudf.DataFrame({col: [None]})
        df_null[col] = df_null[col].astype(df[col].dtype)
        df_null.to_parquet(path, write_index=False, compression=None)
    del df
    return path
コード例 #10
0
def _write_gb_stats(dfs, base_path, col, on_host):
    ignore_index = True
    df = _concat(dfs, ignore_index)
    if on_host:
        df = cudf.from_pandas(df)
    rel_path = "cat_stats.%s.parquet" % (col)
    path = os.path.join(base_path, rel_path)
    if len(df):
        df = df.sort_values(col, na_position="first")
        df.to_parquet(path, write_index=False, compression=None)
    else:
        df_null = cudf.DataFrame({col: [None]})
        df_null[col] = df_null[col].astype(df[col].dtype)
        df_null.to_parquet(path, write_index=False, compression=None)
    del df
    return path
コード例 #11
0
ファイル: test_dataframe.py プロジェクト: StuartAxelOwen/dask
def test_concat():
    x = _concat([pd.DataFrame(columns=['a', 'b']),
                 pd.DataFrame(columns=['a', 'b'])])
    assert list(x.columns) == ['a', 'b']
    assert len(x) == 0
コード例 #12
0
def _mid_level_groupby(
    dfs, col_group, cont_cols, agg_list, freq_limit, on_host, concat_groups, name_sep
):
    if isinstance(col_group, str):
        col_group = [col_group]

    if concat_groups and len(col_group) > 1:
        col_group = [_make_name(*col_group, sep=name_sep)]

    if on_host:
        df = pa.concat_tables(dfs, promote=True)
        df = cudf.DataFrame.from_arrow(df)
    else:
        df = _concat(dfs, ignore_index=True)
    groups = df.groupby(col_group, dropna=False)
    gb = groups.agg({col: _get_aggregation_type(col) for col in df.columns if col not in col_group})
    gb.reset_index(drop=False, inplace=True)

    name_count = _make_name(*(col_group + ["count"]), sep=name_sep)
    if freq_limit:
        gb = gb[gb[name_count] >= freq_limit]

    required = col_group.copy()
    if "count" in agg_list:
        required.append(name_count)

    ddof = 1
    for cont_col in cont_cols:
        name_sum = _make_name(*(col_group + [cont_col, "sum"]), sep=name_sep)
        if "sum" in agg_list:
            required.append(name_sum)

        if "mean" in agg_list:
            name_mean = _make_name(*(col_group + [cont_col, "mean"]), sep=name_sep)
            required.append(name_mean)
            gb[name_mean] = gb[name_sum] / gb[name_count]

        if "min" in agg_list:
            name_min = _make_name(*(col_group + [cont_col, "min"]), sep=name_sep)
            required.append(name_min)

        if "max" in agg_list:
            name_max = _make_name(*(col_group + [cont_col, "max"]), sep=name_sep)
            required.append(name_max)

        if "var" in agg_list or "std" in agg_list:
            n = gb[name_count]
            x = gb[name_sum]
            x2 = gb[_make_name(*(col_group + [cont_col, "pow2", "sum"]), sep=name_sep)]
            result = x2 - x ** 2 / n
            div = n - ddof
            div[div < 1] = 1
            result /= div
            result[(n - ddof) == 0] = np.nan

            if "var" in agg_list:
                name_var = _make_name(*(col_group + [cont_col, "var"]), sep=name_sep)
                required.append(name_var)
                gb[name_var] = result
            if "std" in agg_list:
                name_std = _make_name(*(col_group + [cont_col, "std"]), sep=name_sep)
                required.append(name_std)
                gb[name_std] = np.sqrt(result)

    if on_host:
        gb_pd = gb[required].to_arrow(preserve_index=False)
        del gb
        return gb_pd
    return gb[required]
コード例 #13
0
def _top_level_groupby(
    gdf, cat_col_groups, tree_width, cont_cols, agg_list, on_host, concat_groups, name_sep
):
    sum_sq = "std" in agg_list or "var" in agg_list
    calculate_min = "min" in agg_list
    calculate_max = "max" in agg_list

    # Top-level operation for category-based groupby aggregations
    output = {}
    k = 0
    for i, cat_col_group in enumerate(cat_col_groups):

        if isinstance(cat_col_group, str):
            cat_col_group = [cat_col_group]
        cat_col_group_str = _make_name(*cat_col_group, sep=name_sep)

        if concat_groups and len(cat_col_group) > 1:
            # Concatenate columns and replace cat_col_group
            # with the single name
            df_gb = cudf.DataFrame()
            ignore_index = True
            df_gb[cat_col_group_str] = _concat([gdf[col] for col in cat_col_group], ignore_index)
            cat_col_group = [cat_col_group_str]
        else:
            # Compile aggregation dictionary and add "squared-sum"
            # column(s) (necessary when `cont_cols` is non-empty)
            df_gb = gdf[cat_col_group + cont_cols].copy(deep=False)

        agg_dict = {}
        agg_dict[cat_col_group[0]] = ["count"]
        for col in cont_cols:
            agg_dict[col] = ["sum"]
            if sum_sq:
                name = _make_name(col, "pow2", sep=name_sep)
                df_gb[name] = df_gb[col].pow(2)
                agg_dict[name] = ["sum"]

            if calculate_min:
                agg_dict[col].append("min")
            if calculate_max:
                agg_dict[col].append("max")

        # Perform groupby and flatten column index
        # (flattening provides better cudf support)
        if _is_list_col(cat_col_group, df_gb):
            # handle list columns by encoding the list values
            df_gb = cudf.DataFrame({cat_col_group[0]: df_gb[cat_col_group[0]].list.leaves})

        gb = df_gb.groupby(cat_col_group, dropna=False).agg(agg_dict)
        gb.columns = [
            _make_name(*(tuple(cat_col_group) + name[1:]), sep=name_sep)
            if name[0] == cat_col_group[0]
            else _make_name(*(tuple(cat_col_group) + name), sep=name_sep)
            for name in gb.columns.to_flat_index()
        ]
        gb.reset_index(inplace=True, drop=False)
        del df_gb

        # Split the result by the hash value of the categorical column
        for j, split in enumerate(
            gb.partition_by_hash(cat_col_group, tree_width[cat_col_group_str], keep_index=False)
        ):
            if on_host:
                output[k] = split.to_arrow(preserve_index=False)
            else:
                output[k] = split
            k += 1
        del gb
    return output
コード例 #14
0
def _top_level_groupby(df, options: FitOptions):
    sum_sq = "std" in options.agg_list or "var" in options.agg_list
    calculate_min = "min" in options.agg_list
    calculate_max = "max" in options.agg_list

    # Top-level operation for category-based groupby aggregations
    output = {}
    k = 0
    for i, cat_col_group in enumerate(options.col_groups):
        if isinstance(cat_col_group, tuple):
            cat_col_group = list(cat_col_group)

        if isinstance(cat_col_group, str):
            cat_col_group = [cat_col_group]
        cat_col_group_str = _make_name(*cat_col_group, sep=options.name_sep)

        if options.concat_groups and len(cat_col_group) > 1:
            # Concatenate columns and replace cat_col_group
            # with the single name
            df_gb = type(df)()
            ignore_index = True
            df_gb[cat_col_group_str] = _concat(
                [df[col] for col in cat_col_group], ignore_index)
            cat_col_group = [cat_col_group_str]
        else:
            # Compile aggregation dictionary and add "squared-sum"
            # column(s) (necessary when `agg_cols` is non-empty)
            df_gb = df[cat_col_group + options.agg_cols].copy(deep=False)

        agg_dict = {}
        agg_dict[cat_col_group[0]] = ["count"]
        for col in options.agg_cols:
            agg_dict[col] = ["sum"]
            if sum_sq:
                name = _make_name(col, "pow2", sep=options.name_sep)
                df_gb[name] = df_gb[col].pow(2)
                agg_dict[name] = ["sum"]

            if calculate_min:
                agg_dict[col].append("min")
            if calculate_max:
                agg_dict[col].append("max")

        # Perform groupby and flatten column index
        # (flattening provides better cudf/pd support)
        if _is_list_col(cat_col_group, df_gb):
            # handle list columns by encoding the list values
            df_gb = _flatten_list_column(df_gb[cat_col_group[0]])

        # NOTE: groupby(..., dropna=False) requires pandas>=1.1.0
        gb = df_gb.groupby(cat_col_group, dropna=False).agg(agg_dict)
        gb.columns = [
            _make_name(*(tuple(cat_col_group) + name[1:]),
                       sep=options.name_sep) if name[0] == cat_col_group[0]
            else _make_name(*(tuple(cat_col_group) + name),
                            sep=options.name_sep)
            for name in gb.columns.to_flat_index()
        ]
        gb.reset_index(inplace=True, drop=False)
        del df_gb

        # Split the result by the hash value of the categorical column
        nsplits = options.tree_width[cat_col_group_str]
        for j, split in shuffle_group(gb, cat_col_group, 0, nsplits, nsplits,
                                      True, nsplits).items():
            if options.on_host:
                output[k] = split.to_arrow(preserve_index=False)
            else:
                output[k] = split
            k += 1
        del gb
    return output
コード例 #15
0
ファイル: shuffle.py プロジェクト: Ethyling/dask-cuda
def sort_in_parts(
    in_parts: List[Dict[int, DataFrame]],
    rank_to_out_part_ids: Dict[int, List[int]],
    ignore_index: bool,
    concat_dfs_of_same_output_partition: bool,
) -> Dict[int, List[List[DataFrame]]]:
    """ Sort the list of grouped dataframes in `in_parts`

    It returns a dict that for each worker-rank specifies the output partitions:
    '''
        for each worker:
            for each output partition:
                list of dataframes that makes of an output partition
    '''
    If `concat_dfs_of_same_output_partition` is True, all the dataframes of an
    output partition are concatenated.

    Parameters
    ----------
    in_parts: list of dict of dataframes
        List of dataframe groups that need to be shuffled.
    rank_to_out_part_ids: dict
        dict that for each worker rank specifices a list of partition IDs that
        worker should return. If the worker shouldn't return any partitions,
        it is excluded from the dict.
    ignore_index: bool
        Ignore index during shuffle.  If ``True``, performance may improve,
        but index values will not be preserved.
    concat_dfs_of_same_output_partition: bool
        Concatenate all dataframes of the same output partition.

    Returns
    -------
    rank_to_out_parts_list: dict of list of list of DataFrames
        Dict that maps each worker rank to its output partitions.
    """

    out_part_id_to_dataframes = defaultdict(
        list)  # part_id -> list of dataframes
    for bins in in_parts:
        for k, v in bins.items():
            out_part_id_to_dataframes[k].append(v)
        del bins

    # Create mapping: rank -> list of [list of dataframes]
    rank_to_out_parts_list: Dict[int, List[List[DataFrame]]] = {}
    for rank, part_ids in rank_to_out_part_ids.items():
        rank_to_out_parts_list[rank] = [
            out_part_id_to_dataframes[i] for i in part_ids
        ]
    del out_part_id_to_dataframes

    # Concatenate all dataframes of the same output partition.
    if concat_dfs_of_same_output_partition:
        for rank in rank_to_out_part_ids.keys():
            for i in range(len(rank_to_out_parts_list[rank])):
                if len(rank_to_out_parts_list[rank][i]) > 1:
                    rank_to_out_parts_list[rank][i] = [
                        _concat(rank_to_out_parts_list[rank][i],
                                ignore_index=ignore_index)
                    ]
    return rank_to_out_parts_list
コード例 #16
0
def from_map(
    func,
    *iterables,
    args=None,
    meta=None,
    divisions=None,
    label=None,
    token=None,
    enforce_metadata=True,
    **kwargs,
):
    """Create a DataFrame collection from a custom function map

    WARNING: The ``from_map`` API is experimental, and stability is not
    yet guaranteed. Use at your own risk!

    Parameters
    ----------
    func : callable
        Function used to create each partition. If ``func`` satisfies the
        ``DataFrameIOFunction`` protocol, column projection will be enabled.
    *iterables : Iterable objects
        Iterable objects to map to each output partition. All iterables must
        be the same length. This length determines the number of partitions
        in the output collection (only one element of each iterable will
        be passed to ``func`` for each partition).
    args : list or tuple, optional
        Positional arguments to broadcast to each output partition. Note
        that these arguments will always be passed to ``func`` after the
        ``iterables`` positional arguments.
    $META
    divisions : tuple, str, optional
        Partition boundaries along the index.
        For tuple, see https://docs.dask.org/en/latest/dataframe-design.html#partitions
        For string 'sorted' will compute the delayed values to find index
        values.  Assumes that the indexes are mutually sorted.
        If None, then won't use index information
    label : str, optional
        String to use as the function-name label in the output
        collection-key names.
    token : str, optional
        String to use as the "token" in the output collection-key names.
    enforce_metadata : bool, default True
        Whether to enforce at runtime that the structure of the DataFrame
        produced by ``func`` actually matches the structure of ``meta``.
        This will rename and reorder columns for each partition,
        and will raise an error if this doesn't work or types don't match.
    **kwargs:
        Key-word arguments to broadcast to each output partition. These
        same arguments will be passed to ``func`` for every output partition.

    Examples
    --------
    >>> import pandas as pd
    >>> import dask.dataframe as dd
    >>> func = lambda x, size=0: pd.Series([x] * size)
    >>> inputs = ["A", "B"]
    >>> dd.from_map(func, inputs, size=2).compute()
    0    A
    1    A
    0    B
    1    B
    dtype: object

    This API can also be used as an alternative to other file-based
    IO functions, like ``read_parquet`` (which are already just
    ``from_map`` wrapper functions):

    >>> import pandas as pd
    >>> import dask.dataframe as dd
    >>> paths = ["0.parquet", "1.parquet", "2.parquet"]
    >>> dd.from_map(pd.read_parquet, paths).head()  # doctest: +SKIP
                        name
    timestamp
    2000-01-01 00:00:00   Laura
    2000-01-01 00:00:01  Oliver
    2000-01-01 00:00:02   Alice
    2000-01-01 00:00:03  Victor
    2000-01-01 00:00:04     Bob

    Since ``from_map`` allows you to map an arbitrary function
    to any number of iterable objects, it can be a very convenient
    means of implementing functionality that may be missing from
    from other DataFrame-creation methods. For example, if you
    happen to have apriori knowledge about the number of rows
    in each of the files in a dataset, you can generate a
    DataFrame collection with a global RangeIndex:

    >>> import pandas as pd
    >>> import numpy as np
    >>> import dask.dataframe as dd
    >>> paths = ["0.parquet", "1.parquet", "2.parquet"]
    >>> file_sizes = [86400, 86400, 86400]
    >>> def func(path, row_offset):
    ...     # Read parquet file and set RangeIndex offset
    ...     df = pd.read_parquet(path)
    ...     return df.set_index(
    ...         pd.RangeIndex(row_offset, row_offset+len(df))
    ...     )
    >>> def get_ddf(paths, file_sizes):
    ...     offsets = [0] + list(np.cumsum(file_sizes))
    ...     return dd.from_map(
    ...         func, paths, offsets[:-1], divisions=offsets
    ...     )
    >>> ddf = get_ddf(paths, file_sizes)  # doctest: +SKIP
    >>> ddf.index  # doctest: +SKIP
    Dask Index Structure:
    npartitions=3
    0         int64
    86400       ...
    172800      ...
    259200      ...
    dtype: int64
    Dask Name: myfunc, 6 tasks

    See Also
    --------
    dask.dataframe.from_delayed
    dask.layers.DataFrameIOLayer
    """

    # Input validation
    if not callable(func):
        raise ValueError("`func` argument must be `callable`")
    lengths = set()
    iterables = list(iterables)
    for i, iterable in enumerate(iterables):
        if not isinstance(iterable, Iterable):
            raise ValueError(
                f"All elements of `iterables` must be Iterable, got {type(iterable)}"
            )
        try:
            lengths.add(len(iterable))
        except (AttributeError, TypeError):
            iterables[i] = list(iterable)
            lengths.add(len(iterables[i]))
    if len(lengths) == 0:
        raise ValueError("`from_map` requires at least one Iterable input")
    elif len(lengths) > 1:
        raise ValueError("All `iterables` must have the same length")
    if lengths == {0}:
        raise ValueError("All `iterables` must have a non-zero length")

    # Check for `produces_tasks` and `creation_info`.
    # These options are included in the function signature,
    # because they are not intended for "public" use.
    produces_tasks = kwargs.pop("produces_tasks", False)
    creation_info = kwargs.pop("creation_info", None)

    if produces_tasks or len(iterables) == 1:
        if len(iterables) > 1:
            # Tasks are not detected correctly when they are "packed"
            # within an outer list/tuple
            raise ValueError(
                "Multiple iterables not supported when produces_tasks=True")
        inputs = iterables[0]
        packed = False
    else:
        inputs = list(zip(*iterables))
        packed = True

    # Define collection name
    label = label or funcname(func)
    token = token or tokenize(func, meta, inputs, args, divisions,
                              enforce_metadata, **kwargs)
    name = f"{label}-{token}"

    # Get "projectable" column selection.
    # Note that this relies on the IO function
    # ducktyping with DataFrameIOFunction
    column_projection = func.columns if isinstance(
        func, DataFrameIOFunction) else None

    # NOTE: Most of the metadata-handling logic used here
    # is copied directly from `map_partitions`
    if meta is None:
        meta = _emulate(
            func,
            *(inputs[0] if packed else inputs[:1]),
            *(args or []),
            udf=True,
            **kwargs,
        )
        meta_is_emulated = True
    else:
        meta = make_meta(meta)
        meta_is_emulated = False

    if not (has_parallel_type(meta) or is_arraylike(meta) and meta.shape):
        if not meta_is_emulated:
            raise TypeError(
                "Meta is not valid, `from_map` expects output to be a pandas object. "
                "Try passing a pandas object as meta or a dict or tuple representing the "
                "(name, dtype) of the columns.")
        # If `meta` is not a pandas object, the concatenated results will be a
        # different type
        meta = make_meta(_concat([meta]))

    # Ensure meta is empty DataFrame
    meta = make_meta(meta)

    # Define io_func
    if packed or args or kwargs or enforce_metadata:
        io_func = _PackedArgCallable(
            func,
            args=args,
            kwargs=kwargs,
            meta=meta if enforce_metadata else None,
            enforce_metadata=enforce_metadata,
            packed=packed,
        )
    else:
        io_func = func

    # Construct DataFrameIOLayer
    layer = DataFrameIOLayer(
        name,
        column_projection,
        inputs,
        io_func,
        label=label,
        produces_tasks=produces_tasks,
        creation_info=creation_info,
    )

    # Return new DataFrame-collection object
    divisions = divisions or [None] * (len(inputs) + 1)
    graph = HighLevelGraph.from_collections(name, layer, dependencies=[])
    return new_dd_object(graph, name, meta, divisions)
コード例 #17
0
def test_concat():
    x = _concat(
        [pd.DataFrame(columns=['a', 'b']),
         pd.DataFrame(columns=['a', 'b'])])
    assert list(x.columns) == ['a', 'b']
    assert len(x) == 0
コード例 #18
0
ファイル: merge.py プロジェクト: Ethyling/dask-cuda
async def exchange_and_concat_bins(rank, eps, bins):
    ret = [bins[rank]]
    await asyncio.gather(recv_bins(eps, ret), send_bins(eps, bins))
    return _concat([df for df in ret if df is not None])
コード例 #19
0
def _mid_level_groupby(dfs, col_group, freq_limit_val, options: FitOptions):
    if isinstance(col_group, str):
        col_group = [col_group]
    elif isinstance(col_group, tuple):
        col_group = list(col_group)

    if options.concat_groups and len(col_group) > 1:
        col_group = [_make_name(*col_group, sep=options.name_sep)]

    if options.on_host:
        # Construct gpu DataFrame from pyarrow data.
        # `on_host=True` implies gpu-backed data.
        df = pa.concat_tables(dfs, promote=True)
        df = _from_host(df)
    else:
        df = _concat(dfs, ignore_index=True)
    groups = df.groupby(col_group, dropna=False)
    gb = groups.agg({
        col: _get_aggregation_type(col)
        for col in df.columns if col not in col_group
    })
    gb.reset_index(drop=False, inplace=True)

    name_count = _make_name(*(col_group + ["count"]), sep=options.name_sep)
    if options.freq_limit and not options.max_size:
        gb = gb[gb[name_count] >= freq_limit_val]

    required = col_group.copy()
    if "count" in options.agg_list:
        required.append(name_count)

    ddof = 1
    for cont_col in options.agg_cols:
        name_sum = _make_name(*(col_group + [cont_col, "sum"]),
                              sep=options.name_sep)
        if "sum" in options.agg_list:
            required.append(name_sum)

        if "mean" in options.agg_list:
            name_mean = _make_name(*(col_group + [cont_col, "mean"]),
                                   sep=options.name_sep)
            required.append(name_mean)
            gb[name_mean] = gb[name_sum] / gb[name_count]

        if "min" in options.agg_list:
            name_min = _make_name(*(col_group + [cont_col, "min"]),
                                  sep=options.name_sep)
            required.append(name_min)

        if "max" in options.agg_list:
            name_max = _make_name(*(col_group + [cont_col, "max"]),
                                  sep=options.name_sep)
            required.append(name_max)

        if "var" in options.agg_list or "std" in options.agg_list:
            n = gb[name_count]
            x = gb[name_sum]
            x2 = gb[_make_name(*(col_group + [cont_col, "pow2", "sum"]),
                               sep=options.name_sep)]
            result = x2 - x**2 / n
            div = n - ddof
            div[div < 1] = 1
            result /= div
            result[(n - ddof) == 0] = np.nan

            if "var" in options.agg_list:
                name_var = _make_name(*(col_group + [cont_col, "var"]),
                                      sep=options.name_sep)
                required.append(name_var)
                gb[name_var] = result
            if "std" in options.agg_list:
                name_std = _make_name(*(col_group + [cont_col, "std"]),
                                      sep=options.name_sep)
                required.append(name_std)
                gb[name_std] = np.sqrt(result)

    if options.on_host:
        gb_pd = gb[required].to_arrow(preserve_index=False)
        del gb
        return gb_pd
    return gb[required]
コード例 #20
0
ファイル: test_dataframe.py プロジェクト: rla3rd/dask
def test_concat():
    x = _concat([pd.DataFrame(columns=["a", "b"]), pd.DataFrame(columns=["a", "b"])])
    assert list(x.columns) == ["a", "b"]
    assert len(x) == 0
コード例 #21
0
ファイル: shuffle.py プロジェクト: Ethyling/dask-cuda
async def local_shuffle(
    s,
    workers: Set[int],
    in_nparts: Dict[int, int],
    in_parts: List[Dict[int, DataFrame]],
    rank_to_out_part_ids: Dict[int, List[int]],
    ignore_index: bool,
) -> List[DataFrame]:
    """Local shuffle operation of the already grouped/partitioned dataframes

    This function is running on each worker participating in the shuffle.

    Parameters
    ----------
    s: dict
        Worker session state
    workers: set
        Set of ranks of all the participants
    in_nparts: dict
        dict that for each worker rank specifices the
        number of partitions that worker has of the input dataframe.
        If the worker doesn't have any partitions, it is excluded from the dict.
    in_parts: list of dict of dataframes
        List of dataframe groups that need to be shuffled.
    rank_to_out_part_ids: dict
        dict that for each worker rank specifices a list of partition IDs that
        worker should return. If the worker shouldn't return any partitions,
        it is excluded from the dict.
    ignore_index: bool
        Ignore index during shuffle.  If ``True``, performance may improve,
        but index values will not be preserved.

    Returns
    -------
    partitions: list of DataFrames
        List of dataframe-partitions
    """
    myrank = s["rank"]
    eps = s["eps"]
    assert s["rank"] in workers

    rank_to_out_parts_list = sort_in_parts(
        in_parts,
        rank_to_out_part_ids,
        ignore_index,
        concat_dfs_of_same_output_partition=True,
    )

    # Communicate all the dataframe-partitions all-to-all. The result is
    # `out_parts_list` that for each worker and for each output partition
    # contains a list of dataframes received.
    out_parts_list: List[List[List[DataFrame]]] = []
    futures = []
    if myrank in rank_to_out_parts_list:
        futures.append(recv(eps, in_nparts, out_parts_list))
    if myrank in in_nparts:
        futures.append(send(eps, rank_to_out_parts_list))
    await asyncio.gather(*futures)

    # At this point `send()` should have pop'ed all output partitions
    # beside the partitions owned be `myrank`.
    assert len(rank_to_out_parts_list) == 1

    # Concatenate the received dataframes into the final output partitions
    ret = []
    for i in range(len(rank_to_out_part_ids[myrank])):
        dfs = []
        for out_parts in out_parts_list:
            dfs.extend(out_parts[i])
            out_parts[i] = None
        dfs.extend(rank_to_out_parts_list[myrank][i])
        rank_to_out_parts_list[myrank][i] = None
        if len(dfs) > 1:
            ret.append(_concat(dfs, ignore_index=ignore_index))
        else:
            ret.append(dfs[0])
    return ret