def _write_uniques(dfs, base_path, col_group, options): if options.concat_groups and len(col_group) > 1: col_group = [_make_name(*col_group, sep=options.name_sep)] if isinstance(col_group, str): col_group = [col_group] if options.on_host: # Construct gpu DataFrame from pyarrow data. # `on_host=True` implies gpu-backed data. df = pa.concat_tables(dfs, promote=True) df = _from_host(df) else: df = _concat(dfs, ignore_index=True) rel_path = "unique.%s.parquet" % (_make_name(*col_group, sep=options.name_sep)) path = "/".join([base_path, rel_path]) if len(df): # Make sure first category is Null df = df.sort_values(col_group, na_position="first") new_cols = {} nulls_missing = False for col in col_group: name_count = col + "_count" if options.max_size: max_emb_size = options.max_size if isinstance(options.max_size, dict): max_emb_size = max_emb_size[col] if options.num_buckets: if isinstance(options.num_buckets, int): nlargest = max_emb_size - options.num_buckets - 1 else: nlargest = max_emb_size - options.num_buckets[col] - 1 else: nlargest = max_emb_size - 1 if nlargest <= 0: raise ValueError("`nlargest` cannot be 0 or negative") if nlargest < len(df): df = df.nlargest(n=nlargest, columns=name_count) if not _series_has_nulls(df[col]): nulls_missing = True new_cols[col] = _concat( [ df._constructor_sliced([None], dtype=df[col].dtype), df[col] ], ignore_index=True, ) else: new_cols[col] = df[col].copy(deep=False) if nulls_missing: df = type(df)(new_cols) df.to_parquet(path, index=False, compression=None) else: df_null = type(df)({c: [None] for c in col_group}) for c in col_group: df_null[c] = df_null[c].astype(df[c].dtype) df_null.to_parquet(path, index=False, compression=None) del df return path
def _write_uniques(dfs, base_path, col_group, on_host, concat_groups, name_sep): if concat_groups and len(col_group) > 1: col_group = [_make_name(*col_group, sep=name_sep)] ignore_index = True if isinstance(col_group, str): col_group = [col_group] df = _concat(dfs, ignore_index) if on_host: df.reset_index(drop=True, inplace=True) df = cudf.from_pandas(df) rel_path = "unique.%s.parquet" % (_make_name(*col_group, sep=name_sep)) path = "/".join([base_path, rel_path]) if len(df): # Make sure first category is Null df = df.sort_values(col_group, na_position="first") new_cols = {} nulls_missing = False for col in col_group: if not df[col]._column.has_nulls: nulls_missing = True new_cols[col] = _concat( [cudf.Series([None], dtype=df[col].dtype), df[col]], ignore_index ) else: new_cols[col] = df[col].copy(deep=False) if nulls_missing: df = cudf.DataFrame(new_cols) df.to_parquet(path, write_index=False, compression=None) else: df_null = cudf.DataFrame({c: [None] for c in col_group}) for c in col_group: df_null[c] = df_null[c].astype(df[c].dtype) df_null.to_parquet(path, write_index=False, compression=None) del df return path
def test_concat_unions_categoricals(): # Categorical DataFrame, regular index tm.assert_frame_equal(_concat(frames), pd.concat(frames2)) # Categorical Series, regular index tm.assert_series_equal(_concat([i.y for i in frames]), pd.concat([i.y for i in frames2])) # Categorical Index tm.assert_index_equal(_concat([i.index for i in frames3]), pd.concat([i for i in frames4]).index) # Categorical DataFrame, Categorical Index tm.assert_frame_equal(_concat(frames3), pd.concat(frames4)) # Non-categorical DataFrame, Categorical Index tm.assert_frame_equal(_concat([i[['x', 'z']] for i in frames3]), pd.concat([i[['x', 'z']] for i in frames4])) # Categorical Series, Categorical Index tm.assert_series_equal(_concat([i.z for i in frames3]), pd.concat([i.z for i in frames4])) # Non-categorical Series, Categorical Index tm.assert_series_equal(_concat([i.x for i in frames3]), pd.concat([i.x for i in frames4])) # MultiIndex with Categorical Index tm.assert_index_equal(_concat([i.index for i in frames5]), pd.concat([i for i in frames6]).index) # DataFrame, MultiIndex with CategoricalIndex tm.assert_frame_equal(_concat(frames5), pd.concat(frames6))
def _mid_level_groupby(dfs, col, cont_cols, agg_list, freq_limit, on_host): ignore_index = True if on_host: gb = cudf.from_pandas(_concat(dfs, ignore_index)).groupby( col, dropna=False).sum() else: gb = _concat(dfs, ignore_index).groupby(col, dropna=False).sum() gb.reset_index(drop=False, inplace=True) name_count = _make_name(col, "count") if freq_limit: gb = gb[gb[name_count] >= freq_limit] required = [col] if "count" in agg_list: required.append(name_count) ddof = 1 for cont_col in cont_cols: name_sum = _make_name(col, cont_col, "sum") if "sum" in agg_list: required.append(name_sum) if "mean" in agg_list: name_mean = _make_name(col, cont_col, "mean") required.append(name_mean) gb[name_mean] = gb[name_sum] / gb[name_count] if "var" in agg_list or "std" in agg_list: n = gb[name_count] x = gb[name_sum] x2 = gb[_make_name(col, cont_col, "pow2", "sum")] result = x2 - x**2 / n div = n - ddof div[div < 1] = 1 result /= div result[(n - ddof) == 0] = np.nan if "var" in agg_list: name_var = _make_name(col, cont_col, "var") required.append(name_var) gb[name_var] = result if "std" in agg_list: name_std = _make_name(col, cont_col, "std") required.append(name_std) gb[name_std] = np.sqrt(result) if on_host: gb_pd = gb[required].to_pandas() del gb return gb_pd return gb[required]
def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep): """ Node in groupby-aggregation reduction tree. Following the initial `_groupby_partition_agg` tasks, the `groupby_agg` algorithm will perform a tree reduction to combine the data from the input partitions into `split_out` different output partitions. For each node in the reduction tree, the input DataFrame objects are concatenated, and "sum", "min" and/or "max" groupby aggregations are used to combine the necessary statistics. """ df = _concat(dfs, ignore_index=True) agg_dict = {} for col in df.columns: if col in gb_cols: continue agg = col.split(sep)[-1] if agg in ("count", "sum"): agg_dict[col] = ["sum"] elif agg in ("min", "max"): agg_dict[col] = [agg] else: raise ValueError(f"Unexpected aggregation: {agg}") gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg(agg_dict) # Don't include the last aggregation in the column names gb.columns = [_make_name(*name[:-1], sep=sep) for name in gb.columns] return gb
def _cat_level_2(dfs, col, freq_limit, on_host): ignore_index = True if on_host: # Pandas groupby does not have `dropna` arg gb = cudf.from_pandas(_concat(dfs, ignore_index)).groupby( col, dropna=False).sum() else: gb = _concat(dfs, ignore_index).groupby(col, dropna=False).sum() gb.reset_index(drop=False, inplace=True) if freq_limit: gb = gb[gb["count"] >= freq_limit] if on_host: gb_pd = gb[[col]].to_pandas() del gb return gb_pd return gb[[col]]
def _tree_node_moments(inputs): out = {} for val in ["df-count", "df-sum", "df2-sum"]: df_list = [x.get(val, None) for x in inputs] df_list = [df for df in df_list if df is not None] out[val] = _concat(df_list, ignore_index=True).sum().to_frame().transpose() return out
def df_concat(df_parts): """Making sure df_parts is a single dataframe or None""" if len(df_parts) == 0: return None elif len(df_parts) == 1: return df_parts[0] else: return _concat(df_parts)
def _cat_level_3(dfs, base_path, col, on_host): ignore_index = True df = _concat(dfs, ignore_index) if on_host: df = cudf.from_pandas(df) rel_path = "unique.%s.parquet" % (col) path = "/".join([base_path, rel_path]) if len(df): # Make sure first category is Null df = df.sort_values(col, na_position="first") if not df[col]._column.has_nulls: df = cudf.DataFrame( {col: _concat([cudf.Series([None]), df[col]], ignore_index)}) df.to_parquet(path, write_index=False, compression=None) else: df_null = cudf.DataFrame({col: [None]}) df_null[col] = df_null[col].astype(df[col].dtype) df_null.to_parquet(path, write_index=False, compression=None) del df return path
def _write_gb_stats(dfs, base_path, col, on_host): ignore_index = True df = _concat(dfs, ignore_index) if on_host: df = cudf.from_pandas(df) rel_path = "cat_stats.%s.parquet" % (col) path = os.path.join(base_path, rel_path) if len(df): df = df.sort_values(col, na_position="first") df.to_parquet(path, write_index=False, compression=None) else: df_null = cudf.DataFrame({col: [None]}) df_null[col] = df_null[col].astype(df[col].dtype) df_null.to_parquet(path, write_index=False, compression=None) del df return path
def test_concat(): x = _concat([pd.DataFrame(columns=['a', 'b']), pd.DataFrame(columns=['a', 'b'])]) assert list(x.columns) == ['a', 'b'] assert len(x) == 0
def _mid_level_groupby( dfs, col_group, cont_cols, agg_list, freq_limit, on_host, concat_groups, name_sep ): if isinstance(col_group, str): col_group = [col_group] if concat_groups and len(col_group) > 1: col_group = [_make_name(*col_group, sep=name_sep)] if on_host: df = pa.concat_tables(dfs, promote=True) df = cudf.DataFrame.from_arrow(df) else: df = _concat(dfs, ignore_index=True) groups = df.groupby(col_group, dropna=False) gb = groups.agg({col: _get_aggregation_type(col) for col in df.columns if col not in col_group}) gb.reset_index(drop=False, inplace=True) name_count = _make_name(*(col_group + ["count"]), sep=name_sep) if freq_limit: gb = gb[gb[name_count] >= freq_limit] required = col_group.copy() if "count" in agg_list: required.append(name_count) ddof = 1 for cont_col in cont_cols: name_sum = _make_name(*(col_group + [cont_col, "sum"]), sep=name_sep) if "sum" in agg_list: required.append(name_sum) if "mean" in agg_list: name_mean = _make_name(*(col_group + [cont_col, "mean"]), sep=name_sep) required.append(name_mean) gb[name_mean] = gb[name_sum] / gb[name_count] if "min" in agg_list: name_min = _make_name(*(col_group + [cont_col, "min"]), sep=name_sep) required.append(name_min) if "max" in agg_list: name_max = _make_name(*(col_group + [cont_col, "max"]), sep=name_sep) required.append(name_max) if "var" in agg_list or "std" in agg_list: n = gb[name_count] x = gb[name_sum] x2 = gb[_make_name(*(col_group + [cont_col, "pow2", "sum"]), sep=name_sep)] result = x2 - x ** 2 / n div = n - ddof div[div < 1] = 1 result /= div result[(n - ddof) == 0] = np.nan if "var" in agg_list: name_var = _make_name(*(col_group + [cont_col, "var"]), sep=name_sep) required.append(name_var) gb[name_var] = result if "std" in agg_list: name_std = _make_name(*(col_group + [cont_col, "std"]), sep=name_sep) required.append(name_std) gb[name_std] = np.sqrt(result) if on_host: gb_pd = gb[required].to_arrow(preserve_index=False) del gb return gb_pd return gb[required]
def _top_level_groupby( gdf, cat_col_groups, tree_width, cont_cols, agg_list, on_host, concat_groups, name_sep ): sum_sq = "std" in agg_list or "var" in agg_list calculate_min = "min" in agg_list calculate_max = "max" in agg_list # Top-level operation for category-based groupby aggregations output = {} k = 0 for i, cat_col_group in enumerate(cat_col_groups): if isinstance(cat_col_group, str): cat_col_group = [cat_col_group] cat_col_group_str = _make_name(*cat_col_group, sep=name_sep) if concat_groups and len(cat_col_group) > 1: # Concatenate columns and replace cat_col_group # with the single name df_gb = cudf.DataFrame() ignore_index = True df_gb[cat_col_group_str] = _concat([gdf[col] for col in cat_col_group], ignore_index) cat_col_group = [cat_col_group_str] else: # Compile aggregation dictionary and add "squared-sum" # column(s) (necessary when `cont_cols` is non-empty) df_gb = gdf[cat_col_group + cont_cols].copy(deep=False) agg_dict = {} agg_dict[cat_col_group[0]] = ["count"] for col in cont_cols: agg_dict[col] = ["sum"] if sum_sq: name = _make_name(col, "pow2", sep=name_sep) df_gb[name] = df_gb[col].pow(2) agg_dict[name] = ["sum"] if calculate_min: agg_dict[col].append("min") if calculate_max: agg_dict[col].append("max") # Perform groupby and flatten column index # (flattening provides better cudf support) if _is_list_col(cat_col_group, df_gb): # handle list columns by encoding the list values df_gb = cudf.DataFrame({cat_col_group[0]: df_gb[cat_col_group[0]].list.leaves}) gb = df_gb.groupby(cat_col_group, dropna=False).agg(agg_dict) gb.columns = [ _make_name(*(tuple(cat_col_group) + name[1:]), sep=name_sep) if name[0] == cat_col_group[0] else _make_name(*(tuple(cat_col_group) + name), sep=name_sep) for name in gb.columns.to_flat_index() ] gb.reset_index(inplace=True, drop=False) del df_gb # Split the result by the hash value of the categorical column for j, split in enumerate( gb.partition_by_hash(cat_col_group, tree_width[cat_col_group_str], keep_index=False) ): if on_host: output[k] = split.to_arrow(preserve_index=False) else: output[k] = split k += 1 del gb return output
def _top_level_groupby(df, options: FitOptions): sum_sq = "std" in options.agg_list or "var" in options.agg_list calculate_min = "min" in options.agg_list calculate_max = "max" in options.agg_list # Top-level operation for category-based groupby aggregations output = {} k = 0 for i, cat_col_group in enumerate(options.col_groups): if isinstance(cat_col_group, tuple): cat_col_group = list(cat_col_group) if isinstance(cat_col_group, str): cat_col_group = [cat_col_group] cat_col_group_str = _make_name(*cat_col_group, sep=options.name_sep) if options.concat_groups and len(cat_col_group) > 1: # Concatenate columns and replace cat_col_group # with the single name df_gb = type(df)() ignore_index = True df_gb[cat_col_group_str] = _concat( [df[col] for col in cat_col_group], ignore_index) cat_col_group = [cat_col_group_str] else: # Compile aggregation dictionary and add "squared-sum" # column(s) (necessary when `agg_cols` is non-empty) df_gb = df[cat_col_group + options.agg_cols].copy(deep=False) agg_dict = {} agg_dict[cat_col_group[0]] = ["count"] for col in options.agg_cols: agg_dict[col] = ["sum"] if sum_sq: name = _make_name(col, "pow2", sep=options.name_sep) df_gb[name] = df_gb[col].pow(2) agg_dict[name] = ["sum"] if calculate_min: agg_dict[col].append("min") if calculate_max: agg_dict[col].append("max") # Perform groupby and flatten column index # (flattening provides better cudf/pd support) if _is_list_col(cat_col_group, df_gb): # handle list columns by encoding the list values df_gb = _flatten_list_column(df_gb[cat_col_group[0]]) # NOTE: groupby(..., dropna=False) requires pandas>=1.1.0 gb = df_gb.groupby(cat_col_group, dropna=False).agg(agg_dict) gb.columns = [ _make_name(*(tuple(cat_col_group) + name[1:]), sep=options.name_sep) if name[0] == cat_col_group[0] else _make_name(*(tuple(cat_col_group) + name), sep=options.name_sep) for name in gb.columns.to_flat_index() ] gb.reset_index(inplace=True, drop=False) del df_gb # Split the result by the hash value of the categorical column nsplits = options.tree_width[cat_col_group_str] for j, split in shuffle_group(gb, cat_col_group, 0, nsplits, nsplits, True, nsplits).items(): if options.on_host: output[k] = split.to_arrow(preserve_index=False) else: output[k] = split k += 1 del gb return output
def sort_in_parts( in_parts: List[Dict[int, DataFrame]], rank_to_out_part_ids: Dict[int, List[int]], ignore_index: bool, concat_dfs_of_same_output_partition: bool, ) -> Dict[int, List[List[DataFrame]]]: """ Sort the list of grouped dataframes in `in_parts` It returns a dict that for each worker-rank specifies the output partitions: ''' for each worker: for each output partition: list of dataframes that makes of an output partition ''' If `concat_dfs_of_same_output_partition` is True, all the dataframes of an output partition are concatenated. Parameters ---------- in_parts: list of dict of dataframes List of dataframe groups that need to be shuffled. rank_to_out_part_ids: dict dict that for each worker rank specifices a list of partition IDs that worker should return. If the worker shouldn't return any partitions, it is excluded from the dict. ignore_index: bool Ignore index during shuffle. If ``True``, performance may improve, but index values will not be preserved. concat_dfs_of_same_output_partition: bool Concatenate all dataframes of the same output partition. Returns ------- rank_to_out_parts_list: dict of list of list of DataFrames Dict that maps each worker rank to its output partitions. """ out_part_id_to_dataframes = defaultdict( list) # part_id -> list of dataframes for bins in in_parts: for k, v in bins.items(): out_part_id_to_dataframes[k].append(v) del bins # Create mapping: rank -> list of [list of dataframes] rank_to_out_parts_list: Dict[int, List[List[DataFrame]]] = {} for rank, part_ids in rank_to_out_part_ids.items(): rank_to_out_parts_list[rank] = [ out_part_id_to_dataframes[i] for i in part_ids ] del out_part_id_to_dataframes # Concatenate all dataframes of the same output partition. if concat_dfs_of_same_output_partition: for rank in rank_to_out_part_ids.keys(): for i in range(len(rank_to_out_parts_list[rank])): if len(rank_to_out_parts_list[rank][i]) > 1: rank_to_out_parts_list[rank][i] = [ _concat(rank_to_out_parts_list[rank][i], ignore_index=ignore_index) ] return rank_to_out_parts_list
def from_map( func, *iterables, args=None, meta=None, divisions=None, label=None, token=None, enforce_metadata=True, **kwargs, ): """Create a DataFrame collection from a custom function map WARNING: The ``from_map`` API is experimental, and stability is not yet guaranteed. Use at your own risk! Parameters ---------- func : callable Function used to create each partition. If ``func`` satisfies the ``DataFrameIOFunction`` protocol, column projection will be enabled. *iterables : Iterable objects Iterable objects to map to each output partition. All iterables must be the same length. This length determines the number of partitions in the output collection (only one element of each iterable will be passed to ``func`` for each partition). args : list or tuple, optional Positional arguments to broadcast to each output partition. Note that these arguments will always be passed to ``func`` after the ``iterables`` positional arguments. $META divisions : tuple, str, optional Partition boundaries along the index. For tuple, see https://docs.dask.org/en/latest/dataframe-design.html#partitions For string 'sorted' will compute the delayed values to find index values. Assumes that the indexes are mutually sorted. If None, then won't use index information label : str, optional String to use as the function-name label in the output collection-key names. token : str, optional String to use as the "token" in the output collection-key names. enforce_metadata : bool, default True Whether to enforce at runtime that the structure of the DataFrame produced by ``func`` actually matches the structure of ``meta``. This will rename and reorder columns for each partition, and will raise an error if this doesn't work or types don't match. **kwargs: Key-word arguments to broadcast to each output partition. These same arguments will be passed to ``func`` for every output partition. Examples -------- >>> import pandas as pd >>> import dask.dataframe as dd >>> func = lambda x, size=0: pd.Series([x] * size) >>> inputs = ["A", "B"] >>> dd.from_map(func, inputs, size=2).compute() 0 A 1 A 0 B 1 B dtype: object This API can also be used as an alternative to other file-based IO functions, like ``read_parquet`` (which are already just ``from_map`` wrapper functions): >>> import pandas as pd >>> import dask.dataframe as dd >>> paths = ["0.parquet", "1.parquet", "2.parquet"] >>> dd.from_map(pd.read_parquet, paths).head() # doctest: +SKIP name timestamp 2000-01-01 00:00:00 Laura 2000-01-01 00:00:01 Oliver 2000-01-01 00:00:02 Alice 2000-01-01 00:00:03 Victor 2000-01-01 00:00:04 Bob Since ``from_map`` allows you to map an arbitrary function to any number of iterable objects, it can be a very convenient means of implementing functionality that may be missing from from other DataFrame-creation methods. For example, if you happen to have apriori knowledge about the number of rows in each of the files in a dataset, you can generate a DataFrame collection with a global RangeIndex: >>> import pandas as pd >>> import numpy as np >>> import dask.dataframe as dd >>> paths = ["0.parquet", "1.parquet", "2.parquet"] >>> file_sizes = [86400, 86400, 86400] >>> def func(path, row_offset): ... # Read parquet file and set RangeIndex offset ... df = pd.read_parquet(path) ... return df.set_index( ... pd.RangeIndex(row_offset, row_offset+len(df)) ... ) >>> def get_ddf(paths, file_sizes): ... offsets = [0] + list(np.cumsum(file_sizes)) ... return dd.from_map( ... func, paths, offsets[:-1], divisions=offsets ... ) >>> ddf = get_ddf(paths, file_sizes) # doctest: +SKIP >>> ddf.index # doctest: +SKIP Dask Index Structure: npartitions=3 0 int64 86400 ... 172800 ... 259200 ... dtype: int64 Dask Name: myfunc, 6 tasks See Also -------- dask.dataframe.from_delayed dask.layers.DataFrameIOLayer """ # Input validation if not callable(func): raise ValueError("`func` argument must be `callable`") lengths = set() iterables = list(iterables) for i, iterable in enumerate(iterables): if not isinstance(iterable, Iterable): raise ValueError( f"All elements of `iterables` must be Iterable, got {type(iterable)}" ) try: lengths.add(len(iterable)) except (AttributeError, TypeError): iterables[i] = list(iterable) lengths.add(len(iterables[i])) if len(lengths) == 0: raise ValueError("`from_map` requires at least one Iterable input") elif len(lengths) > 1: raise ValueError("All `iterables` must have the same length") if lengths == {0}: raise ValueError("All `iterables` must have a non-zero length") # Check for `produces_tasks` and `creation_info`. # These options are included in the function signature, # because they are not intended for "public" use. produces_tasks = kwargs.pop("produces_tasks", False) creation_info = kwargs.pop("creation_info", None) if produces_tasks or len(iterables) == 1: if len(iterables) > 1: # Tasks are not detected correctly when they are "packed" # within an outer list/tuple raise ValueError( "Multiple iterables not supported when produces_tasks=True") inputs = iterables[0] packed = False else: inputs = list(zip(*iterables)) packed = True # Define collection name label = label or funcname(func) token = token or tokenize(func, meta, inputs, args, divisions, enforce_metadata, **kwargs) name = f"{label}-{token}" # Get "projectable" column selection. # Note that this relies on the IO function # ducktyping with DataFrameIOFunction column_projection = func.columns if isinstance( func, DataFrameIOFunction) else None # NOTE: Most of the metadata-handling logic used here # is copied directly from `map_partitions` if meta is None: meta = _emulate( func, *(inputs[0] if packed else inputs[:1]), *(args or []), udf=True, **kwargs, ) meta_is_emulated = True else: meta = make_meta(meta) meta_is_emulated = False if not (has_parallel_type(meta) or is_arraylike(meta) and meta.shape): if not meta_is_emulated: raise TypeError( "Meta is not valid, `from_map` expects output to be a pandas object. " "Try passing a pandas object as meta or a dict or tuple representing the " "(name, dtype) of the columns.") # If `meta` is not a pandas object, the concatenated results will be a # different type meta = make_meta(_concat([meta])) # Ensure meta is empty DataFrame meta = make_meta(meta) # Define io_func if packed or args or kwargs or enforce_metadata: io_func = _PackedArgCallable( func, args=args, kwargs=kwargs, meta=meta if enforce_metadata else None, enforce_metadata=enforce_metadata, packed=packed, ) else: io_func = func # Construct DataFrameIOLayer layer = DataFrameIOLayer( name, column_projection, inputs, io_func, label=label, produces_tasks=produces_tasks, creation_info=creation_info, ) # Return new DataFrame-collection object divisions = divisions or [None] * (len(inputs) + 1) graph = HighLevelGraph.from_collections(name, layer, dependencies=[]) return new_dd_object(graph, name, meta, divisions)
def test_concat(): x = _concat( [pd.DataFrame(columns=['a', 'b']), pd.DataFrame(columns=['a', 'b'])]) assert list(x.columns) == ['a', 'b'] assert len(x) == 0
async def exchange_and_concat_bins(rank, eps, bins): ret = [bins[rank]] await asyncio.gather(recv_bins(eps, ret), send_bins(eps, bins)) return _concat([df for df in ret if df is not None])
def _mid_level_groupby(dfs, col_group, freq_limit_val, options: FitOptions): if isinstance(col_group, str): col_group = [col_group] elif isinstance(col_group, tuple): col_group = list(col_group) if options.concat_groups and len(col_group) > 1: col_group = [_make_name(*col_group, sep=options.name_sep)] if options.on_host: # Construct gpu DataFrame from pyarrow data. # `on_host=True` implies gpu-backed data. df = pa.concat_tables(dfs, promote=True) df = _from_host(df) else: df = _concat(dfs, ignore_index=True) groups = df.groupby(col_group, dropna=False) gb = groups.agg({ col: _get_aggregation_type(col) for col in df.columns if col not in col_group }) gb.reset_index(drop=False, inplace=True) name_count = _make_name(*(col_group + ["count"]), sep=options.name_sep) if options.freq_limit and not options.max_size: gb = gb[gb[name_count] >= freq_limit_val] required = col_group.copy() if "count" in options.agg_list: required.append(name_count) ddof = 1 for cont_col in options.agg_cols: name_sum = _make_name(*(col_group + [cont_col, "sum"]), sep=options.name_sep) if "sum" in options.agg_list: required.append(name_sum) if "mean" in options.agg_list: name_mean = _make_name(*(col_group + [cont_col, "mean"]), sep=options.name_sep) required.append(name_mean) gb[name_mean] = gb[name_sum] / gb[name_count] if "min" in options.agg_list: name_min = _make_name(*(col_group + [cont_col, "min"]), sep=options.name_sep) required.append(name_min) if "max" in options.agg_list: name_max = _make_name(*(col_group + [cont_col, "max"]), sep=options.name_sep) required.append(name_max) if "var" in options.agg_list or "std" in options.agg_list: n = gb[name_count] x = gb[name_sum] x2 = gb[_make_name(*(col_group + [cont_col, "pow2", "sum"]), sep=options.name_sep)] result = x2 - x**2 / n div = n - ddof div[div < 1] = 1 result /= div result[(n - ddof) == 0] = np.nan if "var" in options.agg_list: name_var = _make_name(*(col_group + [cont_col, "var"]), sep=options.name_sep) required.append(name_var) gb[name_var] = result if "std" in options.agg_list: name_std = _make_name(*(col_group + [cont_col, "std"]), sep=options.name_sep) required.append(name_std) gb[name_std] = np.sqrt(result) if options.on_host: gb_pd = gb[required].to_arrow(preserve_index=False) del gb return gb_pd return gb[required]
def test_concat(): x = _concat([pd.DataFrame(columns=["a", "b"]), pd.DataFrame(columns=["a", "b"])]) assert list(x.columns) == ["a", "b"] assert len(x) == 0
async def local_shuffle( s, workers: Set[int], in_nparts: Dict[int, int], in_parts: List[Dict[int, DataFrame]], rank_to_out_part_ids: Dict[int, List[int]], ignore_index: bool, ) -> List[DataFrame]: """Local shuffle operation of the already grouped/partitioned dataframes This function is running on each worker participating in the shuffle. Parameters ---------- s: dict Worker session state workers: set Set of ranks of all the participants in_nparts: dict dict that for each worker rank specifices the number of partitions that worker has of the input dataframe. If the worker doesn't have any partitions, it is excluded from the dict. in_parts: list of dict of dataframes List of dataframe groups that need to be shuffled. rank_to_out_part_ids: dict dict that for each worker rank specifices a list of partition IDs that worker should return. If the worker shouldn't return any partitions, it is excluded from the dict. ignore_index: bool Ignore index during shuffle. If ``True``, performance may improve, but index values will not be preserved. Returns ------- partitions: list of DataFrames List of dataframe-partitions """ myrank = s["rank"] eps = s["eps"] assert s["rank"] in workers rank_to_out_parts_list = sort_in_parts( in_parts, rank_to_out_part_ids, ignore_index, concat_dfs_of_same_output_partition=True, ) # Communicate all the dataframe-partitions all-to-all. The result is # `out_parts_list` that for each worker and for each output partition # contains a list of dataframes received. out_parts_list: List[List[List[DataFrame]]] = [] futures = [] if myrank in rank_to_out_parts_list: futures.append(recv(eps, in_nparts, out_parts_list)) if myrank in in_nparts: futures.append(send(eps, rank_to_out_parts_list)) await asyncio.gather(*futures) # At this point `send()` should have pop'ed all output partitions # beside the partitions owned be `myrank`. assert len(rank_to_out_parts_list) == 1 # Concatenate the received dataframes into the final output partitions ret = [] for i in range(len(rank_to_out_part_ids[myrank])): dfs = [] for out_parts in out_parts_list: dfs.extend(out_parts[i]) out_parts[i] = None dfs.extend(rank_to_out_parts_list[myrank][i]) rank_to_out_parts_list[myrank][i] = None if len(dfs) > 1: ret.append(_concat(dfs, ignore_index=ignore_index)) else: ret.append(dfs[0]) return ret