def test_dataframes_share_dev_mem(): cudf = pytest.importorskip("cudf") df = cudf.DataFrame({"a": range(10)}) grouped = shuffle_group(df, "a", 0, 2, 2, False, 2) view1 = grouped[0] view2 = grouped[1] # Even though the two dataframe doesn't point to the same cudf.Buffer object assert view1["a"].data is not view2["a"].data # They still share the same underlying device memory assert view1["a"].data._owner._owner is view2["a"].data._owner._owner dhf = ProxifyHostFile(device_memory_limit=160) dhf["v1"] = view1 dhf["v2"] = view2 v1 = dhf["v1"] v2 = dhf["v2"] # The device_memory_limit is not exceeded since both dataframes share device memory assert not v1._obj_pxy_is_serialized() assert not v2._obj_pxy_is_serialized() # Now the device_memory_limit is exceeded, which should evict both dataframes dhf["k1"] = cupy.arange(1) assert v1._obj_pxy_is_serialized() assert v2._obj_pxy_is_serialized()
def _top_level_groupby( df, cat_col_groups, tree_width, cont_cols, agg_list, on_host, concat_groups, name_sep ): sum_sq = "std" in agg_list or "var" in agg_list calculate_min = "min" in agg_list calculate_max = "max" in agg_list # Top-level operation for category-based groupby aggregations output = {} k = 0 for i, cat_col_group in enumerate(cat_col_groups): if isinstance(cat_col_group, tuple): cat_col_group = list(cat_col_group) if isinstance(cat_col_group, str): cat_col_group = [cat_col_group] cat_col_group_str = _make_name(*cat_col_group, sep=name_sep) if concat_groups and len(cat_col_group) > 1: # Concatenate columns and replace cat_col_group # with the single name df_gb = type(df)() ignore_index = True df_gb[cat_col_group_str] = _concat([df[col] for col in cat_col_group], ignore_index) cat_col_group = [cat_col_group_str] else: # Compile aggregation dictionary and add "squared-sum" # column(s) (necessary when `cont_cols` is non-empty) df_gb = df[cat_col_group + cont_cols].copy(deep=False) agg_dict = {} agg_dict[cat_col_group[0]] = ["count"] for col in cont_cols: agg_dict[col] = ["sum"] if sum_sq: name = _make_name(col, "pow2", sep=name_sep) df_gb[name] = df_gb[col].pow(2) agg_dict[name] = ["sum"] if calculate_min: agg_dict[col].append("min") if calculate_max: agg_dict[col].append("max") # Perform groupby and flatten column index # (flattening provides better cudf/pd support) if _is_list_col(cat_col_group, df_gb): # handle list columns by encoding the list values df_gb = _flatten_list_column(df_gb[cat_col_group[0]]) # NOTE: groupby(..., dropna=False) requires pandas>=1.1.0 gb = df_gb.groupby(cat_col_group, dropna=False).agg(agg_dict) gb.columns = [ _make_name(*(tuple(cat_col_group) + name[1:]), sep=name_sep) if name[0] == cat_col_group[0] else _make_name(*(tuple(cat_col_group) + name), sep=name_sep) for name in gb.columns.to_flat_index() ] gb.reset_index(inplace=True, drop=False) del df_gb # Split the result by the hash value of the categorical column nsplits = tree_width[cat_col_group_str] for j, split in shuffle_group( gb, cat_col_group, 0, nsplits, nsplits, True, nsplits ).items(): if on_host: output[k] = split.to_arrow(preserve_index=False) else: output[k] = split k += 1 del gb return output