def test_normalize_lists(tmpdir, cpu): df = dispatch._make_df(device="cpu" if cpu else "gpu") df["vals"] = [ [0.0, 1.0, 2.0], [ 3.0, 4.0, ], [5.0], ] features = ["vals"] >> nvt.ops.Normalize() workflow = nvt.Workflow(features) transformed = workflow.fit_transform(nvt.Dataset(df)).to_ddf().compute() expected = _flatten_list_column_values(df["vals"]).astype("float32") expected = (expected - expected.mean()) / expected.std() expected_df = type(transformed)({"vals": expected}) assert_eq(expected_df, _flatten_list_column(transformed["vals"]))
def _encode( name, storage_name, path, df, cat_cache, na_sentinel=-1, freq_threshold=0, search_sorted=False, buckets=None, encode_type="joint", cat_names=None, max_size=0, ): if isinstance(buckets, int): buckets = {name: buckets for name in cat_names} # this is to apply freq_hashing logic if max_size: freq_threshold = 1 value = None selection_l = name if isinstance(name, list) else [name] selection_r = name if isinstance(name, list) else [storage_name] list_col = _is_list_col(selection_l, df) if path: read_pq_func = _read_parquet_dispatch(df) if cat_cache is not None: cat_cache = ( cat_cache if isinstance(cat_cache, str) else cat_cache.get(storage_name, "disk") ) if len(df): with get_worker_cache("cats") as cache: value = fetch_table_data( cache, path, columns=selection_r, cache=cat_cache, cats_only=True, reader=read_pq_func, ) else: value = read_pq_func(path, columns=selection_r) value.index.name = "labels" value.reset_index(drop=False, inplace=True) if value is None: value = type(df)() for c in selection_r: typ = df[selection_l[0]].dtype if len(selection_l) == 1 else df[c].dtype value[c] = df._constructor_sliced([None], dtype=typ) value.index.name = "labels" value.reset_index(drop=False, inplace=True) if not search_sorted: if list_col: codes = _flatten_list_column(df[selection_l[0]]) codes["order"] = _arange(len(codes), like_df=df) else: codes = type(df)({"order": _arange(len(df), like_df=df)}, index=df.index) for c in selection_l: codes[c] = df[c].copy() if buckets and storage_name in buckets: na_sentinel = _hash_bucket(df, buckets, selection_l, encode_type=encode_type) # apply frequency hashing if freq_threshold and buckets and storage_name in buckets: merged_df = codes.merge( value, left_on=selection_l, right_on=selection_r, how="left" ).sort_values("order") merged_df.reset_index(drop=True, inplace=True) max_id = merged_df["labels"].max() merged_df["labels"].fillna( df._constructor_sliced(na_sentinel + max_id + 1), inplace=True ) labels = merged_df["labels"].values # only do hashing elif buckets and storage_name in buckets: labels = na_sentinel # no hashing else: na_sentinel = 0 labels = codes.merge( value, left_on=selection_l, right_on=selection_r, how="left" ).sort_values("order")["labels"] labels.fillna(na_sentinel, inplace=True) labels = labels.values else: # Use `searchsorted` if we are using a "full" encoding if list_col: labels = value[selection_r].searchsorted( df[selection_l[0]].list.leaves, side="left", na_position="first" ) else: labels = value[selection_r].searchsorted( df[selection_l], side="left", na_position="first" ) labels[labels >= len(value[selection_r])] = na_sentinel if list_col: labels = _encode_list_column(df[selection_l[0]], labels) return labels
def _top_level_groupby( df, cat_col_groups, tree_width, cont_cols, agg_list, on_host, concat_groups, name_sep ): sum_sq = "std" in agg_list or "var" in agg_list calculate_min = "min" in agg_list calculate_max = "max" in agg_list # Top-level operation for category-based groupby aggregations output = {} k = 0 for i, cat_col_group in enumerate(cat_col_groups): if isinstance(cat_col_group, tuple): cat_col_group = list(cat_col_group) if isinstance(cat_col_group, str): cat_col_group = [cat_col_group] cat_col_group_str = _make_name(*cat_col_group, sep=name_sep) if concat_groups and len(cat_col_group) > 1: # Concatenate columns and replace cat_col_group # with the single name df_gb = type(df)() ignore_index = True df_gb[cat_col_group_str] = _concat([df[col] for col in cat_col_group], ignore_index) cat_col_group = [cat_col_group_str] else: # Compile aggregation dictionary and add "squared-sum" # column(s) (necessary when `cont_cols` is non-empty) df_gb = df[cat_col_group + cont_cols].copy(deep=False) agg_dict = {} agg_dict[cat_col_group[0]] = ["count"] for col in cont_cols: agg_dict[col] = ["sum"] if sum_sq: name = _make_name(col, "pow2", sep=name_sep) df_gb[name] = df_gb[col].pow(2) agg_dict[name] = ["sum"] if calculate_min: agg_dict[col].append("min") if calculate_max: agg_dict[col].append("max") # Perform groupby and flatten column index # (flattening provides better cudf/pd support) if _is_list_col(cat_col_group, df_gb): # handle list columns by encoding the list values df_gb = _flatten_list_column(df_gb[cat_col_group[0]]) # NOTE: groupby(..., dropna=False) requires pandas>=1.1.0 gb = df_gb.groupby(cat_col_group, dropna=False).agg(agg_dict) gb.columns = [ _make_name(*(tuple(cat_col_group) + name[1:]), sep=name_sep) if name[0] == cat_col_group[0] else _make_name(*(tuple(cat_col_group) + name), sep=name_sep) for name in gb.columns.to_flat_index() ] gb.reset_index(inplace=True, drop=False) del df_gb # Split the result by the hash value of the categorical column nsplits = tree_width[cat_col_group_str] for j, split in shuffle_group( gb, cat_col_group, 0, nsplits, nsplits, True, nsplits ).items(): if on_host: output[k] = split.to_arrow(preserve_index=False) else: output[k] = split k += 1 del gb return output