def transform(self, col_selector: ColumnSelector,
                  df: DataFrameType) -> DataFrameType:
        new_df = type(df)()
        tmp = "__tmp__"  # Temporary column for sorting
        df[tmp] = _arange(len(df), like_df=df, dtype="int32")

        cat_names, multi_col_group = nvt_cat._get_multicolumn_names(
            col_selector, df.columns, self.name_sep)

        _read_pq_func = _read_parquet_dispatch(df)
        for name in cat_names:
            new_part = type(df)()
            storage_name = self.storage_name.get(name, name)
            name = multi_col_group.get(name, name)
            path = self.categories[storage_name]
            selection_l = list(name) if isinstance(name, tuple) else [name]
            selection_r = list(name) if isinstance(name,
                                                   tuple) else [storage_name]

            stat_df = nvt_cat._read_groupby_stat_df(path, storage_name,
                                                    self.cat_cache,
                                                    _read_pq_func)
            tran_df = df[selection_l + [tmp]].merge(stat_df,
                                                    left_on=selection_l,
                                                    right_on=selection_r,
                                                    how="left")
            tran_df = tran_df.sort_values(tmp)
            tran_df.drop(columns=selection_l + [tmp], inplace=True)
            new_cols = [c for c in tran_df.columns if c not in new_df.columns]
            new_part = tran_df[new_cols].reset_index(drop=True)
            new_df = _concat_columns([new_df, new_part])
        df.drop(columns=[tmp], inplace=True)
        return new_df
Example #2
0
    def _op_group_logic(self, cat_group, gdf, y_mean, fit_folds, group_ind):
        # Define name of new TE column
        if isinstance(self.out_col, list):
            if group_ind >= len(self.out_col):
                raise ValueError("out_col and cat_groups are different sizes.")
            out_col = self.out_col[group_ind]
            out_col = [out_col] if isinstance(out_col, str) else out_col
            # ToDo Test
            if len(out_col) != len(self.target):
                raise ValueError("out_col and target are different sizes.")
        else:
            out_col = self._make_te_name(cat_group)

        # Initialize new data
        _read_pq_func = _read_parquet_dispatch(gdf)
        new_gdf = cudf.DataFrame()
        tmp = "__tmp__"

        if fit_folds:
            # Groupby Aggregation for each fold
            cols = ["__fold__"] + cat_group
            storage_name_folds = nvt_cat._make_name(*cols, sep=self.name_sep)
            path_folds = self.stats[storage_name_folds]
            agg_each_fold = nvt_cat._read_groupby_stat_df(
                path_folds, storage_name_folds, self.cat_cache, _read_pq_func)
            agg_each_fold.columns = cols + ["count_y"] + [
                x + "_sum_y" for x in self.target
            ]
        else:
            cols = cat_group

        # Groupby Aggregation for all data
        storage_name_all = nvt_cat._make_name(*cat_group, sep=self.name_sep)
        path_all = self.stats[storage_name_all]
        agg_all = nvt_cat._read_groupby_stat_df(path_all, storage_name_all,
                                                self.cat_cache, _read_pq_func)
        agg_all.columns = cat_group + ["count_y_all"] + [
            x + "_sum_y_all" for x in self.target
        ]

        if fit_folds:
            agg_each_fold = agg_each_fold.merge(agg_all,
                                                on=cat_group,
                                                how="left")
            agg_each_fold["count_y_all"] = agg_each_fold[
                "count_y_all"] - agg_each_fold["count_y"]
            for i, x in enumerate(self.target):
                agg_each_fold[x + "_sum_y_all"] = (
                    agg_each_fold[x + "_sum_y_all"] -
                    agg_each_fold[x + "_sum_y"])
                agg_each_fold[out_col[i]] = (
                    agg_each_fold[x + "_sum_y_all"] + self.p_smooth *
                    y_mean[x]) / (agg_each_fold["count_y_all"] + self.p_smooth)

            agg_each_fold = agg_each_fold.drop(
                ["count_y_all", "count_y"] +
                [x + "_sum_y" for x in self.target] +
                [x + "_sum_y_all" for x in self.target],
                axis=1,
            )
            tran_gdf = gdf[cols + [tmp]].merge(agg_each_fold,
                                               on=cols,
                                               how="left")
            del agg_each_fold
        else:
            for i, x in enumerate(self.target):
                agg_all[out_col[i]] = (
                    agg_all[x + "_sum_y_all"] + self.p_smooth * y_mean[x]) / (
                        agg_all["count_y_all"] + self.p_smooth)
            agg_all = agg_all.drop(["count_y_all"] +
                                   [x + "_sum_y_all" for x in self.target],
                                   axis=1)
            tran_gdf = gdf[cols + [tmp]].merge(agg_all, on=cols, how="left")
            del agg_all

        # TODO: There is no need to perform the `agg_each_fold.merge(agg_all, ...)` merge
        #     for every partition.  We can/should cache the result for better performance.

        for i, x in enumerate(self.target):
            tran_gdf[out_col[i]] = tran_gdf[out_col[i]].fillna(y_mean[x])
        if self.out_dtype is not None:
            tran_gdf[out_col] = tran_gdf[out_col].astype(self.out_dtype)

        tran_gdf = tran_gdf.sort_values(tmp, ignore_index=True)
        tran_gdf.drop(columns=cols + [tmp], inplace=True)
        new_cols = [c for c in tran_gdf.columns if c not in new_gdf.columns]
        new_gdf[new_cols] = tran_gdf[new_cols]

        # Make sure we are preserving the index of gdf
        new_gdf.index = gdf.index
        return new_gdf
Example #3
0
def _encode(
    name,
    storage_name,
    path,
    df,
    cat_cache,
    na_sentinel=-1,
    freq_threshold=0,
    search_sorted=False,
    buckets=None,
    encode_type="joint",
    cat_names=None,
    max_size=0,
):
    if isinstance(buckets, int):
        buckets = {name: buckets for name in cat_names}
    # this is to apply freq_hashing logic
    if max_size:
        freq_threshold = 1
    value = None
    selection_l = name if isinstance(name, list) else [name]
    selection_r = name if isinstance(name, list) else [storage_name]
    list_col = _is_list_col(selection_l, df)
    if path:
        read_pq_func = _read_parquet_dispatch(df)
        if cat_cache is not None:
            cat_cache = (
                cat_cache if isinstance(cat_cache, str) else cat_cache.get(storage_name, "disk")
            )
            if len(df):
                with get_worker_cache("cats") as cache:
                    value = fetch_table_data(
                        cache,
                        path,
                        columns=selection_r,
                        cache=cat_cache,
                        cats_only=True,
                        reader=read_pq_func,
                    )
        else:
            value = read_pq_func(path, columns=selection_r)
            value.index.name = "labels"
            value.reset_index(drop=False, inplace=True)

    if value is None:
        value = type(df)()
        for c in selection_r:
            typ = df[selection_l[0]].dtype if len(selection_l) == 1 else df[c].dtype
            value[c] = df._constructor_sliced([None], dtype=typ)
        value.index.name = "labels"
        value.reset_index(drop=False, inplace=True)

    if not search_sorted:
        if list_col:
            codes = _flatten_list_column(df[selection_l[0]])
            codes["order"] = _arange(len(codes), like_df=df)
        else:
            codes = type(df)({"order": _arange(len(df), like_df=df)}, index=df.index)
            for c in selection_l:
                codes[c] = df[c].copy()
        if buckets and storage_name in buckets:
            na_sentinel = _hash_bucket(df, buckets, selection_l, encode_type=encode_type)
        # apply frequency hashing
        if freq_threshold and buckets and storage_name in buckets:
            merged_df = codes.merge(
                value, left_on=selection_l, right_on=selection_r, how="left"
            ).sort_values("order")
            merged_df.reset_index(drop=True, inplace=True)
            max_id = merged_df["labels"].max()
            merged_df["labels"].fillna(
                df._constructor_sliced(na_sentinel + max_id + 1), inplace=True
            )
            labels = merged_df["labels"].values
        # only do hashing
        elif buckets and storage_name in buckets:
            labels = na_sentinel
        # no hashing
        else:
            na_sentinel = 0
            labels = codes.merge(
                value, left_on=selection_l, right_on=selection_r, how="left"
            ).sort_values("order")["labels"]
            labels.fillna(na_sentinel, inplace=True)
            labels = labels.values
    else:
        # Use `searchsorted` if we are using a "full" encoding
        if list_col:
            labels = value[selection_r].searchsorted(
                df[selection_l[0]].list.leaves, side="left", na_position="first"
            )
        else:
            labels = value[selection_r].searchsorted(
                df[selection_l], side="left", na_position="first"
            )
        labels[labels >= len(value[selection_r])] = na_sentinel

    if list_col:
        labels = _encode_list_column(df[selection_l[0]], labels)

    return labels