Ejemplo n.º 1
0
def _write_output_partition(
    gdf,
    processed_path,
    shuffle,
    out_files_per_proc,
    fs,
    cat_names,
    cont_names,
    label_names,
    output_format,
):
    gdf_size = len(gdf)
    out_files_per_proc = out_files_per_proc or 1

    # Get cached writer (or create/cache a new one)
    with get_worker_cache("writer") as writer_cache:
        writer = writer_cache.get(processed_path, None)
        if writer is None:
            writer = writer_factory(
                output_format,
                processed_path,
                out_files_per_proc,
                shuffle,
                use_guid=True,
                bytes_io=(shuffle == "full"),
            )
            writer.set_col_names(labels=label_names,
                                 cats=cat_names,
                                 conts=cont_names)
            writer_cache[processed_path] = writer

        # Add data
        writer.add_data(gdf)

    return gdf_size
Ejemplo n.º 2
0
def _read_groupby_stat_df(path, name, cat_cache):
    if cat_cache is not None:
        cat_cache = cat_cache if isinstance(cat_cache, str) else cat_cache.get(name, "disk")
        with get_worker_cache("stats") as cache:
            if cache:
                return fetch_table_data(cache, path, cache=cat_cache)
    return cudf.io.read_parquet(path, index=False)
Ejemplo n.º 3
0
    def _ext(self):
        # Define _ext, depending on `kind_ext`
        if self.kind_ext == "cudf":
            _ext = self.df_ext
        elif self.kind_ext == "pandas":
            _ext = cudf.DataFrame.from_pandas(self.df_ext)
        elif self.kind_ext == "arrow":
            _ext = cudf.DataFrame.from_arrow(self.df_ext)
        else:
            if self.kind_ext == "parquet":
                reader = cudf.read_parquet
            elif self.kind_ext == "csv":
                reader = cudf.read_csv
            else:
                raise ValueError("Disk format not yet supported")

            with get_worker_cache(self.df_ext) as cached_table:
                _ext = fetch_table_data(
                    cached_table,
                    self.df_ext,
                    cache=self.cache,
                    columns=self.columns_ext,
                    reader=reader,
                    **self.kwargs,
                )

        # Take subset of columns if a list is specified
        if self.columns_ext:
            _ext = _ext[self.columns_ext]

        # Drop duplicates if requested
        if self.drop_duplicates_ext:
            _ext.drop_duplicates(ignore_index=True, inplace=True)

        return _ext
Ejemplo n.º 4
0
def _worker_finish(processed_path):
    general_md, special_md = {}, {}
    with get_worker_cache("writer") as writer_cache:
        writer = writer_cache.get(processed_path, None)
        if writer:
            general_md, special_md = writer.close()

    return general_md, special_md
Ejemplo n.º 5
0
def _read_groupby_stat_df(path, name, cat_cache, read_pq_func):
    if cat_cache is not None:
        cat_cache = cat_cache if isinstance(cat_cache, str) else cat_cache.get(
            name, "disk")
        with get_worker_cache("stats") as cache:
            if cache:
                return fetch_table_data(cache,
                                        path,
                                        cache=cat_cache,
                                        reader=read_pq_func)
    return read_pq_func(path)
Ejemplo n.º 6
0
def _encode(name, path, gdf, cat_cache, na_sentinel=-1, freq_threshold=0):
    value = None
    if path:
        if cat_cache is not None:
            cat_cache = cat_cache if isinstance(
                cat_cache, str) else cat_cache.get(name, "disk")
            cache = get_worker_cache("cats") if len(gdf) else None
            if cache is not None:
                value = fetch_data(cache,
                                   name,
                                   path,
                                   cache=cat_cache,
                                   kind="cats")
        else:
            value = cudf.io.read_parquet(path, index=False, columns=[name])
            value.index.name = "labels"
            value.reset_index(drop=False, inplace=True)

    vals = gdf[name].copy(deep=False)
    if value is None:
        value = cudf.DataFrame({name: [None]})
        value[name] = value[name].astype(vals.dtype)
        value.index.name = "labels"
        value.reset_index(drop=False, inplace=True)

    if freq_threshold > 0:
        codes = cudf.DataFrame({
            name: vals.copy(),
            "order": cp.arange(len(vals))
        })
        codes = codes.merge(value, on=name,
                            how="left").sort_values("order")["labels"]
        codes.fillna(na_sentinel, inplace=True)
        return codes.values
    else:
        # Use `searchsorted` if we are using a "full" encoding
        labels = value[name].searchsorted(vals,
                                          side="left",
                                          na_position="first")
        labels[labels >= len(value[name])] = na_sentinel
        return labels
Ejemplo n.º 7
0
def _encode(name, storage_name, path, gdf, cat_cache, na_sentinel=-1, freq_threshold=0):
    value = None
    selection_l = name if isinstance(name, list) else [name]
    selection_r = name if isinstance(name, list) else [storage_name]
    if path:
        if cat_cache is not None:
            cat_cache = (
                cat_cache if isinstance(cat_cache, str) else cat_cache.get(storage_name, "disk")
            )
            if len(gdf):
                with get_worker_cache("cats") as cache:
                    value = fetch_table_data(
                        cache, path, columns=selection_r, cache=cat_cache, cats_only=True
                    )
        else:
            value = cudf.io.read_parquet(path, index=False, columns=selection_r)
            value.index.name = "labels"
            value.reset_index(drop=False, inplace=True)

    if value is None:
        value = cudf.DataFrame()
        for c in selection_r:
            typ = gdf[selection_l[0]].dtype if len(selection_l) == 1 else gdf[c].dtype
            value[c] = cudf.Series([None], dtype=typ)
        value.index.name = "labels"
        value.reset_index(drop=False, inplace=True)

    if freq_threshold > 0:
        codes = cudf.DataFrame({"order": cp.arange(len(gdf))})
        for c in selection_l:
            codes[c] = gdf[c].copy()
        codes = codes.merge(
            value, left_on=selection_l, right_on=selection_r, how="left"
        ).sort_values("order")["labels"]
        codes.fillna(na_sentinel, inplace=True)
        return codes.values
    else:
        # Use `searchsorted` if we are using a "full" encoding
        labels = value[selection_r].searchsorted(gdf[selection_l], side="left", na_position="first")
        labels[labels >= len(value[selection_r])] = na_sentinel
        return labels
Ejemplo n.º 8
0
def _encode(
    name,
    storage_name,
    path,
    gdf,
    cat_cache,
    na_sentinel=-1,
    freq_threshold=0,
    search_sorted=False,
    buckets=None,
    encode_type="joint",
    cat_names=None,
):

    if isinstance(buckets, int):
        buckets = {name: buckets for name in cat_names}

    value = None
    selection_l = name if isinstance(name, list) else [name]
    selection_r = name if isinstance(name, list) else [storage_name]
    list_col = _is_list_col(selection_l, gdf)
    if path:
        if cat_cache is not None:
            cat_cache = (
                cat_cache if isinstance(cat_cache, str) else cat_cache.get(storage_name, "disk")
            )
            if len(gdf):
                with get_worker_cache("cats") as cache:
                    value = fetch_table_data(
                        cache, path, columns=selection_r, cache=cat_cache, cats_only=True
                    )
        else:
            value = cudf.io.read_parquet(path, index=False, columns=selection_r)
            value.index.name = "labels"
            value.reset_index(drop=False, inplace=True)

    if value is None:
        value = cudf.DataFrame()
        for c in selection_r:
            typ = gdf[selection_l[0]].dtype if len(selection_l) == 1 else gdf[c].dtype
            value[c] = cudf.Series([None], dtype=typ)
        value.index.name = "labels"
        value.reset_index(drop=False, inplace=True)

    if not search_sorted:
        if list_col:
            codes = cudf.DataFrame({selection_l[0]: gdf[selection_l[0]].list.leaves})
            codes["order"] = cp.arange(len(codes))
        else:
            codes = cudf.DataFrame({"order": cp.arange(len(gdf))}, index=gdf.index)
            for c in selection_l:
                codes[c] = gdf[c].copy()
        if buckets and storage_name in buckets:
            na_sentinel = _hash_bucket(gdf, buckets, selection_l, encode_type=encode_type)
        # apply frequency hashing
        if freq_threshold and buckets and storage_name in buckets:
            merged_df = codes.merge(
                value, left_on=selection_l, right_on=selection_r, how="left"
            ).sort_values("order")
            merged_df.reset_index(drop=True, inplace=True)
            max_id = merged_df["labels"].max()
            merged_df["labels"].fillna(cudf.Series(na_sentinel + max_id + 1), inplace=True)
            labels = merged_df["labels"].values
        # only do hashing
        elif buckets and storage_name in buckets:
            labels = na_sentinel
        # no hashing
        else:
            na_sentinel = 0
            labels = codes.merge(
                value, left_on=selection_l, right_on=selection_r, how="left"
            ).sort_values("order")["labels"]
            labels.fillna(na_sentinel, inplace=True)
            labels = labels.values
    else:
        # Use `searchsorted` if we are using a "full" encoding
        if list_col:
            labels = value[selection_r].searchsorted(
                gdf[selection_l[0]].list.leaves, side="left", na_position="first"
            )
        else:
            labels = value[selection_r].searchsorted(
                gdf[selection_l], side="left", na_position="first"
            )
        labels[labels >= len(value[selection_r])] = na_sentinel

    if list_col:
        labels = _encode_list_column(gdf[selection_l[0]], labels)

    return labels
Ejemplo n.º 9
0
def _encode(
    name,
    storage_name,
    path,
    df,
    cat_cache,
    na_sentinel=-1,
    freq_threshold=0,
    search_sorted=False,
    buckets=None,
    encode_type="joint",
    cat_names=None,
    max_size=0,
    dtype=None,
):
    if isinstance(buckets, int):
        buckets = {name: buckets for name in cat_names}
    # this is to apply freq_hashing logic
    if max_size:
        freq_threshold = 1
    value = None
    selection_l = name if isinstance(name, list) else [name]
    selection_r = name if isinstance(name, list) else [storage_name]
    list_col = _is_list_col(selection_l, df)
    if path:
        read_pq_func = _read_parquet_dispatch(df)
        if cat_cache is not None:
            cat_cache = (cat_cache if isinstance(cat_cache, str) else
                         cat_cache.get(storage_name, "disk"))
            if len(df):
                with get_worker_cache("cats") as cache:
                    value = fetch_table_data(
                        cache,
                        path,
                        columns=selection_r,
                        cache=cat_cache,
                        cats_only=True,
                        reader=read_pq_func,
                    )
        else:
            value = read_pq_func(  # pylint: disable=unexpected-keyword-arg
                path,
                columns=selection_r)
            value.index.name = "labels"
            value.reset_index(drop=False, inplace=True)

    if value is None:
        value = type(df)()
        for c in selection_r:
            typ = df[selection_l[0]].dtype if len(
                selection_l) == 1 else df[c].dtype
            value[c] = df._constructor_sliced([None], dtype=typ)
        value.index.name = "labels"
        value.reset_index(drop=False, inplace=True)

    if not search_sorted:
        if list_col:
            codes = _flatten_list_column(df[selection_l[0]])
            codes["order"] = _arange(len(codes), like_df=df)
        else:
            codes = type(df)({
                "order": _arange(len(df), like_df=df)
            },
                             index=df.index)
            for c in selection_l:
                codes[c] = df[c].copy()
        if buckets and storage_name in buckets:
            na_sentinel = _hash_bucket(df,
                                       buckets,
                                       selection_l,
                                       encode_type=encode_type)
        # apply frequency hashing
        if freq_threshold and buckets and storage_name in buckets:
            merged_df = codes.merge(value,
                                    left_on=selection_l,
                                    right_on=selection_r,
                                    how="left").sort_values("order")
            merged_df.reset_index(drop=True, inplace=True)
            max_id = merged_df["labels"].max()
            merged_df["labels"].fillna(df._constructor_sliced(na_sentinel +
                                                              max_id + 1),
                                       inplace=True)
            labels = merged_df["labels"].values
        # only do hashing
        elif buckets and storage_name in buckets:
            labels = na_sentinel
        # no hashing
        else:
            na_sentinel = 0
            labels = codes.merge(value,
                                 left_on=selection_l,
                                 right_on=selection_r,
                                 how="left").sort_values("order")["labels"]
            labels.fillna(na_sentinel, inplace=True)
            labels = labels.values
    else:
        # Use `searchsorted` if we are using a "full" encoding
        if list_col:
            labels = value[selection_r].searchsorted(
                df[selection_l[0]].list.leaves,
                side="left",
                na_position="first")
        else:
            labels = value[selection_r].searchsorted(df[selection_l],
                                                     side="left",
                                                     na_position="first")
        labels[labels >= len(value[selection_r])] = na_sentinel

    if list_col:
        labels = _encode_list_column(df[selection_l[0]], labels, dtype=dtype)
    elif dtype:
        labels = labels.astype(dtype, copy=False)

    return labels