def _write_output_partition( gdf, processed_path, shuffle, out_files_per_proc, fs, cat_names, cont_names, label_names, output_format, ): gdf_size = len(gdf) out_files_per_proc = out_files_per_proc or 1 # Get cached writer (or create/cache a new one) with get_worker_cache("writer") as writer_cache: writer = writer_cache.get(processed_path, None) if writer is None: writer = writer_factory( output_format, processed_path, out_files_per_proc, shuffle, use_guid=True, bytes_io=(shuffle == "full"), ) writer.set_col_names(labels=label_names, cats=cat_names, conts=cont_names) writer_cache[processed_path] = writer # Add data writer.add_data(gdf) return gdf_size
def _read_groupby_stat_df(path, name, cat_cache): if cat_cache is not None: cat_cache = cat_cache if isinstance(cat_cache, str) else cat_cache.get(name, "disk") with get_worker_cache("stats") as cache: if cache: return fetch_table_data(cache, path, cache=cat_cache) return cudf.io.read_parquet(path, index=False)
def _ext(self): # Define _ext, depending on `kind_ext` if self.kind_ext == "cudf": _ext = self.df_ext elif self.kind_ext == "pandas": _ext = cudf.DataFrame.from_pandas(self.df_ext) elif self.kind_ext == "arrow": _ext = cudf.DataFrame.from_arrow(self.df_ext) else: if self.kind_ext == "parquet": reader = cudf.read_parquet elif self.kind_ext == "csv": reader = cudf.read_csv else: raise ValueError("Disk format not yet supported") with get_worker_cache(self.df_ext) as cached_table: _ext = fetch_table_data( cached_table, self.df_ext, cache=self.cache, columns=self.columns_ext, reader=reader, **self.kwargs, ) # Take subset of columns if a list is specified if self.columns_ext: _ext = _ext[self.columns_ext] # Drop duplicates if requested if self.drop_duplicates_ext: _ext.drop_duplicates(ignore_index=True, inplace=True) return _ext
def _worker_finish(processed_path): general_md, special_md = {}, {} with get_worker_cache("writer") as writer_cache: writer = writer_cache.get(processed_path, None) if writer: general_md, special_md = writer.close() return general_md, special_md
def _read_groupby_stat_df(path, name, cat_cache, read_pq_func): if cat_cache is not None: cat_cache = cat_cache if isinstance(cat_cache, str) else cat_cache.get( name, "disk") with get_worker_cache("stats") as cache: if cache: return fetch_table_data(cache, path, cache=cat_cache, reader=read_pq_func) return read_pq_func(path)
def _encode(name, path, gdf, cat_cache, na_sentinel=-1, freq_threshold=0): value = None if path: if cat_cache is not None: cat_cache = cat_cache if isinstance( cat_cache, str) else cat_cache.get(name, "disk") cache = get_worker_cache("cats") if len(gdf) else None if cache is not None: value = fetch_data(cache, name, path, cache=cat_cache, kind="cats") else: value = cudf.io.read_parquet(path, index=False, columns=[name]) value.index.name = "labels" value.reset_index(drop=False, inplace=True) vals = gdf[name].copy(deep=False) if value is None: value = cudf.DataFrame({name: [None]}) value[name] = value[name].astype(vals.dtype) value.index.name = "labels" value.reset_index(drop=False, inplace=True) if freq_threshold > 0: codes = cudf.DataFrame({ name: vals.copy(), "order": cp.arange(len(vals)) }) codes = codes.merge(value, on=name, how="left").sort_values("order")["labels"] codes.fillna(na_sentinel, inplace=True) return codes.values else: # Use `searchsorted` if we are using a "full" encoding labels = value[name].searchsorted(vals, side="left", na_position="first") labels[labels >= len(value[name])] = na_sentinel return labels
def _encode(name, storage_name, path, gdf, cat_cache, na_sentinel=-1, freq_threshold=0): value = None selection_l = name if isinstance(name, list) else [name] selection_r = name if isinstance(name, list) else [storage_name] if path: if cat_cache is not None: cat_cache = ( cat_cache if isinstance(cat_cache, str) else cat_cache.get(storage_name, "disk") ) if len(gdf): with get_worker_cache("cats") as cache: value = fetch_table_data( cache, path, columns=selection_r, cache=cat_cache, cats_only=True ) else: value = cudf.io.read_parquet(path, index=False, columns=selection_r) value.index.name = "labels" value.reset_index(drop=False, inplace=True) if value is None: value = cudf.DataFrame() for c in selection_r: typ = gdf[selection_l[0]].dtype if len(selection_l) == 1 else gdf[c].dtype value[c] = cudf.Series([None], dtype=typ) value.index.name = "labels" value.reset_index(drop=False, inplace=True) if freq_threshold > 0: codes = cudf.DataFrame({"order": cp.arange(len(gdf))}) for c in selection_l: codes[c] = gdf[c].copy() codes = codes.merge( value, left_on=selection_l, right_on=selection_r, how="left" ).sort_values("order")["labels"] codes.fillna(na_sentinel, inplace=True) return codes.values else: # Use `searchsorted` if we are using a "full" encoding labels = value[selection_r].searchsorted(gdf[selection_l], side="left", na_position="first") labels[labels >= len(value[selection_r])] = na_sentinel return labels
def _encode( name, storage_name, path, gdf, cat_cache, na_sentinel=-1, freq_threshold=0, search_sorted=False, buckets=None, encode_type="joint", cat_names=None, ): if isinstance(buckets, int): buckets = {name: buckets for name in cat_names} value = None selection_l = name if isinstance(name, list) else [name] selection_r = name if isinstance(name, list) else [storage_name] list_col = _is_list_col(selection_l, gdf) if path: if cat_cache is not None: cat_cache = ( cat_cache if isinstance(cat_cache, str) else cat_cache.get(storage_name, "disk") ) if len(gdf): with get_worker_cache("cats") as cache: value = fetch_table_data( cache, path, columns=selection_r, cache=cat_cache, cats_only=True ) else: value = cudf.io.read_parquet(path, index=False, columns=selection_r) value.index.name = "labels" value.reset_index(drop=False, inplace=True) if value is None: value = cudf.DataFrame() for c in selection_r: typ = gdf[selection_l[0]].dtype if len(selection_l) == 1 else gdf[c].dtype value[c] = cudf.Series([None], dtype=typ) value.index.name = "labels" value.reset_index(drop=False, inplace=True) if not search_sorted: if list_col: codes = cudf.DataFrame({selection_l[0]: gdf[selection_l[0]].list.leaves}) codes["order"] = cp.arange(len(codes)) else: codes = cudf.DataFrame({"order": cp.arange(len(gdf))}, index=gdf.index) for c in selection_l: codes[c] = gdf[c].copy() if buckets and storage_name in buckets: na_sentinel = _hash_bucket(gdf, buckets, selection_l, encode_type=encode_type) # apply frequency hashing if freq_threshold and buckets and storage_name in buckets: merged_df = codes.merge( value, left_on=selection_l, right_on=selection_r, how="left" ).sort_values("order") merged_df.reset_index(drop=True, inplace=True) max_id = merged_df["labels"].max() merged_df["labels"].fillna(cudf.Series(na_sentinel + max_id + 1), inplace=True) labels = merged_df["labels"].values # only do hashing elif buckets and storage_name in buckets: labels = na_sentinel # no hashing else: na_sentinel = 0 labels = codes.merge( value, left_on=selection_l, right_on=selection_r, how="left" ).sort_values("order")["labels"] labels.fillna(na_sentinel, inplace=True) labels = labels.values else: # Use `searchsorted` if we are using a "full" encoding if list_col: labels = value[selection_r].searchsorted( gdf[selection_l[0]].list.leaves, side="left", na_position="first" ) else: labels = value[selection_r].searchsorted( gdf[selection_l], side="left", na_position="first" ) labels[labels >= len(value[selection_r])] = na_sentinel if list_col: labels = _encode_list_column(gdf[selection_l[0]], labels) return labels
def _encode( name, storage_name, path, df, cat_cache, na_sentinel=-1, freq_threshold=0, search_sorted=False, buckets=None, encode_type="joint", cat_names=None, max_size=0, dtype=None, ): if isinstance(buckets, int): buckets = {name: buckets for name in cat_names} # this is to apply freq_hashing logic if max_size: freq_threshold = 1 value = None selection_l = name if isinstance(name, list) else [name] selection_r = name if isinstance(name, list) else [storage_name] list_col = _is_list_col(selection_l, df) if path: read_pq_func = _read_parquet_dispatch(df) if cat_cache is not None: cat_cache = (cat_cache if isinstance(cat_cache, str) else cat_cache.get(storage_name, "disk")) if len(df): with get_worker_cache("cats") as cache: value = fetch_table_data( cache, path, columns=selection_r, cache=cat_cache, cats_only=True, reader=read_pq_func, ) else: value = read_pq_func( # pylint: disable=unexpected-keyword-arg path, columns=selection_r) value.index.name = "labels" value.reset_index(drop=False, inplace=True) if value is None: value = type(df)() for c in selection_r: typ = df[selection_l[0]].dtype if len( selection_l) == 1 else df[c].dtype value[c] = df._constructor_sliced([None], dtype=typ) value.index.name = "labels" value.reset_index(drop=False, inplace=True) if not search_sorted: if list_col: codes = _flatten_list_column(df[selection_l[0]]) codes["order"] = _arange(len(codes), like_df=df) else: codes = type(df)({ "order": _arange(len(df), like_df=df) }, index=df.index) for c in selection_l: codes[c] = df[c].copy() if buckets and storage_name in buckets: na_sentinel = _hash_bucket(df, buckets, selection_l, encode_type=encode_type) # apply frequency hashing if freq_threshold and buckets and storage_name in buckets: merged_df = codes.merge(value, left_on=selection_l, right_on=selection_r, how="left").sort_values("order") merged_df.reset_index(drop=True, inplace=True) max_id = merged_df["labels"].max() merged_df["labels"].fillna(df._constructor_sliced(na_sentinel + max_id + 1), inplace=True) labels = merged_df["labels"].values # only do hashing elif buckets and storage_name in buckets: labels = na_sentinel # no hashing else: na_sentinel = 0 labels = codes.merge(value, left_on=selection_l, right_on=selection_r, how="left").sort_values("order")["labels"] labels.fillna(na_sentinel, inplace=True) labels = labels.values else: # Use `searchsorted` if we are using a "full" encoding if list_col: labels = value[selection_r].searchsorted( df[selection_l[0]].list.leaves, side="left", na_position="first") else: labels = value[selection_r].searchsorted(df[selection_l], side="left", na_position="first") labels[labels >= len(value[selection_r])] = na_sentinel if list_col: labels = _encode_list_column(df[selection_l[0]], labels, dtype=dtype) elif dtype: labels = labels.astype(dtype, copy=False) return labels