def _create_tensors(self, gdf): """ Breaks a dataframe down into the relevant categorical, continuous, and label tensors. Can be overrideen """ workflow_nodes = (self.cat_names, self.cont_names, self.label_names) dtypes = (self._LONG_DTYPE, self._FLOAT32_DTYPE, self._FLOAT32_DTYPE) tensors = [] offsets = _make_df(device=self.device) for column_names, dtype in zip(workflow_nodes, dtypes): if len(column_names) == 0: tensors.append(None) continue gdf_i = gdf[column_names] gdf.drop(columns=column_names, inplace=True) scalars, lists = self._separate_list_columns(gdf_i) x = None if scalars: # should always return dict column_name: values, offsets (optional) x = self._to_tensor(gdf_i[scalars], dtype) if lists: list_tensors = OrderedDict() for column_name in lists: column = gdf_i.pop(column_name) leaves, col_offsets = _pull_apart_list(column) if isinstance(leaves[0], list): leaves, nest_offsets = _pull_apart_list(leaves) col_offsets = nest_offsets.iloc[col_offsets[:]] offsets[column_name] = col_offsets.reset_index(drop=True) list_tensors[column_name] = self._to_tensor(leaves, dtype) x = x, list_tensors tensors.append(x) if not offsets.empty: offsets_tensor = self._to_tensor(offsets, self._LONG_DTYPE) if len(offsets_tensor.shape) == 1: offsets_tensor = offsets_tensor[:, None] tensors.append(offsets_tensor) del gdf, offsets return tensors
def merge_cats_encoding(self, ser, cats): # df and cats are both series # set cats to dfs offs = None if _is_list_dtype(ser.dtype) or _is_list_dtype(ser): ser, offs = _pull_apart_list(ser) ser = _make_df({"vals": ser}) cats = _make_df({"names": cats}) cats["vals"] = cats.index ser = ser.merge(cats, on=["vals"], how="left") return ser["names"], offs
def fit(self, col_selector: ColumnSelector, ddf: dd.DataFrame) -> Any: stats = {} for col in col_selector.names: series = ddf[col] if _is_list_dtype(series.compute()): stats[col] = stats[col] if col in stats else {} stats[col]["value_count"] = ({} if "value_count" not in stats[col] else stats[col]["value_count"]) offs = _pull_apart_list(series.compute())[1] lh, rh = offs[1:], offs[:-1] rh = rh.reset_index(drop=True) lh = lh.reset_index(drop=True) deltas = lh - rh # must be regular python class otherwise protobuf fails stats[col]["value_count"]["min"] = int(deltas.min()) stats[col]["value_count"]["max"] = int(deltas.max()) return stats
def test_full_df(num_rows, tmpdir, distro): json_sample["num_rows"] = num_rows cats = list(json_sample["cats"].keys()) cols = datagen._get_cols_from_schema(json_sample, distros=distro) df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.00001) df_files = df_gen.full_df_create(num_rows, cols, entries=True, output=tmpdir) test_size = 0 full_df = _make_df() for fi in df_files: df = Dataset(fi).to_ddf().compute() test_size = test_size + df.shape[0] full_df = _concat([full_df, df]) assert test_size == num_rows conts_rep = cols["conts"] cats_rep = cols["cats"] labels_rep = cols["labels"] assert df.shape[1] == len(conts_rep) + len(cats_rep) + len(labels_rep) for idx, cat in enumerate(cats[1:]): dist = cats_rep[idx + 1].distro or df_gen.dist if HAS_GPU: if not _is_string_dtype(full_df[cat]._column): sts, ps = dist.verify(full_df[cat].to_pandas()) assert all(s > 0.9 for s in sts) else: if not _is_string_dtype(full_df[cat]): sts, ps = dist.verify(full_df[cat]) assert all(s > 0.9 for s in sts) # these are not mh series assert full_df[cat].nunique() == cats_rep[0].cardinality assert full_df[cat].str.len().min() == cats_rep[0].min_entry_size assert full_df[cat].str.len().max() == cats_rep[0].max_entry_size # check the mh list here cat 0 only if HAS_GPU: check_ser = _make_df(list(full_df[cats[0]]._column.elements.values_host))[0] else: check_ser = _pull_apart_list(full_df[cats[0]])[0] assert check_ser.nunique() == cats_rep[0].cardinality assert check_ser.str.len().min() == cats_rep[0].min_entry_size assert check_ser.str.len().max() == cats_rep[0].max_entry_size
def test_cat_rep(num_rows, distro): json_sample["num_rows"] = num_rows cats = list(json_sample["cats"].keys()) cols = datagen._get_cols_from_schema(json_sample, distros=distro) df_gen = datagen.DatasetGen(datagen.UniformDistro()) df_uni = df_gen.create_df(num_rows, cols, entries=True) df_cats = df_uni[cats] assert df_cats.shape[1] == len(cats) assert df_cats.shape[0] == num_rows cats_rep = cols["cats"] for idx, cat in enumerate(cats[1:]): assert df_uni[cat].nunique() == cats_rep[idx + 1].cardinality assert df_uni[cat].str.len().min() == cats_rep[idx + 1].min_entry_size assert df_uni[cat].str.len().max() == cats_rep[idx + 1].max_entry_size if HAS_GPU: check_ser = _make_df(list(df_uni[cats[0]]._column.elements.values_host))[0] else: check_ser = df_uni[cats[0]] if isinstance(check_ser[0], (list, np.ndarray)): check_ser = _pull_apart_list(check_ser)[0] assert check_ser.nunique() == cats_rep[0].cardinality assert check_ser.str.len().min() == cats_rep[0].min_entry_size assert check_ser.str.len().max() == cats_rep[0].max_entry_size