Ejemplo n.º 1
0
    def _create_tensors(self, gdf):
        """
        Breaks a dataframe down into the relevant
        categorical, continuous, and label tensors.
        Can be overrideen
        """
        workflow_nodes = (self.cat_names, self.cont_names, self.label_names)
        dtypes = (self._LONG_DTYPE, self._FLOAT32_DTYPE, self._FLOAT32_DTYPE)
        tensors = []
        offsets = _make_df(device=self.device)
        for column_names, dtype in zip(workflow_nodes, dtypes):
            if len(column_names) == 0:
                tensors.append(None)
                continue

            gdf_i = gdf[column_names]
            gdf.drop(columns=column_names, inplace=True)

            scalars, lists = self._separate_list_columns(gdf_i)

            x = None
            if scalars:
                # should always return dict column_name: values, offsets (optional)
                x = self._to_tensor(gdf_i[scalars], dtype)
            if lists:
                list_tensors = OrderedDict()
                for column_name in lists:
                    column = gdf_i.pop(column_name)
                    leaves, col_offsets = _pull_apart_list(column)
                    if isinstance(leaves[0], list):

                        leaves, nest_offsets = _pull_apart_list(leaves)
                        col_offsets = nest_offsets.iloc[col_offsets[:]]
                    offsets[column_name] = col_offsets.reset_index(drop=True)
                    list_tensors[column_name] = self._to_tensor(leaves, dtype)
                x = x, list_tensors
            tensors.append(x)

        if not offsets.empty:
            offsets_tensor = self._to_tensor(offsets, self._LONG_DTYPE)
            if len(offsets_tensor.shape) == 1:
                offsets_tensor = offsets_tensor[:, None]
            tensors.append(offsets_tensor)
        del gdf, offsets

        return tensors
Ejemplo n.º 2
0
    def merge_cats_encoding(self, ser, cats):
        # df and cats are both series
        # set cats to dfs
        offs = None
        if _is_list_dtype(ser.dtype) or _is_list_dtype(ser):
            ser, offs = _pull_apart_list(ser)
        ser = _make_df({"vals": ser})
        cats = _make_df({"names": cats})
        cats["vals"] = cats.index
        ser = ser.merge(cats, on=["vals"], how="left")

        return ser["names"], offs
Ejemplo n.º 3
0
 def fit(self, col_selector: ColumnSelector, ddf: dd.DataFrame) -> Any:
     stats = {}
     for col in col_selector.names:
         series = ddf[col]
         if _is_list_dtype(series.compute()):
             stats[col] = stats[col] if col in stats else {}
             stats[col]["value_count"] = ({}
                                          if "value_count" not in stats[col]
                                          else stats[col]["value_count"])
             offs = _pull_apart_list(series.compute())[1]
             lh, rh = offs[1:], offs[:-1]
             rh = rh.reset_index(drop=True)
             lh = lh.reset_index(drop=True)
             deltas = lh - rh
             # must be regular python class otherwise protobuf fails
             stats[col]["value_count"]["min"] = int(deltas.min())
             stats[col]["value_count"]["max"] = int(deltas.max())
     return stats
Ejemplo n.º 4
0
def test_full_df(num_rows, tmpdir, distro):
    json_sample["num_rows"] = num_rows
    cats = list(json_sample["cats"].keys())
    cols = datagen._get_cols_from_schema(json_sample, distros=distro)

    df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.00001)
    df_files = df_gen.full_df_create(num_rows, cols, entries=True, output=tmpdir)
    test_size = 0
    full_df = _make_df()
    for fi in df_files:
        df = Dataset(fi).to_ddf().compute()
        test_size = test_size + df.shape[0]
        full_df = _concat([full_df, df])
    assert test_size == num_rows
    conts_rep = cols["conts"]
    cats_rep = cols["cats"]
    labels_rep = cols["labels"]
    assert df.shape[1] == len(conts_rep) + len(cats_rep) + len(labels_rep)
    for idx, cat in enumerate(cats[1:]):
        dist = cats_rep[idx + 1].distro or df_gen.dist
        if HAS_GPU:
            if not _is_string_dtype(full_df[cat]._column):
                sts, ps = dist.verify(full_df[cat].to_pandas())
                assert all(s > 0.9 for s in sts)
        else:
            if not _is_string_dtype(full_df[cat]):
                sts, ps = dist.verify(full_df[cat])
                assert all(s > 0.9 for s in sts)
        # these are not mh series
        assert full_df[cat].nunique() == cats_rep[0].cardinality
        assert full_df[cat].str.len().min() == cats_rep[0].min_entry_size
        assert full_df[cat].str.len().max() == cats_rep[0].max_entry_size
    # check the mh list here cat 0 only
    if HAS_GPU:
        check_ser = _make_df(list(full_df[cats[0]]._column.elements.values_host))[0]
    else:
        check_ser = _pull_apart_list(full_df[cats[0]])[0]
    assert check_ser.nunique() == cats_rep[0].cardinality
    assert check_ser.str.len().min() == cats_rep[0].min_entry_size
    assert check_ser.str.len().max() == cats_rep[0].max_entry_size
Ejemplo n.º 5
0
def test_cat_rep(num_rows, distro):
    json_sample["num_rows"] = num_rows
    cats = list(json_sample["cats"].keys())
    cols = datagen._get_cols_from_schema(json_sample, distros=distro)

    df_gen = datagen.DatasetGen(datagen.UniformDistro())
    df_uni = df_gen.create_df(num_rows, cols, entries=True)
    df_cats = df_uni[cats]
    assert df_cats.shape[1] == len(cats)
    assert df_cats.shape[0] == num_rows
    cats_rep = cols["cats"]
    for idx, cat in enumerate(cats[1:]):
        assert df_uni[cat].nunique() == cats_rep[idx + 1].cardinality
        assert df_uni[cat].str.len().min() == cats_rep[idx + 1].min_entry_size
        assert df_uni[cat].str.len().max() == cats_rep[idx + 1].max_entry_size
    if HAS_GPU:
        check_ser = _make_df(list(df_uni[cats[0]]._column.elements.values_host))[0]
    else:
        check_ser = df_uni[cats[0]]
    if isinstance(check_ser[0], (list, np.ndarray)):
        check_ser = _pull_apart_list(check_ser)[0]
    assert check_ser.nunique() == cats_rep[0].cardinality
    assert check_ser.str.len().min() == cats_rep[0].min_entry_size
    assert check_ser.str.len().max() == cats_rep[0].max_entry_size