def test_make_meta_backends(index): dtypes = ["int8", "int32", "int64", "float64"] df = cudf.DataFrame( {dt: np.arange(start=0, stop=3, dtype=dt) for dt in dtypes}) df["strings"] = ["cat", "dog", "fish"] df["cats"] = df["strings"].astype("category") df["time_s"] = np.array(["2018-10-07", "2018-10-08", "2018-10-09"], dtype="datetime64[s]") df["time_ms"] = df["time_s"].astype("datetime64[ms]") df["time_ns"] = df["time_s"].astype("datetime64[ns]") df = df.set_index(index) # Check "empty" metadata types chk_meta = make_meta(df) dd.assert_eq(chk_meta.dtypes, df.dtypes) # Check "non-empty" metadata types chk_meta_nonempty = meta_nonempty(df) dd.assert_eq(chk_meta.dtypes, chk_meta_nonempty.dtypes) # Check dask code path if not MultiIndex if not isinstance(df.index, cudf.MultiIndex): ddf = dgd.from_cudf(df, npartitions=1) # Check "empty" metadata types dd.assert_eq(ddf._meta.dtypes, df.dtypes) # Check "non-empty" metadata types dd.assert_eq(ddf._meta.dtypes, ddf._meta_nonempty.dtypes)
def _nonempty_index(idx): if isinstance(idx, cudf.core.index.RangeIndex): return cudf.core.index.RangeIndex(2, name=idx.name) elif isinstance(idx, cudf.core.index.DatetimeIndex): start = "1970-01-01" data = np.array([start, "1970-01-02"], dtype=idx.dtype) values = cudf.core.column.as_column(data) return cudf.core.index.DatetimeIndex(values, name=idx.name) elif isinstance(idx, cudf.core.index.StringIndex): return cudf.core.index.StringIndex(["cat", "dog"], name=idx.name) elif isinstance(idx, cudf.core.index.CategoricalIndex): key = tuple(idx._data.keys()) assert len(key) == 1 categories = idx._data[key[0]].categories codes = [0, 0] ordered = idx._data[key[0]].ordered values = cudf.core.column.build_categorical_column( categories=categories, codes=codes, ordered=ordered) return cudf.core.index.CategoricalIndex(values, name=idx.name) elif isinstance(idx, cudf.core.index.GenericIndex): return cudf.core.index.GenericIndex(np.arange(2, dtype=idx.dtype), name=idx.name) elif isinstance(idx, cudf.core.MultiIndex): levels = [meta_nonempty(lev) for lev in idx.levels] codes = [[0, 0] for i in idx.levels] return cudf.core.MultiIndex(levels=levels, codes=codes, names=idx.names) raise TypeError(f"Don't know how to handle index of type {type(idx)}")
def meta_nonempty_cudf(x): idx = meta_nonempty(x.index) columns_with_dtype = dict() res = cudf.DataFrame(index=idx) for col in x._data.names: dtype = str(x._data[col].dtype) if dtype not in columns_with_dtype: columns_with_dtype[dtype] = cudf.core.column.as_column( _get_non_empty_data(x[col])) res._data[col] = columns_with_dtype[dtype] return res
def meta_nonempty_cudf(x): idx = meta_nonempty(x.index) dt_s_dict = dict() data = dict() for i, c in enumerate(x.columns): series = x[c] dt = str(series.dtype) if dt not in dt_s_dict: dt_s_dict[dt] = _nonempty_series(series, idx=idx) data[i] = dt_s_dict[dt] res = cudf.DataFrame(data, index=idx, columns=np.arange(len(x.columns))) res.columns = x.columns return res
def meta_nonempty_cudf(x, index=None): y = meta_nonempty(x.to_pandas()) # TODO: add iloc[:5] return cudf.from_pandas(y)
def _(x): y = meta_nonempty(x.to_pandas()) # TODO: add iloc[:5] return cudf.from_pandas(y)