def test_select_by_label_multiindex(): """ Test getting column(s) by label with MultiIndex """ ca = ColumnAccessor( { ("a", "b", "c"): [1, 2, 3], ("a", "b", "e"): [2, 3, 4], ("b", "x", ""): [4, 5, 6], ("a", "d", "e"): [3, 4, 5], }, multiindex=True, ) expect = ColumnAccessor( {("b", "c"): [1, 2, 3], ("b", "e"): [2, 3, 4], ("d", "e"): [3, 4, 5]}, multiindex=True, ) got = ca.select_by_label("a") check_ca_equal(expect, got) expect = ColumnAccessor({"c": [1, 2, 3], "e": [2, 3, 4]}, multiindex=False) got = ca.select_by_label(("a", "b")) check_ca_equal(expect, got) expect = ColumnAccessor( {("b", "c"): [1, 2, 3], ("b", "e"): [2, 3, 4], ("d", "e"): [3, 4, 5]}, multiindex=True, ) got = ca.select_by_label("a") check_ca_equal(expect, got) expect = ColumnAccessor({"c": [1, 2, 3], "e": [2, 3, 4]}, multiindex=False) got = ca.select_by_label(("a", "b")) check_ca_equal(expect, got)
def test_select_by_label_simple(): """ Test getting a column by label """ ca = ColumnAccessor({"a": [1, 2, 3], "b": [2, 3, 4]}) check_ca_equal(ca.select_by_label("a"), ColumnAccessor({"a": [1, 2, 3]})) check_ca_equal(ca.select_by_label("b"), ColumnAccessor({"b": [2, 3, 4]}))
def test_all_columns(simple_data): """ Test that all values of the CA are columns. """ ca = ColumnAccessor(simple_data) for col in ca.values(): assert isinstance(col, cudf.core.column.ColumnBase)
def test_to_pandas_multiindex_names(): ca = ColumnAccessor( {("a", "b"): [1, 2, 3], ("c", "d"): [3, 4, 5]}, multiindex=True, level_names=("foo", "bar"), ) assert_eq( ca.to_pandas_index(), pd.MultiIndex.from_tuples( (("a", "b"), ("c", "d")), names=("foo", "bar") ), )
def test_column_size_mismatch(): """ Test that constructing a CA from columns of differing sizes throws an error. """ with pytest.raises(ValueError): _ = ColumnAccessor({"a": [1], "b": [1, 2]})
def test_iter(simple_data): """ Test that iterating over the CA yields column names. """ ca = ColumnAccessor(simple_data) for expect_key, got_key in zip(simple_data, ca): assert expect_key == got_key
def test_select_by_index_empty(): ca = ColumnAccessor( { ("a", "b", "c"): [1, 2, 3], ("a", "b", "e"): [2, 3, 4], ("b", "x", ""): [4, 5, 6], ("a", "d", "e"): [3, 4, 5], }, multiindex=True, ) expect = ColumnAccessor({}, multiindex=True, level_names=((None, None, None))) got = ca.select_by_index(slice(None, 0)) check_ca_equal(expect, got) got = ca.select_by_index([]) check_ca_equal(expect, got)
def test_select_by_index_simple(): """ Test getting a column by label """ ca = ColumnAccessor({"a": [1, 2, 3], "b": [2, 3, 4]}) check_ca_equal(ca.select_by_index(0), ColumnAccessor({"a": [1, 2, 3]})) check_ca_equal(ca.select_by_index(1), ColumnAccessor({"b": [2, 3, 4]})) check_ca_equal(ca.select_by_index([0, 1]), ca) check_ca_equal(ca.select_by_index(slice(0, None)), ca)
def test_replace_level_values_MultiColumn(): ca = ColumnAccessor( { ("a", 1): [1, 2, 3], ("a", 2): [2, 3, 4], ("b", 1): [3, 4, 5] }, multiindex=True, ) expect = ColumnAccessor( { ("f", 1): [1, 2, 3], ("f", 2): [2, 3, 4], ("b", 1): [3, 4, 5] }, multiindex=True, ) got = ca.rename_levels(mapper={"a": "f"}, level=0) check_ca_equal(expect, got)
def test_replace_level_values_RangeIndex(): ca = ColumnAccessor( { ("a"): [1, 2, 3], ("b"): [2, 3, 4], ("c"): [3, 4, 5] }, multiindex=False, ) expect = ColumnAccessor( { ("f"): [1, 2, 3], ("b"): [2, 3, 4], ("c"): [3, 4, 5] }, multiindex=False, ) got = ca.rename_levels(mapper={"a": "f"}, level=0) check_ca_equal(expect, got)
def test_select_by_index_multiindex(): """ Test getting column(s) by label with MultiIndex """ ca = ColumnAccessor( { ("a", "b", "c"): [1, 2, 3], ("a", "b", "e"): [2, 3, 4], ("b", "x", ""): [4, 5, 6], ("a", "d", "e"): [3, 4, 5], }, multiindex=True, ) expect = ColumnAccessor( { ("a", "b", "c"): [1, 2, 3], ("a", "b", "e"): [2, 3, 4], ("b", "x", ""): [4, 5, 6], }, multiindex=True, ) got = ca.select_by_index(slice(0, 3)) check_ca_equal(expect, got) expect = ColumnAccessor( { ("a", "b", "c"): [1, 2, 3], ("a", "b", "e"): [2, 3, 4], ("a", "d", "e"): [3, 4, 5], }, multiindex=True, ) got = ca.select_by_index([0, 1, 3]) check_ca_equal(expect, got)
def test_select_by_label_multiindex_slice(): ca = ColumnAccessor( { ("a", "b", "c"): [1, 2, 3], ("a", "b", "e"): [2, 3, 4], ("a", "d", "e"): [3, 4, 5], ("b", "x", ""): [4, 5, 6], }, multiindex=True, ) # pandas needs columns to be sorted to do slicing with multiindex expect = ca got = ca.select_by_label(slice(None, None)) check_ca_equal(expect, got) expect = ColumnAccessor( { ("a", "b", "e"): [2, 3, 4], ("a", "d", "e"): [3, 4, 5], ("b", "x", ""): [4, 5, 6], }, multiindex=True, ) got = ca.select_by_label(slice(("a", "b", "e"), ("b", "x", ""))) check_ca_equal(expect, got)
def test_to_pandas_simple(simple_data): """ Test that a ColumnAccessor converts to a correct pd.Index """ ca = ColumnAccessor(simple_data) assert_eq(ca.to_pandas_index(), pd.DataFrame(simple_data).columns)
def test_by_label_list(): ca = ColumnAccessor({"a": [1, 2, 3], "b": [2, 3, 4], "c": [3, 4, 5]}) expect = ColumnAccessor({"b": [2, 3, 4], "c": [3, 4, 5]}) got = ca.select_by_label(["b", "c"]) check_ca_equal(expect, got)
def timeseries( start="2000-01-01", end="2000-01-31", freq="1s", dtypes=None, nulls_frequency=0, seed=None, ): """Create timeseries dataframe with random data Parameters ---------- start : datetime (or datetime-like string) Start of time series end : datetime (or datetime-like string) End of time series dtypes : dict Mapping of column names to types. Valid types include {float, int, str, 'category'}. If none is provided, this defaults to ``{"name": "category", "id": int, "x": float, "y": float}`` freq : string String like '2s' or '1H' or '12W' for the time series frequency nulls_frequency : float Fill the series with the specified proportion of nulls. Default is 0. seed : int (optional) Randomstate seed Examples -------- >>> import cudf as gd >>> gdf = gd.datasets.timeseries() >>> gdf.head() # doctest: +SKIP timestamp id name x y 2000-01-01 00:00:00 967 Jerry -0.031348 -0.040633 2000-01-01 00:00:01 1066 Michael -0.262136 0.307107 2000-01-01 00:00:02 988 Wendy -0.526331 0.128641 2000-01-01 00:00:03 1016 Yvonne 0.620456 0.767270 2000-01-01 00:00:04 998 Ursula 0.684902 -0.463278 """ if dtypes is None: dtypes = {"name": "category", "id": int, "x": float, "y": float} index = pd.DatetimeIndex( pd.date_range(start, end, freq=freq, name="timestamp")) state = np.random.RandomState(seed) columns = {k: make[dt](len(index), state) for k, dt in dtypes.items()} df = pd.DataFrame(columns, index=index, columns=sorted(columns)) if df.index[-1] == end: df = df.iloc[:-1] gdf = cudf.from_pandas(df) for col in gdf: mask = state.choice( [True, False], size=len(index), p=[1 - nulls_frequency, nulls_frequency], ) mask_buf = bools_to_mask(cudf.core.column.as_column(mask)) masked_col = gdf[col]._column.set_mask(mask_buf) gdf[col] = cudf.Series._from_data(ColumnAccessor({None: masked_col}), index=gdf.index) return gdf
def test_to_pandas_multiindex(mi_data): ca = ColumnAccessor(mi_data, multiindex=True) assert_eq(ca.to_pandas_index(), pd.DataFrame(mi_data).columns)
def test_select_by_label_simple_slice(): ca = ColumnAccessor({"a": [1, 2, 3], "b": [2, 3, 4], "c": [3, 4, 5]}) expect = ColumnAccessor({"b": [2, 3, 4], "c": [3, 4, 5]}) got = ca.select_by_label(slice("b", "c")) check_ca_equal(expect, got)
def _data(self): from cudf.core.column_accessor import ColumnAccessor return ColumnAccessor({self.name: self._values})