def test_factorize_index_obj(ncats, nelem): df = DataFrame() np.random.seed(0) # initialize data frame df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32) df = df.set_index("cats") uvals, labels = df.index.factorize() np.testing.assert_array_equal(labels.values.get(), sorted(set(arr))) assert isinstance(uvals, cp.core.core.ndarray) assert isinstance(labels, Index) encoder = dict((labels[idx], idx) for idx in range(len(labels))) handcoded = [encoder[v] for v in arr] np.testing.assert_array_equal(uvals.get(), handcoded)
def test_groupby_column_numeral(): pdf = pd.DataFrame({0: [1.0, 2.0, 3.0], 1: [1, 2, 3]}) gdf = DataFrame.from_pandas(pdf) p = pdf.groupby(1) g = gdf.groupby(1) pxx = p[0].sum() gxx = g[0].sum() assert_groupby_results_equal(pxx, gxx) pdf = pd.DataFrame({0.5: [1.0, 2.0, 3.0], 1.5: [1, 2, 3]}) gdf = DataFrame.from_pandas(pdf) p = pdf.groupby(1.5) g = gdf.groupby(1.5) pxx = p[0.5].sum() gxx = g[0.5].sum() assert_groupby_results_equal(pxx, gxx)
def test_index_join(lhs, rhs, how, level): l_pdf = pd.DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) r_pdf = pd.DataFrame({"a": [1, 5, 4, 0], "b": [3, 9, 8, 4]}) l_df = DataFrame.from_pandas(l_pdf) r_df = DataFrame.from_pandas(r_pdf) p_lhs = l_pdf.set_index(lhs).index p_rhs = r_pdf.set_index(rhs).index g_lhs = l_df.set_index(lhs).index g_rhs = r_df.set_index(rhs).index expected = (p_lhs.join(p_rhs, level=level, how=how).to_frame( index=False).sort_values(by=lhs).reset_index(drop=True)) got = (g_lhs.join(g_rhs, level=level, how=how).to_frame( index=False).sort_values(by=lhs).reset_index(drop=True)) assert_eq(expected, got)
def test_cat_series_binop_error(): df = DataFrame() df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) df["b"] = np.arange(len(df)) dfa = df["a"] dfb = df["b"] # lhs is a categorical assert_exceptions_equal( lfunc=operator.add, rfunc=operator.add, lfunc_args_and_kwargs=([dfa, dfb], ), rfunc_args_and_kwargs=([dfa, dfb], ), check_exception_type=False, expected_error_message="Series of dtype `category` cannot " "perform the operation: add", ) # if lhs is a numerical assert_exceptions_equal( lfunc=operator.add, rfunc=operator.add, lfunc_args_and_kwargs=([dfb, dfa], ), rfunc_args_and_kwargs=([dfb, dfa], ), check_exception_type=False, expected_error_message="'add' operator not supported", )
def test_groupby_column_name(): pdf = pd.DataFrame({"xx": [1.0, 2.0, 3.0], "yy": [1, 2, 3]}) gdf = DataFrame.from_pandas(pdf) g = gdf.groupby("yy") p = pdf.groupby("yy") gxx = g["xx"].sum() pxx = p["xx"].sum() assert_groupby_results_equal(pxx, gxx) gxx = g["xx"].count() pxx = p["xx"].count() assert_groupby_results_equal(pxx, gxx, check_dtype=False) gxx = g["xx"].min() pxx = p["xx"].min() assert_groupby_results_equal(pxx, gxx) gxx = g["xx"].max() pxx = p["xx"].max() assert_groupby_results_equal(pxx, gxx) gxx = g["xx"].idxmin() pxx = p["xx"].idxmin() assert_groupby_results_equal(pxx, gxx, check_dtype=False) gxx = g["xx"].idxmax() pxx = p["xx"].idxmax() assert_groupby_results_equal(pxx, gxx, check_dtype=False) gxx = g["xx"].mean() pxx = p["xx"].mean() assert_groupby_results_equal(pxx, gxx)
def test_tile(nulls, num_cols, num_rows, dtype, count): if dtype not in ["float32", "float64"] and nulls in ["some"]: pytest.skip(msg="nulls not supported in dtype: " + dtype) pdf = pd.DataFrame(dtype=dtype) for i in range(num_cols): colname = str(i) data = pd.Series(np.random.randint(num_cols, 26, num_rows)).astype( dtype ) if nulls == "some": idx = np.random.choice( num_rows, size=int(num_rows / 2), replace=False ) data[idx] = np.nan pdf[colname] = data gdf = DataFrame.from_pandas(pdf) got = gdf.tile(count) expect = pd.DataFrame(pd.concat([pdf] * count)) assert_eq(expect, got)
def test_groupby_iterate_groups(): np.random.seed(0) df = DataFrame() nelem = 20 df["key1"] = np.random.randint(0, 3, nelem) df["key2"] = np.random.randint(0, 2, nelem) df["val1"] = np.random.random(nelem) df["val2"] = np.random.random(nelem) def assert_values_equal(arr): np.testing.assert_array_equal(arr[0], arr) for name, grp in df.groupby(["key1", "key2"]): pddf = grp.to_pandas() for k in "key1,key2".split(","): assert_values_equal(pddf[k].values)
def test_dataframe_multi_column_nulls(num_cols, num_rows, dtype, nulls, ascending, na_position): from string import ascii_lowercase np.random.seed(0) by = list(ascii_lowercase[:num_cols]) pdf = pd.DataFrame() for i in range(3): colname = ascii_lowercase[i] data = np.random.randint(0, 26, num_rows).astype(dtype) if nulls == "some": idx = np.array([], dtype="int64") if num_rows > 0: idx = np.random.choice(num_rows, size=int(num_rows / 4), replace=False) data[idx] = np.nan elif nulls == "all": data[:] = np.nan pdf[colname] = data gdf = DataFrame.from_pandas(pdf) got = gdf.sort_values(by, ascending=ascending, na_position=na_position) expect = pdf.sort_values(by, ascending=ascending, na_position=na_position) assert_eq(got[by].reset_index(drop=True), expect[by].reset_index(drop=True))
def test_interleave_columns(nulls, num_cols, num_rows, dtype): if dtype not in ["float32", "float64"] and nulls in ["some"]: pytest.skip(msg="nulls not supported in dtype: " + dtype) pdf = pd.DataFrame(dtype=dtype) for i in range(num_cols): colname = str(i) data = pd.Series(np.random.randint(0, 26, num_rows)).astype(dtype) if nulls == "some": idx = np.random.choice(num_rows, size=int(num_rows / 2), replace=False) data[idx] = np.nan pdf[colname] = data gdf = DataFrame.from_pandas(pdf) if dtype == "category": with pytest.raises(ValueError): assert gdf.interleave_columns() else: got = gdf.interleave_columns() expect = pd.Series(np.vstack(pdf.to_numpy()).reshape( (-1, ))).astype(dtype) assert_eq(expect, got)
def test_df_stack(nulls, num_cols, num_rows, dtype): if dtype not in ["float32", "float64"] and nulls in ["some"]: pytest.skip(msg="nulls not supported in dtype: " + dtype) pdf = pd.DataFrame() for i in range(num_cols): colname = str(i) data = np.random.randint(0, 26, num_rows).astype(dtype) if nulls == "some": idx = np.random.choice(num_rows, size=int(num_rows / 2), replace=False) data[idx] = np.nan pdf[colname] = data gdf = DataFrame.from_pandas(pdf) got = gdf.stack() expect = pdf.stack() if {None} == set(expect.index.names): expect.rename_axis(list(range(0, len(expect.index.names))), inplace=True) assert_eq(expect, got) pass
def test_onehot_random(): df = DataFrame() low = 10 high = 17 size = 10 df["src"] = src = np.random.randint(low=low, high=high, size=size) df2 = df.one_hot_encoding(column="src", prefix="out_", cats=tuple(range(10, 17))) mat = df2.as_matrix(columns=df2.columns[1:]) for val in range(low, high): colidx = val - low arr = mat[:, colidx] mask = src == val np.testing.assert_equal(arr, mask)
def test_fillna_dataframe(df, value, inplace): pdf = df.copy(deep=True) gdf = DataFrame.from_pandas(pdf) fill_value_pd = value if isinstance(fill_value_pd, (pd.Series, pd.DataFrame)): fill_value_cudf = cudf.from_pandas(fill_value_pd) elif isinstance(fill_value_pd, dict): fill_value_cudf = {} for key in fill_value_pd: temp_val = fill_value_pd[key] if isinstance(temp_val, pd.Series): temp_val = cudf.from_pandas(temp_val) fill_value_cudf[key] = temp_val else: fill_value_cudf = value expect = pdf.fillna(fill_value_pd, inplace=inplace) got = gdf.fillna(fill_value_cudf, inplace=inplace) if inplace: got = gdf expect = pdf assert_eq(expect, got)
def test_onehot_generic_index(): np.random.seed(0) size = 33 indices = np.random.randint(low=0, high=100, size=size) df = DataFrame() values = np.random.randint(low=0, high=4, size=size) df["fo"] = Series(values, index=GenericIndex(indices)) out = df.one_hot_encoding("fo", cats=df.fo.unique(), prefix="fo", dtype=np.int32) assert set(out.columns) == {"fo", "fo_0", "fo_1", "fo_2", "fo_3"} np.testing.assert_array_equal(values == 0, out.fo_0.to_array()) np.testing.assert_array_equal(values == 1, out.fo_1.to_array()) np.testing.assert_array_equal(values == 2, out.fo_2.to_array()) np.testing.assert_array_equal(values == 3, out.fo_3.to_array())
def test_fillna_dataframe(fill_type, inplace): pdf = pd.DataFrame({"a": [1, 2, None], "b": [None, None, 5]}) gdf = DataFrame.from_pandas(pdf) if fill_type == "scalar": fill_value_pd = 5 fill_value_cudf = fill_value_pd elif fill_type == "series": fill_value_pd = pd.Series([3, 4, 5]) fill_value_cudf = Series.from_pandas(fill_value_pd) else: fill_value_pd = {"a": 5, "b": pd.Series([3, 4, 5])} fill_value_cudf = { "a": fill_value_pd["a"], "b": Series.from_pandas(fill_value_pd["b"]), } # https://github.com/pandas-dev/pandas/issues/27197 # pandas df.fill_value with series is not working if isinstance(fill_value_pd, pd.Series): expect = pd.DataFrame() for col in pdf.columns: expect[col] = pdf[col].fillna(fill_value_pd) else: expect = pdf.fillna(fill_value_pd) got = gdf.fillna(fill_value_cudf, inplace=inplace) if inplace: got = gdf assert_eq(expect, got)
def test_datetime_array_timeunit_cast(dtype): testdata = np.array( [ np.datetime64("2016-11-20"), np.datetime64("2020-11-20"), np.datetime64("2019-11-20"), np.datetime64("1918-11-20"), np.datetime64("2118-11-20"), ], dtype=dtype, ) gs = Series(testdata) ps = pd.Series(testdata) assert_eq(ps, gs) gdf = DataFrame() gdf["a"] = np.arange(5) gdf["b"] = testdata pdf = pd.DataFrame() pdf["a"] = np.arange(5) pdf["b"] = testdata assert_eq(pdf, gdf)
def test_groupby_quantile(interpolation, q): raw_data = { "y": [None, 1, 2, 3, 4, None, 6, 7, 8, 9], "x": [1, 2, 3, 1, 2, 2, 1, None, 3, 2], } # Pandas>0.25 now casts NaN in quantile operations as a float64 # # so we are filling with zeros. pdf = pd.DataFrame(raw_data).fillna(0) gdf = DataFrame.from_pandas(pdf) pdg = pdf.groupby("x") gdg = gdf.groupby("x") pdresult = pdg.quantile(q, interpolation=interpolation) gdresult = gdg.quantile(q, interpolation=interpolation) # There's a lot left to add to python bindings like index name # so this is a temporary workaround pdresult = pdresult["y"].reset_index(drop=True) gdresult = gdresult["y"].reset_index(drop=True) if q == 0.5 and interpolation == "nearest": pytest.xfail( "Pandas NaN Rounding will fail nearest interpolation at 0.5") assert_groupby_results_equal(pdresult, gdresult)
def test_categorical_dataframe_slice_copy(): pdf = pd.DataFrame({"g": pd.Series(["a", "b", "z"], dtype="category")}) gdf = DataFrame.from_pandas(pdf) exp = pdf[1:].copy() gdf = gdf[1:].copy() assert_eq(exp, gdf)
def test_groupby_column_name(): pdf = pd.DataFrame({"xx": [1.0, 2.0, 3.0], "yy": [1, 2, 3]}) gdf = DataFrame.from_pandas(pdf) g = gdf.groupby("yy") p = pdf.groupby("yy") gxx = g["xx"].sum() pxx = p["xx"].sum() assert_eq(pxx, gxx)
def test_pickle_dataframe_categorical(): np.random.seed(0) df = DataFrame() df["keys"] = pd.Categorical("aaabababac") df["vals"] = np.random.random(len(df)) check_serialization(df)
def test_pickle_dataframe_numeric(): np.random.seed(0) df = DataFrame() nelem = 10 df["keys"] = np.arange(nelem, dtype=np.float64) df["vals"] = np.random.random(nelem) check_serialization(df)
def test_string_slice(): df = DataFrame({"a": ["hello", "world"]}) pdf = pd.DataFrame({"a": ["hello", "world"]}) a_slice_got = df.a.str.slice(0, 2) a_slice_expected = pdf.a.str.slice(0, 2) assert isinstance(a_slice_got, Series) assert_eq(a_slice_expected, a_slice_got)
def test_groupby_cats(): df = DataFrame() df["cats"] = pd.Categorical(list("aabaacaab")) df["vals"] = np.random.random(len(df)) cats = df["cats"].values_host vals = df["vals"].to_array() grouped = df.groupby(["cats"], as_index=False).mean() got_vals = grouped["vals"] got_cats = grouped["cats"] for i in range(len(got_vals)): expect = vals[cats == got_cats[i]].mean() np.testing.assert_almost_equal(got_vals[i], expect)
def test_factorize_series_index(): df = DataFrame() df["col1"] = ["C", "H", "C", "W", "W", "W", "W", "W", "C", "W"] df["col2"] = [ 2992443.0, 2992447.0, 2992466.0, 2992440.0, 2992441.0, 2992442.0, 2992444.0, 2992445.0, 2992446.0, 2992448.0, ] assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0]) assert_eq( df.col1.factorize()[1].to_pandas().values, df.to_pandas().col1.factorize()[1].values, ) df = df.set_index("col2") assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0]) assert_eq( df.col1.factorize()[1].to_pandas().values, df.to_pandas().col1.factorize()[1].values, )
def test_dataframe_sort_values_sliced(nelem, sliceobj): np.random.seed(0) df = pd.DataFrame() df["a"] = np.random.random(nelem) expect = df[sliceobj]["a"].sort_values() gdf = DataFrame.from_pandas(df) got = gdf[sliceobj]["a"].sort_values() assert (got.to_pandas() == expect).all()
def test_groupby_level_zero(agg): pdf = pd.DataFrame({"x": [1, 2, 3]}, index=[0, 1, 1]) gdf = DataFrame.from_pandas(pdf) pdg = pdf.groupby(level=0) gdg = gdf.groupby(level=0) pdresult = getattr(pdg, agg)() gdresult = getattr(gdg, agg)() check_dtype = False if agg == "count" else True assert_eq(pdresult, gdresult, check_dtype=check_dtype)
def test_groupby_cats(method): df = DataFrame() df["cats"] = pd.Categorical(list("aabaacaab")) df["vals"] = np.random.random(len(df)) cats = np.asarray(list(df["cats"])) vals = df["vals"].to_array() grouped = df.groupby(["cats"], method=method, as_index=False).mean() got_vals = grouped["vals"] got_cats = grouped["cats"] for c, v in zip(got_cats, got_vals): print(c, v) expect = vals[cats == c].mean() np.testing.assert_almost_equal(v, expect)
def test_groupby_as_df(): np.random.seed(0) df = DataFrame() nelem = 20 df["key1"] = np.random.randint(0, 3, nelem) df["key2"] = np.random.randint(0, 2, nelem) df["val1"] = np.random.random(nelem) df["val2"] = np.random.random(nelem) def assert_values_equal(arr): np.testing.assert_array_equal(arr[0], arr) df, segs = df.groupby(["key1", "key2"], method="cudf").as_df() for s, e in zip(segs, list(segs[1:]) + [None]): grp = df[s:e] pddf = grp.to_pandas() for k in "key1,key2".split(","): assert_values_equal(pddf[k].values)
def test_groupby_level_zero(agg): pdf = pd.DataFrame({"x": [1, 2, 3]}, index=[2, 5, 5]) gdf = DataFrame.from_pandas(pdf) pdg = pdf.groupby(level=0) gdg = gdf.groupby(level=0) pdresult = getattr(pdg, agg)() gdresult = getattr(gdg, agg)() check_dtype = False if agg in _index_type_aggs else True assert_groupby_results_equal(pdresult, gdresult, check_dtype=check_dtype)
def test_numpy_non_contiguious(): recdtype = np.dtype([("index", np.int64), ("a", np.int32)]) rec = np.recarray(10, dtype=recdtype) rec.index = np.arange(30, 40) rec.a = aa = np.arange(20, dtype=np.int32)[::2] assert rec.a.flags["C_CONTIGUOUS"] is False gdf = DataFrame.from_records(rec, index="index") assert_eq(aa, gdf["a"].values)
def test_string_groupby_key(str_data, num_keys): other_data = [1, 2, 3, 4, 5][:len(str_data)] pdf = pd.DataFrame() gdf = DataFrame() for i in range(num_keys): pdf[i] = pd.Series(str_data, dtype="str") gdf[i] = Series(str_data, dtype="str") pdf["a"] = other_data gdf["a"] = other_data expect = pdf.groupby(list(range(num_keys)), as_index=False).count() got = gdf.groupby(list(range(num_keys)), as_index=False).count() expect = expect.sort_values([0]).reset_index(drop=True) got = got.sort_values([0]).reset_index(drop=True) assert_eq(expect, got, check_dtype=False)