def test_fillna_dataframe(fill_type, inplace): pdf = pd.DataFrame({"a": [1, 2, None], "b": [None, None, 5]}) gdf = DataFrame.from_pandas(pdf) if fill_type == "scalar": fill_value_pd = 5 fill_value_cudf = fill_value_pd elif fill_type == "series": fill_value_pd = pd.Series([3, 4, 5]) fill_value_cudf = Series.from_pandas(fill_value_pd) else: fill_value_pd = {"a": 5, "b": pd.Series([3, 4, 5])} fill_value_cudf = { "a": fill_value_pd["a"], "b": Series.from_pandas(fill_value_pd["b"]), } # https://github.com/pandas-dev/pandas/issues/27197 # pandas df.fill_value with series is not working if isinstance(fill_value_pd, pd.Series): expect = pd.DataFrame() for col in pdf.columns: expect[col] = pdf[col].fillna(fill_value_pd) else: expect = pdf.fillna(fill_value_pd) got = gdf.fillna(fill_value_cudf, inplace=inplace) if inplace: got = gdf assert_eq(expect, got)
def test_string_series_compare(obj, cmpop, cmp_obj): g_obj = obj if isinstance(g_obj, pd.Series): g_obj = Series.from_pandas(g_obj) g_cmp_obj = cmp_obj if isinstance(g_cmp_obj, pd.Series): g_cmp_obj = Series.from_pandas(g_cmp_obj) got = cmpop(g_obj, g_cmp_obj) expected = cmpop(obj, cmp_obj) utils.assert_eq(expected, got)
def test_series_where(data_dtype, fill_value): psr = pd.Series(list(range(10)), dtype=data_dtype) sr = Series.from_pandas(psr) if sr.dtype.type(fill_value) != fill_value: with pytest.raises(TypeError): sr.where(sr > 0, fill_value) else: # Cast back to original dtype as pandas automatically upcasts expect = psr.where(psr > 0, fill_value).astype(psr.dtype) got = sr.where(sr > 0, fill_value) assert_eq(expect, got) if sr.dtype.type(fill_value) != fill_value: with pytest.raises(TypeError): sr.where(sr < 0, fill_value) else: expect = psr.where(psr < 0, fill_value).astype(psr.dtype) got = sr.where(sr < 0, fill_value) assert_eq(expect, got) if sr.dtype.type(fill_value) != fill_value: with pytest.raises(TypeError): sr.where(sr == 0, fill_value) else: expect = psr.where(psr == 0, fill_value).astype(psr.dtype) got = sr.where(sr == 0, fill_value) assert_eq(expect, got)
def test_string_numeric_astype(dtype): if dtype.startswith("bool"): data = [1, 0, 1, 0, 1] elif dtype.startswith("int"): data = [1, 2, 3, 4, 5] elif dtype.startswith("float"): data = [1.0, 2.0, 3.0, 4.0, 5.0] elif dtype.startswith("datetime64"): data = [1000000000, 2000000000, 3000000000, 4000000000, 5000000000] if dtype.startswith("datetime64"): ps = pd.Series(data, dtype="datetime64[ns]") gs = Series.from_pandas(ps) else: ps = pd.Series(data, dtype=dtype) gs = Series(data, dtype=dtype) # Pandas datetime64 --> str typecasting returns arbitrary format depending # on the data, so making it consistent unless we choose to match the # behavior if dtype.startswith("datetime64"): expect = ps.dt.strftime("%Y-%m-%dT%H:%M:%SZ") else: expect = ps.astype("str") got = gs.astype("str") assert_eq(expect, got)
def test_fillna_categorical(psr_data, fill_value, inplace): psr = psr_data.copy(deep=True) gsr = Series.from_pandas(psr) if isinstance(fill_value, pd.Series): fill_value_cudf = cudf.from_pandas(fill_value) else: fill_value_cudf = fill_value if (isinstance(fill_value_cudf, cudf.Series) and gsr.dtype != fill_value_cudf.dtype): assert_exceptions_equal( lfunc=psr.fillna, rfunc=gsr.fillna, lfunc_args_and_kwargs=([fill_value], { "inplace": inplace }), rfunc_args_and_kwargs=([fill_value_cudf], { "inplace": inplace }), ) else: expected = psr.fillna(fill_value, inplace=inplace) got = gsr.fillna(fill_value_cudf, inplace=inplace) if inplace: expected = psr got = gsr assert_eq(expected, got)
def test_groupby_series_level_zero(agg): pdf = pd.Series([1, 2, 3], index=[0, 1, 1]) gdf = Series.from_pandas(pdf) pdg = pdf.groupby(level=0) gdg = gdf.groupby(level=0) pdresult = getattr(pdg, agg)() gdresult = getattr(gdg, agg)() check_dtype = False if agg == "count" else True assert_eq(pdresult, gdresult, check_dtype=check_dtype)
def test_groupby_series_level_zero(agg): pdf = pd.Series([1, 2, 3], index=[2, 5, 5]) gdf = Series.from_pandas(pdf) pdg = pdf.groupby(level=0) gdg = gdf.groupby(level=0) pdresult = getattr(pdg, agg)() gdresult = getattr(gdg, agg)() check_dtype = False if agg in _index_type_aggs else True assert_groupby_results_equal(pdresult, gdresult, check_dtype=check_dtype)
def test_string_table_view_creation(): data = ["hi"] * 25 + [None] * 2027 psr = pd.Series(data) gsr = Series.from_pandas(psr) expect = psr[:1] got = gsr[:1] assert_eq(expect, got)
def test_fillna_string(fill_type, inplace): psr = pd.Series(["z", None, "z", None]) if fill_type == "scalar": fill_value_pd = "a" fill_value_cudf = fill_value_pd elif fill_type == "series": fill_value_pd = pd.Series(["a", "b", "c", "d"]) fill_value_cudf = Series.from_pandas(fill_value_pd) sr = Series.from_pandas(psr) expect = psr.fillna(fill_value_pd) got = sr.fillna(fill_value_cudf, inplace=inplace) if inplace: got = sr assert_eq(expect, got)
def test_series_clip(data, lower, upper, inplace): psr = pd.Series(data) gsr = Series.from_pandas(data) expect = psr.clip(lower=lower, upper=upper) got = gsr.clip(lower=lower, upper=upper, inplace=inplace) if inplace is True: assert_eq(expect, gsr) else: assert_eq(expect, got)
def test_series_replace(): a1 = np.array([0, 1, 2, 3, 4]) # Numerical a2 = np.array([5, 1, 2, 3, 4]) sr1 = Series(a1) sr2 = sr1.replace(0, 5) assert_eq(a2, sr2.to_array()) # Categorical psr3 = pd.Series(["one", "two", "three"], dtype="category") psr4 = psr3.replace("one", "two") sr3 = Series.from_pandas(psr3) sr4 = sr3.replace("one", "two") assert_eq(psr4, sr4) psr5 = psr3.replace("one", "five") sr5 = sr3.replace("one", "five") assert_eq(psr5, sr5) # List input a6 = np.array([5, 6, 2, 3, 4]) sr6 = sr1.replace([0, 1], [5, 6]) assert_eq(a6, sr6.to_array()) with pytest.raises(TypeError): sr1.replace([0, 1], [5.5, 6.5]) # Series input a8 = np.array([5, 5, 5, 3, 4]) sr8 = sr1.replace(sr1[:3].to_array(), 5) assert_eq(a8, sr8.to_array()) # large input containing null sr9 = Series(list(range(400)) + [None]) sr10 = sr9.replace([22, 323, 27, 0], None) assert sr10.null_count == 5 assert len(sr10.to_array()) == (401 - 5) sr11 = sr9.replace([22, 323, 27, 0], -1) assert sr11.null_count == 1 assert len(sr11.to_array()) == (401 - 1) # large input not containing nulls sr9 = sr9.fillna(-11) sr12 = sr9.replace([22, 323, 27, 0], None) assert sr12.null_count == 4 assert len(sr12.to_array()) == (401 - 4) sr13 = sr9.replace([22, 323, 27, 0], -1) assert sr13.null_count == 0 assert len(sr13.to_array()) == 401
def test_string_replace_multi(): ps = pd.Series(["hello", "goodbye"]) gs = Series(["hello", "goodbye"]) expect = ps.str.replace("e", "E").str.replace("o", "O") got = gs.str.replace(["e", "o"], ["E", "O"]) assert_eq(expect, got) ps = pd.Series(["foo", "fuz", np.nan]) gs = Series.from_pandas(ps) expect = ps.str.replace("f.", "ba", regex=True) got = gs.str.replace(["f."], ["ba"], regex=True) assert_eq(expect, got) ps = pd.Series(["f.o", "fuz", np.nan]) gs = Series.from_pandas(ps) expect = ps.str.replace("f.", "ba", regex=False) got = gs.str.replace(["f."], ["ba"], regex=False) assert_eq(expect, got)
def test_to_from_pandas_nulls(data, nulls): pd_data = pd.Series(data.copy()) if nulls == "some": # Fill half the values with NaT pd_data[list(range(0, len(pd_data), 2))] = np.datetime64("nat", "ns") elif nulls == "all": # Fill all the values with NaT pd_data[:] = np.datetime64("nat", "ns") gdf_data = Series.from_pandas(pd_data) expect = pd_data got = gdf_data.to_pandas() assert_eq(expect, got)
def test_series_replace(): a1 = np.array([0, 1, 2, 3, 4]) # Numerical a2 = np.array([5, 1, 2, 3, 4]) sr1 = Series(a1) sr2 = sr1.replace(0, 5) np.testing.assert_equal(sr2.to_array(), a2) # Categorical psr3 = pd.Series(["one", "two", "three"], dtype="category") psr4 = psr3.replace("one", "two") sr3 = Series.from_pandas(psr3) sr4 = sr3.replace("one", "two") pd.testing.assert_series_equal(sr4.to_pandas(), psr4) # List input a6 = np.array([5, 6, 2, 3, 4]) sr6 = sr1.replace([0, 1], [5, 6]) np.testing.assert_equal(sr6.to_array(), a6) a7 = np.array([5.5, 6.5, 2, 3, 4]) sr7 = sr1.replace([0, 1], [5.5, 6.5]) np.testing.assert_equal(sr7.to_array(), a7) # Series input a8 = np.array([5, 5, 5, 3, 4]) sr8 = sr1.replace(sr1[:3], 5) np.testing.assert_equal(sr8.to_array(), a8) # large input containing null sr9 = Series(list(range(400)) + [None]) sr10 = sr9.replace([22, 323, 27, 0], None) assert sr10.null_count == 5 assert len(sr10.to_array()) == (401 - 5) sr11 = sr9.replace([22, 323, 27, 0], -1) assert sr11.null_count == 1 assert len(sr11.to_array()) == (401 - 1) # large input not containing nulls sr9 = sr9.fillna(-11) sr12 = sr9.replace([22, 323, 27, 0], None) assert sr12.null_count == 4 assert len(sr12.to_array()) == (401 - 4) sr13 = sr9.replace([22, 323, 27, 0], -1) assert sr13.null_count == 0 assert len(sr13.to_array()) == 401
def test_series_where(data_dtype, fill_value): psr = pd.Series(list(range(10)), dtype=data_dtype) sr = Series.from_pandas(psr) expect = psr.where(psr > 0, fill_value) got = sr.where(sr > 0, fill_value) assert_eq(expect, got) expect = psr.where(psr < 0, fill_value) got = sr.where(sr < 0, fill_value) assert_eq(expect, got) expect = psr.where(psr == 0, fill_value) got = sr.where(sr == 0, fill_value) assert_eq(expect, got)
def test_series_with_nulls_where(fill_value): psr = pd.Series([None] * 3 + list(range(5))) sr = Series.from_pandas(psr) expect = psr.where(psr > 0, fill_value) got = sr.where(sr > 0, fill_value) assert_eq(expect, got) expect = psr.where(psr < 0, fill_value) got = sr.where(sr < 0, fill_value) assert_eq(expect, got) expect = psr.where(psr == 0, fill_value) got = sr.where(sr == 0, fill_value) assert_eq(expect, got)
def test_fillna_categorical(psr, fill_value, inplace): gsr = Series.from_pandas(psr) if isinstance(fill_value, pd.Series): fill_value_cudf = cudf.from_pandas(fill_value) else: fill_value_cudf = fill_value expected = psr.fillna(fill_value, inplace=inplace) got = gsr.fillna(fill_value_cudf, inplace=inplace) if inplace: expected = psr got = gsr assert_eq(expected, got)
def test_fillna_datetime(fill_type, inplace): psr = pd.Series(pd.date_range("2010-01-01", "2020-01-10", freq="1y")) if fill_type == "scalar": fill_value = pd.Timestamp("2010-01-02") elif fill_type == "series": fill_value = psr + pd.Timedelta("1d") psr[[5, 9]] = None sr = Series.from_pandas(psr) expect = psr.fillna(fill_value) got = sr.fillna(fill_value, inplace=inplace) if inplace: got = sr assert_eq(expect, got)
def test_fillna_categorical(fill_type, null_value, inplace): data = pd.Series(["a", "b", "a", null_value, "c", null_value], dtype="category") sr = Series.from_pandas(data) if fill_type == "scalar": fill_value = "c" expect = pd.Series(["a", "b", "a", "c", "c", "c"], dtype="category") elif fill_type == "series": fill_value = pd.Series(["c", "c", "c", "c", "c", "a"], dtype="category") expect = pd.Series(["a", "b", "a", "c", "c", "a"], dtype="category") got = sr.fillna(fill_value, inplace=inplace) if inplace: got = sr assert_eq(expect, got)
def test_numeric_series_replace_dtype(series_dtype, replacement): psr = pd.Series([0, 1, 2, 3, 4, 5], dtype=series_dtype) sr = Series.from_pandas(psr) # Both Scalar if sr.dtype.type(replacement) != replacement: with pytest.raises(TypeError): sr.replace(1, replacement) else: expect = psr.replace(1, replacement).astype(psr.dtype) got = sr.replace(1, replacement) assert_eq(expect, got) # to_replace is a list, replacement is a scalar if sr.dtype.type(replacement) != replacement: with pytest.raises(TypeError): sr.replace([2, 3], replacement) else: expect = psr.replace([2, 3], replacement).astype(psr.dtype) got = sr.replace([2, 3], replacement) assert_eq(expect, got) # If to_replace is a scalar and replacement is a list with pytest.raises(TypeError): sr.replace(0, [replacement, 2]) # Both list of unequal length with pytest.raises(ValueError): sr.replace([0, 1], [replacement]) # Both lists of equal length if ( np.dtype(type(replacement)).kind == "f" and sr.dtype.kind in {"i", "u"} ) or (sr.dtype.type(replacement) != replacement): with pytest.raises(TypeError): sr.replace([2, 3], [replacement, replacement]) else: expect = psr.replace([2, 3], [replacement, replacement]).astype( psr.dtype ) got = sr.replace([2, 3], [replacement, replacement]) assert_eq(expect, got)
def test_replace_inplace(): data = np.array([5, 1, 2, 3, 4]) sr = Series(data) psr = pd.Series(data) sr_copy = sr.copy() psr_copy = psr.copy() assert_eq(sr, psr) assert_eq(sr_copy, psr_copy) sr.replace(5, 0, inplace=True) psr.replace(5, 0, inplace=True) assert_eq(sr, psr) assert_eq(sr_copy, psr_copy) sr = Series(data) psr = pd.Series(data) sr_copy = sr.copy() psr_copy = psr.copy() assert_eq(sr, psr) assert_eq(sr_copy, psr_copy) sr.replace({5: 0, 3: -5}) psr.replace({5: 0, 3: -5}) assert_eq(sr, psr) assert_eq(sr_copy, psr_copy) srr = sr.replace() psrr = psr.replace() assert_eq(srr, psrr) psr = pd.Series(["one", "two", "three"], dtype="category") sr = Series.from_pandas(psr) sr_copy = sr.copy() psr_copy = psr.copy() assert_eq(sr, psr) assert_eq(sr_copy, psr_copy) sr.replace("one", "two", inplace=True) psr.replace("one", "two", inplace=True) assert_eq(sr, psr) assert_eq(sr_copy, psr_copy) pdf = pd.DataFrame({"A": [0, 1, 2, 3, 4], "B": [5, 6, 7, 8, 9]}) gdf = DataFrame.from_pandas(pdf) pdf_copy = pdf.copy() gdf_copy = gdf.copy() assert_eq(pdf, gdf) assert_eq(pdf_copy, gdf_copy) pdf.replace(5, 0, inplace=True) gdf.replace(5, 0, inplace=True) assert_eq(pdf, gdf) assert_eq(pdf_copy, gdf_copy) pds = pd.Series([1, 2, 3, 45]) gds = Series.from_pandas(pds) vals = np.array([]).astype(int) assert_eq(pds.replace(vals, -1), gds.replace(vals, -1)) pds.replace(vals, 77, inplace=True) gds.replace(vals, 77, inplace=True) assert_eq(pds, gds) pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 666]}) gdf = DataFrame.from_pandas(pdf) assert_eq(pdf.replace({"a": 2}, {"a": -33}), gdf.replace({"a": 2}, {"a": -33})) assert_eq( pdf.replace({"a": [2, 5]}, {"a": [9, 10]}), gdf.replace({"a": [2, 5]}, {"a": [9, 10]}), ) assert_eq( pdf.replace([], []), gdf.replace([], []), ) assert_exceptions_equal( lfunc=pdf.replace, rfunc=gdf.replace, lfunc_args_and_kwargs=([], { "to_replace": -1, "value": [] }), rfunc_args_and_kwargs=([], { "to_replace": -1, "value": [] }), compare_error_message=False, )
def test_replace_inplace(): data = np.array([5, 1, 2, 3, 4]) sr = Series(data) psr = pd.Series(data) sr_copy = sr.copy() psr_copy = psr.copy() assert_eq(sr, psr) assert_eq(sr_copy, psr_copy) sr.replace(5, 0, inplace=True) psr.replace(5, 0, inplace=True) assert_eq(sr, psr) assert_eq(sr_copy, psr_copy) sr = Series(data) psr = pd.Series(data) sr_copy = sr.copy() psr_copy = psr.copy() assert_eq(sr, psr) assert_eq(sr_copy, psr_copy) sr.replace({5: 0, 3: -5}) psr.replace({5: 0, 3: -5}) assert_eq(sr, psr) assert_eq(sr_copy, psr_copy) srr = sr.replace() psrr = psr.replace() assert_eq(srr, psrr) psr = pd.Series(["one", "two", "three"], dtype="category") sr = Series.from_pandas(psr) sr_copy = sr.copy() psr_copy = psr.copy() assert_eq(sr, psr) assert_eq(sr_copy, psr_copy) sr.replace("one", "two", inplace=True) psr.replace("one", "two", inplace=True) assert_eq(sr, psr) assert_eq(sr_copy, psr_copy) pdf = pd.DataFrame({"A": [0, 1, 2, 3, 4], "B": [5, 6, 7, 8, 9]}) gdf = DataFrame.from_pandas(pdf) pdf_copy = pdf.copy() gdf_copy = gdf.copy() assert_eq(pdf, gdf) assert_eq(pdf_copy, gdf_copy) pdf.replace(5, 0, inplace=True) gdf.replace(5, 0, inplace=True) assert_eq(pdf, gdf) assert_eq(pdf_copy, gdf_copy) pds = pd.Series([1, 2, 3, 45]) gds = Series.from_pandas(pds) vals = np.array([]).astype(int) assert_eq(pds.replace(vals, -1), gds.replace(vals, -1)) pds.replace(vals, 77, inplace=True) gds.replace(vals, 77, inplace=True) assert_eq(pds, gds) pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5, 666]}) gdf = DataFrame.from_pandas(pdf) assert_eq(pdf.replace({"a": 2}, {"a": -33}), gdf.replace({"a": 2}, {"a": -33})) assert_eq( pdf.replace({"a": [2, 5]}, {"a": [9, 10]}), gdf.replace({"a": [2, 5]}, {"a": [9, 10]}), ) assert_eq( pdf.replace([], []), gdf.replace([], []), ) with pytest.raises(TypeError): pdf.replace(-1, []) with pytest.raises(TypeError): gdf.replace(-1, [])