def test_cat_series_binop_error(): df = cudf.DataFrame() df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) df["b"] = np.arange(len(df)) dfa = df["a"] dfb = df["b"] # lhs is a categorical assert_exceptions_equal( lfunc=operator.add, rfunc=operator.add, lfunc_args_and_kwargs=([dfa, dfb], ), rfunc_args_and_kwargs=([dfa, dfb], ), check_exception_type=False, expected_error_message="Series of dtype `category` cannot " "perform the operation: add", ) # if lhs is a numerical assert_exceptions_equal( lfunc=operator.add, rfunc=operator.add, lfunc_args_and_kwargs=([dfb, dfa], ), rfunc_args_and_kwargs=([dfb, dfa], ), check_exception_type=False, expected_error_message="'add' operator not supported", )
def test_series_drop_raises(): gs = cudf.Series([10, 20, 30], index=["x", "y", "z"], name="c") ps = gs.to_pandas() assert_exceptions_equal( lfunc=ps.drop, rfunc=gs.drop, lfunc_args_and_kwargs=(["p"], ), rfunc_args_and_kwargs=(["p"], ), expected_error_message="One or more values not found in axis", ) # dtype specified mismatch assert_exceptions_equal( lfunc=ps.drop, rfunc=gs.drop, lfunc_args_and_kwargs=([3], ), rfunc_args_and_kwargs=([3], ), expected_error_message="One or more values not found in axis", ) expect = ps.drop("p", errors="ignore") actual = gs.drop("p", errors="ignore") assert_eq(actual, expect)
def test_categorical_compare_unordered(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) sr = cudf.Series(cat) # test equal out = sr == sr assert out.dtype == np.bool_ assert type(out[0]) == np.bool_ assert np.all(out.to_numpy()) assert np.all(pdsr == pdsr) # test inequality out = sr != sr assert not np.any(out.to_numpy()) assert not np.any(pdsr != pdsr) assert not pdsr.cat.ordered assert not sr.cat.ordered # test using ordered operators assert_exceptions_equal( lfunc=operator.lt, rfunc=operator.lt, lfunc_args_and_kwargs=([pdsr, pdsr], ), rfunc_args_and_kwargs=([sr, sr], ), )
def test_timedelta_datetime_cast_invalid(): sr = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") psr = sr.to_pandas() assert_exceptions_equal( psr.astype, sr.astype, (["datetime64[ns]"], ), (["datetime64[ns]"], ), expected_error_message=re.escape( "cannot astype a timedelta from timedelta64[ns] to datetime64[ns]" ), ) sr = cudf.Series([1, 2, 3], dtype="datetime64[ns]") psr = sr.to_pandas() assert_exceptions_equal( psr.astype, sr.astype, (["timedelta64[ns]"], ), (["timedelta64[ns]"], ), expected_error_message=re.escape("cannot astype a datetimelike from " "datetime64[ns] to timedelta64[ns]"), )
def test_fillna_categorical(psr_data, fill_value, inplace): psr = psr_data.copy(deep=True) gsr = cudf.from_pandas(psr) if isinstance(fill_value, pd.Series): fill_value_cudf = cudf.from_pandas(fill_value) else: fill_value_cudf = fill_value if (isinstance(fill_value_cudf, cudf.Series) and gsr.dtype != fill_value_cudf.dtype): assert_exceptions_equal( lfunc=psr.fillna, rfunc=gsr.fillna, lfunc_args_and_kwargs=([fill_value], { "inplace": inplace }), rfunc_args_and_kwargs=([fill_value_cudf], { "inplace": inplace }), ) else: expected = psr.fillna(fill_value, inplace=inplace) got = gsr.fillna(fill_value_cudf, inplace=inplace) if inplace: expected = psr got = gsr assert_eq(expected, got)
def test_datetime_series_ops_with_scalars(data, other_scalars, dtype, op): gsr = cudf.Series(data=data, dtype=dtype) psr = gsr.to_pandas() if op == "add": expected = psr + other_scalars actual = gsr + other_scalars elif op == "sub": expected = psr - other_scalars actual = gsr - other_scalars assert_eq(expected, actual) if op == "add": expected = other_scalars + psr actual = other_scalars + gsr assert_eq(expected, actual) elif op == "sub": assert_exceptions_equal( lfunc=operator.sub, rfunc=operator.sub, lfunc_args_and_kwargs=([other_scalars, psr], ), rfunc_args_and_kwargs=([other_scalars, gsr], ), compare_error_message=False, )
def test_categorical_reductions(op): gsr = cudf.Series([1, 2, 3, None], dtype="category") psr = gsr.to_pandas() utils.assert_exceptions_equal( getattr(psr, op), getattr(gsr, op), compare_error_message=False )
def test_column_set_unequal_length_object_by_mask(): data = [1, 2, 3, 4, 5] replace_data_1 = [8, 9] replace_data_2 = [8, 9, 10, 11] mask = [True, True, False, True, False] psr = pd.Series(data) gsr = cudf.Series(data) assert_exceptions_equal( psr.__setitem__, gsr.__setitem__, ([mask, replace_data_1], {}), ([mask, replace_data_1], {}), compare_error_message=False, ) psr = pd.Series(data) gsr = cudf.Series(data) assert_exceptions_equal( psr.__setitem__, gsr.__setitem__, ([mask, replace_data_2], {}), ([mask, replace_data_2], {}), compare_error_message=False, )
def test_timedelta_ops_datetime_inputs(datetime_data, timedelta_data, datetime_dtype, timedelta_dtype, ops): gsr_datetime = cudf.Series(datetime_data, dtype=datetime_dtype) gsr_timedelta = cudf.Series(timedelta_data, dtype=timedelta_dtype) psr_datetime = gsr_datetime.to_pandas() psr_timedelta = gsr_timedelta.to_pandas() expected = getattr(psr_datetime, ops)(psr_timedelta) actual = getattr(gsr_datetime, ops)(gsr_timedelta) assert_eq(expected, actual) if ops == "add": expected = getattr(psr_timedelta, ops)(psr_datetime) actual = getattr(gsr_timedelta, ops)(gsr_datetime) assert_eq(expected, actual) elif ops == "sub": assert_exceptions_equal( lfunc=operator.sub, rfunc=operator.sub, lfunc_args_and_kwargs=([psr_timedelta, psr_datetime], ), rfunc_args_and_kwargs=([gsr_timedelta, gsr_datetime], ), expected_error_message=re.escape( f"Subtraction of {gsr_timedelta.dtype} with " f"{gsr_datetime.dtype} cannot be performed."), )
def test_categorical_remove_categories(pd_str_cat, inplace): pd_sr = pd.Series(pd_str_cat.copy()) cd_sr = cudf.Series(pd_str_cat.copy()) assert_eq(pd_sr, cd_sr) assert str(pd_sr) == str(cd_sr) pd_sr_1 = pd_sr.cat.remove_categories(["a"], inplace=inplace) cd_sr_1 = cd_sr.cat.remove_categories(["a"], inplace=inplace) pd_sr_1 = pd_sr if pd_sr_1 is None else pd_sr_1 cd_sr_1 = cd_sr if cd_sr_1 is None else cd_sr_1 assert "a" not in pd_sr_1.cat.categories.to_list() assert "a" not in cd_sr_1.cat.categories.to_pandas().to_list() assert_eq(pd_sr_1, cd_sr_1) # test using ordered operators assert_exceptions_equal( lfunc=cd_sr.to_pandas().cat.remove_categories, rfunc=cd_sr.cat.remove_categories, lfunc_args_and_kwargs=([["a", "d"]], { "inplace": inplace }), rfunc_args_and_kwargs=([["a", "d"]], { "inplace": inplace }), expected_error_message="removals must all be in old categories", )
def test_to_datetime_errors(data): pd_data = data if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): gd_data = cudf.from_pandas(pd_data) else: gd_data = pd_data assert_exceptions_equal(pd.to_datetime, cudf.to_datetime, ([pd_data], ), ([gd_data], ))
def test_multiindex_rename_error(names): pi = pd.MultiIndex.from_product([["python", "cobra"], [2018, 2019]]) gi = cudf.from_pandas(pi) assert_exceptions_equal( lfunc=pi.rename, rfunc=gi.rename, lfunc_args_and_kwargs=([], {"names": names}), rfunc_args_and_kwargs=([], {"names": names}), )
def test_series_fillna_error(): psr = pd.Series([1, 2, None, 3, None]) gsr = cudf.from_pandas(psr) assert_exceptions_equal( psr.fillna, gsr.fillna, ([pd.DataFrame({"a": [1, 2, 3]})], ), ([cudf.DataFrame({"a": [1, 2, 3]})], ), )
def test_categorical_setitem_invalid(): ps = pd.Series([1, 2, 3], dtype="category") gs = cudf.Series([1, 2, 3], dtype="category") assert_exceptions_equal( lfunc=ps.__setitem__, rfunc=gs.__setitem__, lfunc_args_and_kwargs=([0, 5], {}), rfunc_args_and_kwargs=([0, 5], {}), )
def test_interpolate_dataframe_error_cases(data, kwargs): gsr = cudf.DataFrame(data) psr = gsr.to_pandas() assert_exceptions_equal( lfunc=psr.interpolate, rfunc=gsr.interpolate, lfunc_args_and_kwargs=([], kwargs), rfunc_args_and_kwargs=([], kwargs), )
def test_datetime_to_datetime_error(): assert_exceptions_equal( lfunc=pd.to_datetime, rfunc=cudf.to_datetime, lfunc_args_and_kwargs=(["02-Oct-2017 09:30", "%d-%B-%Y %H:%M"], ), rfunc_args_and_kwargs=(["02-Oct-2017 09:30", "%d-%B-%Y %H:%M"], ), check_exception_type=False, expected_error_message=re.escape( "errors parameter has to be either one of: ['ignore', 'raise', " "'coerce', 'warn'], found: %d-%B-%Y %H:%M"), )
def test_duplicated_with_misspelled_column_name(subset): df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) gdf = cudf.DataFrame.from_pandas(df) assert_exceptions_equal( lfunc=df.drop_duplicates, rfunc=gdf.drop_duplicates, lfunc_args_and_kwargs=([subset], ), rfunc_args_and_kwargs=([subset], ), compare_error_message=False, )
def test_add_categories_error(data, add): pds = pd.Series(data, dtype="category") gds = cudf.Series(data, dtype="category") assert_exceptions_equal( pds.cat.add_categories, gds.cat.add_categories, ([add], ), ([add], ), compare_error_message=False, )
def test_multiindex_loc_rows_0(pdf, gdf, pdfIndex): gdfIndex = cudf.from_pandas(pdfIndex) pdf.index = pdfIndex gdf.index = gdfIndex assert_exceptions_equal( lfunc=pdf.loc.__getitem__, rfunc=gdf.loc.__getitem__, lfunc_args_and_kwargs=([(("d", ), slice(None, None, None))], ), rfunc_args_and_kwargs=([(("d", ), slice(None, None, None))], ), )
def test_column_view_invalid_numeric_to_numeric(data, from_dtype, to_dtype): cpu_data = np.asarray(data, dtype=from_dtype) gpu_data = as_column(data, dtype=from_dtype) assert_exceptions_equal( lfunc=cpu_data.view, rfunc=gpu_data.view, lfunc_args_and_kwargs=([to_dtype],), rfunc_args_and_kwargs=([to_dtype],), expected_error_message="Can not divide", )
def test_dataframe_loc_outbound(): df = cudf.DataFrame() size = 10 df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype(np.int32) df["b"] = hb = np.random.random(size).astype(np.float32) pdf = pd.DataFrame() pdf["a"] = ha pdf["b"] = hb assert_exceptions_equal(lambda: pdf.loc[11], lambda: df.loc[11])
def test_multiindex_set_names_error(level, names): pi = pd.MultiIndex.from_product( [["python", "cobra"], [2018, 2019], ["aab", "bcd"]] ) gi = cudf.from_pandas(pi) assert_exceptions_equal( lfunc=pi.set_names, rfunc=gi.set_names, lfunc_args_and_kwargs=([], {"names": names, "level": level}), rfunc_args_and_kwargs=([], {"names": names, "level": level}), )
def test_timedelta_datetime_cast_invalid(): sr = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") psr = sr.to_pandas() assert_exceptions_equal(psr.astype, sr.astype, (["datetime64[ns]"], ), (["datetime64[ns]"], )) sr = cudf.Series([1, 2, 3], dtype="datetime64[ns]") psr = sr.to_pandas() assert_exceptions_equal(psr.astype, sr.astype, (["timedelta64[ns]"], ), (["timedelta64[ns]"], ))
def test_dataframe_drop_duplicates_method(): pdf = DataFrame( [(1, 2, "a"), (2, 3, "b"), (3, 4, "c"), (2, 3, "d"), (3, 5, "c")], columns=["n1", "n2", "s1"], ) gdf = cudf.DataFrame.from_pandas(pdf) assert_df(gdf.drop_duplicates(), pdf.drop_duplicates()) assert_eq( gdf.drop_duplicates("n1")["n1"].reset_index(drop=True), pdf.drop_duplicates("n1")["n1"].reset_index(drop=True), ) assert_eq( gdf.drop_duplicates("n2")["n2"].reset_index(drop=True), pdf.drop_duplicates("n2")["n2"].reset_index(drop=True), ) assert_eq( gdf.drop_duplicates("s1")["s1"].reset_index(drop=True), pdf.drop_duplicates("s1")["s1"].reset_index(drop=True), ) assert_eq( gdf.drop_duplicates( "s1", keep="last")["s1"].sort_index().reset_index(drop=True), pdf.drop_duplicates("s1", keep="last")["s1"].reset_index(drop=True), ) assert gdf.drop_duplicates("s1", inplace=True) is None gdf = cudf.DataFrame.from_pandas(pdf) assert_df(gdf.drop_duplicates("n1"), pdf.drop_duplicates("n1")) assert_df(gdf.drop_duplicates("n2"), pdf.drop_duplicates("n2")) assert_df(gdf.drop_duplicates("s1"), pdf.drop_duplicates("s1")) assert_df(gdf.drop_duplicates(["n1", "n2"]), pdf.drop_duplicates(["n1", "n2"])) assert_df(gdf.drop_duplicates(["n1", "s1"]), pdf.drop_duplicates(["n1", "s1"])) # Test drop error assert_exceptions_equal( lfunc=pdf.drop_duplicates, rfunc=gdf.drop_duplicates, lfunc_args_and_kwargs=(["n3"], ), rfunc_args_and_kwargs=(["n3"], ), expected_error_message="columns {'n3'} do not exist", ) assert_exceptions_equal( lfunc=pdf.drop_duplicates, rfunc=gdf.drop_duplicates, lfunc_args_and_kwargs=([["n1", "n4", "n3"]], ), rfunc_args_and_kwargs=([["n1", "n4", "n3"]], ), expected_error_message="columns {'n[34]', 'n[34]'} do not exist", )
def test_str_to_datetime_error(): psr = pd.Series(["2001-01-01", "2002-02-02", "2000-01-05", "None"]) gsr = Series(["2001-01-01", "2002-02-02", "2000-01-05", "None"]) assert_exceptions_equal( lfunc=psr.astype, rfunc=gsr.astype, lfunc_args_and_kwargs=(["datetime64[s]"], ), rfunc_args_and_kwargs=(["datetime64[s]"], ), check_exception_type=False, expected_error_message=re.escape( "Could not convert `None` value to datetime"), )
def test_categorical_unary_ceil(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) sr = cudf.Series(cat) assert_exceptions_equal( lfunc=getattr, rfunc=sr.ceil, lfunc_args_and_kwargs=([pdsr, "ceil"], ), check_exception_type=False, expected_error_message="Series of dtype `category` cannot " "perform the operation: ceil", )
def test_categorical_binary_add(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) sr = cudf.Series(cat) assert_exceptions_equal( lfunc=operator.add, rfunc=operator.add, lfunc_args_and_kwargs=([pdsr, pdsr], ), rfunc_args_and_kwargs=([sr, sr], ), expected_error_message="Series of dtype `category` cannot perform " "the operation: add", )
def test_timedelta_unsupported_reductions(op): gsr = cudf.Series([1, 2, 3, None], dtype="timedelta64[ns]") psr = gsr.to_pandas() utils.assert_exceptions_equal( lfunc=getattr(psr, op), rfunc=getattr(gsr, op), expected_error_message=re.escape( "cannot perform " + ("kurtosis" if op == "kurt" else op) + " with type timedelta64[ns]" ), )
def test_multiindex_column_shape(): pdf = pd.DataFrame(np.random.rand(5, 0)) gdf = cudf.from_pandas(pdf) pdfIndex = pd.MultiIndex([["a", "b", "c"]], [[0]]) pdfIndex.names = ["alpha"] gdfIndex = cudf.from_pandas(pdfIndex) assert_eq(pdfIndex, gdfIndex) assert_exceptions_equal( lfunc=operator.setitem, rfunc=operator.setitem, lfunc_args_and_kwargs=([], {"a": pdf, "b": "columns", "c": pdfIndex}), rfunc_args_and_kwargs=([], {"a": gdf, "b": "columns", "c": gdfIndex}), )
def test_series_replace_errors(): gsr = cudf.Series([1, 2, None, 3, None]) psr = gsr.to_pandas() with pytest.raises( TypeError, match=re.escape("to_replace and value should be of same types," "got to_replace dtype: int64 and " "value dtype: object"), ): gsr.replace(1, "a") gsr = cudf.Series(["a", "b", "c"]) with pytest.raises( TypeError, match=re.escape("to_replace and value should be of same types," "got to_replace dtype: int64 and " "value dtype: object"), ): gsr.replace([1, 2], ["a", "b"]) assert_exceptions_equal( psr.replace, gsr.replace, ([{ "a": 1 }, 1], ), ([{ "a": 1 }, 1], ), ) assert_exceptions_equal( lfunc=psr.replace, rfunc=gsr.replace, lfunc_args_and_kwargs=([[1, 2], [1]], ), rfunc_args_and_kwargs=([[1, 2], [1]], ), expected_error_message=re.escape( "Replacement lists must be of same length. " "Expected 2, got 1."), ) assert_exceptions_equal( lfunc=psr.replace, rfunc=gsr.replace, lfunc_args_and_kwargs=([object(), [1]], ), rfunc_args_and_kwargs=([object(), [1]], ), ) assert_exceptions_equal( lfunc=psr.replace, rfunc=gsr.replace, lfunc_args_and_kwargs=([{ "a": 1 }, object()], ), rfunc_args_and_kwargs=([{ "a": 1 }, object()], ), )