def test_index_append_error(data, other): gd_data = cudf.core.index.as_index(data) gd_other = cudf.core.index.as_index(other) got_dtype = (gd_other.dtype if gd_data.dtype == np.dtype("object") else gd_data.dtype) with pytest.raises( TypeError, match=re.escape(f"cudf does not support appending an Index of " f"dtype `{np.dtype('object')}` with an Index " f"of dtype `{got_dtype}`, please type-cast " f"either one of them to same dtypes."), ): gd_data.append(gd_other) with pytest.raises( TypeError, match=re.escape(f"cudf does not support appending an Index of " f"dtype `{np.dtype('object')}` with an Index " f"of dtype `{got_dtype}`, please type-cast " f"either one of them to same dtypes."), ): gd_other.append(gd_data) sr = gd_other.to_series() assert_exceptions_equal( lfunc=gd_data.to_pandas().append, rfunc=gd_data.append, lfunc_args_and_kwargs=([[sr.to_pandas()]], ), rfunc_args_and_kwargs=([[sr]], ), expected_error_message=r"all inputs must be Index", )
def test_fillna_categorical(psr_data, fill_value, inplace): psr = psr_data.copy(deep=True) gsr = Series.from_pandas(psr) if isinstance(fill_value, pd.Series): fill_value_cudf = cudf.from_pandas(fill_value) else: fill_value_cudf = fill_value if (isinstance(fill_value_cudf, cudf.Series) and gsr.dtype != fill_value_cudf.dtype): assert_exceptions_equal( lfunc=psr.fillna, rfunc=gsr.fillna, lfunc_args_and_kwargs=([fill_value], { "inplace": inplace }), rfunc_args_and_kwargs=([fill_value_cudf], { "inplace": inplace }), ) else: expected = psr.fillna(fill_value, inplace=inplace) got = gsr.fillna(fill_value_cudf, inplace=inplace) if inplace: expected = psr got = gsr assert_eq(expected, got)
def test_categorical_remove_categories(pd_str_cat, inplace): pd_sr = pd.Series(pd_str_cat.copy()) cd_sr = cudf.Series(pd_str_cat.copy()) assert_eq(pd_sr, cd_sr) assert str(pd_sr) == str(cd_sr) pd_sr_1 = pd_sr.cat.remove_categories(["a"], inplace=inplace) cd_sr_1 = cd_sr.cat.remove_categories(["a"], inplace=inplace) pd_sr_1 = pd_sr if pd_sr_1 is None else pd_sr_1 cd_sr_1 = cd_sr if cd_sr_1 is None else cd_sr_1 assert "a" not in pd_sr_1.cat.categories.to_list() assert "a" not in cd_sr_1.cat.categories.to_pandas().to_list() assert_eq(pd_sr_1, cd_sr_1) # test using ordered operators assert_exceptions_equal( lfunc=cd_sr.to_pandas().cat.remove_categories, rfunc=cd_sr.cat.remove_categories, lfunc_args_and_kwargs=([["a", "d"]], {"inplace": inplace}), rfunc_args_and_kwargs=([["a", "d"]], {"inplace": inplace}), expected_error_message="removals must all be in old categories", )
def test_datetime_series_ops_with_scalars(data, other_scalars, dtype, op): gsr = cudf.Series(data=data, dtype=dtype) psr = gsr.to_pandas() if op == "add": expected = psr + other_scalars actual = gsr + other_scalars elif op == "sub": expected = psr - other_scalars actual = gsr - other_scalars assert_eq(expected, actual) if op == "add": expected = other_scalars + psr actual = other_scalars + gsr assert_eq(expected, actual) elif op == "sub": assert_exceptions_equal( lfunc=operator.sub, rfunc=operator.sub, lfunc_args_and_kwargs=([other_scalars, psr], ), rfunc_args_and_kwargs=([other_scalars, gsr], ), compare_error_message=False, )
def test_categorical_compare_unordered(): cat = pd.Categorical(["a", "a", "b", "c", "a"], categories=["a", "b", "c"]) pdsr = pd.Series(cat) sr = cudf.Series(cat) # test equal out = sr == sr assert out.dtype == np.bool_ assert type(out[0]) == np.bool_ assert np.all(out.to_array()) assert np.all(pdsr == pdsr) # test inequality out = sr != sr assert not np.any(out.to_array()) assert not np.any(pdsr != pdsr) assert not pdsr.cat.ordered assert not sr.cat.ordered # test using ordered operators assert_exceptions_equal( lfunc=operator.lt, rfunc=operator.lt, lfunc_args_and_kwargs=([pdsr, pdsr],), rfunc_args_and_kwargs=([sr, sr],), )
def test_series_drop_raises(): gs = cudf.Series([10, 20, 30], index=["x", "y", "z"], name="c") ps = gs.to_pandas() assert_exceptions_equal( lfunc=ps.drop, rfunc=gs.drop, lfunc_args_and_kwargs=(["p"],), rfunc_args_and_kwargs=(["p"],), expected_error_message="One or more values not found in axis", ) # dtype specified mismatch assert_exceptions_equal( lfunc=ps.drop, rfunc=gs.drop, lfunc_args_and_kwargs=([3],), rfunc_args_and_kwargs=([3],), expected_error_message="One or more values not found in axis", ) expect = ps.drop("p", errors="ignore") actual = gs.drop("p", errors="ignore") assert_eq(actual, expect)
def test_categorical_reductions(op): gsr = cudf.Series([1, 2, 3, None], dtype="category") psr = gsr.to_pandas() utils.assert_exceptions_equal( getattr(psr, op), getattr(gsr, op), compare_error_message=False )
def test_cat_series_binop_error(): df = cudf.DataFrame() df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) df["b"] = np.arange(len(df)) dfa = df["a"] dfb = df["b"] # lhs is a categorical assert_exceptions_equal( lfunc=operator.add, rfunc=operator.add, lfunc_args_and_kwargs=([dfa, dfb],), rfunc_args_and_kwargs=([dfa, dfb],), check_exception_type=False, expected_error_message="Series of dtype `category` cannot " "perform the operation: add", ) # if lhs is a numerical assert_exceptions_equal( lfunc=operator.add, rfunc=operator.add, lfunc_args_and_kwargs=([dfb, dfa],), rfunc_args_and_kwargs=([dfb, dfa],), check_exception_type=False, expected_error_message="'add' operator not supported", )
def test_groupby_nonempty_no_keys(pdf): gdf = cudf.from_pandas(pdf) assert_exceptions_equal( lambda: pdf.groupby([]), lambda: gdf.groupby([]), compare_error_message=False, )
def test_timedelta_ops_datetime_inputs(datetime_data, timedelta_data, datetime_dtype, timedelta_dtype, ops): gsr_datetime = cudf.Series(datetime_data, dtype=datetime_dtype) gsr_timedelta = cudf.Series(timedelta_data, dtype=timedelta_dtype) psr_datetime = gsr_datetime.to_pandas() psr_timedelta = gsr_timedelta.to_pandas() expected = getattr(psr_datetime, ops)(psr_timedelta) actual = getattr(gsr_datetime, ops)(gsr_timedelta) assert_eq(expected, actual) if ops == "add": expected = getattr(psr_timedelta, ops)(psr_datetime) actual = getattr(gsr_timedelta, ops)(gsr_datetime) assert_eq(expected, actual) elif ops == "sub": assert_exceptions_equal( lfunc=operator.sub, rfunc=operator.sub, lfunc_args_and_kwargs=([psr_timedelta, psr_datetime], ), rfunc_args_and_kwargs=([gsr_timedelta, gsr_datetime], ), expected_error_message=re.escape( f"Subtraction of {gsr_timedelta.dtype} with " f"{gsr_datetime.dtype} cannot be performed."), )
def test_datetime_unsupported_reductions(op): gsr = cudf.Series([1, 2, 3, None], dtype="datetime64[ns]") psr = gsr.to_pandas() utils.assert_exceptions_equal( lfunc=getattr(psr, op), rfunc=getattr(gsr, op), )
def test_to_datetime_errors(data): pd_data = data if isinstance(pd_data, (pd.Series, pd.DataFrame, pd.Index)): gd_data = cudf.from_pandas(pd_data) else: gd_data = pd_data assert_exceptions_equal(pd.to_datetime, cudf.to_datetime, ([pd_data], ), ([gd_data], ))
def test_multiindex_sample_basic(n, frac, replace, axis): # as we currently don't support column with same name if axis == 1 and replace: return pdf = pd.DataFrame( { "a": [1, 2, 3, 4, 5], "float": [0.05, 0.2, 0.3, 0.2, 0.25], "int": [1, 3, 5, 4, 2], }, ) mul_index = cudf.Index(DataFrame.from_pandas(pdf)) random_state = 0 try: pout = pdf.sample( n=n, frac=frac, replace=replace, random_state=random_state, axis=axis, ) except BaseException: assert_exceptions_equal( lfunc=pdf.sample, rfunc=mul_index.sample, lfunc_args_and_kwargs=( [], { "n": n, "frac": frac, "replace": replace, "random_state": random_state, "axis": axis, }, ), rfunc_args_and_kwargs=( [], { "n": n, "frac": frac, "replace": replace, "random_state": random_state, "axis": axis, }, ), ) else: gout = mul_index.sample( n=n, frac=frac, replace=replace, random_state=random_state, axis=axis, ) assert pout.shape == gout.shape
def test_index_difference_sort_error(): pdi = pd.Index([1, 2, 3]) gdi = cudf.Index([1, 2, 3]) assert_exceptions_equal( pdi.difference, gdi.difference, ([pdi], {"sort": True}), ([gdi], {"sort": True}), )
def test_categorical_setitem_invalid(): ps = pd.Series([1, 2, 3], dtype="category") gs = cudf.Series([1, 2, 3], dtype="category") assert_exceptions_equal( lfunc=ps.__setitem__, rfunc=gs.__setitem__, lfunc_args_and_kwargs=([0, 5], {}), rfunc_args_and_kwargs=([0, 5], {}), )
def test_multiindex_rename_error(names): pi = pd.MultiIndex.from_product([["python", "cobra"], [2018, 2019]]) gi = cudf.from_pandas(pi) assert_exceptions_equal( lfunc=pi.rename, rfunc=gi.rename, lfunc_args_and_kwargs=([], {"names": names}), rfunc_args_and_kwargs=([], {"names": names}), )
def test_series_fillna_error(): psr = pd.Series([1, 2, None, 3, None]) gsr = cudf.from_pandas(psr) assert_exceptions_equal( psr.fillna, gsr.fillna, ([pd.DataFrame({"a": [1, 2, 3]})], ), ([cudf.DataFrame({"a": [1, 2, 3]})], ), )
def test_index_set_names_error(idx, level, names): pi = idx.copy() gi = cudf.from_pandas(idx) assert_exceptions_equal( lfunc=pi.set_names, rfunc=gi.set_names, lfunc_args_and_kwargs=([], {"names": names, "level": level}), rfunc_args_and_kwargs=([], {"names": names, "level": level}), )
def test_raise_data_error(): pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) gdf = cudf.from_pandas(pdf) assert_exceptions_equal( pdf.groupby("a").mean, gdf.groupby("a").mean, compare_error_message=False, )
def test_timedelta_unsupported_reductions(op): gsr = cudf.Series([1, 2, 3, None], dtype="timedelta64[ns]") psr = gsr.to_pandas() utils.assert_exceptions_equal( lfunc=getattr(psr, op), rfunc=getattr(gsr, op), expected_error_message=re.escape("cannot perform " + ("kurtosis" if op == "kurt" else op) + " with type timedelta64[ns]"), )
def test_column_view_invalid_numeric_to_numeric(data, from_dtype, to_dtype): cpu_data = np.asarray(data, dtype=from_dtype) gpu_data = as_column(data, dtype=from_dtype) assert_exceptions_equal( lfunc=cpu_data.view, rfunc=gpu_data.view, lfunc_args_and_kwargs=([to_dtype], ), rfunc_args_and_kwargs=([to_dtype], ), expected_error_message="Can not divide", )
def test_multiindex_loc_rows_0(pdf, gdf, pdfIndex): gdfIndex = cudf.from_pandas(pdfIndex) pdf.index = pdfIndex gdf.index = gdfIndex assert_exceptions_equal( lfunc=pdf.loc.__getitem__, rfunc=gdf.loc.__getitem__, lfunc_args_and_kwargs=([(("d", ), slice(None, None, None))], ), rfunc_args_and_kwargs=([(("d", ), slice(None, None, None))], ), )
def test_duplicated_with_misspelled_column_name(subset): df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) gdf = cudf.DataFrame.from_pandas(df) assert_exceptions_equal( lfunc=df.drop_duplicates, rfunc=gdf.drop_duplicates, lfunc_args_and_kwargs=([subset], ), rfunc_args_and_kwargs=([subset], ), compare_error_message=False, )
def test_add_categories_error(data, add): pds = pd.Series(data, dtype="category") gds = cudf.Series(data, dtype="category") assert_exceptions_equal( pds.cat.add_categories, gds.cat.add_categories, ([add],), ([add],), compare_error_message=False, )
def test_datetime_to_datetime_error(): assert_exceptions_equal( lfunc=pd.to_datetime, rfunc=cudf.to_datetime, lfunc_args_and_kwargs=(["02-Oct-2017 09:30", "%d-%B-%Y %H:%M"], ), rfunc_args_and_kwargs=(["02-Oct-2017 09:30", "%d-%B-%Y %H:%M"], ), check_exception_type=False, expected_error_message=re.escape( "errors parameter has to be either one of: ['ignore', 'raise', " "'coerce', 'warn'], found: %d-%B-%Y %H:%M"), )
def test_dataframe_loc_outbound(): df = DataFrame() size = 10 df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype(np.int32) df["b"] = hb = np.random.random(size).astype(np.float32) pdf = pd.DataFrame() pdf["a"] = ha pdf["b"] = hb assert_exceptions_equal(lambda: pdf.loc[11], lambda: df.loc[11])
def test_df_different_index_shape(df2, binop): df1 = cudf.DataFrame([1, 2, 3], index=[1, 2, 3]) pdf1 = df1.to_pandas() pdf2 = df2.to_pandas() utils.assert_exceptions_equal( lfunc=binop, rfunc=binop, lfunc_args_and_kwargs=([pdf1, pdf2], ), rfunc_args_and_kwargs=([df1, df2], ), )
def test_pandas_concat_compatibility_axis1_eq_index(): s1 = gd.Series(["a", "b", "c"], index=[0, 1, 2]) s2 = gd.Series(["a", "b", "c"], index=[1, 1, 1]) ps1 = s1.to_pandas() ps2 = s2.to_pandas() assert_exceptions_equal( lfunc=pd.concat, rfunc=gd.concat, lfunc_args_and_kwargs=([], {"objs": [ps1, ps2], "axis": 1}), rfunc_args_and_kwargs=([], {"objs": [s1, s2], "axis": 1}), )
def test_multiindex_set_names_error(level, names): pi = pd.MultiIndex.from_product( [["python", "cobra"], [2018, 2019], ["aab", "bcd"]] ) gi = cudf.from_pandas(pi) assert_exceptions_equal( lfunc=pi.set_names, rfunc=gi.set_names, lfunc_args_and_kwargs=([], {"names": names, "level": level}), rfunc_args_and_kwargs=([], {"names": names, "level": level}), )
def test_timedelta_datetime_cast_invalid(): sr = cudf.Series([1, 2, 3], dtype="timedelta64[ns]") psr = sr.to_pandas() assert_exceptions_equal(psr.astype, sr.astype, (["datetime64[ns]"], ), (["datetime64[ns]"], )) sr = cudf.Series([1, 2, 3], dtype="datetime64[ns]") psr = sr.to_pandas() assert_exceptions_equal(psr.astype, sr.astype, (["timedelta64[ns]"], ), (["timedelta64[ns]"], ))