def test_cdt_eq(data, ordered): dt = cudf.CategoricalDtype(categories=data, ordered=ordered) assert dt == "category" assert dt == dt assert dt == cudf.CategoricalDtype(categories=None, ordered=ordered) assert dt == cudf.CategoricalDtype(categories=data, ordered=ordered) assert not dt == cudf.CategoricalDtype(categories=data, ordered=not ordered)
def test_is_categorical_dispatch(): assert is_categorical_dtype(pd.CategoricalDtype([1, 2, 3])) assert is_categorical_dtype(cudf.CategoricalDtype([1, 2, 3])) assert is_categorical_dtype(cudf.Series([1, 2, 3], dtype="category")) assert is_categorical_dtype(pd.Series([1, 2, 3], dtype="category")) assert is_categorical_dtype(pd.Index([1, 2, 3], dtype="category")) assert is_categorical_dtype(cudf.Index([1, 2, 3], dtype="category"))
def _match_categorical_dtypes_both(lcol: CategoricalColumn, rcol: CategoricalColumn, how: str) -> Tuple[ColumnBase, ColumnBase]: # The commontype depends on both `how` and the specifics of the # categorical variables to be merged. ltype, rtype = lcol.dtype, rcol.dtype # when both are ordered and both have the same categories, # no casting required: if ltype == rtype: return lcol, rcol # Merging categorical variables when only one side is ordered is # ambiguous and not allowed. if ltype.ordered != rtype.ordered: raise TypeError("Merging on categorical variables with mismatched" " ordering is ambiguous") if ltype.ordered and rtype.ordered: # if we get to here, categories must be what causes the # dtype equality check to fail. And we can never merge # two ordered categoricals with different categories raise TypeError(f"{how} merge between categoricals with " "different categories is only valid when " "neither side is ordered") # the following should now always hold assert not ltype.ordered and not rtype.ordered if how == "inner": # cast to category types -- we must cast them back later return _match_join_keys( lcol.cat()._decategorize(), rcol.cat()._decategorize(), how, ) elif how in {"left", "leftanti", "leftsemi"}: # always cast to left type return lcol, rcol.astype(ltype) else: # merge categories merged_categories = cudf.concat([ltype.categories, rtype.categories]).unique() common_type = cudf.CategoricalDtype(categories=merged_categories, ordered=False) return lcol.astype(common_type), rcol.astype(common_type)
def _libcudf_to_output_castrules(lcol, rcol, how): """ Determine what dtype an output merge key column should be cast to after it has been processed by libcudf. Determine if a column should be promoted to a categorical datatype. For inner merges between unordered categoricals, we get a new categorical variable containing the intersection of the two source variables. For left or right joins, we get the original categorical variable from whichever was the major operand of the join, e.g. left for a left join or right for a right join. In the case of an outer join, the result will be a new categorical variable with both sets of categories. """ merge_return_type = None ltype = lcol.dtype rtype = rcol.dtype if pd.api.types.is_dtype_equal(ltype, rtype): return ltype l_is_cat = isinstance(ltype, CategoricalDtype) r_is_cat = isinstance(rtype, CategoricalDtype) # we currently only need to do this for categorical variables if how == "inner": if l_is_cat and r_is_cat: merge_return_type = "category" elif how == "left": if l_is_cat: merge_return_type = ltype elif how == "right": if r_is_cat: merge_return_type = rtype elif how == "outer": if l_is_cat and r_is_cat: new_cats = cudf.concat([ltype.categories, rtype.categories]).unique() merge_return_type = cudf.CategoricalDtype(categories=new_cats, ordered=ltype.ordered) return merge_return_type
def test_merging_categorical_columns(): try: from dask.dataframe.dispatch import ( # noqa: F401 union_categoricals_dispatch, ) except ImportError: pytest.skip( "need a version of dask that has union_categoricals_dispatch") df_1 = cudf.DataFrame({ "id_1": [0, 1, 2, 3], "cat_col": ["a", "b", "f", "f"] }) ddf_1 = dgd.from_cudf(df_1, npartitions=2) ddf_1 = dd.categorical.categorize(ddf_1, columns=["cat_col"]) df_2 = cudf.DataFrame({ "id_2": [111, 112, 113], "cat_col": ["g", "h", "f"] }) ddf_2 = dgd.from_cudf(df_2, npartitions=2) ddf_2 = dd.categorical.categorize(ddf_2, columns=["cat_col"]) expected = cudf.DataFrame({ "id_1": [2, 3], "cat_col": cudf.Series( ["f", "f"], dtype=cudf.CategoricalDtype(categories=["a", "b", "f", "g", "h"], ordered=False), ), "id_2": [113, 113], }) dd.assert_eq(ddf_1.merge(ddf_2), expected)
def test_cdf_to_pandas(data, ordered): assert (pd.CategoricalDtype(data, ordered) == cudf.CategoricalDtype( categories=data, ordered=ordered).to_pandas())
def test_categorical_dtype(categories, ordered): expected = pd.CategoricalDtype(categories=categories, ordered=ordered) got = cudf.CategoricalDtype(categories=categories, ordered=ordered) assert_eq(expected, got)
def categorical_dtype_cudf(categories=None, ordered=None): return cudf.CategoricalDtype(categories=categories, ordered=ordered)
def _input_to_libcudf_castrules_both_cat(lcol, rcol, how): """ Based off the left and right operands, determine the libcudf merge dtype or error for corner cases where the merge cannot proceed. This function handles categorical variables. Categorical variable typecasting logic depends on both `how` and the specifics of the categorical variables to be merged. Merging categorical variables when only one side is ordered is ambiguous and not allowed. Merging when both categoricals are ordered is allowed, but only when the categories are exactly equal and have equal ordering, and will result in the common dtype. When both sides are unordered, the result categorical depends on the kind of join: - For inner joins, the result will be the intersection of the categories - For left or right joins, the result will be the the left or right dtype respectively. This extends to semi and anti joins. - For outer joins, the result will be the union of categories from both sides. """ ltype = lcol.dtype rtype = rcol.dtype # this function is only to be used to resolve the result when both # sides are categorical if not isinstance(ltype, CategoricalDtype) and isinstance( rtype, CategoricalDtype): raise TypeError("Both operands must be CategoricalDtype") # true for every configuration if ltype == rtype: return ltype # raise for any join where ordering doesn't match if ltype.ordered != rtype.ordered: raise TypeError("Merging on categorical variables with mismatched" " ordering is ambiguous") elif ltype.ordered and rtype.ordered: # if we get to here, categories must be what causes the # dtype equality check to fail. And we can never merge # two ordered categoricals with different categories raise TypeError(f"{how} merge between categoricals with " "different categories is only valid when " "neither side is ordered") elif how == "inner": # neither ordered, so categories must be different # demote to underlying types return _input_to_libcudf_castrules_any(ltype.categories, rtype.categories, how) elif how == "left": return ltype elif how == "right": return rtype elif how == "outer": new_cats = cudf.concat([ltype.categories, rtype.categories]).unique() return cudf.CategoricalDtype(categories=new_cats, ordered=False)
(pd.Series(dtype="str"), False), (pd.Series(dtype="unicode"), False), (pd.Series(dtype="datetime64[s]"), False), (pd.Series(dtype="timedelta64[s]"), False), (pd.Series(dtype="category"), True), (pd.Series(dtype="object"), False), # cuDF dtypes. (cudf.CategoricalDtype, True), (cudf.ListDtype, False), (cudf.StructDtype, False), (cudf.Decimal128Dtype, False), (cudf.Decimal64Dtype, False), (cudf.Decimal32Dtype, False), (cudf.IntervalDtype, False), # cuDF dtype instances. (cudf.CategoricalDtype("a"), True), (cudf.ListDtype(int), False), (cudf.StructDtype({"a": int}), False), (cudf.Decimal128Dtype(5, 2), False), (cudf.Decimal64Dtype(5, 2), False), (cudf.Decimal32Dtype(5, 2), False), (cudf.IntervalDtype(int), False), # cuDF objects (cudf.Series(dtype="bool"), False), (cudf.Series(dtype="int"), False), (cudf.Series(dtype="float"), False), (cudf.Series(dtype="str"), False), (cudf.Series(dtype="datetime64[s]"), False), (cudf.Series(dtype="timedelta64[s]"), False), (cudf.Series(dtype="category"), True), (cudf.Series(dtype=cudf.Decimal128Dtype(5, 2)), False),
def find_common_type(dtypes): """ Wrapper over np.find_common_type to handle special cases Corner cases: 1. "M8", "M8" -> "M8" | "m8", "m8" -> "m8" Parameters ---------- dtypes : iterable, sequence of dtypes to find common types Returns ------- dtype : np.dtype optional, the result from np.find_common_type, None if input is empty """ if len(dtypes) == 0: return None # Early exit for categoricals since they're not hashable and therefore # can't be put in a set. if any(cudf.api.types.is_categorical_dtype(dtype) for dtype in dtypes): if all((cudf.api.types.is_categorical_dtype(dtype) and ( not dtype.ordered if hasattr(dtype, "ordered") else True)) for dtype in dtypes): if len({dtype._categories.dtype for dtype in dtypes}) == 1: return cudf.CategoricalDtype( cudf.core.column.concat_columns( [dtype._categories for dtype in dtypes]).unique()) else: raise ValueError( "Only unordered categories of the same underlying type " "may be coerced to a common type.") else: # TODO: Should this be an error case (mixing categorical with other # dtypes) or should this return object? Unclear if we have enough # information to decide right now, may have to come back to this as # usage of find_common_type increases. return cudf.dtype("O") # Aggregate same types dtypes = set(dtypes) if any(cudf.api.types.is_decimal_dtype(dtype) for dtype in dtypes): if all( cudf.api.types.is_decimal_dtype(dtype) or cudf.api.types.is_numeric_dtype(dtype) for dtype in dtypes): return _find_common_type_decimal([ dtype for dtype in dtypes if cudf.api.types.is_decimal_dtype(dtype) ]) else: return cudf.dtype("O") # Corner case 1: # Resort to np.result_type to handle "M" and "m" types separately dt_dtypes = set( filter(lambda t: cudf.api.types.is_datetime_dtype(t), dtypes)) if len(dt_dtypes) > 0: dtypes = dtypes - dt_dtypes dtypes.add(np.result_type(*dt_dtypes)) td_dtypes = set( filter(lambda t: pd.api.types.is_timedelta64_dtype(t), dtypes)) if len(td_dtypes) > 0: dtypes = dtypes - td_dtypes dtypes.add(np.result_type(*td_dtypes)) common_dtype = np.find_common_type(list(dtypes), []) if common_dtype == np.dtype("float16"): return cudf.dtype("float32") return cudf.dtype(common_dtype)
(float, np.dtype("float64")), (cudf.ListDtype("int64"), cudf.ListDtype("int64")), ("float16", np.dtype("float32")), (np.dtype("U"), np.dtype("object")), ("timedelta64", np.dtype("<m8")), ("timedelta64[ns]", np.dtype("<m8[ns]")), ("timedelta64[ms]", np.dtype("<m8[ms]")), ("timedelta64[D]", np.dtype("<m8[D]")), ("<m8[s]", np.dtype("<m8[s]")), ("datetime64", np.dtype("<M8")), ("datetime64[ns]", np.dtype("<M8[ns]")), ("datetime64[ms]", np.dtype("<M8[ms]")), ("datetime64[D]", np.dtype("<M8[D]")), ("<M8[s]", np.dtype("<M8[s]")), (cudf.ListDtype("int64"), cudf.ListDtype("int64")), ("category", cudf.CategoricalDtype()), ( cudf.CategoricalDtype(categories=("a", "b", "c")), cudf.CategoricalDtype(categories=("a", "b", "c")), ), ( pd.CategoricalDtype(categories=("a", "b", "c")), cudf.CategoricalDtype(categories=("a", "b", "c")), ), ( # this is a pandas.core.arrays.numpy_.PandasDtype... pd.array([1], dtype="int16").dtype, np.dtype("int16"), ), (pd.IntervalDtype("int"), cudf.IntervalDtype("int64")), (cudf.IntervalDtype("int"), cudf.IntervalDtype("int64")),