Esempio n. 1
0
def _check_and_cast_columns_with_other(
    source_col: ColumnBase,
    other: Union[ScalarLike, ColumnBase],
    inplace: bool,
) -> Tuple[ColumnBase, Union[ScalarLike, ColumnBase]]:
    """
    Returns type-casted column `source_col` & scalar `other_scalar`
    based on `inplace` parameter.
    """
    if cudf.utils.dtypes.is_categorical_dtype(source_col.dtype):
        return source_col, other

    if cudf.utils.dtypes.is_scalar(other):
        device_obj = _normalize_scalars(source_col, other)
    else:
        device_obj = other

    if other is None:
        return source_col, device_obj
    elif cudf.utils.dtypes.is_mixed_with_object_dtype(device_obj, source_col):
        raise TypeError(
            "cudf does not support mixed types, please type-cast "
            "the column of dataframe/series and other "
            "to same dtypes."
        )
    if inplace:
        if not cudf.utils.dtypes._can_cast(device_obj.dtype, source_col.dtype):
            warnings.warn(
                f"Type-casting from {device_obj.dtype} "
                f"to {source_col.dtype}, there could be potential data loss"
            )
        return source_col, device_obj.astype(source_col.dtype)
    else:
        if (
            cudf.utils.dtypes.is_scalar(other)
            and cudf.utils.dtypes._is_non_decimal_numeric_dtype(
                source_col.dtype
            )
            and cudf.utils.dtypes._can_cast(other, source_col.dtype)
        ):
            common_dtype = source_col.dtype
            return (
                source_col.astype(common_dtype),
                cudf.Scalar(other, dtype=common_dtype),
            )
        else:
            common_dtype = cudf.utils.dtypes.find_common_type(
                [
                    source_col.dtype,
                    np.min_scalar_type(other)
                    if cudf.utils.dtypes.is_scalar(other)
                    else other.dtype,
                ]
            )
            if cudf.utils.dtypes.is_scalar(device_obj):
                device_obj = cudf.Scalar(other, dtype=common_dtype)
            else:
                device_obj = device_obj.astype(common_dtype)
            return source_col.astype(common_dtype), device_obj
Esempio n. 2
0
def _one_hot_encode_column(
    column: ColumnBase,
    categories: ColumnBase,
    prefix: Optional[str],
    prefix_sep: Optional[str],
    dtype: Optional[Dtype],
) -> Dict[str, ColumnBase]:
    """Encode a single column with one hot encoding. The return dictionary
    contains pairs of (category, encodings). The keys may be prefixed with
    `prefix`, separated with category name with `prefix_sep`. The encoding
    columns maybe coerced into `dtype`.
    """
    if isinstance(column, CategoricalColumn):
        if column.size == column.null_count:
            column = column_empty_like(categories, newsize=column.size)
        else:
            column = column._get_decategorized_column()

    if column.size * categories.size >= np.iinfo("int32").max:
        raise ValueError(
            "Size limitation exceeded: column.size * category.size < "
            "np.iinfo('int32').max. Consider reducing size of category")
    data = one_hot_encode(column, categories)

    if prefix is not None and prefix_sep is not None:
        data = {f"{prefix}{prefix_sep}{col}": enc for col, enc in data.items()}
    if dtype:
        data = {k: v.astype(dtype) for k, v in data.items()}
    return data
Esempio n. 3
0
    def from_arrow(cls, array):
        """Create from PyArrow Array/ChunkedArray.

        Parameters
        ----------
        array : PyArrow Array/ChunkedArray
            PyArrow Object which has to be converted.

        Raises
        ------
        TypeError for invalid input type.

        Returns
        -------
        SingleColumnFrame

        Examples
        --------
        >>> import cudf
        >>> import pyarrow as pa
        >>> cudf.Index.from_arrow(pa.array(["a", "b", None]))
        StringIndex(['a' 'b' None], dtype='object')
        >>> cudf.Series.from_arrow(pa.array(["a", "b", None]))
        0       a
        1       b
        2    <NA>
        dtype: object
        """
        return cls(ColumnBase.from_arrow(array))
Esempio n. 4
0
    def cov(self, other: ColumnBase) -> float:
        if (len(self) == 0 or len(other) == 0
                or (len(self) == 1 and len(other) == 1)):
            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)

        result = (self - self.mean()) * (other - other.mean())
        cov_sample = result.sum() / (len(self) - 1)
        return cov_sample
Esempio n. 5
0
def _match_categorical_dtypes(lcol: ColumnBase, rcol: ColumnBase,
                              how: str) -> Tuple[ColumnBase, ColumnBase]:
    # cast the keys lcol and rcol to a common dtype
    # when at least one of them is a categorical type
    ltype, rtype = lcol.dtype, rcol.dtype

    if isinstance(lcol, cudf.core.column.CategoricalColumn) and isinstance(
            rcol, cudf.core.column.CategoricalColumn):
        # if both are categoricals, logic is complicated:
        return _match_categorical_dtypes_both(lcol, rcol, how)

    if isinstance(ltype, CategoricalDtype):
        if how in {"left", "leftsemi", "leftanti"}:
            return lcol, rcol.astype(ltype)
        common_type = ltype.categories.dtype
    elif isinstance(rtype, CategoricalDtype):
        common_type = rtype.categories.dtype
    return lcol.astype(common_type), rcol.astype(common_type)
Esempio n. 6
0
    def corr(self, other: ColumnBase) -> float:
        if len(self) == 0 or len(other) == 0:
            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)

        cov = self.cov(other)
        lhs_std, rhs_std = self.std(), other.std()

        if not cov or lhs_std == 0 or rhs_std == 0:
            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
        return cov / lhs_std / rhs_std
Esempio n. 7
0
 def _concat(cls, objs):
     data = ColumnBase._concat([o._values for o in objs])
     names = {obj.name for obj in objs}
     if len(names) == 1:
         [name] = names
     else:
         name = None
     result = as_index(data)
     result.name = name
     return result
Esempio n. 8
0
def _match_join_keys(lcol: ColumnBase, rcol: ColumnBase,
                     how: str) -> Tuple[ColumnBase, ColumnBase]:
    # returns the common dtype that lcol and rcol should be casted to,
    # before they can be used as left and right join keys.
    # If no casting is necessary, returns None

    common_type = None

    # cast the keys lcol and rcol to a common dtype
    ltype = lcol.dtype
    rtype = rcol.dtype

    # if either side is categorical, different logic
    if isinstance(ltype, CategoricalDtype) or isinstance(
            rtype, CategoricalDtype):
        return _match_categorical_dtypes(lcol, rcol, how)

    if pd.api.types.is_dtype_equal(ltype, rtype):
        return lcol, rcol

    if isinstance(ltype, cudf.Decimal64Dtype) or isinstance(
            rtype, cudf.Decimal64Dtype):
        raise TypeError(
            "Decimal columns can only be merged with decimal columns "
            "of the same precision and scale")

    if (np.issubdtype(ltype, np.number)) and (np.issubdtype(rtype, np.number)):
        common_type = (max(ltype, rtype) if ltype.kind == rtype.kind else
                       np.find_common_type([], (ltype, rtype)))

    elif np.issubdtype(ltype, np.datetime64) and np.issubdtype(
            rtype, np.datetime64):
        common_type = max(ltype, rtype)

    if how == "left":
        if rcol.fillna(0).can_cast_safely(ltype):
            return lcol, rcol.astype(ltype)
        else:
            warnings.warn(f"Can't safely cast column from {rtype} to {ltype}, "
                          "upcasting to {common_type}.")

    return lcol.astype(common_type), rcol.astype(common_type)
Esempio n. 9
0
def _match_join_keys(lcol: ColumnBase, rcol: ColumnBase,
                     how: str) -> Tuple[ColumnBase, ColumnBase]:
    # Casts lcol and rcol to a common dtype for use as join keys. If no casting
    # is necessary, they are returned as is.

    common_type = None

    # cast the keys lcol and rcol to a common dtype
    ltype = lcol.dtype
    rtype = rcol.dtype

    # if either side is categorical, different logic
    left_is_categorical = isinstance(ltype, CategoricalDtype)
    right_is_categorical = isinstance(rtype, CategoricalDtype)
    if left_is_categorical and right_is_categorical:
        return _match_categorical_dtypes_both(cast(CategoricalColumn, lcol),
                                              cast(CategoricalColumn, rcol),
                                              how)
    elif left_is_categorical or right_is_categorical:
        if left_is_categorical:
            if how in {"left", "leftsemi", "leftanti"}:
                return lcol, rcol.astype(ltype)
            common_type = ltype.categories.dtype
        else:
            common_type = rtype.categories.dtype
        return lcol.astype(common_type), rcol.astype(common_type)

    if is_dtype_equal(ltype, rtype):
        return lcol, rcol

    if is_decimal_dtype(ltype) or is_decimal_dtype(rtype):
        raise TypeError(
            "Decimal columns can only be merged with decimal columns "
            "of the same precision and scale")

    if (np.issubdtype(ltype, np.number)) and (np.issubdtype(rtype, np.number)):
        common_type = (max(ltype, rtype) if ltype.kind == rtype.kind else
                       np.find_common_type([], (ltype, rtype)))

    elif np.issubdtype(ltype, np.datetime64) and np.issubdtype(
            rtype, np.datetime64):
        common_type = max(ltype, rtype)

    if how == "left":
        if rcol.fillna(0).can_cast_safely(ltype):
            return lcol, rcol.astype(ltype)
        else:
            warnings.warn(f"Can't safely cast column from {rtype} to {ltype}, "
                          f"upcasting to {common_type}.")

    return lcol.astype(common_type), rcol.astype(common_type)
Esempio n. 10
0
def _safe_cast_to_int(col: ColumnBase, dtype: DtypeObj) -> ColumnBase:
    """
    Cast given NumericalColumn to given integer dtype safely.
    """
    assert is_integer_dtype(dtype)

    if col.dtype == dtype:
        return col

    new_col = col.astype(dtype)
    if (new_col == col).all():
        return new_col
    else:
        raise TypeError(
            f"Cannot safely cast non-equivalent "
            f"{col.dtype.type.__name__} to {np.dtype(dtype).type.__name__}")
Esempio n. 11
0
    def fillna(self, fill_value=None, method=None):
        if fill_value is not None:
            if cudf.utils.utils.isnat(fill_value):
                return _fillna_natwise(self)
            col = self
            if is_scalar(fill_value):
                if isinstance(fill_value, np.timedelta64):
                    dtype = determine_out_dtype(self.dtype, fill_value.dtype)
                    fill_value = fill_value.astype(dtype)
                    col = col.astype(dtype)
                if not isinstance(fill_value, cudf.Scalar):
                    fill_value = cudf.Scalar(fill_value, dtype=dtype)
            else:
                fill_value = column.as_column(fill_value, nan_as_null=False)

            return ColumnBase.fillna(col, fill_value)
        else:
            return super().fillna(method=method)