Beispiel #1
def dtype_can_compare_equal_to_other(dtype):
    # return True if values of this dtype can compare
    # as equal to equal values of a different dtype
    return not (
        or is_list_dtype(dtype)
        or is_struct_dtype(dtype)
        or is_decimal_dtype(dtype)
        or is_interval_dtype(dtype)
Beispiel #2
def _convert_str_col(col, errors, _downcast=None):
    Converts a string column to numeric column

    Converts to integer column if all strings are integer-like (isinteger.all)
    Otherwise, converts to float column if all strings are float-like (

    If error == 'coerce', fill non-numerics strings with null

    Looks ahead to ``downcast`` parameter, if the float may be casted to
    integer, then only process in float32 pipeline.

    col : The string column to convert, must be string dtype
    errors : {'raise', 'ignore', 'coerce'}, same as ``to_numeric``
    _downcast : Same as ``to_numeric``, see description for use

    Converted numeric column
    if not is_string_dtype(col):
        raise TypeError("col must be string dtype.")

    is_integer = libstrings.is_integer(col)
    if is_integer.all():
        return col.as_numerical_column(dtype=cudf.dtype("i8"))

    col = _proc_inf_empty_strings(col)

    is_float = libstrings.is_float(col)
    if is_float.all():
        if _downcast in {"unsigned", "signed", "integer"}:
                UserWarning("Downcasting from float to int will be "
                            "limited by float32 precision."))
            return col.as_numerical_column(dtype=cudf.dtype("f"))
            return col.as_numerical_column(dtype=cudf.dtype("d"))
        if errors == "coerce":
            col = libcudf.string_casting.stod(col)
            non_numerics = is_float.unary_operator("not")
            col[non_numerics] = None
            return col
            raise ValueError("Unable to convert some strings to numerics.")
Beispiel #3
def to_cudf_compatible_scalar(val, dtype=None):
    Converts the value `val` to a numpy/Pandas scalar,
    optionally casting to `dtype`.

    If `val` is None, returns None.

    if cudf._lib.scalar._is_null_host_scalar(val) or isinstance(
            val, cudf.Scalar):
        return val

    if not _is_scalar_or_zero_d_array(val):
        raise ValueError(f"Cannot convert value of type {type(val).__name__} "
                         "to cudf scalar")

    if isinstance(val, Decimal):
        return val

    if isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0:
        val = val.item()

    if ((dtype is None) and isinstance(val, str)) or is_string_dtype(dtype):
        dtype = "str"

    if isinstance(val, dt.datetime):
        val = np.datetime64(val)
    elif isinstance(val, dt.timedelta):
        val = np.timedelta64(val)
    elif isinstance(val, pd.Timestamp):
        val = val.to_datetime64()
    elif isinstance(val, pd.Timedelta):
        val = val.to_timedelta64()

    val = pandas_dtype(type(val)).type(val)

    if dtype is not None:
        val = val.astype(dtype)

    if val.dtype.type is np.datetime64:
        time_unit, _ = np.datetime_data(val.dtype)
        if time_unit in ("D", "W", "M", "Y"):
            val = val.astype("datetime64[s]")
    elif val.dtype.type is np.timedelta64:
        time_unit, _ = np.datetime_data(val.dtype)
        if time_unit in ("D", "W", "M", "Y"):
            val = val.astype("timedelta64[ns]")

    return val
Beispiel #4
def _get_non_empty_data(s):
    if isinstance(s._column, cudf.core.column.CategoricalColumn):
        categories = (s._column.categories
                      if len(s._column.categories) else [UNKNOWN_CATEGORIES])
        codes = cudf.core.column.full(size=2, fill_value=0, dtype="int32")
        ordered = s._column.ordered
        data = cudf.core.column.build_categorical_column(categories=categories,
    elif is_string_dtype(s.dtype):
        data = pa.array(["cat", "dog"])
        if pd.api.types.is_numeric_dtype(s.dtype):
            data = cudf.core.column.as_column(
                cp.arange(start=0, stop=2, dtype=s.dtype))
            data = cudf.core.column.as_column(
                cp.arange(start=0, stop=2, dtype="int64")).astype(s.dtype)
    return data
Beispiel #5
def test_is_string_dtype(obj, expect):
    assert types.is_string_dtype(obj) == expect
Beispiel #6
def test_pandas_agreement(obj):
    assert types.is_categorical_dtype(obj) == ptypes.is_categorical_dtype(obj)
    assert types.is_numeric_dtype(obj) == ptypes.is_numeric_dtype(obj)
    assert types.is_integer_dtype(obj) == ptypes.is_integer_dtype(obj)
    assert types.is_integer(obj) == ptypes.is_integer(obj)
    assert types.is_string_dtype(obj) == ptypes.is_string_dtype(obj)
Beispiel #7
def to_numeric(arg, errors="raise", downcast=None):
    Convert argument into numerical types.

    arg : column-convertible
        The object to convert to numeric types
    errors : {'raise', 'ignore', 'coerce'}, defaults 'raise'
        Policy to handle errors during parsing.

        * 'raise' will notify user all errors encountered.
        * 'ignore' will skip error and returns ``arg``.
        * 'coerce' will leave invalid values as nulls.
    downcast : {'integer', 'signed', 'unsigned', 'float'}, defaults None
        If set, will try to down-convert the datatype of the
        parsed results to smallest possible type. For each `downcast`
        type, this method will determine the smallest possible
        dtype from the following sets:

        * {'integer', 'signed'}: all integer types greater or equal to
        * {'unsigned'}: all unsigned types greater or equal to `np.uint8`
        * {'float'}: all floating types greater or equal to `np.float32`

        Note that downcast behavior is decoupled from parsing. Errors
        encountered during downcast is raised regardless of ``errors``

    Series or ndarray
        Depending on the input, if series is passed in, series is returned,
        otherwise ndarray

    An important difference from pandas is that this function does not accept
    mixed numeric/non-numeric type sequences. For example ``[1, 'a']``.
    A ``TypeError`` will be raised when such input is received, regardless of
    ``errors`` parameter.

    >>> s = cudf.Series(['1', '2.0', '3e3'])
    >>> cudf.to_numeric(s)
    0       1.0
    1       2.0
    2    3000.0
    dtype: float64
    >>> cudf.to_numeric(s, downcast='float')
    0       1.0
    1       2.0
    2    3000.0
    dtype: float32
    >>> cudf.to_numeric(s, downcast='signed')
    0       1
    1       2
    2    3000
    dtype: int16
    >>> s = cudf.Series(['apple', '1.0', '3e3'])
    >>> cudf.to_numeric(s, errors='ignore')
    0    apple
    1      1.0
    2      3e3
    dtype: object
    >>> cudf.to_numeric(s, errors='coerce')
    0      <NA>
    1       1.0
    2    3000.0
    dtype: float64

    if errors not in {"raise", "ignore", "coerce"}:
        raise ValueError("invalid error value specified")

    if downcast not in {None, "integer", "signed", "unsigned", "float"}:
        raise ValueError("invalid downcasting method provided")

    if not can_convert_to_column(arg) or (hasattr(arg, "ndim")
                                          and arg.ndim > 1):
        raise ValueError("arg must be column convertible")

    col = as_column(arg)
    dtype = col.dtype

    if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype):
        col = col.as_numerical_column(cudf.dtype("int64"))
    elif is_categorical_dtype(dtype):
        cat_dtype = col.dtype.type
        if _is_non_decimal_numeric_dtype(cat_dtype):
            col = col.as_numerical_column(cat_dtype)
                col = _convert_str_col(col._get_decategorized_column(), errors,
            except ValueError as e:
                if errors == "ignore":
                    return arg
                    raise e
    elif is_string_dtype(dtype):
            col = _convert_str_col(col, errors, downcast)
        except ValueError as e:
            if errors == "ignore":
                return arg
                raise e
    elif is_list_dtype(dtype) or is_struct_dtype(dtype):
        raise ValueError("Input does not support nested datatypes")
    elif _is_non_decimal_numeric_dtype(dtype):
        raise ValueError("Unrecognized datatype")

    # str->float conversion may require lower precision
    if col.dtype == cudf.dtype("f"):
        col = col.as_numerical_column("d")

    if downcast:
        downcast_type_map = {
            "integer": list(np.typecodes["Integer"]),
            "signed": list(np.typecodes["Integer"]),
            "unsigned": list(np.typecodes["UnsignedInteger"]),
        float_types = list(np.typecodes["Float"])
        idx = float_types.index(cudf.dtype(np.float32).char)
        downcast_type_map["float"] = float_types[idx:]

        type_set = downcast_type_map[downcast]

        for t in type_set:
            downcast_dtype = cudf.dtype(t)
            if downcast_dtype.itemsize <= col.dtype.itemsize:
                if col.can_cast_safely(downcast_dtype):
                    col = libcudf.unary.cast(col, downcast_dtype)

    if isinstance(arg, (cudf.Series, pd.Series)):
        return cudf.Series(col)
        if col.has_nulls():
            # To match pandas, always return a floating type filled with nan.
            col = col.astype(float).fillna(np.nan)
        return col.values
Beispiel #8
def _is_string_dtype(obj):
    if not HAS_GPU:
        return pd.api.types.is_string_dtype(obj)
        return is_string_dtype(obj)