Beispiel #1
0
def dtype_can_compare_equal_to_other(dtype):
    # return True if values of this dtype can compare
    # as equal to equal values of a different dtype
    return not (
        is_string_dtype(dtype)
        or is_list_dtype(dtype)
        or is_struct_dtype(dtype)
        or is_decimal_dtype(dtype)
        or is_interval_dtype(dtype)
    )
Beispiel #2
0
def _convert_str_col(col, errors, _downcast=None):
    """
    Converts a string column to numeric column

    Converts to integer column if all strings are integer-like (isinteger.all)
    Otherwise, converts to float column if all strings are float-like (
    isfloat.all)

    If error == 'coerce', fill non-numerics strings with null

    Looks ahead to ``downcast`` parameter, if the float may be casted to
    integer, then only process in float32 pipeline.

    Parameters
    ----------
    col : The string column to convert, must be string dtype
    errors : {'raise', 'ignore', 'coerce'}, same as ``to_numeric``
    _downcast : Same as ``to_numeric``, see description for use

    Returns
    -------
    Converted numeric column
    """
    if not is_string_dtype(col):
        raise TypeError("col must be string dtype.")

    is_integer = libstrings.is_integer(col)
    if is_integer.all():
        return col.as_numerical_column(dtype=cudf.dtype("i8"))

    col = _proc_inf_empty_strings(col)

    is_float = libstrings.is_float(col)
    if is_float.all():
        if _downcast in {"unsigned", "signed", "integer"}:
            warnings.warn(
                UserWarning("Downcasting from float to int will be "
                            "limited by float32 precision."))
            return col.as_numerical_column(dtype=cudf.dtype("f"))
        else:
            return col.as_numerical_column(dtype=cudf.dtype("d"))
    else:
        if errors == "coerce":
            col = libcudf.string_casting.stod(col)
            non_numerics = is_float.unary_operator("not")
            col[non_numerics] = None
            return col
        else:
            raise ValueError("Unable to convert some strings to numerics.")
Beispiel #3
0
def to_cudf_compatible_scalar(val, dtype=None):
    """
    Converts the value `val` to a numpy/Pandas scalar,
    optionally casting to `dtype`.

    If `val` is None, returns None.
    """

    if cudf._lib.scalar._is_null_host_scalar(val) or isinstance(
            val, cudf.Scalar):
        return val

    if not _is_scalar_or_zero_d_array(val):
        raise ValueError(f"Cannot convert value of type {type(val).__name__} "
                         "to cudf scalar")

    if isinstance(val, Decimal):
        return val

    if isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0:
        val = val.item()

    if ((dtype is None) and isinstance(val, str)) or is_string_dtype(dtype):
        dtype = "str"

    if isinstance(val, dt.datetime):
        val = np.datetime64(val)
    elif isinstance(val, dt.timedelta):
        val = np.timedelta64(val)
    elif isinstance(val, pd.Timestamp):
        val = val.to_datetime64()
    elif isinstance(val, pd.Timedelta):
        val = val.to_timedelta64()

    val = pandas_dtype(type(val)).type(val)

    if dtype is not None:
        val = val.astype(dtype)

    if val.dtype.type is np.datetime64:
        time_unit, _ = np.datetime_data(val.dtype)
        if time_unit in ("D", "W", "M", "Y"):
            val = val.astype("datetime64[s]")
    elif val.dtype.type is np.timedelta64:
        time_unit, _ = np.datetime_data(val.dtype)
        if time_unit in ("D", "W", "M", "Y"):
            val = val.astype("timedelta64[ns]")

    return val
Beispiel #4
0
def _get_non_empty_data(s):
    if isinstance(s._column, cudf.core.column.CategoricalColumn):
        categories = (s._column.categories
                      if len(s._column.categories) else [UNKNOWN_CATEGORIES])
        codes = cudf.core.column.full(size=2, fill_value=0, dtype="int32")
        ordered = s._column.ordered
        data = cudf.core.column.build_categorical_column(categories=categories,
                                                         codes=codes,
                                                         ordered=ordered)
    elif is_string_dtype(s.dtype):
        data = pa.array(["cat", "dog"])
    else:
        if pd.api.types.is_numeric_dtype(s.dtype):
            data = cudf.core.column.as_column(
                cp.arange(start=0, stop=2, dtype=s.dtype))
        else:
            data = cudf.core.column.as_column(
                cp.arange(start=0, stop=2, dtype="int64")).astype(s.dtype)
    return data
Beispiel #5
0
def test_is_string_dtype(obj, expect):
    assert types.is_string_dtype(obj) == expect
Beispiel #6
0
def test_pandas_agreement(obj):
    assert types.is_categorical_dtype(obj) == ptypes.is_categorical_dtype(obj)
    assert types.is_numeric_dtype(obj) == ptypes.is_numeric_dtype(obj)
    assert types.is_integer_dtype(obj) == ptypes.is_integer_dtype(obj)
    assert types.is_integer(obj) == ptypes.is_integer(obj)
    assert types.is_string_dtype(obj) == ptypes.is_string_dtype(obj)
Beispiel #7
0
def to_numeric(arg, errors="raise", downcast=None):
    """
    Convert argument into numerical types.

    Parameters
    ----------
    arg : column-convertible
        The object to convert to numeric types
    errors : {'raise', 'ignore', 'coerce'}, defaults 'raise'
        Policy to handle errors during parsing.

        * 'raise' will notify user all errors encountered.
        * 'ignore' will skip error and returns ``arg``.
        * 'coerce' will leave invalid values as nulls.
    downcast : {'integer', 'signed', 'unsigned', 'float'}, defaults None
        If set, will try to down-convert the datatype of the
        parsed results to smallest possible type. For each `downcast`
        type, this method will determine the smallest possible
        dtype from the following sets:

        * {'integer', 'signed'}: all integer types greater or equal to
          `np.int8`
        * {'unsigned'}: all unsigned types greater or equal to `np.uint8`
        * {'float'}: all floating types greater or equal to `np.float32`

        Note that downcast behavior is decoupled from parsing. Errors
        encountered during downcast is raised regardless of ``errors``
        parameter.

    Returns
    -------
    Series or ndarray
        Depending on the input, if series is passed in, series is returned,
        otherwise ndarray

    Notes
    -------
    An important difference from pandas is that this function does not accept
    mixed numeric/non-numeric type sequences. For example ``[1, 'a']``.
    A ``TypeError`` will be raised when such input is received, regardless of
    ``errors`` parameter.

    Examples
    --------
    >>> s = cudf.Series(['1', '2.0', '3e3'])
    >>> cudf.to_numeric(s)
    0       1.0
    1       2.0
    2    3000.0
    dtype: float64
    >>> cudf.to_numeric(s, downcast='float')
    0       1.0
    1       2.0
    2    3000.0
    dtype: float32
    >>> cudf.to_numeric(s, downcast='signed')
    0       1
    1       2
    2    3000
    dtype: int16
    >>> s = cudf.Series(['apple', '1.0', '3e3'])
    >>> cudf.to_numeric(s, errors='ignore')
    0    apple
    1      1.0
    2      3e3
    dtype: object
    >>> cudf.to_numeric(s, errors='coerce')
    0      <NA>
    1       1.0
    2    3000.0
    dtype: float64
    """

    if errors not in {"raise", "ignore", "coerce"}:
        raise ValueError("invalid error value specified")

    if downcast not in {None, "integer", "signed", "unsigned", "float"}:
        raise ValueError("invalid downcasting method provided")

    if not can_convert_to_column(arg) or (hasattr(arg, "ndim")
                                          and arg.ndim > 1):
        raise ValueError("arg must be column convertible")

    col = as_column(arg)
    dtype = col.dtype

    if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype):
        col = col.as_numerical_column(cudf.dtype("int64"))
    elif is_categorical_dtype(dtype):
        cat_dtype = col.dtype.type
        if _is_non_decimal_numeric_dtype(cat_dtype):
            col = col.as_numerical_column(cat_dtype)
        else:
            try:
                col = _convert_str_col(col._get_decategorized_column(), errors,
                                       downcast)
            except ValueError as e:
                if errors == "ignore":
                    return arg
                else:
                    raise e
    elif is_string_dtype(dtype):
        try:
            col = _convert_str_col(col, errors, downcast)
        except ValueError as e:
            if errors == "ignore":
                return arg
            else:
                raise e
    elif is_list_dtype(dtype) or is_struct_dtype(dtype):
        raise ValueError("Input does not support nested datatypes")
    elif _is_non_decimal_numeric_dtype(dtype):
        pass
    else:
        raise ValueError("Unrecognized datatype")

    # str->float conversion may require lower precision
    if col.dtype == cudf.dtype("f"):
        col = col.as_numerical_column("d")

    if downcast:
        downcast_type_map = {
            "integer": list(np.typecodes["Integer"]),
            "signed": list(np.typecodes["Integer"]),
            "unsigned": list(np.typecodes["UnsignedInteger"]),
        }
        float_types = list(np.typecodes["Float"])
        idx = float_types.index(cudf.dtype(np.float32).char)
        downcast_type_map["float"] = float_types[idx:]

        type_set = downcast_type_map[downcast]

        for t in type_set:
            downcast_dtype = cudf.dtype(t)
            if downcast_dtype.itemsize <= col.dtype.itemsize:
                if col.can_cast_safely(downcast_dtype):
                    col = libcudf.unary.cast(col, downcast_dtype)
                    break

    if isinstance(arg, (cudf.Series, pd.Series)):
        return cudf.Series(col)
    else:
        if col.has_nulls():
            # To match pandas, always return a floating type filled with nan.
            col = col.astype(float).fillna(np.nan)
        return col.values
Beispiel #8
0
def _is_string_dtype(obj):
    if not HAS_GPU:
        return pd.api.types.is_string_dtype(obj)
    else:
        return is_string_dtype(obj)