Beispiel #1
0
def scalar_broadcast_to(scalar, size, dtype=None):
    from cudf.utils.cudautils import fill_value
    from cudf.utils.dtypes import to_cudf_compatible_scalar, is_string_dtype
    from cudf.core.column import column_empty

    if isinstance(size, (tuple, list)):
        size = size[0]

    if scalar is None:
        if dtype is None:
            dtype = "object"
        return column_empty(size, dtype=dtype, masked=True)

    if isinstance(scalar, pd.Categorical):
        return scalar_broadcast_to(scalar.categories[0], size).astype(dtype)

    if isinstance(scalar, str) and (is_string_dtype(dtype) or dtype is None):
        dtype = "object"
    else:
        scalar = to_cudf_compatible_scalar(scalar, dtype=dtype)
        dtype = scalar.dtype

    if np.dtype(dtype) == np.dtype("object"):
        import nvstrings
        from cudf.core.column import as_column
        from cudf.utils.cudautils import zeros

        gather_map = zeros(size, dtype="int32")
        scalar_str_col = as_column(nvstrings.to_device([scalar]))
        return scalar_str_col[gather_map]
    else:
        da = rmm.device_array((size, ), dtype=dtype)
        if da.size != 0:
            fill_value(da, scalar)
        return da
Beispiel #2
0
def test_full_df(num_rows, tmpdir, distro):
    json_sample["num_rows"] = num_rows
    cats = list(json_sample["cats"].keys())
    cols = datagen._get_cols_from_schema(json_sample, distros=distro)

    df_gen = datagen.DatasetGen(datagen.UniformDistro(), gpu_frac=0.00001)
    df_files = df_gen.full_df_create(num_rows,
                                     cols,
                                     entries=True,
                                     output=tmpdir)
    test_size = 0
    full_df = cudf.DataFrame()
    for fi in df_files:
        df = cudf.read_parquet(fi)
        test_size = test_size + df.shape[0]
        full_df = cudf.concat([full_df, df])
    assert test_size == num_rows
    conts_rep = cols["conts"]
    cats_rep = cols["cats"]
    labels_rep = cols["labels"]
    assert df.shape[1] == len(conts_rep) + len(cats_rep) + len(labels_rep)
    for idx, cat in enumerate(cats[1:]):
        dist = cats_rep[idx + 1].distro or df_gen.dist
        if not is_string_dtype(full_df[cat]._column):
            sts, ps = dist.verify(full_df[cat].to_pandas())
            assert all(s > 0.9 for s in sts)
        assert full_df[cat].nunique() == cats_rep[idx + 1].cardinality
        assert full_df[cat].str.len().min() == cats_rep[idx + 1].min_entry_size
        assert full_df[cat].str.len().max() == cats_rep[idx + 1].max_entry_size
    check_ser = cudf.Series(full_df[cats[0]]._column.elements.values_host)
    assert check_ser.nunique() == cats_rep[0].cardinality
    assert check_ser.str.len().min() == cats_rep[0].min_entry_size
    assert check_ser.str.len().max() == cats_rep[0].max_entry_size
Beispiel #3
0
def scalar_broadcast_to(scalar, size, dtype=None):
    from cudf.utils.dtypes import to_cudf_compatible_scalar, is_string_dtype
    from cudf.core.column import column_empty

    if isinstance(size, (tuple, list)):
        size = size[0]

    if scalar is None:
        if dtype is None:
            dtype = "object"
        return column_empty(size, dtype=dtype, masked=True)

    if isinstance(scalar, pd.Categorical):
        return scalar_broadcast_to(scalar.categories[0], size).astype(dtype)

    if isinstance(scalar, str) and (is_string_dtype(dtype) or dtype is None):
        dtype = "object"
    else:
        scalar = to_cudf_compatible_scalar(scalar, dtype=dtype)
        dtype = scalar.dtype

    if np.dtype(dtype) == np.dtype("object"):
        from cudf.core.column import as_column

        gather_map = cupy.zeros(size, dtype="int32")
        scalar_str_col = as_column([scalar], dtype="str")
        return scalar_str_col[gather_map]
    else:
        out_col = column_empty(size, dtype=dtype)
        if out_col.size != 0:
            out_col.data_array_view[:] = scalar
        return out_col
Beispiel #4
0
    def _fill(self, fill_value, begin=0, end=-1, inplace=False):
        if end <= begin or begin >= self.size:
            return self if inplace else self.copy()

        if is_categorical_dtype(self.dtype):
            return self._fill_categorical(fill_value, begin, end, inplace)

        fill_scalar = Scalar(fill_value, self.dtype)

        if not inplace:
            return libcudfxx.filling.fill(self, begin, end, fill_scalar)

        if is_string_dtype(self.dtype):
            return self._mimic_inplace(
                libcudfxx.filling.fill(self, begin, end, fill_scalar),
                inplace=True,
            )

        if fill_value is None and not self.nullable:
            mask = create_null_mask(self.size, state=MaskState.ALL_VALID)
            self.set_base_mask(mask)

        libcudfxx.filling.fill_in_place(self, begin, end, fill_scalar)

        return self
Beispiel #5
0
def _convert_str_col(col, errors, _downcast=None):
    """
    Converts a string column to numeric column

    Converts to integer column if all strings are integer-like (isinteger.all)
    Otherwise, converts to float column if all strings are float-like (
    isfloat.all)

    If error == 'coerce', fill non-numerics strings with null

    Looks ahead to ``downcast`` parameter, if the float may be casted to
    integer, then only process in float32 pipeline.

    Parameters
    ----------
    col : The string column to convert, must be string dtype
    errors : {'raise', 'ignore', 'coerce'}, same as ``to_numeric``
    _downcast : Same as ``to_numeric``, see description for use

    Returns
    -------
    Converted numeric column
    """
    if not is_string_dtype(col):
        raise TypeError("col must be string dtype.")

    is_integer = col.str().isinteger()
    if is_integer.all():
        return col.as_numerical_column(dtype=np.dtype("i8"))

    col = _proc_inf_empty_strings(col)

    is_float = col.str().isfloat()
    if is_float.all():
        if _downcast in {"unsigned", "signed", "integer"}:
            warnings.warn(
                UserWarning("Downcasting from float to int will be "
                            "limited by float32 precision."))
            return col.as_numerical_column(dtype=np.dtype("f"))
        else:
            return col.as_numerical_column(dtype=np.dtype("d"))
    else:
        if errors == "coerce":
            col = libcudf.string_casting.stod(col)
            non_numerics = is_float.unary_operator("not")
            col[non_numerics] = None
            return col
        else:
            raise ValueError("Unable to convert some strings to numerics.")
Beispiel #6
0
def _get_non_empty_data(s):
    if isinstance(s._column, cudf.core.column.CategoricalColumn):
        categories = (s._column.categories
                      if len(s._column.categories) else [UNKNOWN_CATEGORIES])
        codes = column.full(size=2, fill_value=0, dtype="int32")
        ordered = s._column.ordered
        data = column.build_categorical_column(categories=categories,
                                               codes=codes,
                                               ordered=ordered)
    elif is_string_dtype(s.dtype):
        data = pa.array(["cat", "dog"])
    else:
        if pd.api.types.is_numeric_dtype(s.dtype):
            data = column.as_column(cp.arange(start=0, stop=2, dtype=s.dtype))
        else:
            data = column.as_column(cp.arange(start=0, stop=2,
                                              dtype="int64")).astype(s.dtype)
    return data
Beispiel #7
0
def _nonempty_series(s, idx=None):
    if idx is None:
        idx = _nonempty_index(s.index)
    dtype = s.dtype
    if is_categorical_dtype(dtype):
        categories = (
            s._column.categories if len(s._column.categories) else ["a"]
        )
        codes = [0, 0]
        ordered = s._column.ordered
        data = column.build_categorical_column(
            categories=categories, codes=codes, ordered=ordered
        )
    elif is_string_dtype(dtype):
        data = ["cat", "dog"]
    else:
        data = np.arange(start=0, stop=2, dtype=dtype)

    return cudf.Series(data, name=s.name, index=idx)
Beispiel #8
0
    def _mimic_inplace(self, result, inplace=False):
        """
        If `inplace=True`, used to mimic an inplace operation
        by replacing data in ``self`` with data in ``result``.

        Otherwise, returns ``result`` unchanged.
        """
        if inplace:
            self.data = result.data
            self.mask = result.mask
            self.dtype = result.dtype
            self.size = result.size
            self.offset = result.offset
            if hasattr(result, "children"):
                self.children = result.children
                if is_string_dtype(self):
                    # force recomputation of nvstrings/nvcategory
                    self._nvstrings = None
                    self._nvcategory = None
            self.__class__ = result.__class__
        else:
            return result
Beispiel #9
0
def to_numeric(arg, errors="raise", downcast=None):
    """
    Convert argument into numerical types.

    Parameters
    ----------
    arg : column-convertible
        The object to convert to numeric types
    errors : {'raise', 'ignore', 'coerce'}, defaults 'raise'
        Policy to handle errors during parsing.

        * 'raise' will notify user all errors encountered.
        * 'ignore' will skip error and returns ``arg``.
        * 'coerce' will leave invalid values as nulls.
    downcast : {'integer', 'signed', 'unsigned', 'float'}, defaults None
        If set, will try to down-convert the datatype of the
        parsed results to smallest possible type. For each `downcast`
        type, this method will determine the smallest possible
        dtype from the following sets:

        * {'integer', 'signed'}: all integer types greater or equal to
          `np.int8`
        * {'unsigned'}: all unsigned types greater or equal to `np.uint8`
        * {'float'}: all floating types greater or equal to `np.float32`

        Note that downcast behavior is decoupled from parsing. Errors
        encountered during downcast is raised regardless of ``errors``
        parameter.

    Returns
    -------
    Series or ndarray
        Depending on the input, if series is passed in, series is returned,
        otherwise ndarray

    Notes
    -------
    An important difference from pandas is that this function does not accept
    mixed numeric/non-numeric type sequences. For example ``[1, 'a']``.
    A ``TypeError`` will be raised when such input is received, regardless of
    ``errors`` parameter.

    Examples
    --------
    >>> s = cudf.Series(['1', '2.0', '3e3'])
    >>> cudf.to_numeric(s)
    0       1.0
    1       2.0
    2    3000.0
    dtype: float64
    >>> cudf.to_numeric(s, downcast='float')
    0       1.0
    1       2.0
    2    3000.0
    dtype: float32
    >>> cudf.to_numeric(s, downcast='signed')
    0       1
    1       2
    2    3000
    dtype: int16
    >>> s = cudf.Series(['apple', '1.0', '3e3'])
    >>> cudf.to_numeric(s, errors='ignore')
    0    apple
    1      1.0
    2      3e3
    dtype: object
    >>> cudf.to_numeric(s, errors='coerce')
    0      <NA>
    1       1.0
    2    3000.0
    dtype: float64
    """

    if errors not in {"raise", "ignore", "coerce"}:
        raise ValueError("invalid error value specified")

    if downcast not in {None, "integer", "signed", "unsigned", "float"}:
        raise ValueError("invalid downcasting method provided")

    if not can_convert_to_column(arg) or (hasattr(arg, "ndim")
                                          and arg.ndim > 1):
        raise ValueError("arg must be column convertible")

    col = as_column(arg)
    dtype = col.dtype

    if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype):
        col = col.as_numerical_column(np.dtype("int64"))
    elif is_categorical_dtype(dtype):
        cat_dtype = col.dtype.type
        if is_numerical_dtype(cat_dtype):
            col = col.as_numerical_column(cat_dtype)
        else:
            try:
                col = _convert_str_col(col._get_decategorized_column(), errors,
                                       downcast)
            except ValueError as e:
                if errors == "ignore":
                    return arg
                else:
                    raise e
    elif is_string_dtype(dtype):
        try:
            col = _convert_str_col(col, errors, downcast)
        except ValueError as e:
            if errors == "ignore":
                return arg
            else:
                raise e
    elif is_list_dtype(dtype) or is_struct_dtype(dtype):
        raise ValueError("Input does not support nested datatypes")
    elif is_numerical_dtype(dtype):
        pass
    else:
        raise ValueError("Unrecognized datatype")

    # str->float conversion may require lower precision
    if col.dtype == np.dtype("f"):
        col = col.as_numerical_column("d")

    if downcast:
        downcast_type_map = {
            "integer": list(np.typecodes["Integer"]),
            "signed": list(np.typecodes["Integer"]),
            "unsigned": list(np.typecodes["UnsignedInteger"]),
        }
        float_types = list(np.typecodes["Float"])
        idx = float_types.index(np.dtype(np.float32).char)
        downcast_type_map["float"] = float_types[idx:]

        type_set = downcast_type_map[downcast]

        for t in type_set:
            downcast_dtype = np.dtype(t)
            if downcast_dtype.itemsize <= col.dtype.itemsize:
                if col.can_cast_safely(downcast_dtype):
                    col = libcudf.unary.cast(col, downcast_dtype)
                    break

    if isinstance(arg, (cudf.Series, pd.Series)):
        return cudf.Series(col)
    else:
        col = col.fillna(col.default_na_value())
        return col.values