Exemple #1
0
    def run(self, df, **launch_params):
        # Get input columns
        if isinstance(self.incols, dict):
            inputs = {
                v: df[k]._column.data_array_view
                for (k, v) in self.incols.items()
            }
        else:
            inputs = {k: df[k]._column.data_array_view for k in self.incols}
        # Allocate output columns
        outputs = {}
        for k, dt in self.outcols.items():
            outputs[k] = column.column_empty(len(df), dt,
                                             False).data_array_view
        # Bind argument
        args = {}
        for dct in [inputs, outputs, self.kwargs]:
            args.update(dct)
        bound = self.sig.bind(**args)
        # Launch kernel
        self.launch_kernel(df, bound.args, **launch_params)
        # Prepare pessimistic nullmask
        if self.pessimistic_nulls:
            out_mask = make_aggregate_nullmask(df, columns=self.incols)
        else:
            out_mask = None
        # Prepare output frame
        outdf = df.copy()
        for k in sorted(self.outcols):
            outdf[k] = Series(outputs[k], index=outdf.index, nan_as_null=False)
            if out_mask is not None:
                outdf[k] = outdf[k].set_mask(out_mask.data_array_view)

        return outdf
Exemple #2
0
    def from_sequences(
            cls,
            arbitrary: Sequence[ColumnLike]) -> "cudf.core.column.ListColumn":
        """
        Create a list column for list of column-like sequences
        """
        data_col = column.column_empty(0)
        mask_col = []
        offset_col = [0]
        offset = 0

        # Build Data, Mask & Offsets
        for data in arbitrary:
            if cudf._lib.scalar._is_null_host_scalar(data):
                mask_col.append(False)
                offset_col.append(offset)
            else:
                mask_col.append(True)
                data_col = data_col.append(as_column(data))
                offset += len(data)
                offset_col.append(offset)

        offset_col = column.as_column(offset_col, dtype="int32")

        # Build ListColumn
        res = cls(
            size=len(arbitrary),
            dtype=cudf.ListDtype(data_col.dtype),
            mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)),
            offset=0,
            null_count=0,
            children=(offset_col, data_col),
        )
        return res
Exemple #3
0
def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False):
    if reflect:
        lhs, rhs = rhs, lhs
    libcudf.nvtx.nvtx_range_push("CUDF_BINARY_OP", "orange")
    # Allocate output
    masked = False
    if np.isscalar(lhs):
        masked = rhs.nullable
        row_count = len(rhs)
    elif np.isscalar(rhs):
        masked = lhs.nullable
        row_count = len(lhs)
    elif rhs is None:
        masked = True
        row_count = len(lhs)
    elif lhs is None:
        masked = True
        row_count = len(rhs)
    else:
        masked = lhs.nullable or rhs.nullable
        row_count = len(lhs)

    is_op_comparison = op in ["lt", "gt", "le", "ge", "eq", "ne"]

    out = column.column_empty(row_count, dtype=out_dtype, masked=masked)

    _ = libcudf.binops.apply_op(lhs, rhs, out, op)

    if is_op_comparison:
        out = out.fillna(op == "ne")

    libcudf.nvtx.nvtx_range_pop()
    return out
Exemple #4
0
def _numeric_column_binop(lhs, rhs, op, out_dtype, reflect=False):
    if reflect:
        lhs, rhs = rhs, lhs
    libcudf.nvtx.nvtx_range_push("CUDF_BINARY_OP", "orange")
    # Allocate output
    masked = False
    name = None
    if np.isscalar(lhs):
        masked = rhs.has_null_mask
        row_count = len(rhs)
        name = rhs.name
    elif np.isscalar(rhs):
        masked = lhs.has_null_mask
        row_count = len(lhs)
        name = lhs.name
    else:
        masked = lhs.has_null_mask or rhs.has_null_mask
        row_count = len(lhs)

    is_op_comparison = op in ["lt", "gt", "le", "ge", "eq", "ne"]

    out = column.column_empty(row_count, dtype=out_dtype, masked=masked)
    # Call and fix null_count
    null_count = libcudf.binops.apply_op(lhs, rhs, out, op)

    if is_op_comparison:
        out.fillna(op == "ne", inplace=True)
    else:
        out = out.replace(null_count=null_count)

    result = out.view(NumericalColumn, dtype=out_dtype, name=name)
    libcudf.nvtx.nvtx_range_pop()
    return result
Exemple #5
0
    def __init__(self, **kwargs):
        """
        Parameters
        ----------
        data : Buffer
            The code values
        mask : Buffer; optional
            The validity mask
        null_count : int; optional
            The number of null values in the mask.
        categories : iterable
            The categories
        ordered : bool
            whether the categorical has a logical ordering (e.g. less than)
        """

        ordered = bool(kwargs.pop("ordered"))
        categories = kwargs.pop("categories", [])
        # Default to String dtype if len(categories) == 0, like pandas does
        categories = (
            column.as_column(categories)
            if len(categories) > 0
            else column.column_empty(0, np.dtype("object"), masked=False)
        )

        dtype = CategoricalDtype(
            categories=column.as_column(categories), ordered=ordered
        )
        kwargs.update({"dtype": dtype})
        super(CategoricalColumn, self).__init__(**kwargs)
        self._categories = categories
        self._ordered = ordered
Exemple #6
0
    def as_string_column(self, dtype, **kwargs):
        from cudf.core.column import string

        if len(self) > 0:
            return string._numeric_to_str_typecast_functions[np.dtype(
                self.dtype)](self, **kwargs)
        else:
            return column.column_empty(0, dtype="object", masked=False)
Exemple #7
0
    def as_string_column(self, dtype, **kwargs):

        if not kwargs.get("format"):
            fmt = _dtype_to_format_conversion.get(self.dtype.name,
                                                  "%Y-%m-%d %H:%M:%S")
            kwargs["format"] = fmt
        if len(self) > 0:
            return string._numeric_to_str_typecast_functions[np.dtype(
                self.dtype)](self, **kwargs)
        else:
            return column.column_empty(0, dtype="object", masked=False)
Exemple #8
0
 def _string_safe_hash(df):
     frame = df.copy(deep=False)
     for col in frame.columns:
         if isinstance(frame[col]._column, StringColumn):
             out_col = column.column_empty(
                 len(frame), dtype="int32", masked=False
             )
             ptr = out_col.data.ptr
             frame[col]._column.data_array_view.hash(devptr=ptr)
             frame[col] = out_col
     return frame.hash_columns()
Exemple #9
0
 def as_string_column(self, dtype, format=None):
     if format is None:
         format = _dtype_to_format_conversion.get(
             self.dtype.name, "%Y-%m-%d %H:%M:%S"
         )
     if len(self) > 0:
         return string._datetime_to_str_typecast_functions[
             np.dtype(self.dtype)
         ](self, format)
     else:
         return column.column_empty(0, dtype="object", masked=False)
Exemple #10
0
 def as_string_column(self,
                      dtype: Dtype,
                      format=None) -> "cudf.core.column.StringColumn":
     if format is None:
         format = _dtype_to_format_conversion.get(self.dtype.name,
                                                  "%D days %H:%M:%S")
     if len(self) > 0:
         return string._timedelta_to_str_typecast_functions[np.dtype(
             self.dtype)](self, format=format)
     else:
         return cast(
             "cudf.core.column.StringColumn",
             column.column_empty(0, dtype="object", masked=False),
         )
Exemple #11
0
    def as_string_column(self, dtype, **kwargs):
        from cudf.core.column import string

        if len(self) > 0:
            dev_ptr = self.data_ptr
            null_ptr = None
            if self.nullable:
                null_ptr = self.mask_ptr
            kwargs.update({
                "count": len(self),
                "nulls": null_ptr,
                "bdevmem": True,
                "units": self.time_unit,
            })
            data = string._numeric_to_str_typecast_functions[np.dtype(
                self.dtype)](dev_ptr, **kwargs)
            return as_column(data)
        else:
            return column.column_empty(0, dtype="object", masked=False)
Exemple #12
0
    def as_numerical_column(self, dtype, **kwargs):

        mem_dtype = np.dtype(dtype)
        str_dtype = mem_dtype
        out_dtype = mem_dtype

        if mem_dtype.type in (np.int8, np.int16):
            mem_dtype = np.dtype(np.int32)
            str_dtype = mem_dtype
        elif mem_dtype.type is np.datetime64:
            kwargs.update(units=np.datetime_data(mem_dtype)[0])
            mem_dtype = np.dtype(np.int64)
            if "format" not in kwargs:
                if len(self.nvstrings) > 0:
                    # infer on host from the first not na element
                    fmt = pd.core.tools.datetimes._guess_datetime_format(
                        self[self.notna()][0]
                    )
                    kwargs.update(format=fmt)
            else:
                fmt = None

        out_arr = rmm.device_array(shape=len(self), dtype=mem_dtype)
        out_ptr = libcudf.cudf.get_ctype_ptr(out_arr)
        kwargs.update({"devptr": out_ptr})

        _str_to_numeric_typecast_functions[str_dtype](self.nvstrings, **kwargs)

        out_col = column.as_column(out_arr)

        if self.has_nulls:
            mask_size = utils.calc_chunk_size(
                len(self.nvstrings), utils.mask_bitsize
            )
            out_mask = column.column_empty(
                mask_size, dtype="int8", masked=False
            ).data
            out_mask_ptr = out_mask.ptr
            self.nvstrings.set_null_bitmask(out_mask_ptr, bdevmem=True)
            out_col.mask = out_mask

        return out_col.astype(out_dtype)
Exemple #13
0
 def as_string_column(self,
                      dtype: Dtype,
                      format=None,
                      **kwargs) -> "cudf.core.column.StringColumn":
     if format is None:
         format = _dtype_to_format_conversion.get(self.dtype.name,
                                                  "%Y-%m-%d %H:%M:%S")
     if format in _DATETIME_SPECIAL_FORMATS:
         names = as_column(_DATETIME_NAMES)
     else:
         names = cudf.core.column.column_empty(0,
                                               dtype="object",
                                               masked=False)
     if len(self) > 0:
         return string._datetime_to_str_typecast_functions[cudf.dtype(
             self.dtype)](self, format, names)
     else:
         return cast(
             "cudf.core.column.StringColumn",
             column.column_empty(0, dtype="object", masked=False),
         )
Exemple #14
0
 def _values(self):
     if len(self) > 0:
         vals = cudautils.arange(self._start, self._stop, dtype=self.dtype)
         return column.as_column(vals)
     else:
         return column.column_empty(0, masked=False, dtype=self.dtype)
Exemple #15
0
    def __setitem__(self, key, value):
        """
        Set the value of self[key] to value.

        If value and self are of different types,
        value is coerced to self.dtype
        """
        from cudf.core import column

        if isinstance(key, slice):
            key_start, key_stop, key_stride = key.indices(len(self))
            if key_stride != 1:
                raise NotImplementedError("Stride not supported in slice")
            nelem = abs(key_stop - key_start)
        else:
            key = column.as_column(key)
            if pd.api.types.is_bool_dtype(key.dtype):
                if not len(key) == len(self):
                    raise ValueError(
                        "Boolean mask must be of same length as column"
                    )
                key = column.as_column(cudautils.arange(len(self)))[key]
            nelem = len(key)

        if is_scalar(value):
            if is_categorical_dtype(self.dtype):
                from cudf.utils.cudautils import fill_value

                data = rmm.device_array(nelem, dtype=self.codes.dtype)
                fill_value(data, self._encode(value))
                value = build_categorical_column(
                    categories=self.dtype.categories,
                    codes=as_column(data),
                    ordered=self.dtype.ordered,
                )
            elif value is None:
                value = column.column_empty(nelem, self.dtype, masked=True)
            else:
                to_dtype = pd.api.types.pandas_dtype(self.dtype)
                value = utils.scalar_broadcast_to(value, nelem, to_dtype)

        value = column.as_column(value).astype(self.dtype)

        if len(value) != nelem:
            msg = (
                f"Size mismatch: cannot set value "
                f"of size {len(value)} to indexing result of size "
                f"{nelem}"
            )
            raise ValueError(msg)

        if is_categorical_dtype(value.dtype):
            value = value.cat().set_categories(self.categories)
            assert self.dtype == value.dtype

        if isinstance(key, slice):
            out = libcudf.copying.copy_range(
                self, value, key_start, key_stop, 0
            )
        else:
            try:
                out = libcudf.copying.scatter(value, key, self)
            except RuntimeError as e:
                if "out of bounds" in str(e):
                    raise IndexError(
                        f"index out of bounds for column of size {len(self)}"
                    )
                raise

        self._mimic_inplace(out, inplace=True)