Beispiel #1
0
 def normalize_binop_value(
     self, other: ScalarLike
 ) -> Union[ColumnBase, ScalarLike]:
     if other is None:
         return other
     if isinstance(other, cudf.Scalar):
         if self.dtype == other.dtype:
             return other
         # expensive device-host transfer just to
         # adjust the dtype
         other = other.value
     elif isinstance(other, np.ndarray) and other.ndim == 0:
         other = other.item()
     other_dtype = np.min_scalar_type(other)
     if other_dtype.kind in {"b", "i", "u", "f"}:
         if isinstance(other, cudf.Scalar):
             return other
         other_dtype = np.promote_types(self.dtype, other_dtype)
         if other_dtype == np.dtype("float16"):
             other_dtype = np.dtype("float32")
             other = other_dtype.type(other)
         if self.dtype.kind == "b":
             other_dtype = min_signed_type(other)
         if np.isscalar(other):
             other = np.dtype(other_dtype).type(other)
             return other
         else:
             ary = utils.scalar_broadcast_to(
                 other, size=len(self), dtype=other_dtype
             )
             return column.build_column(
                 data=Buffer(ary), dtype=ary.dtype, mask=self.mask,
             )
     else:
         raise TypeError(f"cannot broadcast {type(other)}")
Beispiel #2
0
    def fillna(self, fill_value):
        col = self
        if is_scalar(fill_value):
            if isinstance(fill_value, np.timedelta64):
                dtype = determine_out_dtype(self.dtype, fill_value.dtype)
                fill_value = fill_value.astype(dtype)
                col = col.astype(dtype)
            elif not isinstance(fill_value, Scalar):
                fill_value = np.timedelta64(fill_value)
        else:
            fill_value = column.as_column(fill_value, nan_as_null=False)

        result = libcudf.replace.replace_nulls(col, fill_value)
        if isinstance(fill_value, np.timedelta64) and np.isnat(fill_value):
            # If the value we are filling is np.timedelta64("NAT")
            # we set the same mask as current column.
            # However where there are "<NA>" in the
            # columns, their corresponding locations
            # in base_data will contain min(int64) values.

            return column.build_column(
                data=result.base_data,
                dtype=result.dtype,
                mask=self.base_mask,
                size=result.size,
                offset=result.offset,
                children=result.base_children,
            )
        return result
Beispiel #3
0
 def sort_by_values(self, ascending=True, na_position="last"):
     sort_inds = get_sorted_inds(self, ascending, na_position)
     col_keys = self[sort_inds]
     col_inds = column.build_column(sort_inds.data,
                                    dtype=sort_inds.dtype,
                                    mask=sort_inds.mask)
     return col_keys, col_inds
Beispiel #4
0
 def normalize_binop_value(self, other):
     if other is None:
         return other
     other_dtype = np.min_scalar_type(other)
     if other_dtype.kind in {"b", "i", "u", "f"}:
         other_dtype = np.promote_types(self.dtype, other_dtype)
         if other_dtype == np.dtype("float16"):
             other = np.dtype("float32").type(other)
             other_dtype = other.dtype
         if self.dtype.kind == "b":
             other_dtype = min_signed_type(other)
         if np.isscalar(other):
             other = np.dtype(other_dtype).type(other)
             return other
         else:
             ary = utils.scalar_broadcast_to(other,
                                             size=len(self),
                                             dtype=other_dtype)
             return column.build_column(
                 data=Buffer.from_array_lik(ary),
                 dtype=ary.dtype,
                 mask=self.mask,
             )
     else:
         raise TypeError("cannot broadcast {}".format(type(other)))
Beispiel #5
0
    def fillna(self, fill_value):
        """
        Fill null values with *fill_value*
        """
        if np.isscalar(fill_value):
            # castsafely to the same dtype as self
            fill_value_casted = self.dtype.type(fill_value)
            if not np.isnan(fill_value) and (fill_value_casted != fill_value):
                raise TypeError(
                    "Cannot safely cast non-equivalent {} to {}".format(
                        type(fill_value).__name__, self.dtype.name
                    )
                )
            fill_value = fill_value_casted
        else:
            fill_value = column.as_column(fill_value, nan_as_null=False)
            # cast safely to the same dtype as self
            if is_integer_dtype(self.dtype):
                fill_value = _safe_cast_to_int(fill_value, self.dtype)
            else:
                fill_value = fill_value.astype(self.dtype)
        result = libcudfxx.replace.replace_nulls(self, fill_value)
        result = column.build_column(
            result.base_data,
            result.dtype,
            mask=None,
            offset=result.offset,
            size=result.size,
        )

        return result
Beispiel #6
0
    def deserialize(cls, header: dict, frames: list) -> CategoricalColumn:
        n_dtype_frames = header["dtype_frames_count"]
        dtype = CategoricalDtype.deserialize(header["dtype"],
                                             frames[:n_dtype_frames])
        n_data_frames = header["data_frames_count"]

        column_type = pickle.loads(header["data"]["type-serialized"])
        data = column_type.deserialize(
            header["data"],
            frames[n_dtype_frames:n_dtype_frames + n_data_frames],
        )
        mask = None
        if "mask" in header:
            mask = Buffer.deserialize(header["mask"],
                                      [frames[n_dtype_frames + n_data_frames]])
        return cast(
            CategoricalColumn,
            column.build_column(
                data=None,
                dtype=dtype,
                mask=mask,
                children=(column.as_column(data.base_data,
                                           dtype=data.dtype), ),
            ),
        )
Beispiel #7
0
 def as_numerical(self) -> NumericalColumn:
     return cast(
         cudf.core.column.NumericalColumn,
         column.build_column(data=self.codes.data,
                             dtype=self.codes.dtype,
                             mask=self.mask),
     )
Beispiel #8
0
    def deserialize(cls, header, frames):

        # Get null mask
        if header["null_count"] > 0:
            mask = Buffer(frames[-1])
        else:
            mask = None

        # Deserialize child columns
        children = []
        f = 0
        for h in header["subheaders"]:
            fcount = h["frame_count"]
            child_frames = frames[f:f + fcount]
            column_type = pickle.loads(h["type-serialized"])
            children.append(column_type.deserialize(h, child_frames))
            f += fcount

        # Materialize list column
        return column.build_column(
            data=None,
            dtype=pickle.loads(header["dtype"]),
            mask=mask,
            children=tuple(children),
            size=header["size"],
        )
Beispiel #9
0
 def as_numerical(self):
     return column.build_column(
         data=self.base_data,
         dtype=np.int64,
         mask=self.base_mask,
         offset=self.offset,
         size=self.size,
     )
Beispiel #10
0
 def get_dt_field(self, field):
     out_column = self._values.get_dt_field(field)
     # column.column_empty_like always returns a Column object
     # but we need a NumericalColumn for GenericIndex..
     # how should this be handled?
     out_column = column.build_column(
         data=out_column.data, dtype=out_column.dtype, mask=out_column.mask
     )
     return as_index(out_column, name=self.name)
Beispiel #11
0
 def as_numerical_column(self, dtype, **kwargs):
     casted = libcudf.typecast.cast(self, dtype)
     return column.build_column(
         data=casted.data,
         dtype=casted.dtype,
         mask=casted.mask,
         size=casted.size,
         offset=casted.offset,
     )
Beispiel #12
0
    def fillna(self, fill_value):
        if is_scalar(fill_value):
            fill_value = np.datetime64(fill_value, self.time_unit)
        else:
            fill_value = column.as_column(fill_value, nan_as_null=False)

        result = libcudf.replace.replace_nulls(self, fill_value)
        result = column.build_column(result.data, result.dtype, mask=None)

        return result
Beispiel #13
0
 def __init__(self, values, **kwargs):
     kwargs = _setdefault_name(values, kwargs)
     if isinstance(values, StringColumn):
         values = values.copy()
     elif isinstance(values, StringIndex):
         values = values._values.copy()
     else:
         values = column.build_column(nvstrings.to_device(values),
                                      dtype="object")
     super(StringIndex, self).__init__(values, **kwargs)
Beispiel #14
0
    def round(self, decimals=0):
        if decimals < 0:
            msg = "Decimal values < 0 are not yet supported."
            raise NotImplementedError(msg)

        if np.issubdtype(self.dtype, np.integer):
            return self

        data = Buffer(cudautils.apply_round(self.data_array_view, decimals))
        return column.build_column(data=data, dtype=self.dtype, mask=self.mask)
Beispiel #15
0
    def _set_categories(self, new_categories, **kwargs):
        """Returns a new CategoricalColumn with the categories set to the
        specified *new_categories*.

        Notes
        -----
        Assumes ``new_categories`` is the same dtype as the current categories
        """

        from cudf import DataFrame, Series

        cur_cats = self._parent.categories
        new_cats = column.as_column(new_categories)

        # Join the old and new categories to build a map from
        # old to new codes, inserting na_sentinel for any old
        # categories that don't exist in the new categories

        # Ensure new_categories is unique first
        if not (kwargs.get("is_unique", False) or new_cats.is_unique):
            # drop_duplicates() instead of unique() to preserve order
            new_cats = Series(new_cats).drop_duplicates()._column

        cur_codes = self.codes
        cur_order = cudautils.arange(len(cur_codes))
        old_codes = cudautils.arange(len(cur_cats), dtype=cur_codes.dtype)
        new_codes = cudautils.arange(len(new_cats), dtype=cur_codes.dtype)

        new_df = DataFrame({"new_codes": new_codes, "cats": new_cats})
        old_df = DataFrame({"old_codes": old_codes, "cats": cur_cats})
        cur_df = DataFrame({"old_codes": cur_codes, "order": cur_order})

        # Join the old and new categories and line up their codes
        df = old_df.merge(new_df, on="cats", how="left")
        # Join the old and new codes to "recode" the codes data buffer
        df = cur_df.merge(df, on="old_codes", how="left")
        df = df.sort_values(by="order").reset_index(True)

        ordered = kwargs.get("ordered", self.ordered)
        new_codes = df["new_codes"]._column
        new_dtype = CategoricalDtype(categories=new_cats, ordered=ordered)

        if kwargs.get("inplace", False):
            self._parent.data = None
            self._parent.mask = new_codes.mask
            self._parent.dtype = new_dtype
            self._parent.children = (new_codes, )
            return None

        return column.build_column(
            data=None,
            dtype=new_dtype,
            mask=new_codes.mask,
            children=(new_codes, ),
        )
Beispiel #16
0
 def as_numerical(self) -> "cudf.core.column.NumericalColumn":
     return cast(
         "cudf.core.column.NumericalColumn",
         column.build_column(
             data=self.base_data,
             dtype=np.int64,
             mask=self.base_mask,
             offset=self.offset,
             size=self.size,
         ),
     )
Beispiel #17
0
 def children(self):
     if self._children is None:
         codes_column = self.base_children[0]
         codes_column = column.build_column(
             data=codes_column.base_data,
             dtype=codes_column.dtype,
             mask=codes_column.base_mask,
             size=self.size,
             offset=self.offset,
         )
         self._children = (codes_column, )
     return self._children
Beispiel #18
0
    def children(self):
        if self._children is None:
            if self.base_children is None or (self.offset == 0
                                              and self.base_children[0].size
                                              == (self.size + 1)):
                self._children = self.base_children
            else:
                # First get the base columns for chars and offsets
                chars_column = self.base_children[1]
                offsets_column = self.base_children[0]

                # Shift offsets column by the parent offset.
                offsets_column = column.build_column(
                    data=offsets_column.base_data,
                    dtype=offsets_column.dtype,
                    mask=offsets_column.base_mask,
                    size=self.size + 1,
                    offset=self.offset,
                )

                # Now run a subtraction binary op to shift all of the offsets
                # by the respective number of characters relative to the
                # parent offset
                chars_offset = offsets_column[0]
                offsets_column = offsets_column.binary_operator(
                    "sub", offsets_column.dtype.type(chars_offset))

                # Shift the chars offset by the new first element of the
                # offsets column
                chars_size = offsets_column[self.size]
                chars_column = column.build_column(
                    data=chars_column.base_data,
                    dtype=chars_column.dtype,
                    mask=chars_column.base_mask,
                    size=chars_size,
                    offset=chars_offset,
                )

                self._children = (offsets_column, chars_column)
        return self._children
Beispiel #19
0
    def children(self):
        if self._children is None:
            codes_column = self.base_children[0]

            buf = Buffer(codes_column.base_data)
            buf.ptr = buf.ptr + (self.offset * codes_column.dtype.itemsize)
            buf.size = self.size * codes_column.dtype.itemsize

            codes_column = column.build_column(
                data=buf, dtype=codes_column.dtype, size=self.size,
            )
            self._children = (codes_column,)
        return self._children
Beispiel #20
0
def test_dataframe_apply_rows(dtype, has_nulls, pessimistic):
    count = 1000
    gdf_series_a = gen_rand_series(dtype, count, has_nulls=has_nulls)
    gdf_series_b = gen_rand_series(dtype, count, has_nulls=has_nulls)
    gdf_series_c = gen_rand_series(dtype, count, has_nulls=has_nulls)

    if pessimistic:
        # pessimistically combine the null masks
        gdf_series_expected = gdf_series_a * gdf_series_b
    else:
        # optimistically ignore the null masks
        a = cudf.Series(column.build_column(gdf_series_a.data, dtype))
        b = cudf.Series(column.build_column(gdf_series_b.data, dtype))
        gdf_series_expected = a * b

    df_expected = cudf.DataFrame(
        {
            "a": gdf_series_a,
            "b": gdf_series_b,
            "c": gdf_series_c,
            "out": gdf_series_expected,
        }
    )

    df_original = cudf.DataFrame(
        {"a": gdf_series_a, "b": gdf_series_b, "c": gdf_series_c}
    )

    df_actual = df_original.apply_rows(
        _kernel_multiply,
        ["a", "b"],
        {"out": dtype},
        {},
        pessimistic_nulls=pessimistic,
    )

    assert_eq(df_expected, df_actual)
Beispiel #21
0
    def children(self) -> Tuple[NumericalColumn]:
        if self._children is None:
            codes_column = self.base_children[0]

            buf = Buffer(codes_column.base_data)
            buf.ptr = buf.ptr + (self.offset * codes_column.dtype.itemsize)
            buf.size = self.size * codes_column.dtype.itemsize

            codes_column = cast(
                cudf.core.column.NumericalColumn,
                column.build_column(
                    data=buf, dtype=codes_column.dtype, size=self.size,
                ),
            )
            self._children = (codes_column,)
        return self._children
Beispiel #22
0
def from_dlpack(pycapsule_obj):
    """Converts from a DLPack tensor to a cuDF object.

    DLPack is an open-source memory tensor structure:
    `dmlc/dlpack <https://github.com/dmlc/dlpack>`_.

    This function takes a PyCapsule object which contains a pointer to
    a DLPack tensor as input, and returns a cuDF object. This function deep
    copies the data in the DLPack tensor into a cuDF object.

    Parameters
    ----------
    pycapsule_obj : PyCapsule
        Input DLPack tensor pointer which is encapsulated in a PyCapsule
        object.

    Returns
    -------
    A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D
    or 2D.
    """
    try:
        res, valids = cpp_dlpack.from_dlpack(pycapsule_obj)
    except GDFError as err:
        if str(err) == "b'GDF_DATASET_EMPTY'":
            raise ValueError(
                "Cannot create a cuDF Object from a DLPack tensor of 0 size"
            )
        else:
            raise err
    cols = []
    for idx in range(len(valids)):
        mask = None
        if valids[idx]:
            mask = Buffer(valids[idx])
        cols.append(
            column.build_column(
                Buffer(res[idx]), dtype=res[idx].dtype, mask=mask
            )
        )
    if len(cols) == 1:
        return Series(cols[0])
    else:
        df = DataFrame()
        for idx, col in enumerate(cols):
            df[idx] = col
        return df
Beispiel #23
0
    def sort_by_values(self, ascending=True, na_position="last"):
        if na_position == "last":
            nullfirst = False
        elif na_position == "first":
            nullfirst = True

        idx_dev_arr = rmm.device_array(len(self), dtype="int32")
        dev_ptr = libcudf.cudf.get_ctype_ptr(idx_dev_arr)
        self.data.order(2, asc=ascending, nullfirst=nullfirst, devptr=dev_ptr)

        col_inds = column.build_column(Buffer(idx_dev_arr),
                                       idx_dev_arr.dtype,
                                       mask=None)

        col_keys = self[col_inds.data.mem]

        return col_keys, col_inds
Beispiel #24
0
    def normalize_binop_value(self, other):
        if isinstance(other, dt.datetime):
            other = np.datetime64(other)

        if isinstance(other, pd.Timestamp):
            m = _numpy_to_pandas_conversion[self.time_unit]
            ary = utils.scalar_broadcast_to(other.value * m,
                                            shape=len(self),
                                            dtype=self.dtype)
        elif isinstance(other, np.datetime64):
            other = other.astype(self.dtype)
            ary = utils.scalar_broadcast_to(other,
                                            size=len(self),
                                            dtype=self.dtype)
        else:
            raise TypeError("cannot broadcast {}".format(type(other)))

        return column.build_column(data=Buffer(ary), dtype=self.dtype)
Beispiel #25
0
    def deserialize(cls, header, frames):
        n_dtype_frames = header["dtype_frames_count"]
        dtype = CategoricalDtype.deserialize(header["dtype"],
                                             frames[:n_dtype_frames])
        n_data_frames = header["data_frames_count"]

        column_type = pickle.loads(header["data"]["type"])
        data = column_type.deserialize(
            header["data"],
            frames[n_dtype_frames:n_dtype_frames + n_data_frames],
        )
        mask = None
        if header["frame_count"] > n_dtype_frames + n_data_frames:
            mask = Buffer(frames[n_dtype_frames + n_data_frames])
        return column.build_column(data=None,
                                   dtype=dtype,
                                   mask=mask,
                                   children=(data, ))
Beispiel #26
0
    def deserialize(cls, header, frames):
        # Deserialize the mask, value, and offset frames
        buffers = [Buffer(each_frame) for each_frame in frames]

        if header["null_count"] > 0:
            nbuf = buffers[2]
        else:
            nbuf = None

        children = []
        for h, b in zip(header["subheaders"], buffers[:2]):
            column_type = pickle.loads(h["type"])
            children.append(column_type.deserialize(h, [b]))

        col = column.build_column(data=None,
                                  dtype="str",
                                  mask=nbuf,
                                  children=tuple(children))
        return col
Beispiel #27
0
    def len(self):
        """
        Computes the length of each element in the Series/Index.

        Returns
        -------
          Series or Index of int: A Series or Index of integer values
            indicating the length of each element in the Series or Index.
        """
        from cudf.core.series import Series

        out_dev_arr = rmm.device_array(len(self._parent), dtype="int32")
        ptr = libcudf.cudf.get_ctype_ptr(out_dev_arr)
        self._parent.nvstrings.len(ptr)

        mask = None
        if self._parent.has_nulls:
            mask = self._parent.mask

        col = column.build_column(
            Buffer(out_dev_arr), np.dtype("int32"), mask=mask
        )
        return Series(col, index=self._index, name=self._name)
Beispiel #28
0
    def fillna(self, fill_value, inplace=False):
        """
        Fill null values with *fill_value*
        """
        if not self.has_null_mask:
            return self

        fill_is_scalar = np.isscalar(fill_value)

        if fill_is_scalar:
            if fill_value == self.default_na_value():
                fill_value = self.data.dtype.type(fill_value)
            else:
                try:
                    fill_value = self._encode(fill_value)
                    fill_value = self.data.dtype.type(fill_value)
                except (ValueError) as err:
                    err_msg = "fill value must be in categories"
                    raise ValueError(err_msg) from err
        else:
            fill_value = column.as_column(fill_value, nan_as_null=False)
            # TODO: only required if fill_value has a subset of the categories:
            fill_value = fill_value.cat()._set_categories(
                self._categories, is_unique=True
            )
            fill_value = column.as_column(fill_value.data).astype(
                self.data.dtype
            )

        result = libcudf.replace.replace_nulls(self, fill_value)

        result = column.build_column(
            result.data, "category", result.mask, categories=self._categories
        )

        return self._mimic_inplace(result.replace(mask=None), inplace)
Beispiel #29
0
 def as_numerical(self):
     return column.build_column(data=self.codes.data,
                                dtype=self.codes.dtype,
                                mask=self.mask)
Beispiel #30
0
 def _find_segments(self):
     seg, markers = cudautils.find_segments(self.gpu_values)
     return (
         column.build_column(data=Buffer(seg), dtype=seg.dtype),
         markers,
     )