Esempio n. 1
0
    def __getitem__(self, arg):
        from cudf.dataframe import columnops

        if isinstance(arg, Number):
            arg = int(arg)
            return self.element_indexing(arg)
        elif isinstance(arg, slice):
            # compute mask slice
            if self.null_count > 0:
                if arg.step is not None and arg.step != 1:
                    raise NotImplementedError(arg)

                # slicing data
                subdata = self.data[arg]
                # slicing mask
                if self.dtype == "object":
                    data_size = self.data.size()
                else:
                    data_size = self.data.size
                bytemask = cudautils.expand_mask_bits(data_size,
                                                      self.mask.to_gpu_array())
                submask = Buffer(cudautils.compact_mask_bytes(bytemask[arg]))
                col = self.replace(data=subdata, mask=submask)
                return col
            else:
                newbuffer = self.data[arg]
                return self.replace(data=newbuffer)
        else:
            arg = columnops.as_column(arg)
            if len(arg) == 0:
                arg = columnops.as_column([], dtype="int32")
            if pd.api.types.is_integer_dtype(arg.dtype):
                return self.take(arg.data.mem)
            if pd.api.types.is_bool_dtype(arg.dtype):
                return self.apply_boolean_mask(arg)
            raise NotImplementedError(type(arg))
Esempio n. 2
0
    def reorder_categories(self, new_categories, **kwargs):
        from cudf.dataframe.series import Series

        new_categories = columnops.as_column(new_categories)
        # Compare new_categories against current categories.
        # Ignore order for comparison because we're only interested
        # in whether new_categories has all the same values as the
        # current set of categories.
        if not self._categories_equal(new_categories, ordered=False):
            raise ValueError(
                "items in new_categories are not the same as in "
                "old categories"
            )
        data = self._set_categories(new_categories, **kwargs)
        if data is not None:
            return Series(data=data)
Esempio n. 3
0
    def __getattr__(self, attr, *args, **kwargs):
        from cudf.dataframe.series import Series
        if hasattr(self._parent._data, attr):
            passed_attr = getattr(self._parent._data, attr)
            if callable(passed_attr):

                def wrapper(*args, **kwargs):
                    return getattr(self._parent._data, attr)(*args, **kwargs)

                if isinstance(wrapper, nvstrings.nvstrings):
                    wrapper = Series(columnops.as_column(wrapper),
                                     index=self._index)
                return wrapper
            else:
                return passed_attr
        else:
            raise AttributeError(attr)
Esempio n. 4
0
    def __init__(self, levels, codes=None, labels=None, names=None):
        self.names = names
        column_names = []
        if labels:
            warnings.warn("the 'labels' keyword is deprecated, use 'codes' "
                          "instead", FutureWarning)
        if labels and not codes:
            codes = labels
        if isinstance(names, (Sequence,
                              pd.core.indexes.frozen.FrozenNDArray,
                              pd.core.indexes.frozen.FrozenList)):
            if sum(x is None for x in names) > 1:
                column_names = list(range(len(codes)))
            else:
                column_names = names
        elif names is None:
            column_names = list(range(len(codes)))
        else:
            column_names = names

        if len(levels) == 0:
            raise ValueError('Must pass non-zero number of levels/codes')

        import cudf
        if not isinstance(codes, cudf.dataframe.dataframe.DataFrame) and\
                not isinstance(codes[0], (Sequence,
                               pd.core.indexes.frozen.FrozenNDArray)):
            raise TypeError('Codes is not a Sequence of sequences')
        if not isinstance(codes, cudf.dataframe.dataframe.DataFrame):
            self.codes = cudf.dataframe.dataframe.DataFrame()
            for idx, code in enumerate(codes):
                code = np.array(code)
                self.codes.add_column(column_names[idx],
                                      columnops.as_column(code))
        else:
            self.codes = codes

        # converting levels to numpy array will produce a Float64Index
        # (on empty levels)for levels mimicking the behavior of Pandas
        self.levels = np.array([Series(level).to_array() for level in levels])
        self._validate_levels_and_codes(self.levels, self.codes)
        self.name = None
        self.names = names
Esempio n. 5
0
    def __init__(self, values, name=None):
        if isinstance(values, pd.Series) and \
                pd.api.types.is_categorical_dtype(values.dtype):
            values = CategoricalColumn(
                data=Buffer(values.cat.codes.values),
                categories=values.cat.categories.tolist(),
                ordered=values.cat.ordered)
        elif isinstance(values, (pd.Categorical, pd.CategoricalIndex)):
            values = CategoricalColumn(data=Buffer(values.codes),
                                       categories=values.categories.tolist(),
                                       ordered=values.ordered)
        elif isinstance(values, (list, tuple)):
            values = columnops.as_column(
                pd.Categorical(values, categories=values))

        assert values.null_count == 0
        self._values = values
        self.name = name
        self.names = [name]
Esempio n. 6
0
    def astype(self, dtype):
        if self.dtype == dtype:
            return self
        elif dtype in (np.dtype('int8'), np.dtype('int16')):
            out_dtype = np.dtype(dtype)
            dtype = np.dtype('int32')
        else:
            out_dtype = np.dtype(dtype)

        out_arr = rmm.device_array(shape=len(self), dtype=dtype)
        out_ptr = get_ctype_ptr(out_arr)
        kwargs = {'devptr': out_ptr}
        if dtype == np.dtype('datetime64[ms]'):
            kwargs['units'] = 'ms'
        _str_to_numeric_typecast_functions[np.dtype(dtype)](self.str(),
                                                            **kwargs)

        out_col = columnops.as_column(out_arr)
        return out_col.astype(out_dtype)
Esempio n. 7
0
    def __getitem__(self, arg):
        rows = []
        len_idx = len(self._sr)

        if isinstance(arg, tuple):
            for idx in arg:
                rows.append(idx)

        elif isinstance(arg, int):
            rows.append(arg)

        elif isinstance(arg, slice):
            start, stop, step, sln = utils.standard_python_slice(len_idx, arg)
            if sln > 0:
                for idx in range(start, stop, step):
                    rows.append(idx)

        else:
            raise TypeError(type(arg))

        # To check whether all the indices are valid.
        for idx in rows:
            if abs(idx) > len_idx or idx == len_idx:
                raise IndexError("positional indexers are out-of-bounds")

        for i in range(len(rows)):
            if rows[i] < 0:
                rows[i] = len_idx + rows[i]

        # returns the single elem similar to pandas
        if isinstance(arg, int) and len(rows) == 1:
            return self._sr[rows[0]]

        ret_list = []
        for idx in rows:
            ret_list.append(self._sr[idx])

        col_data = columnops.as_column(ret_list,
                                       dtype=self._sr.dtype,
                                       nan_as_null=True)

        return Series(col_data, index=as_index(np.asarray(rows)))
Esempio n. 8
0
def as_index(arbitrary, **kwargs):
    """Create an Index from an arbitrary object

    Currently supported inputs are:

    * ``Column``
    * ``Buffer``
    * ``Series``
    * ``Index``
    * numba device array
    * numpy array
    * pyarrow array
    * pandas.Categorical

    Returns
    -------
    result : subclass of Index
        - CategoricalIndex for Categorical input.
        - DatetimeIndex for Datetime input.
        - GenericIndex for all other inputs.
    """

    kwargs = _setdefault_name(arbitrary, kwargs)

    if isinstance(arbitrary, Index):
        return arbitrary.rename(**kwargs)
    elif isinstance(arbitrary, NumericalColumn):
        return GenericIndex(arbitrary, **kwargs)
    elif isinstance(arbitrary, StringColumn):
        return StringIndex(arbitrary, **kwargs)
    elif isinstance(arbitrary, DatetimeColumn):
        return DatetimeIndex(arbitrary, **kwargs)
    elif isinstance(arbitrary, CategoricalColumn):
        return CategoricalIndex(arbitrary, **kwargs)
    elif isinstance(arbitrary, cudf.Series):
        return as_index(arbitrary._column, **kwargs)
    elif isinstance(arbitrary, pd.RangeIndex):
        return RangeIndex(start=arbitrary._start,
                          stop=arbitrary._stop,
                          **kwargs)
    else:
        return as_index(columnops.as_column(arbitrary), **kwargs)
Esempio n. 9
0
    def deserialize(cls, header, frames):
        data, mask = super(CategoricalColumn, cls).deserialize(header, frames)

        # Handle categories that were serialized as a cudf.Column
        category_frames = frames[
            len(frames) - header["category_frame_count"] :
        ]
        cat_typ = pickle.loads(header["categories"]["type"])
        _categories = cat_typ.deserialize(
            header["categories"], category_frames
        )

        categories = columnops.as_column(_categories)

        return cls(
            data=data,
            mask=mask,
            categories=categories,
            ordered=header["ordered"],
        )
Esempio n. 10
0
def find_last(arr, val):
    """
    Returns the index of the last occurrence of *val* in *arr*.
    Otherwise, returns -1.

    Parameters
    ----------
    arr : device array
    val : scalar
    """
    found = rmm.device_array_like(arr)
    if found.size > 0:
        if arr.dtype in ('float32', 'float64'):
            gpu_mark_found_float.forall(found.size)(arr, val, found, -1)
        else:
            gpu_mark_found_int.forall(found.size)(arr, val, found, -1)
    from cudf.dataframe.columnops import as_column
    found_col = as_column(found)
    max_index = found_col.max()
    return max_index
Esempio n. 11
0
    def __init__(self, data=None, index=None, name=None, nan_as_null=True):
        if isinstance(data, pd.Series):
            name = data.name
            index = as_index(data.index)
        if isinstance(data, Series):
            index = data._index if index is None else index
            name = data.name
            data = data._column
        if data is None:
            data = {}

        if not isinstance(data, columnops.TypedColumnBase):
            data = columnops.as_column(data, nan_as_null=nan_as_null)

        if index is not None and not isinstance(index, Index):
            raise TypeError('index not a Index type: got {!r}'.format(index))

        assert isinstance(data, columnops.TypedColumnBase)
        self._column = data
        self._index = RangeIndex(len(data)) if index is None else index
        self.name = name
Esempio n. 12
0
    def from_masked_array(cls, data, mask, null_count=None):
        """Create a Series with null-mask.
        This is equivalent to:

            Series(data).set_mask(mask, null_count=null_count)

        Parameters
        ----------
        data : 1D array-like
            The values.  Null values must not be skipped.  They can appear
            as garbage values.
        mask : 1D array-like of numpy.uint8
            The null-mask.  Valid values are marked as ``1``; otherwise ``0``.
            The mask bit given the data index ``idx`` is computed as::

                (mask[idx // 8] >> (idx % 8)) & 1
        null_count : int, optional
            The number of null values.
            If None, it is calculated automatically.
        """
        col = columnops.as_column(data).set_mask(mask, null_count=null_count)
        return cls(data=col)
Esempio n. 13
0
 def __init__(self, values, **kwargs):
     kwargs = _setdefault_name(values, kwargs)
     if isinstance(values, CategoricalColumn):
         values = values
     elif isinstance(values, pd.Series) and (is_categorical_dtype(
             values.dtype)):
         values = CategoricalColumn(
             data=Buffer(values.cat.codes.values),
             categories=values.cat.categories,
             ordered=values.cat.ordered,
         )
     elif isinstance(values, (pd.Categorical, pd.CategoricalIndex)):
         values = CategoricalColumn(
             data=Buffer(values.codes),
             categories=values.categories,
             ordered=values.ordered,
         )
     elif isinstance(values, (list, tuple)):
         values = columnops.as_column(
             pd.Categorical(values, categories=values))
     super(CategoricalIndex, self).__init__(values, **kwargs)
     assert self._values.null_count == 0
Esempio n. 14
0
def as_index(arbitrary, name=None):
    """Create an Index from an arbitrary object

    Currently supported inputs are:

    * ``Column``
    * ``Buffer``
    * ``Series``
    * ``Index``
    * numba device array
    * numpy array
    * pyarrow array
    * pandas.Categorical

    Returns
    -------
    result : subclass of Index
        - CategoricalIndex for Categorical input.
        - DatetimeIndex for Datetime input.
        - GenericIndex for all other inputs.
    """
    # This function should probably be moved to Index.__new__
    if isinstance(arbitrary, Index):
        return arbitrary
    elif isinstance(arbitrary, NumericalColumn):
        return GenericIndex(arbitrary, name=name)
    elif isinstance(arbitrary, StringColumn):
        return StringIndex(arbitrary, name=name)
    elif isinstance(arbitrary, DatetimeColumn):
        return DatetimeIndex(arbitrary, name=name)
    elif isinstance(arbitrary, CategoricalColumn):
        return CategoricalIndex(arbitrary, name=name)
    else:
        if hasattr(arbitrary, 'name') and name is None:
            name = arbitrary.name
        if len(arbitrary) == 0:
            return RangeIndex(0, 0, name=name)
        return as_index(columnops.as_column(arbitrary), name=name)
Esempio n. 15
0
    def __init__(self, data=None, index=None, name=None, nan_as_null=True,
                 dtype=None):
        if isinstance(data, pd.Series):
            name = data.name
            index = as_index(data.index)
        if isinstance(data, Series):
            index = data._index if index is None else index
            name = data.name
            data = data._column
        if data is None:
            data = {}

        if not isinstance(data, columnops.TypedColumnBase):
            data = columnops.as_column(data, nan_as_null=nan_as_null,
                                       dtype=dtype)

        if index is not None and not isinstance(index, Index):
            index = as_index(index)

        assert isinstance(data, columnops.TypedColumnBase)
        self._column = data
        self._index = RangeIndex(len(data)) if index is None else index
        self.name = name
Esempio n. 16
0
    def fillna(self, fill_value, inplace=False):
        """
        Fill null values with *fill_value*
        """
        result = self.copy()

        if np.isscalar(fill_value):
            if fill_value != self.default_na_value():
                if (fill_value not in self.cat().categories):
                    raise ValueError("fill value must be in categories")
            fill_value = pd.Categorical(fill_value,
                                        categories=self.cat().categories)

        fill_value_col = columnops.as_column(fill_value, nan_as_null=False)

        # TODO: only required if fill_value has a subset of the categories:
        fill_value_col = fill_value_col.cat()._set_categories(
            self.cat().categories)

        cpp_replace.replace_nulls(result, fill_value_col)

        result = result.replace(mask=None)
        return self._mimic_inplace(result, inplace)
Esempio n. 17
0
 def fillna(self, fill_value, inplace=False):
     """
     Fill null values with *fill_value*
     """
     if np.isscalar(fill_value):
         # castsafely to the same dtype as self
         fill_value_casted = self.dtype.type(fill_value)
         if not np.isnan(fill_value) and (fill_value_casted != fill_value):
             raise TypeError(
                 "Cannot safely cast non-equivalent {} to {}".format(
                     type(fill_value).__name__, self.dtype.name
                 )
             )
         fill_value = fill_value_casted
     else:
         fill_value = columnops.as_column(fill_value, nan_as_null=False)
         # cast safely to the same dtype as self
         if is_integer_dtype(self.dtype):
             fill_value = safe_cast_to_int(fill_value, self.dtype)
         else:
             fill_value = fill_value.astype(self.dtype)
     result = cpp_replace.apply_replace_nulls(self, fill_value)
     return self._mimic_inplace(result, inplace)
Esempio n. 18
0
    def __init__(
        self, levels=None, codes=None, labels=None, names=None, **kwargs
    ):
        from cudf.dataframe.series import Series

        self.name = None
        self.names = names
        self._source_data = None
        column_names = []
        if labels:
            warnings.warn(
                "the 'labels' keyword is deprecated, use 'codes' " "instead",
                FutureWarning,
            )
        if labels and not codes:
            codes = labels

        # early termination enables lazy evaluation of codes
        if "source_data" in kwargs:
            self._source_data = kwargs["source_data"].reset_index(drop=True)
            self._codes = codes
            self._levels = levels
            return

        # name setup
        if isinstance(
            names,
            (
                Sequence,
                pd.core.indexes.frozen.FrozenNDArray,
                pd.core.indexes.frozen.FrozenList,
            ),
        ):
            if sum(x is None for x in names) > 1:
                column_names = list(range(len(codes)))
            else:
                column_names = names
        elif names is None:
            column_names = list(range(len(codes)))
        else:
            column_names = names

        if len(levels) == 0:
            raise ValueError("Must pass non-zero number of levels/codes")

        from cudf import DataFrame

        if not isinstance(codes, DataFrame) and not isinstance(
            codes[0], (Sequence, pd.core.indexes.frozen.FrozenNDArray)
        ):
            raise TypeError("Codes is not a Sequence of sequences")

        if isinstance(codes, DataFrame):
            self._codes = codes
        elif len(levels) == len(codes):
            self._codes = DataFrame()
            for i, codes in enumerate(codes):
                name = column_names[i] or i
                codes = columnops.as_column(codes)
                self._codes[name] = codes.astype(np.int64)
        else:
            raise ValueError(
                "MultiIndex has unequal number of levels and "
                "codes and is inconsistent!"
            )

        self._levels = [Series(level) for level in levels]
        self._validate_levels_and_codes(self._levels, self._codes)

        self._source_data = DataFrame()
        for i, name in enumerate(self._codes.columns):
            codes = as_index(self._codes[name]._column)
            level = DataFrame({name: self._levels[i]})
            level = DataFrame(index=codes).join(level)
            self._source_data[name] = level[name].reset_index(drop=True)

        self.names = [None] * len(self._levels) if names is None else names
Esempio n. 19
0
 def _find_first_and_last(self, value):
     found_indices = self.str().contains(f"^{value}$").data.mem
     found_indices = cudautils.astype(found_indices, "int32")
     first = columnops.as_column(found_indices).find_first_value(1)
     last = columnops.as_column(found_indices).find_last_value(1)
     return first, last
Esempio n. 20
0
 def wrapper(*args, **kwargs):
     ret = getattr(self._parent._data, attr)(*args, **kwargs)
     if isinstance(ret, nvstrings.nvstrings):
         ret = Series(columnops.as_column(ret),
                      index=self._index)
     return ret
Esempio n. 21
0
def get_sorted_inds(by, ascending=True, na_position="last"):
    """
        Sort by the values.

        Parameters
        ----------
        by : Column or list of Column
            Column or list of Column objects to sort by.
        ascending : bool or list of bool, default True
            If True, sort values in ascending order, otherwise descending.
        na_position : {‘first’ or ‘last’}, default ‘last’
            Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs at
            the end.
        Returns
        -------
        col_inds : cuDF Column of indices sorted based on input

        Difference from pandas:
          * Support axis='index' only.
          * Not supporting: inplace, kind
          * Ascending can be a list of bools to control per column
    """
    if isinstance(by, (Column)):
        by = [by]

    inds = Buffer(cudautils.arange(len(by[0])))
    # This is due to current limitation in libcudf of using int32
    col_inds = columnops.as_column(inds).astype('int32')

    # This needs to be updated to handle list of bools for ascending
    if ascending is True:
        if na_position == "last":
            na_position = 0
        elif na_position == "first":
            na_position = 1
    elif ascending is False:
        if na_position == "last":
            na_position = 1
        elif na_position == "first":
            na_position = 0
    else:
        logging.warning(
            "When using a sequence of booleans for `ascending`, `na_position` "
            "flag is not yet supported and defaults to treating nulls as "
            "greater than all numbers"
        )
        na_position = 0

    # If given a scalar need to construct a sequence of length # of columns
    if np.isscalar(ascending):
        ascending = [ascending] * len(by)
    # If given a list-like need to convert to a numpy array and copy to device
    if isinstance(ascending, collections.abc.Sequence):
        # Need to flip the boolean here since libcudf has 0 as ascending
        ascending = [not val for val in ascending]
        ascending = rmm.to_device(np.array(ascending, dtype='int8'))
    else:
        raise ValueError("Must use a boolean or list of booleans")

    cpp_sort.apply_order_by(by, col_inds, ascending, na_position)

    return col_inds
Esempio n. 22
0
 def append(self, other):
     from cudf.dataframe.columnops import as_column
     return Column._concat([self, as_column(other)])
Esempio n. 23
0
 def searchsorted(self, value, side="left"):
     value_col = columnops.as_column(value)
     return cpp_search.search_sorted(self, value_col, side)
Esempio n. 24
0
 def take(self, indices):
     return columnops.as_column(self._values).element_indexing(indices)
Esempio n. 25
0
    def astype(self, dtype):
        from cudf.dataframe import columnops

        return columnops.as_column(self).astype(dtype).data