Ejemplo n.º 1
0
def test_index_comparision():
    start, stop = 10, 34
    rg = RangeIndex(start, stop)
    gi = GenericIndex(np.arange(start, stop))
    assert rg.equals(gi)
    assert gi.equals(rg)
    assert not rg[:-1].equals(gi)
    assert rg[:-1].equals(gi[:-1])
Ejemplo n.º 2
0
    def take(self, indices, ignore_index=False):
        """Return Series by taking values from the corresponding *indices*.
        """
        indices = Buffer(indices).to_gpu_array()
        # Handle zero size
        if indices.size == 0:
            return self._copy_construct(data=self.data[:0],
                                        index=self.index[:0])

        if self.dtype == np.dtype("object"):
            return self[indices]

        data = cudautils.gather(data=self.data.to_gpu_array(), index=indices)

        if self._column.mask:
            mask = self._get_mask_as_series().take(indices).as_mask()
            mask = Buffer(mask)
        else:
            mask = None
        if ignore_index:
            index = RangeIndex(indices.size)
        else:
            index = self.index.take(indices)

        col = self._column.replace(data=Buffer(data), mask=mask)
        return self._copy_construct(data=col, index=index)
Ejemplo n.º 3
0
def test_index_immutable():
    start, stop = 10, 34
    rg = RangeIndex(start, stop)
    with pytest.raises(TypeError):
        rg[1] = 5
    gi = GenericIndex(np.arange(start, stop))
    with pytest.raises(TypeError):
        gi[1] = 5
Ejemplo n.º 4
0
def test_index_comparision():
    start, stop = 10, 34
    rg = RangeIndex(start, stop)
    gi = GenericIndex(np.arange(start, stop))
    assert rg == gi
    assert gi == rg
    assert rg[:-1] != gi
    assert rg[:-1] == gi[:-1]
Ejemplo n.º 5
0
def test_range_index(testrange):

    index = RangeIndex(start=testrange[0], stop=testrange[1])
    index_pd = pd.RangeIndex(
        start=testrange[0], stop=testrange[1], step=testrange[2]
    )

    assert index.is_unique == index_pd.is_unique
    assert index.is_monotonic == index_pd.is_monotonic
    assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing
    assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing
Ejemplo n.º 6
0
    def __init__(self, data=None, index=None, name=None, nan_as_null=True):
        if isinstance(data, pd.Series):
            name = data.name
            index = as_index(data.index)
        if isinstance(data, Series):
            index = data._index if index is None else index
            name = data.name
            data = data._column
        if data is None:
            data = {}

        if not isinstance(data, columnops.TypedColumnBase):
            data = columnops.as_column(data, nan_as_null=nan_as_null)

        if index is not None and not isinstance(index, Index):
            raise TypeError('index not a Index type: got {!r}'.format(index))

        assert isinstance(data, columnops.TypedColumnBase)
        self._column = data
        self._index = RangeIndex(len(data)) if index is None else index
        self.name = name
Ejemplo n.º 7
0
    def __init__(self, data=None, index=None, name=None, nan_as_null=True,
                 dtype=None):
        if isinstance(data, pd.Series):
            name = data.name
            index = as_index(data.index)
        if isinstance(data, Series):
            index = data._index if index is None else index
            name = data.name
            data = data._column
        if data is None:
            data = {}

        if not isinstance(data, columnops.TypedColumnBase):
            data = columnops.as_column(data, nan_as_null=nan_as_null,
                                       dtype=dtype)

        if index is not None and not isinstance(index, Index):
            index = as_index(index)

        assert isinstance(data, columnops.TypedColumnBase)
        self._column = data
        self._index = RangeIndex(len(data)) if index is None else index
        self.name = name
Ejemplo n.º 8
0
    def _getitem_tuple_arg(self, arg):
        from cudf import MultiIndex
        from cudf.dataframe.dataframe import DataFrame
        from cudf.dataframe.dataframe import Series
        from cudf.dataframe.index import as_index

        # Iloc Step 1:
        # Gather the columns specified by the second tuple arg
        columns = self._get_column_selection(arg[1])
        if isinstance(self._df.columns, MultiIndex):
            columns_df = self._df.columns._get_column_major(self._df, arg[1])
            if (
                len(columns_df) == 0
                and len(columns_df.columns) == 0
                and not isinstance(arg[0], slice)
            ):
                result = Series([], name=arg[0])
                result._index = columns_df.columns.copy(deep=False)
                return result
        else:
            if isinstance(arg[0], slice):
                columns_df = DataFrame()
                for col in columns:
                    columns_df.add_column(name=col, data=self._df[col])
                columns_df._index = self._df._index
            else:
                columns_df = self._df._columns_view(columns)

        # Iloc Step 2:
        # Gather the rows specified by the first tuple arg
        if isinstance(columns_df.index, MultiIndex):
            df = columns_df.index._get_row_major(columns_df, arg[0])
            if (len(df) == 1 and len(columns_df) >= 1) and not (
                isinstance(arg[0], slice) or isinstance(arg[1], slice)
            ):
                # Pandas returns a numpy scalar in this case
                return df[0]
            if self._can_downcast_to_series(df, arg):
                return self._downcast_to_series(df, arg)
            return df
        else:
            df = DataFrame()
            for key, col in columns_df._cols.items():
                df[key] = col.iloc[arg[0]]
            df.columns = columns_df.columns

        # Iloc Step 3:
        # Reindex
        if df.shape[0] == 1:  # we have a single row without an index
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = 0
                df.index = as_index(self._df.index[start])
            else:
                df.index = as_index(self._df.index[arg[0]])

        # Iloc Step 4:
        # Downcast
        if self._can_downcast_to_series(df, arg):
            if isinstance(df.columns, MultiIndex):
                if len(df) > 0 and not (
                    isinstance(arg[0], slice) or isinstance(arg[1], slice)
                ):
                    return list(df._cols.values())[0][0]
                elif df.shape[1] > 1:
                    result = self._downcast_to_series(df, arg)
                    result.index = df.columns
                    return result
                elif not isinstance(arg[0], slice):
                    result_series = list(df._cols.values())[0]
                    result_series.index = df.columns
                    result_series.name = arg[0]
                    return result_series
                else:
                    return list(df._cols.values())[0]
            return self._downcast_to_series(df, arg)
        if df.shape[0] == 0 and df.shape[1] == 0:
            from cudf.dataframe.index import RangeIndex

            slice_len = arg[0].stop or len(self._df)
            start, stop, step = arg[0].indices(slice_len)
            df._index = RangeIndex(start, stop)
        return df
Ejemplo n.º 9
0
class Series(object):
    """
    Data and null-masks.

    ``Series`` objects are used as columns of ``DataFrame``.
    """
    @classmethod
    def from_categorical(cls, categorical, codes=None):
        """Creates from a pandas.Categorical

        If ``codes`` is defined, use it instead of ``categorical.codes``
        """
        from cudf.dataframe.categorical import pandas_categorical_as_column

        col = pandas_categorical_as_column(categorical, codes=codes)
        return Series(data=col)

    @classmethod
    def from_masked_array(cls, data, mask, null_count=None):
        """Create a Series with null-mask.
        This is equivalent to:

            Series(data).set_mask(mask, null_count=null_count)

        Parameters
        ----------
        data : 1D array-like
            The values.  Null values must not be skipped.  They can appear
            as garbage values.
        mask : 1D array-like of numpy.uint8
            The null-mask.  Valid values are marked as ``1``; otherwise ``0``.
            The mask bit given the data index ``idx`` is computed as::

                (mask[idx // 8] >> (idx % 8)) & 1
        null_count : int, optional
            The number of null values.
            If None, it is calculated automatically.
        """
        col = columnops.as_column(data).set_mask(mask, null_count=null_count)
        return cls(data=col)

    def __init__(self, data=None, index=None, name=None, nan_as_null=True):
        if isinstance(data, pd.Series):
            name = data.name
            index = as_index(data.index)
        if isinstance(data, Series):
            index = data._index if index is None else index
            name = data.name
            data = data._column
        if data is None:
            data = {}

        if not isinstance(data, columnops.TypedColumnBase):
            data = columnops.as_column(data, nan_as_null=nan_as_null)

        if index is not None and not isinstance(index, Index):
            raise TypeError('index not a Index type: got {!r}'.format(index))

        assert isinstance(data, columnops.TypedColumnBase)
        self._column = data
        self._index = RangeIndex(len(data)) if index is None else index
        self.name = name

    @classmethod
    def from_pandas(cls, s, nan_as_null=True):
        return cls(s, nan_as_null=nan_as_null)

    @classmethod
    def from_arrow(cls, s):
        return cls(s)

    def serialize(self, serialize):
        header = {}
        frames = []
        header['index'], index_frames = serialize(self._index)
        frames.extend(index_frames)
        header['index_frame_count'] = len(index_frames)
        header['column'], column_frames = serialize(self._column)
        frames.extend(column_frames)
        header['column_frame_count'] = len(column_frames)
        return header, frames

    @property
    def shape(self):
        """Returns a tuple representing the dimensionality of the Series.
        """
        return len(self),

    @property
    def dt(self):
        if isinstance(self._column, DatetimeColumn):
            return DatetimeProperties(self)
        else:
            raise AttributeError("Can only use .dt accessor with datetimelike "
                                 "values")

    @classmethod
    def deserialize(cls, deserialize, header, frames):
        index_nframes = header['index_frame_count']
        index = deserialize(header['index'], frames[:index_nframes])
        frames = frames[index_nframes:]
        column_nframes = header['column_frame_count']
        column = deserialize(header['column'], frames[:column_nframes])
        return Series(column, index=index)

    def _copy_construct_defaults(self):
        return dict(
            data=self._column,
            index=self._index,
            name=self.name,
        )

    def _copy_construct(self, **kwargs):
        """Shallow copy this object by replacing certain ctor args.
        """
        params = self._copy_construct_defaults()
        cls = type(self)
        params.update(kwargs)
        return cls(**params)

    def copy(self, deep=True):
        result = self._copy_construct()
        if deep:
            result._column = self._column.copy(deep)
        return result

    def __copy__(self, deep=True):
        return self.copy(deep)

    def __deepcopy__(self):
        return self.copy()

    def reset_index(self, drop=False):
        """ Reset index to RangeIndex """
        if not drop:
            return self.to_frame().reset_index(drop=drop)
        else:
            return self._copy_construct(index=RangeIndex(len(self)))

    def set_index(self, index):
        """Returns a new Series with a different index.

        Parameters
        ----------
        index : Index, Series-convertible
            the new index or values for the new index
        """
        index = index if isinstance(index, Index) else as_index(index)
        return self._copy_construct(index=index)

    def as_index(self):
        return self.set_index(RangeIndex(len(self)))

    def to_frame(self):
        """ Convert Series into a DataFrame """
        from cudf import DataFrame
        return DataFrame({self.name or 0: self}, index=self.index)

    def set_mask(self, mask, null_count=None):
        """Create new Series by setting a mask array.

        This will override the existing mask.  The returned Series will
        reference the same data buffer as this Series.

        Parameters
        ----------
        mask : 1D array-like of numpy.uint8
            The null-mask.  Valid values are marked as ``1``; otherwise ``0``.
            The mask bit given the data index ``idx`` is computed as::

                (mask[idx // 8] >> (idx % 8)) & 1
        null_count : int, optional
            The number of null values.
            If None, it is calculated automatically.

        """
        col = self._column.set_mask(mask, null_count=null_count)
        return self._copy_construct(data=col)

    def __sizeof__(self):
        return self._column.__sizeof__() + self._index.__sizeof__()

    def __len__(self):
        """Returns the size of the ``Series`` including null values.
        """
        return len(self._column)

    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
        if method == '__call__' and 'sqrt' == ufunc.__name__:
            from cudf import sqrt
            return sqrt(self)
        else:
            return NotImplemented

    @property
    def empty(self):
        return not len(self)

    def __getitem__(self, arg):
        if isinstance(
                arg,
            (list, np.ndarray, pd.Series, range, Index, DeviceNDArray)):
            arg = Series(arg)
        if isinstance(arg, Series):
            if issubclass(arg.dtype.type, np.integer):
                selvals, selinds = columnops.column_select_by_position(
                    self._column, arg)
                index = self.index.take(selinds.to_gpu_array())
            elif arg.dtype in [np.bool, np.bool_]:
                selvals, selinds = columnops.column_select_by_boolmask(
                    self._column, arg)
                index = self.index.take(selinds.to_gpu_array())
            else:
                raise NotImplementedError(arg.dtype)
            return self._copy_construct(data=selvals, index=index)

        elif isinstance(arg, slice):
            index = self.index[arg]  # slice index
            col = self._column[arg]  # slice column
            return self._copy_construct(data=col, index=index)
        elif isinstance(arg, Number):
            # The following triggers a IndexError if out-of-bound
            return self._column.element_indexing(arg)
        else:
            raise NotImplementedError(type(arg))

    def take(self, indices, ignore_index=False):
        """Return Series by taking values from the corresponding *indices*.
        """
        indices = Buffer(indices).to_gpu_array()
        # Handle zero size
        if indices.size == 0:
            return self._copy_construct(data=self.data[:0],
                                        index=self.index[:0])

        data = cudautils.gather(data=self.data.to_gpu_array(), index=indices)

        if self._column.mask:
            mask = self._get_mask_as_series().take(indices).as_mask()
            mask = Buffer(mask)
        else:
            mask = None
        if ignore_index:
            index = RangeIndex(indices.size)
        else:
            index = self.index.take(indices)

        col = self._column.replace(data=Buffer(data), mask=mask)
        return self._copy_construct(data=col, index=index)

    def _get_mask_as_series(self):
        mask = Series(cudautils.ones(len(self), dtype=np.bool))
        if self._column.mask is not None:
            mask = mask.set_mask(self._column.mask).fillna(False)
        return mask

    def __bool__(self):
        """Always raise TypeError when converting a Series
        into a boolean.
        """
        raise TypeError("can't compute boolean for {!r}".format(type(self)))

    def values_to_string(self, nrows=None):
        """Returns a list of string for each element.
        """
        values = self[:nrows]
        out = ['' if v is None else str(v) for v in values]
        return out

    def head(self, n=5):
        return self.iloc[:n]

    def tail(self, n=5):
        """
        Returns the last n rows as a new Series

        Examples
        --------
        >>> import cudf
        >>> ser = cudf.Series([4, 3, 2, 1, 0])
        >>> print(ser.tail(2))
        3    1
        4    0
        """
        if n == 0:
            return self.iloc[0:0]

        return self.iloc[-n:]

    def to_string(self, nrows=NOTSET):
        """Convert to string

        Parameters
        ----------
        nrows : int
            Maximum number of rows to show.
            If it is None, all rows are shown.
        """
        if nrows is NOTSET:
            nrows = settings.formatting.get(nrows)

        if len(self) == 0:
            return "<empty Series of dtype={}>".format(self.dtype)

        if nrows is None:
            nrows = len(self)
        else:
            nrows = min(nrows, len(self))  # cap row count

        more_rows = len(self) - nrows

        # Prepare cells
        cols = OrderedDict([('', self.values_to_string(nrows=nrows))])
        # Format into a table
        output = formatting.format(index=self.index,
                                   cols=cols,
                                   more_rows=more_rows,
                                   series_spacing=True)
        return output + "\nName: {}, dtype: {}".format(self.name, self.dtype)\
            if self.name else output + "\ndtype: {}".format(self.dtype)

    def __str__(self):
        return self.to_string(nrows=10)

    def __repr__(self):
        return "<cudf.Series nrows={} >".format(len(self))

    def _binaryop(self, other, fn):
        """
        Internal util to call a binary operator *fn* on operands *self*
        and *other*.  Return the output Series.  The output dtype is
        determined by the input operands.
        """
        from cudf import DataFrame
        if isinstance(other, DataFrame):
            return other._binaryop(self, fn)
        nvtx_range_push("CUDF_BINARY_OP", "orange")
        other = self._normalize_binop_value(other)
        outcol = self._column.binary_operator(fn, other._column)
        result = self._copy_construct(data=outcol)
        result.name = None
        nvtx_range_pop()
        return result

    def _rbinaryop(self, other, fn):
        """
        Internal util to call a binary operator *fn* on operands *self*
        and *other* for reflected operations.  Return the output Series.
        The output dtype is determined by the input operands.
        """
        from cudf import DataFrame
        if isinstance(other, DataFrame):
            return other._binaryop(self, fn)
        nvtx_range_push("CUDF_BINARY_OP", "orange")
        other = self._normalize_binop_value(other)
        outcol = other._column.binary_operator(fn, self._column)
        result = self._copy_construct(data=outcol)
        result.name = None
        nvtx_range_pop()
        return result

    def _unaryop(self, fn):
        """
        Internal util to call a unary operator *fn* on operands *self*.
        Return the output Series.  The output dtype is determined by the input
        operand.
        """
        outcol = self._column.unary_operator(fn)
        return self._copy_construct(data=outcol)

    def __add__(self, other):
        return self._binaryop(other, 'add')

    def __radd__(self, other):
        return self._rbinaryop(other, 'add')

    def __sub__(self, other):
        return self._binaryop(other, 'sub')

    def __rsub__(self, other):
        return self._rbinaryop(other, 'sub')

    def __mul__(self, other):
        return self._binaryop(other, 'mul')

    def __rmul__(self, other):
        return self._rbinaryop(other, 'mul')

    def __mod__(self, other):
        return self._binaryop(other, 'mod')

    def __rmod__(self, other):
        return self._rbinaryop(other, 'mod')

    def __pow__(self, other):
        return self._binaryop(other, 'pow')

    def __floordiv__(self, other):
        return self._binaryop(other, 'floordiv')

    def __rfloordiv__(self, other):
        return self._rbinaryop(other, 'floordiv')

    def __truediv__(self, other):
        if self.dtype in list(truediv_int_dtype_corrections.keys()):
            truediv_type = truediv_int_dtype_corrections[str(self.dtype)]
            return self.astype(truediv_type)._binaryop(other, 'truediv')
        else:
            return self._binaryop(other, 'truediv')

    def __rtruediv__(self, other):
        if self.dtype in list(truediv_int_dtype_corrections.keys()):
            truediv_type = truediv_int_dtype_corrections[str(self.dtype)]
            return self.astype(truediv_type)._rbinaryop(other, 'truediv')
        else:
            return self._rbinaryop(other, 'truediv')

    __div__ = __truediv__

    def _normalize_binop_value(self, other):
        if isinstance(other, Series):
            return other
        elif isinstance(other, Index):
            return Series(other)
        else:
            col = self._column.normalize_binop_value(other)
            return self._copy_construct(data=col)

    def _unordered_compare(self, other, cmpops):
        nvtx_range_push("CUDF_UNORDERED_COMP", "orange")
        other = self._normalize_binop_value(other)
        outcol = self._column.unordered_compare(cmpops, other._column)
        result = self._copy_construct(data=outcol)
        result.name = None
        nvtx_range_pop()
        return result

    def _ordered_compare(self, other, cmpops):
        nvtx_range_push("CUDF_ORDERED_COMP", "orange")
        other = self._normalize_binop_value(other)
        outcol = self._column.ordered_compare(cmpops, other._column)
        result = self._copy_construct(data=outcol)
        result.name = None
        nvtx_range_pop()
        return result

    def __eq__(self, other):
        return self._unordered_compare(other, 'eq')

    def __ne__(self, other):
        return self._unordered_compare(other, 'ne')

    def __lt__(self, other):
        return self._ordered_compare(other, 'lt')

    def __le__(self, other):
        return self._ordered_compare(other, 'le')

    def __gt__(self, other):
        return self._ordered_compare(other, 'gt')

    def __ge__(self, other):
        return self._ordered_compare(other, 'ge')

    @property
    def cat(self):
        return self._column.cat()

    @property
    def dtype(self):
        """dtype of the Series"""
        return self._column.dtype

    @classmethod
    def _concat(cls, objs, axis=0, index=True):
        # Concatenate index if not provided
        if index is True:
            index = Index._concat([o.index for o in objs])

        names = {obj.name for obj in objs}
        if len(names) == 1:
            [name] = names
        else:
            name = None
        col = Column._concat([o._column for o in objs])
        return cls(data=col, index=index, name=name)

    def append(self, arbitrary):
        """Append values from another ``Series`` or array-like object.
        Returns a new copy with the index resetted.
        """
        other = Series(arbitrary)
        other_col = other._column
        # return new series
        return Series(self._column.append(other_col))

    @property
    def valid_count(self):
        """Number of non-null values"""
        return self._column.valid_count

    @property
    def null_count(self):
        """Number of null values"""
        return self._column.null_count

    @property
    def has_null_mask(self):
        """A boolean indicating whether a null-mask is needed"""
        return self._column.has_null_mask

    def masked_assign(self, value, mask):
        """Assign a scalar value to a series using a boolean mask
        df[df < 0] = 0

        Parameters
        ----------
        value : scalar
            scalar value for assignment
        mask : cudf Series
            Boolean Series

        Returns
        -------
        cudf Series
            cudf series with new value set to where mask is True
        """

        data = self._column.masked_assign(value, mask)
        return self._copy_construct(data=data)

    def fillna(self, value, method=None, axis=None, inplace=False, limit=None):
        """Fill null values with ``value``.

        Parameters
        ----------
        value : scalar or Series-like
            Value to use to fill nulls. If Series-like, null values
            are filled with the values in corresponding indices of the
            given Series.

        Returns
        -------
        result : Series
            Copy with nulls filled.
        """
        if method is not None:
            raise NotImplementedError("The method keyword is not supported")
        if limit is not None:
            raise NotImplementedError("The limit keyword is not supported")
        if axis:
            raise NotImplementedError("The axis keyword is not supported")

        data = self._column.fillna(value, inplace=inplace)

        if not inplace:
            return self._copy_construct(data=data)

    def to_array(self, fillna=None):
        """Get a dense numpy array for the data.

        Parameters
        ----------
        fillna : str or None
            Defaults to None, which will skip null values.
            If it equals "pandas", null values are filled with NaNs.
            Non integral dtype is promoted to np.float64.

        Notes
        -----

        if ``fillna`` is ``None``, null values are skipped.  Therefore, the
        output size could be smaller.
        """
        return self._column.to_array(fillna=fillna)

    def to_gpu_array(self, fillna=None):
        """Get a dense numba device array for the data.

        Parameters
        ----------
        fillna : str or None
            See *fillna* in ``.to_array``.

        Notes
        -----

        if ``fillna`` is ``None``, null values are skipped.  Therefore, the
        output size could be smaller.
        """
        return self._column.to_gpu_array(fillna=fillna)

    def to_pandas(self, index=True):
        if index is True:
            index = self.index.to_pandas()
        s = self._column.to_pandas(index=index)
        s.name = self.name
        return s

    def to_arrow(self):
        return self._column.to_arrow()

    @property
    def data(self):
        """The gpu buffer for the data
        """
        return self._column.data

    @property
    def index(self):
        """The index object
        """
        return self._index

    @property
    def iloc(self):
        """
        For integer-location based selection.

        Examples
        --------
        >>> import cudf
        >>> sr = cudf.Series(list(range(20)))

        Get the value from 1st index

        >>> sr.iloc[1]
        1

        Get the values from 0,2,9 and 18th index

        >>> sr.iloc[0,2,9,18]
         0    0
         2    2
         9    9
        18   18

        Get the values using slice indices

        >>> sr.iloc[3:10:2]
        3    3
        5    5
        7    7
        9    9

        Returns
        -------
        Series containing the elements corresponding to the indices
        """
        return Iloc(self)

    @property
    def nullmask(self):
        """The gpu buffer for the null-mask
        """
        return self._column.nullmask

    def as_mask(self):
        """Convert booleans to bitmask

        Returns
        -------
        device array
        """
        return cudautils.compact_mask_bytes(self.to_gpu_array())

    def astype(self, dtype):
        """Convert to the given ``dtype``.

        Returns
        -------
        If the dtype changed, a new ``Series`` is returned by casting each
        values to the given dtype.
        If the dtype is not changed, ``self`` is returned.
        """
        if dtype == self.dtype:
            return self

        return self._copy_construct(data=self._column.astype(dtype))

    def argsort(self, ascending=True, na_position="last"):
        """Returns a Series of int64 index that will sort the series.

        Uses Thrust sort.

        Returns
        -------
        result: Series
        """
        return self._sort(ascending=ascending, na_position=na_position)[1]

    def sort_index(self, ascending=True):
        """Sort by the index.
        """
        inds = self.index.argsort(ascending=ascending)
        return self.take(inds.to_gpu_array())

    def sort_values(self, ascending=True, na_position="last"):
        """
        Sort by the values.

        Sort a Series in ascending or descending order by some criterion.

        Parameters
        ----------
        ascending : bool, default True
            If True, sort values in ascending order, otherwise descending.
        na_position : {‘first’, ‘last’}, default ‘last’
            'first' puts nulls at the beginning, 'last' puts nulls at the end.
        Returns
        -------
        sorted_obj : cuDF Series

        Difference from pandas:
          * Not supporting: inplace, kind

        Examples
        --------
        >>> import cudf
        >>> s = cudf.Series([1, 5, 2, 4, 3])
        >>> s.sort_values()
        0    1
        2    2
        4    3
        3    4
        1    5
        """
        if len(self) == 0:
            return self
        vals, inds = self._sort(ascending=ascending, na_position=na_position)
        index = self.index.take(inds.to_gpu_array())
        return vals.set_index(index)

    def _n_largest_or_smallest(self, largest, n, keep):
        if not (0 <= n < len(self)):
            raise ValueError("n out-of-bound")
        direction = largest
        if keep == 'first':
            return self.sort_values(ascending=not direction)[:n]
        elif keep == 'last':
            return self.sort_values(ascending=direction)[-n:].reverse()
        else:
            raise ValueError('keep must be either "first", "last"')

    def nlargest(self, n=5, keep='first'):
        """Returns a new Series of the *n* largest element.
        """
        return self._n_largest_or_smallest(n=n, keep=keep, largest=True)

    def nsmallest(self, n=5, keep='first'):
        """Returns a new Series of the *n* smallest element.
        """
        return self._n_largest_or_smallest(n=n, keep=keep, largest=False)

    def _sort(self, ascending=True, na_position="last"):
        """
        Sort by values

        Returns
        -------
        2-tuple of key and index
        """
        col_keys, col_inds = self._column.sort_by_values(
            ascending=ascending, na_position=na_position)
        sr_keys = self._copy_construct(data=col_keys)
        sr_inds = self._copy_construct(data=col_inds)
        return sr_keys, sr_inds

    def replace(self, to_replace, value):
        """
        Replace values given in *to_replace* with *value*.

        Parameters
        ----------
        to_replace : numeric, str or list-like
            Value(s) to replace.

            * numeric or str:

                - values equal to *to_replace* will be replaced with *value*

            * list of numeric or str:

                - If *value* is also list-like, *to_replace* and *value* must
                be of same length.
        value : numeric, str, list-like, or dict
            Value(s) to replace `to_replace` with.

        See also
        --------
        Series.fillna

        Returns
        -------
        result : Series
            Series after replacement. The mask and index are preserved.
        """
        if not is_scalar(to_replace):
            if is_scalar(value):
                value = utils.scalar_broadcast_to(value, (len(to_replace), ),
                                                  np.dtype(type(value)))
        else:
            if not is_scalar(value):
                raise TypeError("Incompatible types '{}' and '{}' "
                                "for *to_replace* and *value*.".format(
                                    type(to_replace).__name__,
                                    type(value).__name__))
            to_replace = [to_replace]
            value = [value]

        if len(to_replace) != len(value):
            raise ValueError("Replacement lists must be"
                             "of same length."
                             "Expected {}, got {}.".format(
                                 len(to_replace), len(value)))

        if is_dict_like(to_replace) or is_dict_like(value):
            raise TypeError("Dict-like args not supported in Series.replace()")

        result = self._column.find_and_replace(to_replace, value)

        return self._copy_construct(data=result)

    def reverse(self):
        """Reverse the Series
        """
        data = cudautils.reverse_array(self.to_gpu_array())
        index = as_index(cudautils.reverse_array(self.index.gpu_values))
        col = self._column.replace(data=Buffer(data))
        return self._copy_construct(data=col, index=index)

    def one_hot_encoding(self, cats, dtype='float64'):
        """Perform one-hot-encoding

        Parameters
        ----------
        cats : sequence of values
                values representing each category.
        dtype : numpy.dtype
                specifies the output dtype.

        Returns
        -------
        A sequence of new series for each category.  Its length is determined
        by the length of ``cats``.
        """
        if self.dtype.kind not in 'iuf':
            raise TypeError('expecting integer or float dtype')

        dtype = np.dtype(dtype)
        out = []
        for cat in cats:
            mask = None  # self.nullmask.to_gpu_array()
            buf = cudautils.apply_equal_constant(arr=self.data.to_gpu_array(),
                                                 mask=mask,
                                                 val=cat,
                                                 dtype=dtype)
            out.append(Series(buf, index=self.index))
        return out

    def label_encoding(self, cats, dtype=None, na_sentinel=-1):
        """Perform label encoding

        Parameters
        ----------
        values : sequence of input values
        dtype: numpy.dtype; optional
               Specifies the output dtype.  If `None` is given, the
               smallest possible integer dtype (starting with np.int32)
               is used.
        na_sentinel : number
            Value to indicate missing category.
        Returns
        -------
        A sequence of encoded labels with value between 0 and n-1 classes(cats)
        """

        if self.null_count != 0:
            mesg = 'series contains NULL values'
            raise ValueError(mesg)

        if self.dtype.kind not in 'iuf':
            raise TypeError('expecting integer or float dtype')

        gpuarr = self.to_gpu_array()
        sr_cats = Series(cats)
        if dtype is None:
            # Get smallest type to represent the category size
            min_dtype = np.min_scalar_type(len(cats))
            # Normalize the size to at least 32-bit
            normalized_sizeof = max(4, min_dtype.itemsize)
            dtype = getattr(np, "int{}".format(normalized_sizeof * 8))
        dtype = np.dtype(dtype)
        labeled = cudautils.apply_label(gpuarr, sr_cats.to_gpu_array(), dtype,
                                        na_sentinel)

        return Series(labeled)

    def factorize(self, na_sentinel=-1):
        """Encode the input values as integer labels

        Parameters
        ----------
        na_sentinel : number
            Value to indicate missing category.

        Returns
        --------
        (labels, cats) : (Series, Series)
            - *labels* contains the encoded values
            - *cats* contains the categories in order that the N-th
              item corresponds to the (N-1) code.
        """
        cats = self.unique()
        labels = self.label_encoding(cats=cats)
        return labels, cats

    # UDF related

    def applymap(self, udf, out_dtype=None):
        """Apply a elemenwise function to transform the values in the Column.

        The user function is expected to take one argument and return the
        result, which will be stored to the output Series.  The function
        cannot reference globals except for other simple scalar objects.

        Parameters
        ----------
        udf : function
            Wrapped by ``numba.cuda.jit`` for call on the GPU as a device
            function.
        out_dtype  : numpy.dtype; optional
            The dtype for use in the output.
            By default, the result will have the same dtype as the source.

        Returns
        -------
        result : Series
            The mask and index are preserved.
        """
        res_col = self._column.applymap(udf, out_dtype=out_dtype)
        return self._copy_construct(data=res_col)

    # Find / Search

    def find_first_value(self, value):
        """
        Returns offset of first value that matches
        """
        return self._column.find_first_value(value)

    def find_last_value(self, value):
        """
        Returns offset of last value that matches
        """
        return self._column.find_last_value(value)

    #
    # Stats
    #
    def count(self, axis=None, skipna=True):
        """The number of non-null values"""
        assert axis in (None, 0) and skipna is True
        return self.valid_count

    def min(self, axis=None, skipna=True):
        """Compute the min of the series
        """
        assert axis in (None, 0) and skipna is True
        return self._column.min()

    def max(self, axis=None, skipna=True):
        """Compute the max of the series
        """
        assert axis in (None, 0) and skipna is True
        return self._column.max()

    def sum(self, axis=None, skipna=True):
        """Compute the sum of the series"""
        assert axis in (None, 0) and skipna is True
        return self._column.sum()

    def product(self, axis=None, skipna=True):
        """Compute the product of the series"""
        assert axis in (None, 0) and skipna is True
        return self._column.product()

    def mean(self, axis=None, skipna=True):
        """Compute the mean of the series
        """
        assert axis in (None, 0) and skipna is True
        return self._column.mean()

    def std(self, ddof=1, axis=None, skipna=True):
        """Compute the standard deviation of the series
        """
        assert axis in (None, 0) and skipna is True
        return np.sqrt(self.var(ddof=ddof))

    def var(self, ddof=1, axis=None, skipna=True):
        """Compute the variance of the series
        """
        assert axis in (None, 0) and skipna is True
        mu, var = self.mean_var(ddof=ddof)
        return var

    def mean_var(self, ddof=1):
        """Compute mean and variance at the same time.
        """
        mu, var = self._column.mean_var(ddof=ddof)
        return mu, var

    def sum_of_squares(self):
        return self._column.sum_of_squares()

    def unique_k(self, k):
        warnings.warn("Use .unique() instead", DeprecationWarning)
        return self.unique()

    def unique(self, method='sort', sort=True):
        """Returns unique values of this Series.
        default='sort' will be changed to 'hash' when implemented.
        """
        if method != 'sort':
            msg = 'non sort based unique() not implemented yet'
            raise NotImplementedError(msg)
        if not sort:
            msg = 'not sorted unique not implemented yet.'
            raise NotImplementedError(msg)
        if self.null_count == len(self):
            return np.empty(0, dtype=self.dtype)
        res = self._column.unique(method=method)
        return Series(res)

    def nunique(self, method='sort', dropna=True):
        """Returns the number of unique values of the Series: approximate version,
        and exact version to be moved to libgdf
        """
        if method != 'sort':
            msg = 'non sort based unique_count() not implemented yet'
            raise NotImplementedError(msg)
        if self.null_count == len(self):
            return 0
        return self._column.unique_count(method=method, dropna=dropna)
        # return len(self._column.unique())

    def value_counts(self, method='sort', sort=True):
        """Returns unique values of this Series.
        """
        if method != 'sort':
            msg = 'non sort based value_count() not implemented yet'
            raise NotImplementedError(msg)
        if self.null_count == len(self):
            return Series(np.array([], dtype=np.int64))
        vals, cnts = self._column.value_counts(method=method)
        res = Series(cnts, index=as_index(vals))
        if sort:
            return res.sort_values(ascending=False)
        return res

    def scale(self):
        """Scale values to [0, 1] in float64
        """
        if self.null_count != 0:
            msg = 'masked series not supported by this operation'
            raise NotImplementedError(msg)
        vmin = self.min()
        vmax = self.max()
        gpuarr = self.to_gpu_array()
        scaled = cudautils.compute_scale(gpuarr, vmin, vmax)
        return self._copy_construct(data=scaled)

    # Rounding
    def ceil(self):
        """Rounds each value upward to the smallest integral value not less
        than the original.

        Returns a new Series.
        """
        return self._unaryop('ceil')

    def floor(self):
        """Rounds each value downward to the largest integral value not greater
        than the original.

        Returns a new Series.
        """
        return self._unaryop('floor')

    # Misc

    def hash_values(self):
        """Compute the hash of values in this column.
        """
        from cudf.dataframe import numerical

        return Series(numerical.column_hash_values(self._column))

    def hash_encode(self, stop, use_name=False):
        """Encode column values as ints in [0, stop) using hash function.

        Parameters
        ----------
        stop : int
            The upper bound on the encoding range.
        use_name : bool
            If ``True`` then combine hashed column values
            with hashed column name. This is useful for when the same
            values in different columns should be encoded
            with different hashed values.
        Returns
        -------
        result: Series
            The encoded Series.
        """
        assert stop > 0

        from cudf.dataframe import numerical
        initial_hash = np.asarray(hash(self.name)) if use_name else None
        hashed_values = numerical.column_hash_values(
            self._column, initial_hash_values=initial_hash)

        # TODO: Binary op when https://github.com/rapidsai/cudf/pull/892 merged
        mod_vals = cudautils.modulo(hashed_values.data.to_gpu_array(), stop)
        return Series(mod_vals)

    def quantile(self,
                 q,
                 interpolation='midpoint',
                 exact=True,
                 quant_index=True):
        """
        Return values at the given quantile.

        Parameters
        ----------

        q : float or array-like, default 0.5 (50% quantile)
            0 <= q <= 1, the quantile(s) to compute
        interpolation : {’linear’, ‘lower’, ‘higher’, ‘midpoint’, ‘nearest’}
            This optional parameter specifies the interpolation method to use,
            when the desired quantile lies between two data points i and j:
        columns : list of str
            List of column names to include.
        exact : boolean
            Whether to use approximate or exact quantile algorithm.
        quant_index : boolean
            Whether to use the list of quantiles as index.

        Returns
        -------

        DataFrame

        """
        if not quant_index:
            return Series(self._column.quantile(q, interpolation, exact))
        else:
            return Series(self._column.quantile(q, interpolation, exact),
                          index=as_index(np.asarray(q)))

    def digitize(self, bins, right=False):
        """Return the indices of the bins to which each value in series belongs.

        Notes
        -----
        Monotonicity of bins is assumed and not checked.

        Parameters
        ----------
        bins : np.array
            1-D monotonically, increasing array with same type as this series.
        right : bool
            Indicates whether interval contains the right or left bin edge.

        Returns
        -------
        A new Series containing the indices.
        """
        from cudf.dataframe import numerical

        return Series(numerical.digitize(self._column, bins, right))

    def groupby(self, group_series=None, level=None, sort=False):
        from cudf.groupby.groupby import SeriesGroupBy
        return SeriesGroupBy(self, group_series, level, sort)

    def to_json(self, path_or_buf=None, *args, **kwargs):
        """
        Convert the cuDF object to a JSON string.
        Note nulls and NaNs will be converted to null and datetime objects
        will be converted to UNIX timestamps.
        Parameters
        ----------
        path_or_buf : string or file handle, optional
            File path or object. If not specified, the result is returned as
            a string.
        orient : string
            Indication of expected JSON string format.
            * Series
                - default is 'index'
                - allowed values are: {'split','records','index','table'}
            * DataFrame
                - default is 'columns'
                - allowed values are:
                {'split','records','index','columns','values','table'}
            * The format of the JSON string
                - 'split' : dict like {'index' -> [index],
                'columns' -> [columns], 'data' -> [values]}
                - 'records' : list like
                [{column -> value}, ... , {column -> value}]
                - 'index' : dict like {index -> {column -> value}}
                - 'columns' : dict like {column -> {index -> value}}
                - 'values' : just the values array
                - 'table' : dict like {'schema': {schema}, 'data': {data}}
                describing the data, and the data component is
                like ``orient='records'``.
        date_format : {None, 'epoch', 'iso'}
            Type of date conversion. 'epoch' = epoch milliseconds,
            'iso' = ISO8601. The default depends on the `orient`. For
            ``orient='table'``, the default is 'iso'. For all other orients,
            the default is 'epoch'.
        double_precision : int, default 10
            The number of decimal places to use when encoding
            floating point values.
        force_ascii : bool, default True
            Force encoded string to be ASCII.
        date_unit : string, default 'ms' (milliseconds)
            The time unit to encode to, governs timestamp and ISO8601
            precision.  One of 's', 'ms', 'us', 'ns' for second, millisecond,
            microsecond, and nanosecond respectively.
        default_handler : callable, default None
            Handler to call if object cannot otherwise be converted to a
            suitable format for JSON. Should receive a single argument which is
            the object to convert and return a serialisable object.
        lines : bool, default False
            If 'orient' is 'records' write out line delimited json format. Will
            throw ValueError if incorrect 'orient' since others are not list
            like.
        compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
            A string representing the compression to use in the output file,
            only used when the first argument is a filename. By default, the
            compression is inferred from the filename.
        index : bool, default True
            Whether to include the index values in the JSON string. Not
            including the index (``index=False``) is only supported when
            orient is 'split' or 'table'.
        """
        import cudf.io.json as json
        json.to_json(self, path_or_buf=path_or_buf, *args, **kwargs)

    def to_hdf(self, path_or_buf, key, *args, **kwargs):
        """
        Write the contained data to an HDF5 file using HDFStore.

        Hierarchical Data Format (HDF) is self-describing, allowing an
        application to interpret the structure and contents of a file with
        no outside information. One HDF file can hold a mix of related objects
        which can be accessed as a group or as individual objects.

        In order to add another DataFrame or Series to an existing HDF file
        please use append mode and a different a key.

        For more information see the :ref:`user guide
        <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#hdf5-pytables>`_.

        Parameters
        ----------
        path_or_buf : str or pandas.HDFStore
            File path or HDFStore object.
        key : str
            Identifier for the group in the store.
        mode : {'a', 'w', 'r+'}, default 'a'
            Mode to open file:
            - 'w': write, a new file is created (an existing file with
                the same name would be deleted).
            - 'a': append, an existing file is opened for reading and
                writing, and if the file does not exist it is created.
            - 'r+': similar to 'a', but the file must already exist.
        format : {'fixed', 'table'}, default 'fixed'
            Possible values:
            - 'fixed': Fixed format. Fast writing/reading. Not-appendable,
                nor searchable.
            - 'table': Table format. Write as a PyTables Table structure
                which may perform worse but allow more flexible operations
                like searching / selecting subsets of the data.
        append : bool, default False
            For Table formats, append the input data to the existing.
        data_columns :  list of columns or True, optional
            List of columns to create as indexed data columns for on-disk
            queries, or True to use all columns. By default only the axes
            of the object are indexed. `See Query via Data Columns
            <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#hdf5-pytables>`_.
            Applicable only to format='table'.
        complevel : {0-9}, optional
            Specifies a compression level for data.
            A value of 0 disables compression.
        complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
            Specifies the compression library to be used.
            As of v0.20.2 these additional compressors for Blosc are supported
            (default if no compressor specified: 'blosc:blosclz'):
            {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
            'blosc:zlib', 'blosc:zstd'}.
            Specifying a compression library which is not available issues
            a ValueError.
        fletcher32 : bool, default False
            If applying compression use the fletcher32 checksum.
        dropna : bool, default False
            If true, ALL nan rows will not be written to store.
        errors : str, default 'strict'
            Specifies how encoding and decoding errors are to be handled.
            See the errors argument for :func:`open` for a full list
            of options.
        """
        import cudf.io.hdf as hdf
        hdf.to_hdf(path_or_buf, key, self, *args, **kwargs)

    def rename(self, index=None, copy=True):
        """
        Alter Series name.

        Change Series.name with a scalar value.

        Parameters
        ----------
        index : Scalar, optional
            Scalar to alter the Series.name attribute
        copy : boolean, default True
            Also copy underlying data

        Returns
        -------
        Series

        Difference from pandas:
          * Supports scalar values only for changing name attribute
          * Not supporting: inplace, level
        """
        out = self.copy(deep=False)
        out = out.set_index(self.index)
        if index:
            out.name = index

        return out.copy(deep=copy)
Ejemplo n.º 10
0
 def as_index(self):
     return self.set_index(RangeIndex(len(self)))
Ejemplo n.º 11
0
 def reset_index(self, drop=False):
     """ Reset index to RangeIndex """
     if not drop:
         return self.to_frame().reset_index(drop=drop)
     else:
         return self._copy_construct(index=RangeIndex(len(self)))