Example #1
0
    def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):
        from cudf.utils.cudautils import arange
        from cudf import Series

        # Instructions for Slicing
        # if tuple, get first and last elements of tuple
        # if open beginning tuple, get 0 to highest valid_index
        # if open ending tuple, get highest valid_index to len()
        # if not open end or beginning, get range lowest beginning index
        # to highest ending index
        if isinstance(row_tuple, slice):
            if (isinstance(row_tuple.start, numbers.Number)
                    or isinstance(row_tuple.stop, numbers.Number)
                    or row_tuple == slice(None)):
                stop = row_tuple.stop or max_length
                start, stop, step = row_tuple.indices(stop)
                return arange(start, stop, step)
            start_values = self._compute_validity_mask(index, row_tuple.start,
                                                       max_length)
            stop_values = self._compute_validity_mask(index, row_tuple.stop,
                                                      max_length)
            return Series(arange(start_values.min(), stop_values.max() + 1))
        elif isinstance(row_tuple, numbers.Number):
            return row_tuple
        return self._compute_validity_mask(index, row_tuple, max_length)
Example #2
0
    def _set_categories(self, new_categories, **kwargs):
        """Returns a new CategoricalColumn with the categories set to the
        specified *new_categories*.

        Notes
        -----
        Assumes ``new_categories`` is the same dtype as the current categories
        """

        from cudf import DataFrame, Series

        cur_cats = self._parent.categories
        new_cats = column.as_column(new_categories)

        # Join the old and new categories to build a map from
        # old to new codes, inserting na_sentinel for any old
        # categories that don't exist in the new categories

        # Ensure new_categories is unique first
        if not (kwargs.get("is_unique", False) or new_cats.is_unique):
            # drop_duplicates() instead of unique() to preserve order
            new_cats = Series(new_cats).drop_duplicates()._column

        cur_codes = self.codes
        cur_order = cudautils.arange(len(cur_codes))
        old_codes = cudautils.arange(len(cur_cats), dtype=cur_codes.dtype)
        new_codes = cudautils.arange(len(new_cats), dtype=cur_codes.dtype)

        new_df = DataFrame({"new_codes": new_codes, "cats": new_cats})
        old_df = DataFrame({"old_codes": old_codes, "cats": cur_cats})
        cur_df = DataFrame({"old_codes": cur_codes, "order": cur_order})

        # Join the old and new categories and line up their codes
        df = old_df.merge(new_df, on="cats", how="left")
        # Join the old and new codes to "recode" the codes data buffer
        df = cur_df.merge(df, on="old_codes", how="left")
        df = df.sort_values(by="order").reset_index(True)

        ordered = kwargs.get("ordered", self.ordered)
        new_codes = df["new_codes"]._column
        new_dtype = CategoricalDtype(categories=new_cats, ordered=ordered)

        if kwargs.get("inplace", False):
            self._parent.data = None
            self._parent.mask = new_codes.mask
            self._parent.dtype = new_dtype
            self._parent.children = (new_codes, )
            return None

        return column.build_column(
            data=None,
            dtype=new_dtype,
            mask=new_codes.mask,
            children=(new_codes, ),
        )
Example #3
0
    def _compute_validity_mask(self, index, row_tuple, max_length):
        """ Computes the valid set of indices of values in the lookup
        """
        from cudf import DataFrame
        from cudf import Series
        from cudf import concat
        from cudf.utils.cudautils import arange

        lookup = DataFrame()
        for idx, row in enumerate(row_tuple):
            if row == slice(None):
                continue
            lookup[index._source_data.columns[idx]] = Series(row)
        data_table = concat(
            [
                index._source_data,
                DataFrame({"idx": Series(arange(len(index._source_data)))}),
            ],
            axis=1,
        )
        result = lookup.merge(data_table)["idx"]
        # Avoid computing levels unless the result of the merge is empty,
        # which suggests that a KeyError should be raised.
        if len(result) == 0:
            for idx, row in enumerate(row_tuple):
                if row == slice(None):
                    continue
                if row not in index.levels[idx]._column:
                    raise KeyError(row)
        return result
Example #4
0
    def __setitem__(self, key, value):
        """
        Set the value of self[key] to value.

        If value and self are of different types,
        value is coerced to self.dtype
        """
        import cudf.bindings.copying as cpp_copying
        from cudf.dataframe import columnops

        if isinstance(key, slice):
            key_start, key_stop, key_stride = key.indices(len(self))
            if key_stride != 1:
                raise NotImplementedError("Stride not supported in slice")
            nelem = abs(key_stop - key_start)
        else:
            key = columnops.as_column(key)
            if pd.api.types.is_bool_dtype(key.dtype):
                if not len(key) == len(self):
                    raise ValueError(
                        "Boolean mask must be of same length as column")
                key = columnops.as_column(cudautils.arange(len(self)))[key]
            nelem = len(key)

        if utils.is_scalar(value):
            if is_categorical_dtype(self.dtype):
                from cudf.dataframe.categorical import CategoricalColumn
                from cudf.dataframe.buffer import Buffer
                from cudf.utils.cudautils import fill_value

                data = rmm.device_array(nelem, dtype="int8")
                fill_value(data, self._encode(value))
                value = CategoricalColumn(
                    data=Buffer(data),
                    categories=self._categories,
                    ordered=False,
                )
            elif value is None:
                value = columnops.column_empty(nelem, self.dtype, masked=True)
            else:
                to_dtype = pd.api.types.pandas_dtype(self.dtype)
                value = utils.scalar_broadcast_to(value, nelem, to_dtype)

        value = columnops.as_column(value).astype(self.dtype)

        if len(value) != nelem:
            msg = (f"Size mismatch: cannot set value "
                   f"of size {len(value)} to indexing result of size "
                   f"{nelem}")
            raise ValueError(msg)

        if isinstance(key, slice):
            out = cpp_copying.apply_copy_range(self, value, key_start,
                                               key_stop, 0)
        else:
            out = cpp_copying.apply_scatter(value, key, self)

        self._data = out.data
        self._mask = out.mask
        self._update_null_count()
Example #5
0
def get_sorted_inds(by, ascending=True, na_position="last"):
    """
        Sort by the values.

        Parameters
        ----------
        by : Column or list of Column
            Column or list of Column objects to sort by.
        ascending : bool or list of bool, default True
            If True, sort values in ascending order, otherwise descending.
        na_position : {‘first’ or ‘last’}, default ‘last’
            Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs at
            the end.
        Returns
        -------
        col_inds : cuDF Column of indices sorted based on input

        Difference from pandas:
          * Support axis='index' only.
          * Not supporting: inplace, kind
          * Ascending can be a list of bools to control per column
    """
    if isinstance(by, (ColumnBase)):
        by = [by]

    col_inds = column.as_column(cudautils.arange(len(by[0]), dtype="int32"))

    # This needs to be updated to handle list of bools for ascending
    if ascending is True:
        if na_position == "last":
            na_position = 0
        elif na_position == "first":
            na_position = 1
    elif ascending is False:
        if na_position == "last":
            na_position = 1
        elif na_position == "first":
            na_position = 0
    else:
        logging.warning(
            "When using a sequence of booleans for `ascending`, `na_position` "
            "flag is not yet supported and defaults to treating nulls as "
            "greater than all numbers")
        na_position = 0

    # If given a scalar need to construct a sequence of length # of columns
    if np.isscalar(ascending):
        ascending = [ascending] * len(by)
    # If given a list-like need to convert to a numpy array and copy to device
    if isinstance(ascending, collections.abc.Sequence):
        # Need to flip the boolean here since libcudf has 0 as ascending
        ascending = [not val for val in ascending]
        ascending = rmm.to_device(np.array(ascending, dtype="int8"))
    else:
        raise ValueError("Must use a boolean or list of booleans")

    libcudf.sort.order_by(by, col_inds, ascending, na_position)

    return col_inds
Example #6
0
 def normalize_chunks(self, size, chunks):
     if isinstance(chunks, six.integer_types):
         # *chunks* is the chunksize
         return cudautils.arange(0, size, chunks)
     else:
         # *chunks* is an array of chunk leading offset
         chunks = column.as_column(chunks)
         return chunks.data_array_view
Example #7
0
 def normalize_chunks(self, size, chunks):
     if isinstance(chunks, six.integer_types):
         # *chunks* is the chunksize
         return cudautils.arange(0, size, chunks)
     else:
         # *chunks* is an array of chunk leading offset
         chunks = Series(chunks)
         return chunks.to_gpu_array()
Example #8
0
 def as_column(self):
     if len(self) > 0:
         vals = cudautils.arange(self._start, self._stop, dtype=self.dtype)
     else:
         vals = rmm.device_array(0, dtype=self.dtype)
     return NumericalColumn(data=Buffer(vals),
                            dtype=vals.dtype,
                            name=self.name)
Example #9
0
 def sort_by_values(self, ascending):
     if self.null_count > 0:
         raise ValueError('nulls not yet supported')
     # Clone data buffer as the key
     col_keys = self.replace(data=self.data.copy(), dtype=self._data.dtype)
     # Create new array for the positions
     inds = Buffer(cudautils.arange(len(self)))
     col_inds = self.replace(data=inds, dtype=inds.dtype)
     cpp_sort.apply_sort(col_keys, col_inds, ascending=ascending)
     return col_keys, col_inds
Example #10
0
    def _getitem_tuple_arg(self, arg):
        from cudf.core.dataframe import Series, DataFrame
        from cudf.core.column import column
        from cudf.core.index import as_index
        from cudf.utils.cudautils import arange
        from cudf import MultiIndex

        # Step 1: Gather columns
        if isinstance(self._df.columns, MultiIndex):
            columns_df = self._df.columns._get_column_major(self._df, arg[1])
            if isinstance(columns_df, Series):
                return columns_df
        else:
            columns = self._get_column_selection(arg[1])
            columns_df = DataFrame(index=self._df.index)
            for i, col in enumerate(columns):
                columns_df.insert(i, col, self._df[col])

        # Step 2: Gather rows
        if isinstance(columns_df.index, MultiIndex):
            return columns_df.index._get_row_major(columns_df, arg[0])
        else:
            if isinstance(self._df.columns, MultiIndex):
                if isinstance(arg[0], slice):
                    start, stop, step = arg[0].indices(len(columns_df))
                    indices = arange(start, stop, step)
                    df = columns_df.take(indices)
                else:
                    df = columns_df.take(arg[0])
            else:
                df = DataFrame()
                for col in columns_df.columns:
                    # need Series() in case a scalar is returned
                    df[col] = Series(columns_df[col].loc[arg[0]])
                df.columns = columns_df.columns

        # Step 3: Gather index
        if df.shape[0] == 1:  # we have a single row
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = self._df.index[0]
                df.index = as_index(start)
            else:
                row_selection = column.as_column(arg[0])
                if pd.api.types.is_bool_dtype(row_selection.dtype):
                    df.index = self._df.index.take(row_selection)
                else:
                    df.index = as_index(row_selection)
        # Step 4: Downcast
        if self._can_downcast_to_series(df, arg):
            return self._downcast_to_series(df, arg)
        return df
Example #11
0
    def _group_inner_levels(self, columns, rowidcol, segs, markers):
        """Group the second and onwards level.

        Parameters
        ----------
        columns : sequence[str]
            Group keys.  The order is important.
        rowid_column : str
            The name of the special column with the original rowid.
            It's internally used to determine the shuffling order.
        df : DataFrame
            The dataframe being grouped.
        segs : Series
            First level group begin offsets.

        Returns
        -------
        (sorted_keys, reordering_indices, segments)
            - sorted_keys : list[Series]
                List of sorted key columns.
                Column order is same as arg *columns*.
            - reordering_indices : device array
                The indices to gather on to shuffle the dataframe
                into the grouped seqence.
            - segments : Series
                Group begin offsets.
        """
        dsegs = segs.astype(dtype=np.int32).data.mem
        sorted_keys = []
        plan_cache = {}
        for col in columns:
            # Shuffle the key column according to the previous groups
            srkeys = self._df[col].take(rowidcol.to_gpu_array(),
                                        ignore_index=True)
            # Segmented sort on the key
            shuf = Column(Buffer(cudautils.arange(len(srkeys))))

            cache_key = (len(srkeys), srkeys.dtype, shuf.dtype)
            plan = plan_cache.get(cache_key)
            plan = apply_segsort(srkeys._column, shuf, dsegs, plan=plan)
            plan_cache[cache_key] = plan

            sorted_keys.append(srkeys)  # keep sorted key cols
            # Determine segments
            dsegs, markers = cudautils.find_segments(srkeys.to_gpu_array(),
                                                     dsegs,
                                                     markers=markers)
            # Shuffle
            rowidcol = rowidcol.take(shuf.to_gpu_array(), ignore_index=True)

        reordering_indices = rowidcol.to_gpu_array()
        return sorted_keys, reordering_indices, Series(dsegs)
Example #12
0
 def _to_frame(self):
     from cudf import DataFrame
     # for each column of codes
     # replace column with mapping from integers to levels
     df = self.codes.copy(deep=False)
     for idx, column in enumerate(df.columns):
         # use merge as a replace fn
         level = DataFrame({'idx': Series(cudautils.arange(len(
                                                     self.levels[idx]),
                                          dtype=df[column].dtype)),
                            'level': self.levels[idx]})
         code = DataFrame({'idx': df[column]})
         df[column] = code.merge(level).level
     return df
Example #13
0
def indices_from_labels(obj, labels):
    from cudf.dataframe import columnops

    labels = columnops.as_column(labels)

    if is_categorical_dtype(obj.index):
        labels = labels.astype("category")
        labels._data = labels.data.astype(obj.index._values.data.dtype)
    else:
        labels = labels.astype(obj.index.dtype)

    lhs = cudf.DataFrame({}, index=labels)
    rhs = cudf.DataFrame({"_": arange(len(obj))}, index=obj.index)
    return lhs.join(rhs)["_"]
Example #14
0
def column_select_by_boolmask(column, boolmask):
    """Select by a boolean mask to a column.

    Returns (selected_column, selected_positions)
    """
    from cudf.dataframe.numerical import NumericalColumn
    assert column.null_count == 0  # We don't properly handle the boolmask yet
    boolbits = cudautils.compact_mask_bytes(boolmask.to_gpu_array())
    indices = cudautils.arange(len(boolmask))
    _, selinds = cudautils.copy_to_dense(indices, mask=boolbits)
    _, selvals = cudautils.copy_to_dense(column.data.to_gpu_array(),
                                         mask=boolbits)

    selected_values = column.replace(data=Buffer(selvals))
    selected_index = Buffer(selinds)
    return selected_values, NumericalColumn(data=selected_index,
                                            dtype=selected_index.dtype)
Example #15
0
 def take(self, indices):
     from collections.abc import Sequence
     from cudf import Series
     from numbers import Integral
     if isinstance(indices, (Integral, Sequence)):
         indices = np.array(indices)
     elif isinstance(indices, Series):
         indices = indices.to_gpu_array()
     elif isinstance(indices, slice):
         start, stop, step, sln = utils.standard_python_slice(len(self),
                                                              indices)
         indices = cudautils.arange(start, stop, step)
     if hasattr(self, '_source_data'):
         result = MultiIndex(source_data=self._source_data.take(indices))
     else:
         codes = self.codes.take(indices)
         result = MultiIndex(self.levels, codes)
     result.names = self.names
     return result
Example #16
0
def indices_from_labels(obj, labels):
    from cudf.core.column import column

    labels = column.as_column(labels)

    if is_categorical_dtype(obj.index):
        labels = labels.astype("category")
        codes = labels.codes.astype(obj.index._values.codes.dtype)
        labels = column.build_categorical_column(
            categories=labels.dtype.categories,
            codes=codes,
            ordered=labels.dtype.ordered,
        )
    else:
        labels = labels.astype(obj.index.dtype)

    lhs = cudf.DataFrame({}, index=labels)
    rhs = cudf.DataFrame({"_": arange(len(obj))}, index=obj.index)
    return lhs.join(rhs)["_"]
Example #17
0
    def take(self, indices):
        from collections.abc import Sequence
        from cudf import Series
        from numbers import Integral

        if isinstance(indices, (Integral, Sequence)):
            indices = np.array(indices)
        elif isinstance(indices, Series):
            indices = indices.to_gpu_array()
        elif isinstance(indices, slice):
            start, stop, step = indices.indices(len(self))
            indices = cudautils.arange(start, stop, step)
        result = MultiIndex(source_data=self._source_data.take(indices))
        if self._codes is not None:
            result._codes = self._codes.take(indices)
        if self._levels is not None:
            result._levels = self._levels
        result.names = self.names
        return result
Example #18
0
    def _getitem_tuple_arg(self, arg):
        from cudf.dataframe.dataframe import DataFrame
        from cudf.dataframe.index import as_index
        from cudf.utils.cudautils import arange
        from cudf import MultiIndex

        # Step 1: Gather columns
        if isinstance(self._df.columns, MultiIndex):
            columns_df = self._df.columns._get_column_major(self._df, arg[1])
        else:
            columns = self._get_column_selection(arg[1])
            columns_df = DataFrame()
            for col in columns:
                columns_df.add_column(name=col, data=self._df[col])
        # Step 2: Gather rows
        if isinstance(columns_df.index, MultiIndex):
            return columns_df.index._get_row_major(columns_df, arg[0])
        else:
            if isinstance(self._df.columns, MultiIndex):
                if isinstance(arg[0], slice):
                    start, stop, step = arg[0].indices(len(columns_df))
                    indices = arange(start, stop, step)
                    df = columns_df.take(indices)
                else:
                    df = columns_df.take(arg[0])
            else:
                df = DataFrame()
                for col in columns_df.columns:
                    df[col] = columns_df[col].loc[arg[0]]
        # Step 3: Gather index
        if df.shape[0] == 1:  # we have a single row
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = self._df.index[0]
                df.index = as_index(start)
            else:
                df.index = as_index(arg[0])
        # Step 4: Downcast
        if self._can_downcast_to_series(df, arg):
            return self._downcast_to_series(df, arg)
        return df
Example #19
0
    def take(self, indices):
        from collections.abc import Sequence
        from cudf import Series
        from numbers import Integral

        if isinstance(indices, (Integral, Sequence)):
            indices = np.array(indices)
        elif isinstance(indices, Series):
            if indices.null_count != 0:
                raise ValueError("Column must have no nulls.")
            indices = indices.data.mem
        elif isinstance(indices, slice):
            start, stop, step = indices.indices(len(self))
            indices = cudautils.arange(start, stop, step)
        result = MultiIndex(source_data=self._source_data.take(indices))
        if self._codes is not None:
            result._codes = self._codes.take(indices)
        if self._levels is not None:
            result._levels = self._levels
        result.names = self.names
        return result
Example #20
0
    def _compute_levels_and_codes(self):
        levels = []
        from cudf import DataFrame
        codes = DataFrame()
        names = []
        # Note: This is an O(N^2) solution using gpu masking
        # to compute new codes for the MultiIndex. There may be
        # a faster solution that could be executed on gpu at the same
        # time the groupby is calculated.
        for by in self._source_data.columns:
            if len(self._source_data[by]) > 0:
                level = self._source_data[by].unique()
                replaced = self._source_data[by].replace(
                        level, Series(cudautils.arange(len(level))))
            else:
                level = np.array([])
                replaced = np.array([])
            levels.append(level)
            codes[by] = Series(replaced, dtype="int32")
            names.append(by)

        self._levels = levels
        self._codes = codes
        self.names = names
Example #21
0
def index_from_range(start, stop=None, step=None):
    vals = cudautils.arange(start, stop, step, dtype=np.int64)
    return GenericIndex(NumericalColumn(data=Buffer(vals), dtype=vals.dtype))
Example #22
0
def index_from_range(start, stop=None, step=None):
    vals = cudautils.arange(start, stop, step, dtype=np.int64)
    return as_index(vals)
Example #23
0
 def _values(self):
     if len(self) > 0:
         vals = cudautils.arange(self._start, self._stop, dtype=self.dtype)
         return column.as_column(vals)
     else:
         return column.column_empty(0, masked=False, dtype=self.dtype)
Example #24
0
    def __setitem__(self, key, value):
        """
        Set the value of self[key] to value.

        If value and self are of different types,
        value is coerced to self.dtype
        """
        from cudf.core import column

        if isinstance(key, slice):
            key_start, key_stop, key_stride = key.indices(len(self))
            if key_stride != 1:
                raise NotImplementedError("Stride not supported in slice")
            nelem = abs(key_stop - key_start)
        else:
            key = column.as_column(key)
            if pd.api.types.is_bool_dtype(key.dtype):
                if not len(key) == len(self):
                    raise ValueError(
                        "Boolean mask must be of same length as column"
                    )
                key = column.as_column(cudautils.arange(len(self)))[key]
            nelem = len(key)

        if is_scalar(value):
            if is_categorical_dtype(self.dtype):
                from cudf.utils.cudautils import fill_value

                data = rmm.device_array(nelem, dtype=self.codes.dtype)
                fill_value(data, self._encode(value))
                value = build_categorical_column(
                    categories=self.dtype.categories,
                    codes=as_column(data),
                    ordered=self.dtype.ordered,
                )
            elif value is None:
                value = column.column_empty(nelem, self.dtype, masked=True)
            else:
                to_dtype = pd.api.types.pandas_dtype(self.dtype)
                value = utils.scalar_broadcast_to(value, nelem, to_dtype)

        value = column.as_column(value).astype(self.dtype)

        if len(value) != nelem:
            msg = (
                f"Size mismatch: cannot set value "
                f"of size {len(value)} to indexing result of size "
                f"{nelem}"
            )
            raise ValueError(msg)

        if is_categorical_dtype(value.dtype):
            value = value.cat().set_categories(self.categories)
            assert self.dtype == value.dtype

        if isinstance(key, slice):
            out = libcudf.copying.copy_range(
                self, value, key_start, key_stop, 0
            )
        else:
            try:
                out = libcudf.copying.scatter(value, key, self)
            except RuntimeError as e:
                if "out of bounds" in str(e):
                    raise IndexError(
                        f"index out of bounds for column of size {len(self)}"
                    )
                raise

        self._mimic_inplace(out, inplace=True)
Example #25
0
 def _get_row_major(self, df, row_tuple):
     slice_access = False
     if isinstance(row_tuple[0], numbers.Number):
         valid_indices = row_tuple[0]
     elif isinstance(row_tuple[0], slice):
         # 1. empty slice compute
         if row_tuple[0].stop == 0:
             valid_indices = []
         else:
             slice_access = True
             start = row_tuple[0].start or 0
             stop = row_tuple[0].stop or len(df)
             step = row_tuple[0].step or 1
             valid_indices = cudautils.arange(start, stop, step)
     else:
         valid_indices = self._compute_validity_mask(df, row_tuple)
     from cudf import Series
     result = df.take(Series(valid_indices))
     # Build new index - INDEX based MultiIndex
     # ---------------
     from cudf import DataFrame
     out_index = DataFrame()
     # Select the last n-k columns where n is the number of source
     # levels and k is the length of the indexing tuple
     size = 0
     if not isinstance(row_tuple[0], (numbers.Number, slice)):
         size = len(row_tuple)
     for k in range(size, len(df.index.levels)):
         out_index.add_column(df.index.names[k],
                              df.index.codes[df.index.codes.columns[k]])
     # If there's only one column remaining in the output index, convert
     # it into an Index and name the final index values according
     # to the proper codes.
     if len(out_index.columns) == 1:
         out_index = []
         for val in result.index.codes[result.index.codes.columns[len(result.index.codes.columns)-1]]:  # noqa: E501
             out_index.append(result.index.levels[
                     len(result.index.codes.columns)-1][val])
         out_index = as_index(out_index)
         out_index.name = result.index.names[len(result.index.names)-1]
         result.index = out_index
     else:
         if len(result) == 1 and size == 0 and slice_access is False:
             # If the final result is one row and it was not mapped into
             # directly
             result = result.T
             result = result[result.columns[0]]
             # convert to Series
             series_name = []
             for idx, code in enumerate(result.columns.codes):
                 series_name.append(result.columns.levels[idx][
                         result.columns.codes[code][0]])
             result = Series(list(result._cols.values())[0],
                             name=series_name)
             result.name = tuple(series_name)
         elif(len(out_index.columns)) > 0:
             # Otherwise pop the leftmost levels, names, and codes from the
             # source index until it has the correct number of columns (n-k)
             result.reset_index(drop=True)
             result.index = result.index._popn(size)
     return result