Beispiel #1
0
    def _set_categories(
        self,
        current_categories: Any,
        new_categories: Any,
        is_unique: bool = False,
        ordered: bool = False,
    ) -> CategoricalColumn:
        """Returns a new CategoricalColumn with the categories set to the
        specified *new_categories*.

        Notes
        -----
        Assumes ``new_categories`` is the same dtype as the current categories
        """

        cur_cats = column.as_column(current_categories)
        new_cats = column.as_column(new_categories)

        # Join the old and new categories to build a map from
        # old to new codes, inserting na_sentinel for any old
        # categories that don't exist in the new categories

        # Ensure new_categories is unique first
        if not (is_unique or new_cats.is_unique):
            # drop_duplicates() instead of unique() to preserve order
            new_cats = (cudf.Series(new_cats).drop_duplicates(
                ignore_index=True)._column)

        cur_codes = self.codes
        max_cat_size = (len(cur_cats)
                        if len(cur_cats) > len(new_cats) else len(new_cats))
        out_code_dtype = min_unsigned_type(max_cat_size)

        cur_order = column.arange(len(cur_codes))
        old_codes = column.arange(len(cur_cats), dtype=out_code_dtype)
        new_codes = column.arange(len(new_cats), dtype=out_code_dtype)

        new_df = cudf.DataFrame({"new_codes": new_codes, "cats": new_cats})
        old_df = cudf.DataFrame({"old_codes": old_codes, "cats": cur_cats})
        cur_df = cudf.DataFrame({"old_codes": cur_codes, "order": cur_order})

        # Join the old and new categories and line up their codes
        df = old_df.merge(new_df, on="cats", how="left")
        # Join the old and new codes to "recode" the codes data buffer
        df = cur_df.merge(df, on="old_codes", how="left")
        df = df.sort_values(by="order")
        df.reset_index(drop=True, inplace=True)

        ordered = ordered if ordered is not None else self.ordered
        new_codes = df["new_codes"]._column

        # codes can't have masks, so take mask out before moving in
        return column.build_categorical_column(
            categories=new_cats,
            codes=column.as_column(new_codes.base_data, dtype=new_codes.dtype),
            mask=new_codes.base_mask,
            size=new_codes.size,
            offset=new_codes.offset,
            ordered=ordered,
        )
Beispiel #2
0
    def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):

        # Instructions for Slicing
        # if tuple, get first and last elements of tuple
        # if open beginning tuple, get 0 to highest valid_index
        # if open ending tuple, get highest valid_index to len()
        # if not open end or beginning, get range lowest beginning index
        # to highest ending index
        if isinstance(row_tuple, slice):
            if (
                isinstance(row_tuple.start, numbers.Number)
                or isinstance(row_tuple.stop, numbers.Number)
                or row_tuple == slice(None)
            ):
                stop = row_tuple.stop or max_length
                start, stop, step = row_tuple.indices(stop)
                return column.arange(start, stop, step)
            start_values = self._compute_validity_mask(
                index, row_tuple.start, max_length
            )
            stop_values = self._compute_validity_mask(
                index, row_tuple.stop, max_length
            )
            return column.arange(start_values.min(), stop_values.max() + 1)
        elif isinstance(row_tuple, numbers.Number):
            return row_tuple
        return self._compute_validity_mask(index, row_tuple, max_length)
Beispiel #3
0
 def _compute_validity_mask(self, index, row_tuple, max_length):
     """ Computes the valid set of indices of values in the lookup
     """
     lookup = cudf.DataFrame()
     for idx, row in enumerate(row_tuple):
         if isinstance(row, slice) and row == slice(None):
             continue
         lookup[index._source_data.columns[idx]] = cudf.Series(row)
     data_table = cudf.concat(
         [
             index._source_data,
             cudf.DataFrame(
                 {
                     "idx": cudf.Series(
                         column.arange(len(index._source_data))
                     )
                 }
             ),
         ],
         axis=1,
     )
     result = lookup.merge(data_table)["idx"]
     # Avoid computing levels unless the result of the merge is empty,
     # which suggests that a KeyError should be raised.
     if len(result) == 0:
         for idx, row in enumerate(row_tuple):
             if row == slice(None):
                 continue
             if row not in index.levels[idx]._column:
                 raise KeyError(row)
     return result
Beispiel #4
0
 def normalize_chunks(self, size, chunks):
     if isinstance(chunks, int):
         # *chunks* is the chunksize
         return column.arange(0, size, chunks).data_array_view
     else:
         # *chunks* is an array of chunk leading offset
         chunks = column.as_column(chunks)
         return chunks.data_array_view
Beispiel #5
0
def indices_from_labels(obj, labels):
    from cudf.core.column import column

    if not isinstance(labels, cudf.MultiIndex):
        labels = column.as_column(labels)

        if is_categorical_dtype(obj.index):
            labels = labels.astype("category")
            codes = labels.codes.astype(obj.index._values.codes.dtype)
            labels = column.build_categorical_column(
                categories=labels.dtype.categories,
                codes=codes,
                ordered=labels.dtype.ordered,
            )
        else:
            labels = labels.astype(obj.index.dtype)

    # join is not guaranteed to maintain the index ordering
    # so we will sort it with its initial ordering which is stored
    # in column "__"
    lhs = cudf.DataFrame({"__": column.arange(len(labels))}, index=labels)
    rhs = cudf.DataFrame({"_": column.arange(len(obj))}, index=obj.index)
    return lhs.join(rhs).sort_values("__")["_"]
Beispiel #6
0
    def _to_frame(self):

        # for each column of codes
        # replace column with mapping from integers to levels
        df = self.codes.copy(deep=False)
        for idx, col in enumerate(df.columns):
            # use merge as a replace fn
            level = cudf.DataFrame({
                "idx":
                column.arange(len(self.levels[idx]), dtype=df[col].dtype),
                "level":
                self.levels[idx],
            })
            code = cudf.DataFrame({"idx": df[col]})
            df[col] = code.merge(level).level
        return df
Beispiel #7
0
    def take(self, indices):
        from collections.abc import Sequence
        from numbers import Integral

        if isinstance(indices, (Integral, Sequence)):
            indices = np.array(indices)
        elif isinstance(indices, cudf.Series):
            if indices.has_nulls:
                raise ValueError("Column must have no nulls.")
            indices = indices
        elif isinstance(indices, slice):
            start, stop, step = indices.indices(len(self))
            indices = column.arange(start, stop, step)
        result = MultiIndex(source_data=self._source_data.take(indices))
        if self._codes is not None:
            result._codes = self._codes.take(indices)
        if self._levels is not None:
            result._levels = self._levels
        result.names = self.names
        return result
Beispiel #8
0
    def _getitem_tuple_arg(self, arg):
        from uuid import uuid4

        from cudf import MultiIndex
        from cudf.core.column import column
        from cudf.core.dataframe import DataFrame
        from cudf.core.index import as_index

        # Step 1: Gather columns
        if isinstance(arg, tuple):
            columns_df = self._get_column_selection(arg[1])
            columns_df._index = self._df._index
        else:
            columns_df = self._df

        # Step 2: Gather rows
        if isinstance(columns_df.index, MultiIndex):
            if isinstance(arg, (MultiIndex, pd.MultiIndex)):
                if isinstance(arg, pd.MultiIndex):
                    arg = MultiIndex.from_pandas(arg)

                indices = indices_from_labels(columns_df, arg)
                return columns_df.take(indices)

            else:
                if isinstance(arg, tuple):
                    return columns_df.index._get_row_major(columns_df, arg[0])
                else:
                    return columns_df.index._get_row_major(columns_df, arg)
        else:
            if isinstance(arg[0], slice):
                out = get_label_range_or_mask(
                    columns_df.index, arg[0].start, arg[0].stop, arg[0].step
                )
                if isinstance(out, slice):
                    df = columns_df._slice(out)
                else:
                    df = columns_df._apply_boolean_mask(out)
            else:
                tmp_arg = arg
                if is_scalar(arg[0]):
                    # If a scalar, there is possibility of having duplicates.
                    # Join would get all the duplicates. So, coverting it to
                    # an array kind.
                    tmp_arg = ([tmp_arg[0]], tmp_arg[1])
                if len(tmp_arg[0]) == 0:
                    return columns_df._empty_like(keep_index=True)
                tmp_arg = (column.as_column(tmp_arg[0]), tmp_arg[1])

                if pd.api.types.is_bool_dtype(tmp_arg[0]):
                    df = columns_df._apply_boolean_mask(tmp_arg[0])
                else:
                    tmp_col_name = str(uuid4())
                    other_df = DataFrame(
                        {tmp_col_name: column.arange(len(tmp_arg[0]))},
                        index=as_index(tmp_arg[0]),
                    )
                    df = other_df.join(columns_df, how="inner")
                    # as join is not assigning any names to index,
                    # update it over here
                    df.index.name = columns_df.index.name
                    df = df.sort_values(tmp_col_name)
                    df.drop(columns=[tmp_col_name], inplace=True)
                    # There were no indices found
                    if len(df) == 0:
                        raise KeyError(arg)

        # Step 3: Gather index
        if df.shape[0] == 1:  # we have a single row
            if isinstance(arg[0], slice):
                start = arg[0].start
                if start is None:
                    start = self._df.index[0]
                df.index = as_index(start)
            else:
                row_selection = column.as_column(arg[0])
                if pd.api.types.is_bool_dtype(row_selection.dtype):
                    df.index = self._df.index.take(row_selection)
                else:
                    df.index = as_index(row_selection)
        # Step 4: Downcast
        if self._can_downcast_to_series(df, arg):
            return self._downcast_to_series(df, arg)
        return df