def _set_categories( self, current_categories: Any, new_categories: Any, is_unique: bool = False, ordered: bool = False, ) -> CategoricalColumn: """Returns a new CategoricalColumn with the categories set to the specified *new_categories*. Notes ----- Assumes ``new_categories`` is the same dtype as the current categories """ cur_cats = column.as_column(current_categories) new_cats = column.as_column(new_categories) # Join the old and new categories to build a map from # old to new codes, inserting na_sentinel for any old # categories that don't exist in the new categories # Ensure new_categories is unique first if not (is_unique or new_cats.is_unique): # drop_duplicates() instead of unique() to preserve order new_cats = (cudf.Series(new_cats).drop_duplicates( ignore_index=True)._column) cur_codes = self.codes max_cat_size = (len(cur_cats) if len(cur_cats) > len(new_cats) else len(new_cats)) out_code_dtype = min_unsigned_type(max_cat_size) cur_order = column.arange(len(cur_codes)) old_codes = column.arange(len(cur_cats), dtype=out_code_dtype) new_codes = column.arange(len(new_cats), dtype=out_code_dtype) new_df = cudf.DataFrame({"new_codes": new_codes, "cats": new_cats}) old_df = cudf.DataFrame({"old_codes": old_codes, "cats": cur_cats}) cur_df = cudf.DataFrame({"old_codes": cur_codes, "order": cur_order}) # Join the old and new categories and line up their codes df = old_df.merge(new_df, on="cats", how="left") # Join the old and new codes to "recode" the codes data buffer df = cur_df.merge(df, on="old_codes", how="left") df = df.sort_values(by="order") df.reset_index(drop=True, inplace=True) ordered = ordered if ordered is not None else self.ordered new_codes = df["new_codes"]._column # codes can't have masks, so take mask out before moving in return column.build_categorical_column( categories=new_cats, codes=column.as_column(new_codes.base_data, dtype=new_codes.dtype), mask=new_codes.base_mask, size=new_codes.size, offset=new_codes.offset, ordered=ordered, )
def _get_valid_indices_by_tuple(self, index, row_tuple, max_length): # Instructions for Slicing # if tuple, get first and last elements of tuple # if open beginning tuple, get 0 to highest valid_index # if open ending tuple, get highest valid_index to len() # if not open end or beginning, get range lowest beginning index # to highest ending index if isinstance(row_tuple, slice): if ( isinstance(row_tuple.start, numbers.Number) or isinstance(row_tuple.stop, numbers.Number) or row_tuple == slice(None) ): stop = row_tuple.stop or max_length start, stop, step = row_tuple.indices(stop) return column.arange(start, stop, step) start_values = self._compute_validity_mask( index, row_tuple.start, max_length ) stop_values = self._compute_validity_mask( index, row_tuple.stop, max_length ) return column.arange(start_values.min(), stop_values.max() + 1) elif isinstance(row_tuple, numbers.Number): return row_tuple return self._compute_validity_mask(index, row_tuple, max_length)
def _compute_validity_mask(self, index, row_tuple, max_length): """ Computes the valid set of indices of values in the lookup """ lookup = cudf.DataFrame() for idx, row in enumerate(row_tuple): if isinstance(row, slice) and row == slice(None): continue lookup[index._source_data.columns[idx]] = cudf.Series(row) data_table = cudf.concat( [ index._source_data, cudf.DataFrame( { "idx": cudf.Series( column.arange(len(index._source_data)) ) } ), ], axis=1, ) result = lookup.merge(data_table)["idx"] # Avoid computing levels unless the result of the merge is empty, # which suggests that a KeyError should be raised. if len(result) == 0: for idx, row in enumerate(row_tuple): if row == slice(None): continue if row not in index.levels[idx]._column: raise KeyError(row) return result
def normalize_chunks(self, size, chunks): if isinstance(chunks, int): # *chunks* is the chunksize return column.arange(0, size, chunks).data_array_view else: # *chunks* is an array of chunk leading offset chunks = column.as_column(chunks) return chunks.data_array_view
def indices_from_labels(obj, labels): from cudf.core.column import column if not isinstance(labels, cudf.MultiIndex): labels = column.as_column(labels) if is_categorical_dtype(obj.index): labels = labels.astype("category") codes = labels.codes.astype(obj.index._values.codes.dtype) labels = column.build_categorical_column( categories=labels.dtype.categories, codes=codes, ordered=labels.dtype.ordered, ) else: labels = labels.astype(obj.index.dtype) # join is not guaranteed to maintain the index ordering # so we will sort it with its initial ordering which is stored # in column "__" lhs = cudf.DataFrame({"__": column.arange(len(labels))}, index=labels) rhs = cudf.DataFrame({"_": column.arange(len(obj))}, index=obj.index) return lhs.join(rhs).sort_values("__")["_"]
def _to_frame(self): # for each column of codes # replace column with mapping from integers to levels df = self.codes.copy(deep=False) for idx, col in enumerate(df.columns): # use merge as a replace fn level = cudf.DataFrame({ "idx": column.arange(len(self.levels[idx]), dtype=df[col].dtype), "level": self.levels[idx], }) code = cudf.DataFrame({"idx": df[col]}) df[col] = code.merge(level).level return df
def take(self, indices): from collections.abc import Sequence from numbers import Integral if isinstance(indices, (Integral, Sequence)): indices = np.array(indices) elif isinstance(indices, cudf.Series): if indices.has_nulls: raise ValueError("Column must have no nulls.") indices = indices elif isinstance(indices, slice): start, stop, step = indices.indices(len(self)) indices = column.arange(start, stop, step) result = MultiIndex(source_data=self._source_data.take(indices)) if self._codes is not None: result._codes = self._codes.take(indices) if self._levels is not None: result._levels = self._levels result.names = self.names return result
def _getitem_tuple_arg(self, arg): from uuid import uuid4 from cudf import MultiIndex from cudf.core.column import column from cudf.core.dataframe import DataFrame from cudf.core.index import as_index # Step 1: Gather columns if isinstance(arg, tuple): columns_df = self._get_column_selection(arg[1]) columns_df._index = self._df._index else: columns_df = self._df # Step 2: Gather rows if isinstance(columns_df.index, MultiIndex): if isinstance(arg, (MultiIndex, pd.MultiIndex)): if isinstance(arg, pd.MultiIndex): arg = MultiIndex.from_pandas(arg) indices = indices_from_labels(columns_df, arg) return columns_df.take(indices) else: if isinstance(arg, tuple): return columns_df.index._get_row_major(columns_df, arg[0]) else: return columns_df.index._get_row_major(columns_df, arg) else: if isinstance(arg[0], slice): out = get_label_range_or_mask( columns_df.index, arg[0].start, arg[0].stop, arg[0].step ) if isinstance(out, slice): df = columns_df._slice(out) else: df = columns_df._apply_boolean_mask(out) else: tmp_arg = arg if is_scalar(arg[0]): # If a scalar, there is possibility of having duplicates. # Join would get all the duplicates. So, coverting it to # an array kind. tmp_arg = ([tmp_arg[0]], tmp_arg[1]) if len(tmp_arg[0]) == 0: return columns_df._empty_like(keep_index=True) tmp_arg = (column.as_column(tmp_arg[0]), tmp_arg[1]) if pd.api.types.is_bool_dtype(tmp_arg[0]): df = columns_df._apply_boolean_mask(tmp_arg[0]) else: tmp_col_name = str(uuid4()) other_df = DataFrame( {tmp_col_name: column.arange(len(tmp_arg[0]))}, index=as_index(tmp_arg[0]), ) df = other_df.join(columns_df, how="inner") # as join is not assigning any names to index, # update it over here df.index.name = columns_df.index.name df = df.sort_values(tmp_col_name) df.drop(columns=[tmp_col_name], inplace=True) # There were no indices found if len(df) == 0: raise KeyError(arg) # Step 3: Gather index if df.shape[0] == 1: # we have a single row if isinstance(arg[0], slice): start = arg[0].start if start is None: start = self._df.index[0] df.index = as_index(start) else: row_selection = column.as_column(arg[0]) if pd.api.types.is_bool_dtype(row_selection.dtype): df.index = self._df.index.take(row_selection) else: df.index = as_index(row_selection) # Step 4: Downcast if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) return df