def _set_categories(self, new_categories, **kwargs): """Returns a new CategoricalColumn with the categories set to the specified *new_categories*. Notes ----- Assumes ``new_categories`` is the same dtype as the current categories """ from cudf import DataFrame, Series cur_cats = self._parent.categories new_cats = column.as_column(new_categories) # Join the old and new categories to build a map from # old to new codes, inserting na_sentinel for any old # categories that don't exist in the new categories # Ensure new_categories is unique first if not (kwargs.get("is_unique", False) or new_cats.is_unique): # drop_duplicates() instead of unique() to preserve order new_cats = Series(new_cats).drop_duplicates()._column cur_codes = self.codes cur_order = cudautils.arange(len(cur_codes)) old_codes = cudautils.arange(len(cur_cats), dtype=cur_codes.dtype) new_codes = cudautils.arange(len(new_cats), dtype=cur_codes.dtype) new_df = DataFrame({"new_codes": new_codes, "cats": new_cats}) old_df = DataFrame({"old_codes": old_codes, "cats": cur_cats}) cur_df = DataFrame({"old_codes": cur_codes, "order": cur_order}) # Join the old and new categories and line up their codes df = old_df.merge(new_df, on="cats", how="left") # Join the old and new codes to "recode" the codes data buffer df = cur_df.merge(df, on="old_codes", how="left") df = df.sort_values(by="order").reset_index(True) ordered = kwargs.get("ordered", self.ordered) new_codes = df["new_codes"]._column new_dtype = CategoricalDtype(categories=new_cats, ordered=ordered) if kwargs.get("inplace", False): self._parent.data = None self._parent.mask = new_codes.mask self._parent.dtype = new_dtype self._parent.children = (new_codes, ) return None return column.build_column( data=None, dtype=new_dtype, mask=new_codes.mask, children=(new_codes, ), )
def _set_categories(self, new_categories, **kwargs): """Returns a new CategoricalColumn with the categories set to the specified *new_categories*. Notes ----- Assumes ``new_categories`` is the same dtype as the current categories """ from cudf import DataFrame, Series cur_cats = self._column.categories new_cats = column.as_column(new_categories) # Join the old and new categories to build a map from # old to new codes, inserting na_sentinel for any old # categories that don't exist in the new categories # Ensure new_categories is unique first if not (kwargs.get("is_unique", False) or new_cats.is_unique): # drop_duplicates() instead of unique() to preserve order new_cats = ( Series(new_cats).drop_duplicates(ignore_index=True)._column ) cur_codes = self.codes cur_order = cupy.arange(len(cur_codes)) old_codes = cupy.arange(len(cur_cats), dtype=cur_codes.dtype) new_codes = cupy.arange(len(new_cats), dtype=cur_codes.dtype) new_df = DataFrame({"new_codes": new_codes, "cats": new_cats}) old_df = DataFrame({"old_codes": old_codes, "cats": cur_cats}) cur_df = DataFrame({"old_codes": cur_codes, "order": cur_order}) # Join the old and new categories and line up their codes df = old_df.merge(new_df, on="cats", how="left") # Join the old and new codes to "recode" the codes data buffer df = cur_df.merge(df, on="old_codes", how="left") df = df.sort_values(by="order") df.reset_index(drop=True, inplace=True) ordered = kwargs.get("ordered", self.ordered) new_codes = df["new_codes"]._column # codes can't have masks, so take mask out before moving in return column.build_categorical_column( categories=new_cats, codes=column.as_column(new_codes.base_data, dtype=new_codes.dtype), mask=new_codes.base_mask, size=new_codes.size, offset=new_codes.offset, ordered=ordered, )
def _compute_validity_mask(self, index, row_tuple, max_length): """ Computes the valid set of indices of values in the lookup """ from cudf import DataFrame from cudf import Series from cudf import concat from cudf.utils.cudautils import arange lookup = DataFrame() for idx, row in enumerate(row_tuple): if row == slice(None): continue lookup[index._source_data.columns[idx]] = Series(row) data_table = concat( [ index._source_data, DataFrame({"idx": Series(arange(len(index._source_data)))}), ], axis=1, ) result = lookup.merge(data_table)["idx"] # Avoid computing levels unless the result of the merge is empty, # which suggests that a KeyError should be raised. if len(result) == 0: for idx, row in enumerate(row_tuple): if row == slice(None): continue if row not in index.levels[idx]._column: raise KeyError(row) return result
def transform(self, columns, gdf: cudf.DataFrame) -> cudf.DataFrame: tmp = "__tmp__" # Temporary column for sorting gdf[tmp] = cupy.arange(len(gdf), dtype="int32") new_gdf = gdf.merge(self._ext, left_on=self.on, right_on=self.on_ext, how=self.how) new_gdf = new_gdf.sort_values(tmp) new_gdf.drop(columns=[tmp], inplace=True) gdf.drop(columns=[tmp], inplace=True) new_gdf.reset_index(drop=True, inplace=True) return new_gdf
def _to_frame(self): from cudf import DataFrame # for each column of codes # replace column with mapping from integers to levels df = self.codes.copy(deep=False) for idx, column in enumerate(df.columns): # use merge as a replace fn level = DataFrame({'idx': Series(cudautils.arange(len( self.levels[idx]), dtype=df[column].dtype)), 'level': self.levels[idx]}) code = DataFrame({'idx': df[column]}) df[column] = code.merge(level).level return df
def _to_frame(self): from cudf import DataFrame, Series # for each column of codes # replace column with mapping from integers to levels df = self.codes.copy(deep=False) for idx, col in enumerate(df.columns): # use merge as a replace fn level = DataFrame({ "idx": Series(cupy.arange(len(self.levels[idx]), dtype=df[col].dtype)), "level": self.levels[idx], }) code = DataFrame({"idx": df[col]}) df[col] = code.merge(level).level return df
def apply_op( self, gdf: cudf.DataFrame, columns_ctx: dict, input_cols, target_cols=["base"], stats_context=None, ): target_columns = self.get_columns(columns_ctx, input_cols, target_cols) tmp = "__tmp__" # Temporary column for sorting gdf[tmp] = cupy.arange(len(gdf), dtype="int32") new_gdf = gdf.merge(self._ext, left_on=self.on, right_on=self.on_ext, how=self.how) new_gdf = new_gdf.sort_values(tmp) new_gdf.drop(columns=[tmp], inplace=True) gdf.drop(columns=[tmp], inplace=True) new_gdf.reset_index(drop=True, inplace=True) self.update_columns_ctx(columns_ctx, input_cols, new_gdf.columns, target_columns) return new_gdf