コード例 #1
0
    def _set_categories(self, new_categories, **kwargs):
        """Returns a new CategoricalColumn with the categories set to the
        specified *new_categories*.

        Notes
        -----
        Assumes ``new_categories`` is the same dtype as the current categories
        """

        from cudf import DataFrame, Series

        cur_cats = self._parent.categories
        new_cats = column.as_column(new_categories)

        # Join the old and new categories to build a map from
        # old to new codes, inserting na_sentinel for any old
        # categories that don't exist in the new categories

        # Ensure new_categories is unique first
        if not (kwargs.get("is_unique", False) or new_cats.is_unique):
            # drop_duplicates() instead of unique() to preserve order
            new_cats = Series(new_cats).drop_duplicates()._column

        cur_codes = self.codes
        cur_order = cudautils.arange(len(cur_codes))
        old_codes = cudautils.arange(len(cur_cats), dtype=cur_codes.dtype)
        new_codes = cudautils.arange(len(new_cats), dtype=cur_codes.dtype)

        new_df = DataFrame({"new_codes": new_codes, "cats": new_cats})
        old_df = DataFrame({"old_codes": old_codes, "cats": cur_cats})
        cur_df = DataFrame({"old_codes": cur_codes, "order": cur_order})

        # Join the old and new categories and line up their codes
        df = old_df.merge(new_df, on="cats", how="left")
        # Join the old and new codes to "recode" the codes data buffer
        df = cur_df.merge(df, on="old_codes", how="left")
        df = df.sort_values(by="order").reset_index(True)

        ordered = kwargs.get("ordered", self.ordered)
        new_codes = df["new_codes"]._column
        new_dtype = CategoricalDtype(categories=new_cats, ordered=ordered)

        if kwargs.get("inplace", False):
            self._parent.data = None
            self._parent.mask = new_codes.mask
            self._parent.dtype = new_dtype
            self._parent.children = (new_codes, )
            return None

        return column.build_column(
            data=None,
            dtype=new_dtype,
            mask=new_codes.mask,
            children=(new_codes, ),
        )
コード例 #2
0
ファイル: categorical.py プロジェクト: zhuohuwu0603/cudf
    def _set_categories(self, new_categories, **kwargs):
        """Returns a new CategoricalColumn with the categories set to the
        specified *new_categories*.

        Notes
        -----
        Assumes ``new_categories`` is the same dtype as the current categories
        """

        from cudf import DataFrame, Series

        cur_cats = self._column.categories
        new_cats = column.as_column(new_categories)

        # Join the old and new categories to build a map from
        # old to new codes, inserting na_sentinel for any old
        # categories that don't exist in the new categories

        # Ensure new_categories is unique first
        if not (kwargs.get("is_unique", False) or new_cats.is_unique):
            # drop_duplicates() instead of unique() to preserve order
            new_cats = (
                Series(new_cats).drop_duplicates(ignore_index=True)._column
            )

        cur_codes = self.codes
        cur_order = cupy.arange(len(cur_codes))
        old_codes = cupy.arange(len(cur_cats), dtype=cur_codes.dtype)
        new_codes = cupy.arange(len(new_cats), dtype=cur_codes.dtype)

        new_df = DataFrame({"new_codes": new_codes, "cats": new_cats})
        old_df = DataFrame({"old_codes": old_codes, "cats": cur_cats})
        cur_df = DataFrame({"old_codes": cur_codes, "order": cur_order})

        # Join the old and new categories and line up their codes
        df = old_df.merge(new_df, on="cats", how="left")
        # Join the old and new codes to "recode" the codes data buffer
        df = cur_df.merge(df, on="old_codes", how="left")
        df = df.sort_values(by="order")
        df.reset_index(drop=True, inplace=True)

        ordered = kwargs.get("ordered", self.ordered)
        new_codes = df["new_codes"]._column

        # codes can't have masks, so take mask out before moving in
        return column.build_categorical_column(
            categories=new_cats,
            codes=column.as_column(new_codes.base_data, dtype=new_codes.dtype),
            mask=new_codes.base_mask,
            size=new_codes.size,
            offset=new_codes.offset,
            ordered=ordered,
        )
コード例 #3
0
ファイル: multiindex.py プロジェクト: williamBlazing/cudf
    def _compute_validity_mask(self, index, row_tuple, max_length):
        """ Computes the valid set of indices of values in the lookup
        """
        from cudf import DataFrame
        from cudf import Series
        from cudf import concat
        from cudf.utils.cudautils import arange

        lookup = DataFrame()
        for idx, row in enumerate(row_tuple):
            if row == slice(None):
                continue
            lookup[index._source_data.columns[idx]] = Series(row)
        data_table = concat(
            [
                index._source_data,
                DataFrame({"idx": Series(arange(len(index._source_data)))}),
            ],
            axis=1,
        )
        result = lookup.merge(data_table)["idx"]
        # Avoid computing levels unless the result of the merge is empty,
        # which suggests that a KeyError should be raised.
        if len(result) == 0:
            for idx, row in enumerate(row_tuple):
                if row == slice(None):
                    continue
                if row not in index.levels[idx]._column:
                    raise KeyError(row)
        return result
コード例 #4
0
 def transform(self, columns, gdf: cudf.DataFrame) -> cudf.DataFrame:
     tmp = "__tmp__"  # Temporary column for sorting
     gdf[tmp] = cupy.arange(len(gdf), dtype="int32")
     new_gdf = gdf.merge(self._ext, left_on=self.on, right_on=self.on_ext, how=self.how)
     new_gdf = new_gdf.sort_values(tmp)
     new_gdf.drop(columns=[tmp], inplace=True)
     gdf.drop(columns=[tmp], inplace=True)
     new_gdf.reset_index(drop=True, inplace=True)
     return new_gdf
コード例 #5
0
 def _to_frame(self):
     from cudf import DataFrame
     # for each column of codes
     # replace column with mapping from integers to levels
     df = self.codes.copy(deep=False)
     for idx, column in enumerate(df.columns):
         # use merge as a replace fn
         level = DataFrame({'idx': Series(cudautils.arange(len(
                                                     self.levels[idx]),
                                          dtype=df[column].dtype)),
                            'level': self.levels[idx]})
         code = DataFrame({'idx': df[column]})
         df[column] = code.merge(level).level
     return df
コード例 #6
0
ファイル: multiindex.py プロジェクト: zhuohuwu0603/cudf
    def _to_frame(self):
        from cudf import DataFrame, Series

        # for each column of codes
        # replace column with mapping from integers to levels
        df = self.codes.copy(deep=False)
        for idx, col in enumerate(df.columns):
            # use merge as a replace fn
            level = DataFrame({
                "idx":
                Series(cupy.arange(len(self.levels[idx]),
                                   dtype=df[col].dtype)),
                "level":
                self.levels[idx],
            })
            code = DataFrame({"idx": df[col]})
            df[col] = code.merge(level).level
        return df
コード例 #7
0
ファイル: join_external.py プロジェクト: vslyu/NVTabular
 def apply_op(
     self,
     gdf: cudf.DataFrame,
     columns_ctx: dict,
     input_cols,
     target_cols=["base"],
     stats_context=None,
 ):
     target_columns = self.get_columns(columns_ctx, input_cols, target_cols)
     tmp = "__tmp__"  # Temporary column for sorting
     gdf[tmp] = cupy.arange(len(gdf), dtype="int32")
     new_gdf = gdf.merge(self._ext, left_on=self.on, right_on=self.on_ext, how=self.how)
     new_gdf = new_gdf.sort_values(tmp)
     new_gdf.drop(columns=[tmp], inplace=True)
     gdf.drop(columns=[tmp], inplace=True)
     new_gdf.reset_index(drop=True, inplace=True)
     self.update_columns_ctx(columns_ctx, input_cols, new_gdf.columns, target_columns)
     return new_gdf