Beispiel #1
0
    def inverse_transform(self, X):
        """Inverse ordinal-encode the columns in `X`

        Parameters
        ----------
        X : array or dataframe
            Either the NumPy, dask, or pandas version

        Returns
        -------
        data : DataFrame
            Dask array or dataframe will return a Dask DataFrame.
            Numpy array or pandas dataframe will return a pandas DataFrame
        """
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X, columns=self.columns_)

        elif isinstance(X, da.Array):
            # later on we concat(..., axis=1), which requires
            # known divisions. Suboptimal, but I think unavoidable.
            unknown = np.isnan(X.chunks[0]).any()
            if unknown:
                lengths = blockwise(len, "i", X[:, 0], "i",
                                    dtype="i8").compute()
                X = X.copy()
                chunks = (tuple(lengths), X.chunks[1])
                X._chunks = chunks

            X = dd.from_dask_array(X, columns=self.columns_)

        big = isinstance(X, dd.DataFrame)

        if big:
            chunks = np.array(X.divisions)
            chunks[-1] = chunks[-1] + 1
            chunks = tuple(chunks[1:] - chunks[:-1])

        X = X.copy()
        for col in self.categorical_columns_:
            if _HAS_CTD:
                dtype = self.dtypes_[col]
                categories, ordered = dtype.categories, dtype.ordered
            else:
                categories, ordered = self.dtypes_[col]

            # use .values to avoid warning from pandas
            codes = X[col].values

            if big:
                # dask
                codes._chunks = (chunks, )
                # Need a Categorical.from_codes for dask
                series = (dd.from_dask_array(
                    codes, columns=col).astype("category").cat.set_categories(
                        np.arange(len(categories)),
                        ordered=ordered).cat.rename_categories(categories))
                # Bug in pandas <= 0.20.3 lost name
                if series.name is None:
                    series.name = col
                series.divisions = X.divisions
            else:
                # pandas
                series = pd.Series(
                    pd.Categorical.from_codes(codes,
                                              categories,
                                              ordered=ordered),
                    name=col,
                )

            X[col] = series

        return X
Beispiel #2
0
    def inverse_transform(self, X):
        """Inverse dummy-encode the columns in `X`

        Parameters
        ----------
        X : array or dataframe
            Either the NumPy, dask, or pandas version

        Returns
        -------
        data : DataFrame
            Dask array or dataframe will return a Dask DataFrame.
            Numpy array or pandas dataframe will return a pandas DataFrame
        """
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X, columns=self.transformed_columns_)

        elif isinstance(X, da.Array):
            # later on we concat(..., axis=1), which requires
            # known divisions. Suboptimal, but I think unavoidable.
            unknown = np.isnan(X.chunks[0]).any()
            if unknown:
                lengths = blockwise(len, "i", X[:, 0], "i",
                                    dtype="i8").compute()
                X = X.copy()
                chunks = (tuple(lengths), X.chunks[1])
                X._chunks = chunks

            X = dd.from_dask_array(X, columns=self.transformed_columns_)

        big = isinstance(X, dd.DataFrame)

        if big:
            chunks = np.array(X.divisions)
            chunks[-1] = chunks[-1] + 1
            chunks = tuple(chunks[1:] - chunks[:-1])

        non_cat = X[list(self.non_categorical_columns_)]

        cats = []
        for col in self.categorical_columns_:
            slice_ = self.categorical_blocks_[col]
            if _HAS_CTD:
                dtype = self.dtypes_[col]
                categories, ordered = dtype.categories, dtype.ordered
            else:
                categories, ordered = self.dtypes_[col]

            # use .values to avoid warning from pandas
            cols_slice = list(X.columns[slice_])
            if big:
                inds = X[cols_slice].to_dask_array(lengths=chunks)
            else:
                inds = X[cols_slice].values
            codes = inds.argmax(1)

            if self.drop_first:
                codes += 1
                codes[(inds == 0).all(1)] = 0

            if big:
                # dask
                codes._chunks = (chunks, )
                # Need a Categorical.from_codes for dask
                series = (dd.from_dask_array(
                    codes, columns=col).astype("category").cat.set_categories(
                        np.arange(len(categories)),
                        ordered=ordered).cat.rename_categories(categories))
                # Bug in pandas <= 0.20.3 lost name
                if series.name is None:
                    series.name = col
                series.divisions = X.divisions
            else:
                # pandas
                series = pd.Series(
                    pd.Categorical.from_codes(codes,
                                              categories,
                                              ordered=ordered),
                    name=col,
                )

            cats.append(series)
        if big:
            df = dd.concat([non_cat] + cats, axis=1)[list(self.columns_)]
        else:
            df = pd.concat([non_cat] + cats, axis=1)[self.columns_]
        return df