Ejemplo n.º 1
0
    def fit_transform(self, X, y, copy_data=True):
        start = time.time()
        self.reset()
        if X is None:
            raise ValueError(f'X cannot be none.')
        if y is None:
            raise ValueError(f'y cannot be none.')
        if len(X.shape) != 2:
            raise ValueError(f'X must be a 2D datasets.')
        if len(y.shape) != 1:
            raise ValueError(f'y must be a 1D datasets.')
        if X.shape[0] != y.shape[0]:
            raise ValueError(
                f"The number of samples of X and y must be the same. X.shape:{X.shape}, y.shape{y.shape}"
            )
        y_series = pd.Series(y)
        if y_series.isnull().sum() > 0:
            raise ValueError("Missing values in y.")

        self.X_types = X.dtypes
        self.y_type = y_series.dtype

        if copy:
            X = copy.deepcopy(X)
            y = copy.deepcopy(y)

        y = self.fit_transform_y(y)

        X = self.prepare_X(X)
        X = self.__prepare_features(X)
        if self.config.auto_imputation:
            X = self._imputation(X)
        if self.config.auto_encode_label:
            X = self._categorical_encoding(X)
        if self.config.auto_discrete:
            X = self._discretization(X)
        if self.config.apply_gbm_features and y is not None:
            X = self._apply_gbm_features(X, y)

        self.X_transformers['last'] = PassThroughEstimator()

        print(f'fit_transform cost:{time.time() - start}')
        return X, y
Ejemplo n.º 2
0
    def fit_transform(self, X, y, copy_data=True):
        sign = self.get_X_y_signature(X, y)
        if self.use_cache:
            logger.info('Try to load (X, y) from cache')
            X_t, y_t = self.get_transformed_X_y_from_cache(sign)
            if X_t is not None and y_t is not None:
                if self.load_transformers_from_cache():
                    return X_t, y_t
            else:
                logger.info('Load failed')

        start = time.time()
        self.reset()
        if X is None:
            raise ValueError(f'X cannot be none.')
        if y is None:
            raise ValueError(f'y cannot be none.')
        if len(X.shape) != 2:
            raise ValueError(f'X must be a 2D datasets.')
        # if len(y.shape) != 1:
        #    raise ValueError(f'y must be a 1D datasets.')
        if X.shape[0] != y.shape[0]:
            raise ValueError(
                f"The number of samples of X and y must be the same. X.shape:{X.shape}, y.shape{y.shape}"
            )

        y_df = pd.DataFrame(y)
        if y_df.isnull().sum().sum() > 0:
            raise ValueError("Missing values in y.")

        if copy:
            X = copy.deepcopy(X)
            y = copy.deepcopy(y)

        y = self.fit_transform_y(y)

        X = self.prepare_X(X)
        X = self.__prepare_features(X)
        if self.config.auto_imputation:
            X = self._imputation(X)
        if self.config.auto_encode_label:
            X = self._categorical_encoding(X)
        if self.config.auto_discrete:
            X = self._discretization(X)
        if self.config.apply_gbm_features and y is not None:
            X = self._apply_gbm_features(X, y)
        var_len_categorical_columns = self.config.var_len_categorical_columns
        if var_len_categorical_columns is not None and len(
                var_len_categorical_columns) > 0:
            X = self._var_len_encoder(X, var_len_categorical_columns)

        self.X_transformers['last'] = PassThroughEstimator()

        cat_cols = self.get_categorical_columns()
        cont_cols = self.get_continuous_columns()
        if len(cat_cols) > 0:
            X[cat_cols] = X[cat_cols].astype('category')
        if len(cont_cols) > 0:
            X[cont_cols] = X[cont_cols].astype('float')

        logger.info(f'fit_transform taken {time.time() - start}s')

        if self.use_cache:
            logger.info('Put (X, y) into cache')
            self.save_transformed_X_y_to_cache(sign, X, y)
            self.save_transformers_to_cache()
        return X, y