def transformer(self, value): """Validate transformer initialization. Args: value (object): The selected transformer that SmartTransformer should use. Raises: ValueError: If input is neither a valid foreshadow wrapped transformer, scikit-learn Pipeline, scikit-learn FeatureUnion, nor None. """ if isinstance(value, str): value = get_transformer(value)(**self.kwargs) self.unset_resolve() # Check transformer type is_trans = is_transformer(value) trans_wrapped = (is_wrapped(value) if getattr(self, "check_wrapped", True) else True) # True by default passes this check if we don't want it. is_pipe = isinstance(value, Pipeline) is_none = value is None checks = [is_trans, is_pipe, is_none, trans_wrapped] # Check the transformer inheritance status if not any(checks): logging.error("transformer: {} failed checks: {}".format( value, checks)) raise ValueError( "{} is neither a scikit-learn Pipeline, FeatureUnion, a " "wrapped foreshadow transformer, nor None.".format(value)) self._transformer = value
def _get_modules(classes, globals_, mname): # TODO auto import all # TODO sklearn transformers and test each one generically. """Import sklearn transformers from transformers directory. Searches transformers directory for classes implementing BaseEstimator and TransformerMixin and duplicates them, wraps their init methods and public functions to support pandas dataframes, and exposes them as foreshadow.transformers.[name] Args: classes: A list of classes globals_: The globals in the callee's context mname: The module name Returns: The list of wrapped transformers. """ transformers = [ cls for cls in classes if is_transformer(cls, method="issubclass") # noqa: F821 ] # flake does not detect due to del. for t in transformers: copied_t = type(t.__name__, (t, *t.__bases__), dict(t.__dict__)) copied_t.__module__ = mname globals_[copied_t.__name__] = pandas_wrap( # noqa: F821 copied_t # noqa: F821 ) # flake does not detect due to del. return [t.__name__ for t in transformers]
def inverse_transform(self, X, *args, **kwargs): """Give original inputs using fitted transformer. Pandas enabled. See transformer Args: X: transformed inputs *args: arguments to transformer **kwargs: keyword arguments to transformer Returns: original inputs Raises: ValueError: If not a valid output type from transformer. """ df = check_df(X) init_cols = [str(col) for col in df] func = super(DFTransformer, self).inverse_transform out = func(df, *args, **kwargs) # determine name of new columns name = getattr(self, "name", type(self).__name__) out_is_transformer = hasattr(out, "__class__") and is_transformer( out.__class__, method="issubclass") # noqa: E127 # check if the output # returned by the scikit-learn public function is a transformer or # not. It will be a transformer in fit calls. if not (out_is_transformer): # if the output is a transformer, we do nothing. if isinstance(out, pd.DataFrame): # custom handling based on the # type returned by the sklearn transformer function call out, graph = _df_post_process(out, init_cols, name) elif isinstance(out, np.ndarray): out, graph = _ndarray_post_process(out, df, init_cols, name) elif scipy.sparse.issparse(out): out = out.toarray() out, graph = _ndarray_post_process(out, df, init_cols, name) elif isinstance(out, pd.Series): graph = [] # just return the series else: raise ValueError("undefined input {0}".format(type(out))) if getattr(self, "keep_columns", False): out = _keep_columns_process(out, df, name, graph) if getattr(self, "cache_manager", None) is not None: # only # used when part of the Foreshadow flow. for column in X: self.cache_manager["graph", column] = graph else: logging.debug("cache_manager is not set for: " "{}".format(self)) return out # TODO output is a DataFrame, make it detect based
def fit_transform(self, X, *args, **kwargs): df = check_df(X) kwargs.pop("full_df", None) init_cols = [str(col) for col in df] func = super(DFTransformer, self).fit_transform out = func(df, *args, **kwargs) # determine name of new columns name = getattr(self, "name", type(self).__name__) out_is_transformer = hasattr(out, "__class__") and is_transformer( out.__class__, method="issubclass") # noqa: E127 # check if the output returned by the scikit-learn public function # is a transformer or not. It will be a transformer in fit calls. if not (out_is_transformer) and not isinstance(out, pd.DataFrame): # out_is_transformer: if the output is a transformer, # we do nothing. # pd.DataFrame: fit_transform will likely be # passed to the TransformerMixin fit_transform, which just # calls .fit and .transform. Processing will be handled # there if isinstance(out, np.ndarray): # output was not yet # transformed to DataFrame out, graph = _ndarray_post_process(out, df, init_cols, name) elif scipy.sparse.issparse(out): out = out.toarray() out, graph = _ndarray_post_process(out, df, init_cols, name) elif isinstance(out, pd.Series): graph = [] # just return the series else: raise ValueError("undefined input {0}".format(type(out))) if getattr(self, "keep_columns", False): out = _keep_columns_process(out, df, name, graph) if getattr(self, "cache_manager", None) is not None: # only # used when part of the Foreshadow flow. for column in X: self.cache_manager["graph", column] = graph else: logging.debug("cache_manager is not set for: " "{}".format(self)) return out