コード例 #1
0
    def transformer(self, value):
        """Validate transformer initialization.

        Args:
            value (object): The selected transformer that SmartTransformer
                should use.

        Raises:
            ValueError: If input is neither a valid foreshadow wrapped
                transformer, scikit-learn Pipeline, scikit-learn FeatureUnion,
                nor None.

        """
        if isinstance(value, str):
            value = get_transformer(value)(**self.kwargs)
            self.unset_resolve()
        # Check transformer type
        is_trans = is_transformer(value)
        trans_wrapped = (is_wrapped(value)
                         if getattr(self, "check_wrapped", True) else True)
        # True by default passes this check if we don't want it.
        is_pipe = isinstance(value, Pipeline)
        is_none = value is None
        checks = [is_trans, is_pipe, is_none, trans_wrapped]
        # Check the transformer inheritance status
        if not any(checks):
            logging.error("transformer: {} failed checks: {}".format(
                value, checks))
            raise ValueError(
                "{} is neither a scikit-learn Pipeline, FeatureUnion, a "
                "wrapped foreshadow transformer, nor None.".format(value))

        self._transformer = value
コード例 #2
0
def _get_modules(classes, globals_, mname):  # TODO auto import all
    # TODO sklearn transformers and test each one generically.
    """Import sklearn transformers from transformers directory.

    Searches transformers directory for classes implementing BaseEstimator and
    TransformerMixin and duplicates them, wraps their init methods and public
    functions to support pandas dataframes, and exposes them as
    foreshadow.transformers.[name]

    Args:
        classes: A list of classes
        globals_: The globals in the callee's context
        mname: The module name

    Returns:
        The list of wrapped transformers.

    """
    transformers = [
        cls for cls in classes
        if is_transformer(cls, method="issubclass")  # noqa: F821
    ]  # flake does not detect due to del.

    for t in transformers:
        copied_t = type(t.__name__, (t, *t.__bases__), dict(t.__dict__))
        copied_t.__module__ = mname
        globals_[copied_t.__name__] = pandas_wrap(  # noqa: F821
            copied_t  # noqa: F821
        )
        # flake does not detect due to del.

    return [t.__name__ for t in transformers]
コード例 #3
0
        def inverse_transform(self, X, *args, **kwargs):
            """Give original inputs using fitted transformer. Pandas enabled.

            See transformer

            Args:
                X: transformed inputs
                *args: arguments to transformer
                **kwargs: keyword arguments to transformer

            Returns:
                original inputs

            Raises:
                ValueError: If not a valid output type from transformer.

            """
            df = check_df(X)

            init_cols = [str(col) for col in df]
            func = super(DFTransformer, self).inverse_transform

            out = func(df, *args, **kwargs)

            # determine name of new columns
            name = getattr(self, "name", type(self).__name__)
            out_is_transformer = hasattr(out, "__class__") and is_transformer(
                out.__class__, method="issubclass")  # noqa: E127
            # check if the output
            # returned by the scikit-learn public function is a transformer or
            # not. It will be a transformer in fit calls.

            if not (out_is_transformer):
                # if the output is a transformer, we do nothing.
                if isinstance(out,
                              pd.DataFrame):  # custom handling based on the
                    # type returned by the sklearn transformer function call
                    out, graph = _df_post_process(out, init_cols, name)
                elif isinstance(out, np.ndarray):
                    out, graph = _ndarray_post_process(out, df, init_cols,
                                                       name)
                elif scipy.sparse.issparse(out):
                    out = out.toarray()
                    out, graph = _ndarray_post_process(out, df, init_cols,
                                                       name)
                elif isinstance(out, pd.Series):
                    graph = []  # just return the series
                else:
                    raise ValueError("undefined input {0}".format(type(out)))

                if getattr(self, "keep_columns", False):
                    out = _keep_columns_process(out, df, name, graph)
                if getattr(self, "cache_manager", None) is not None:  # only
                    # used when part of the Foreshadow flow.
                    for column in X:
                        self.cache_manager["graph", column] = graph
                else:
                    logging.debug("cache_manager is not set for: "
                                  "{}".format(self))
            return out  # TODO output is a DataFrame, make it detect based
コード例 #4
0
        def fit_transform(self, X, *args, **kwargs):
            df = check_df(X)
            kwargs.pop("full_df", None)
            init_cols = [str(col) for col in df]
            func = super(DFTransformer, self).fit_transform
            out = func(df, *args, **kwargs)

            # determine name of new columns
            name = getattr(self, "name", type(self).__name__)
            out_is_transformer = hasattr(out, "__class__") and is_transformer(
                out.__class__, method="issubclass")  # noqa: E127
            # check if the output returned by the scikit-learn public function
            # is a transformer or not. It will be a transformer in fit calls.

            if not (out_is_transformer) and not isinstance(out, pd.DataFrame):
                # out_is_transformer: if the output is a transformer,
                # we do nothing.
                # pd.DataFrame: fit_transform will likely be
                # passed to the TransformerMixin fit_transform, which just
                # calls .fit and .transform. Processing will be handled
                # there
                if isinstance(out, np.ndarray):  # output was not yet
                    # transformed to DataFrame
                    out, graph = _ndarray_post_process(out, df, init_cols,
                                                       name)
                elif scipy.sparse.issparse(out):
                    out = out.toarray()
                    out, graph = _ndarray_post_process(out, df, init_cols,
                                                       name)
                elif isinstance(out, pd.Series):
                    graph = []  # just return the series
                else:
                    raise ValueError("undefined input {0}".format(type(out)))
                if getattr(self, "keep_columns", False):
                    out = _keep_columns_process(out, df, name, graph)
                if getattr(self, "cache_manager", None) is not None:  # only
                    # used when part of the Foreshadow flow.
                    for column in X:
                        self.cache_manager["graph", column] = graph
                else:
                    logging.debug("cache_manager is not set for: "
                                  "{}".format(self))
            return out