Ejemplo n.º 1
0
    def fit(self, data_df, y_df):
        """Fit the Foreshadow instance using the provided input data.

        Args:
            data_df (:obj:`DataFrame <pandas.DataFrame>`): The input feature(s)
            y_df (:obj:`DataFrame <pandas.DataFrame>`): The response feature(s)

        Returns:
            :obj:`Foreshadow`: The fitted instance.

        """
        self._reset()
        X_df = check_df(data_df)
        y_df = check_df(y_df)
        self.data_columns = X_df.columns.values.tolist()

        self.pipeline = Pipeline([
            ("X_preparer", self.X_preparer),
            ("estimator_wrapper", self.estimator_wrapper),
        ])

        self.pipeline.fit(X_df, y_df)
        self.has_fitted = True

        return self
Ejemplo n.º 2
0
def test_check_df_rename_cols():
    import pandas as pd
    from foreshadow.utils import check_df

    input_df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "A"])
    input_df = check_df(input_df)
    assert input_df.columns.tolist() == ["A", "A.1"]
Ejemplo n.º 3
0
def test_check_df_convert_series_to_df():
    import pandas as pd
    from foreshadow.utils import check_df

    input_ser = pd.Series([1, 2, 3, 4])
    input_df = check_df(input_ser)
    assert isinstance(input_df, pd.DataFrame)
Ejemplo n.º 4
0
        def inverse_transform(self, X, *args, **kwargs):
            """Give original inputs using fitted transformer. Pandas enabled.

            See transformer

            Args:
                X: transformed inputs
                *args: arguments to transformer
                **kwargs: keyword arguments to transformer

            Returns:
                original inputs

            Raises:
                ValueError: If not a valid output type from transformer.

            """
            df = check_df(X)

            init_cols = [str(col) for col in df]
            func = super(DFTransformer, self).inverse_transform

            out = func(df, *args, **kwargs)

            # determine name of new columns
            name = getattr(self, "name", type(self).__name__)
            out_is_transformer = hasattr(out, "__class__") and is_transformer(
                out.__class__, method="issubclass")  # noqa: E127
            # check if the output
            # returned by the scikit-learn public function is a transformer or
            # not. It will be a transformer in fit calls.

            if not (out_is_transformer):
                # if the output is a transformer, we do nothing.
                if isinstance(out,
                              pd.DataFrame):  # custom handling based on the
                    # type returned by the sklearn transformer function call
                    out, graph = _df_post_process(out, init_cols, name)
                elif isinstance(out, np.ndarray):
                    out, graph = _ndarray_post_process(out, df, init_cols,
                                                       name)
                elif scipy.sparse.issparse(out):
                    out = out.toarray()
                    out, graph = _ndarray_post_process(out, df, init_cols,
                                                       name)
                elif isinstance(out, pd.Series):
                    graph = []  # just return the series
                else:
                    raise ValueError("undefined input {0}".format(type(out)))

                if getattr(self, "keep_columns", False):
                    out = _keep_columns_process(out, df, name, graph)
                if getattr(self, "cache_manager", None) is not None:  # only
                    # used when part of the Foreshadow flow.
                    for column in X:
                        self.cache_manager["graph", column] = graph
                else:
                    logging.debug("cache_manager is not set for: "
                                  "{}".format(self))
            return out  # TODO output is a DataFrame, make it detect based
Ejemplo n.º 5
0
def test_check_df_convert_to_df():
    import numpy as np
    import pandas as pd
    from foreshadow.utils import check_df

    input_arr = np.array([1, 2, 3, 4])
    input_df = check_df(input_arr)
    assert isinstance(input_df, pd.DataFrame)
Ejemplo n.º 6
0
    def fit(self, X, y=None):
        """Fit the AutoEstimator instance using a selected AutoML estimator.

        Args:
            X (:obj:`pandas.DataFrame` or :obj:`numpy.ndarray` or list): The
                input feature(s)
            y (:obj:`pandas.DataFrame` or :obj:`numpy.ndarray` or list): The
                response feature(s)

        Returns:
            self

        """
        X = check_df(X)
        y = check_df(y)
        y = self.preprocessor.fit_transform(y)
        self.estimator.fit(X, y)
        return self
Ejemplo n.º 7
0
    def score(self, X, y, sample_weight=None):
        """Use the trained estimator to compute the evaluation score.

        Note: sample weights are not supported

        Args:
            X (pandas.DataFrame or numpy.ndarray or list): The input feature(s)
            y (pandas.DataFrame or numpy.ndarray or list): The response
                feature(s)
            sample_weight: sample weighting. Not implemented.

        Returns:
            float: A computed prediction fitness score

        """
        X = check_df(X)
        y = check_df(y)
        return self.estimator.score(X, y)
Ejemplo n.º 8
0
def test_check_df_single_column():
    import numpy as np
    from foreshadow.utils import check_df

    input_arr = np.arange(8).reshape((4, 2))

    with pytest.raises(ValueError) as e:
        _ = check_df(input_arr, single_column=True)

    assert str(e.value) == ("Input Dataframe must have only one column")
Ejemplo n.º 9
0
    def score(self, data_df, y_df=None, sample_weight=None):
        """Use the trained estimator to compute the evaluation score.

        The scoding method is defined by the selected estimator.

        Args:
            data_df (:obj:`DataFrame <pandas.DataFrame>`): The input feature(s)
            y_df (:obj:`DataFrame <pandas.DataFrame>`, optional): The response
                feature(s)
            sample_weight (:obj:`numpy.ndarray`, optional): The weights to be
                used when scoring each sample

        Returns:
            float: A computed prediction fitness score

        """
        data_df = check_df(data_df)
        y_df = check_df(y_df)
        self._prepare_predict(data_df.columns)
        return self.pipeline.score(data_df, y_df, sample_weight)
Ejemplo n.º 10
0
    def score(self, X, y):
        """Use the trained estimator to compute the evaluation score.

        Note: sample weights are not supported

        Args:
            X (:obj:`pandas.DataFrame` or :obj:`numpy.ndarray` or list): The
                input feature(s)
            y (:obj:`pandas.DataFrame` or :obj:`numpy.ndarray` or list): The
                response feature(s)

        Returns:
            float: A computed prediction fitness score

        """
        X = check_df(X)
        y = check_df(y)
        y = self.preprocessor.transform(y)

        return self.estimator.score(X, y)
Ejemplo n.º 11
0
    def fit(self, X, y):
        """Fit the AutoEstimator instance.

        Uses the selected AutoML estimator.

        Args:
            X (pandas.DataFrame or numpy.ndarray or list): The input
                feature(s)
            y (pandas.DataFrame or numpy.ndarray or list): The response
                feature(s)

        Returns:
            The selected estimator

        """
        X = check_df(X)
        y = check_df(y)
        self._fit(X, y)

        return self.estimator
Ejemplo n.º 12
0
def test_check_df_raises_on_invalid():
    from foreshadow.utils import check_df
    import re

    input_df = None
    with pytest.raises(ValueError) as e:
        input_df = check_df(input_df)
    assert re.match(
        "Invalid input type: (.+) is not pd.DataFrame, "
        "pd.Series, np.ndarray, nor list",
        str(e.value),
    )
Ejemplo n.º 13
0
    def predict(self, X):
        """Use the trained estimator to predict the response.

        Args:
            X (pandas.DataFrame or :obj:`numpy.ndarray` or list): The input
                feature(s)

        Returns:
            :obj:`pandas.DataFrame`: The response feature(s) (transformed)

        """
        X = check_df(X)
        return self.preprocessor.inverse_transform(self.estimator.predict(X))
Ejemplo n.º 14
0
    def inverse_transform(self, X):
        """Invert transform if possible.

        Args:
            X: transformed input observations using selected best transformer

        Returns:
            original input observations

        """
        X = check_df(X)
        self.resolve(X)
        return self.transformer.inverse_transform(X)
Ejemplo n.º 15
0
    def predict(self, X):
        """Use the trained estimator to predict the response.

        Args:
            X (pandas.DataFrame or numpy.ndarray or list): The input
                feature(s)

        Returns:
            pandas.DataFrame: The response feature(s)

        """
        X = check_df(X)
        return self.estimator.predict(X)
Ejemplo n.º 16
0
    def predict_proba(self, X):
        """Use the trained estimator to predict the response probabilities.

        Args:
            X (:obj:`pandas.DataFrame` or :obj:`numpy.ndarray` or list): The
                input feature(s)

        Returns:
            :obj:`pandas.DataFrame`: The probability associated with each \
                feature

        """
        X = check_df(X)
        return self.estimator.predict_proba(X)
Ejemplo n.º 17
0
    def predict(self, data_df):
        """Use the trained estimator to predict the response variable.

        Args:
            data_df (:obj:`DataFrame <pandas.DataFrame>`): The input feature(s)

        Returns:
            :obj:`DataFrame <pandas.DataFrame>`: The response feature(s) \
                (transformed if necessary)

        """
        data_df = check_df(data_df)
        self._prepare_predict(data_df.columns)
        return self.pipeline.predict(data_df)
Ejemplo n.º 18
0
    def predict_proba(self, X):  # pragma: no cover
        """Use the trained estimator to predict the responses probabilities.

        Args:
            X (pandas.DataFrame or numpy.ndarray or list): The input
                feature(s)

        Returns:
            pandas.DataFrame: The probability associated with each response \
                feature

        """
        X = check_df(X)
        return self.estimator.predict_proba(X)
Ejemplo n.º 19
0
    def transform(self, X, y=None):
        """Remove HTML tags from passed in strings.

        Args:
            X: input observations
            y: input labels

        Returns:
            transformed X

        """
        X = check_df(X, single_column=True).iloc[:, 0]
        X = X.str.replace(HTML_REGEX, "")

        return X
Ejemplo n.º 20
0
    def fit(self, X, y=None, **fit_params):
        """See base class.

        This class returns self, not self.transformer.fit, which would
        return the aggregated transformers self because then chains such as
        SmartTransformer().fit().transform() would only call the underlying
        transformer's fit. In the case that Smart is Wrapped, this changes
        the way columns are named.

        Args:
            X: see base class
            y: see base class
            **fit_params: see base class

        Returns:
            see base class

        """
        X = check_df(X)
        y = check_df(y, ignore_none=True)
        self.resolve(X, y, **fit_params)
        self.transformer.full_df = fit_params.pop("full_df", None)
        self.transformer.fit(X, y, **fit_params)
        return self
Ejemplo n.º 21
0
    def predict_proba(self, data_df):
        """Use the trained estimator to predict the response variable.

        Uses the predicted confidences instead of binary predictions.

        Args:
            data_df (:obj:`DataFrame <pandas.DataFrame>`): The input feature(s)

        Returns:
            :obj:`DataFrame <pandas.DataFrame>`: The probability associated \
                with each response feature

        """
        data_df = check_df(data_df)
        self._prepare_predict(data_df.columns)
        return self.pipeline.predict_proba(data_df)
Ejemplo n.º 22
0
    def transform(self, X, y=None):
        """Apply the computed transform to the passed in data.

        Args:
            X (:obj:`pandas.DataFrame`): input DataFrame
            y: input labels

        Returns:
            :obj:`pandas.DataFrame`: transformed dataframe

        """
        X = check_df(X, single_column=True).iloc[:, 0]
        check_is_fitted(self, ["values_", "merge_values_"])
        X[X.isin(self.merge_values_)
          | ~X.isin(self.values_)] = self.replacement
        X = X.to_frame()

        return X
Ejemplo n.º 23
0
    def fit(self, X, y=None):
        """Find the uncommon values and set the replacement value.

        Args:
            X (:obj:`pandas.DataFrame`): input dataframe
            y: input labels

        Returns:
            self

        """
        X = check_df(X, single_column=True).iloc[:, 0]

        vc_series = X.value_counts()
        self.values_ = vc_series.index.values.tolist()
        self.merge_values_ = vc_series[vc_series <= (
            self.threshold * X.size)].index.values.tolist()

        return self
Ejemplo n.º 24
0
        def fit(self, X, *args, **kwargs):
            """Fit the estimator or transformer, pandas enabled.

            See transformer.

            Args:
                X: inputs
                *args: arguments to transformer
                **kwargs: keyword arguments to transformer

            Returns:
                self

            """
            df = check_df(X)

            func = super(DFTransformer, self).fit
            out = func(df, *args, **kwargs)
            return out
Ejemplo n.º 25
0
        def fit_transform(self, X, *args, **kwargs):
            df = check_df(X)
            kwargs.pop("full_df", None)
            init_cols = [str(col) for col in df]
            func = super(DFTransformer, self).fit_transform
            out = func(df, *args, **kwargs)

            # determine name of new columns
            name = getattr(self, "name", type(self).__name__)
            out_is_transformer = hasattr(out, "__class__") and is_transformer(
                out.__class__, method="issubclass")  # noqa: E127
            # check if the output returned by the scikit-learn public function
            # is a transformer or not. It will be a transformer in fit calls.

            if not (out_is_transformer) and not isinstance(out, pd.DataFrame):
                # out_is_transformer: if the output is a transformer,
                # we do nothing.
                # pd.DataFrame: fit_transform will likely be
                # passed to the TransformerMixin fit_transform, which just
                # calls .fit and .transform. Processing will be handled
                # there
                if isinstance(out, np.ndarray):  # output was not yet
                    # transformed to DataFrame
                    out, graph = _ndarray_post_process(out, df, init_cols,
                                                       name)
                elif scipy.sparse.issparse(out):
                    out = out.toarray()
                    out, graph = _ndarray_post_process(out, df, init_cols,
                                                       name)
                elif isinstance(out, pd.Series):
                    graph = []  # just return the series
                else:
                    raise ValueError("undefined input {0}".format(type(out)))
                if getattr(self, "keep_columns", False):
                    out = _keep_columns_process(out, df, name, graph)
                if getattr(self, "cache_manager", None) is not None:  # only
                    # used when part of the Foreshadow flow.
                    for column in X:
                        self.cache_manager["graph", column] = graph
                else:
                    logging.debug("cache_manager is not set for: "
                                  "{}".format(self))
            return out
Ejemplo n.º 26
0
def will_remove_uncommon(X, temp_uncommon_remover):
    """Check if the transformer will modify the data.

    Uses current settings.

    Args:
        X: input observations column
        temp_uncommon_remover: transformer

    Returns:
        (tuple) bool and category counts

    """
    X = check_df(X, single_column=True).iloc[:, 0].values
    out = temp_uncommon_remover.fit_transform(X).values.ravel()

    return (
        not (np.array_equal(X, out) | (pd.isnull(X) & pd.isnull(out))).all(),
        pd.unique(out).size,
    )
Ejemplo n.º 27
0
    def transform(self, X):
        """See base class.

        Args:
            X: transform

        Returns:
            transformed X using selected best transformer.

        Raises:
            ValueError: Transformer should be fitted first.

        """
        X = check_df(X)
        # Why do we need to resolve twice? If the transformer is not set,
        # throw an exception since we should call fit before transform
        # self.resolve(X)
        if self.transformer is None:
            raise ValueError("The transformer has not been fitted. Please "
                             "call fit first.")
        return self.transformer.transform(X)
Ejemplo n.º 28
0
    def pick_transformer(self, X, y=None, **fit_params):
        """Determine the appropriate scaling method for an input dataset.

        Args:
            X (:obj:`pandas.DataFrame`): Input X data
            y (:obj: 'pandas.DataFrame'): labels Y for data
            **fit_params (dict): Parameters to apply to transformers when
                fitting

        Returns:
            An initialized scaling transformer

        """
        X = check_df(X)
        data = X.iloc[:, 0]
        # statistically invalid but good enough measure of relative closeness
        # ks-test does not allow estimated parameters
        distributions = {"norm": StandardScaler(), "uniform": MinMaxScaler()}
        p_vals = {}
        for d in distributions.keys():
            dist = getattr(ss.distributions, d)
            p_vals[d] = ss.kstest(data, d, args=dist.fit(data)).pvalue
        best_dist = max(p_vals, key=p_vals.get)
        best_dist = best_dist if p_vals[best_dist] >= self.p_val else None
        if best_dist is None:
            selected_transformer = Pipeline([
                # Turning off the BoxCox transformer because if the test
                # dataset has an even smaller negative min, it will
                # break the pipeline.
                # TODO add a different transformer if necessary
                # ("box_cox", BoxCox()),
                ("power_transformer", PowerTransformer()),
                ("robust_scaler", RobustScaler()),
            ])
        else:
            selected_transformer = distributions[best_dist]
        return selected_transformer
Ejemplo n.º 29
0
def test_check_df_passthrough():
    import pandas as pd
    from foreshadow.utils import check_df

    input_df = pd.DataFrame([1, 2, 3, 4])
    assert input_df.equals(check_df(input_df))
Ejemplo n.º 30
0
def test_check_df_passthrough_none():
    from foreshadow.utils import check_df

    input_df = None
    assert check_df(input_df, ignore_none=True) is None