Ejemplo n.º 1
0
    def calculate(self, feature, **kwargs):
        """Use the metric function passed at initialization.

        Note:
            If default_return was set, the wrapper will suppress any errors
            raised by the wrapped function.

        Args:
            feature: feature/column of pandas dataset requires it.
            **kwargs: any keyword arguments to metric function

        Returns:
            The metric computation defined by the metric.

        Raises:
            re_raise: If default return is not set the metric then re-raise

        """
        try:
            self._last_call = self.fn(feature, **kwargs)
        except Exception as re_raise:
            logging.debug("There was an exception when calling {}".format(
                self.fn))
            if self.default_return is not None:
                return self.default_return
            else:
                raise re_raise

        return self._last_call if not self.invert else (1.0 - self._last_call)
Ejemplo n.º 2
0
        def inverse_transform(self, X, *args, **kwargs):
            """Give original inputs using fitted transformer. Pandas enabled.

            See transformer

            Args:
                X: transformed inputs
                *args: arguments to transformer
                **kwargs: keyword arguments to transformer

            Returns:
                original inputs

            Raises:
                ValueError: If not a valid output type from transformer.

            """
            df = check_df(X)

            init_cols = [str(col) for col in df]
            func = super(DFTransformer, self).inverse_transform

            out = func(df, *args, **kwargs)

            # determine name of new columns
            name = getattr(self, "name", type(self).__name__)
            out_is_transformer = hasattr(out, "__class__") and is_transformer(
                out.__class__, method="issubclass")  # noqa: E127
            # check if the output
            # returned by the scikit-learn public function is a transformer or
            # not. It will be a transformer in fit calls.

            if not (out_is_transformer):
                # if the output is a transformer, we do nothing.
                if isinstance(out,
                              pd.DataFrame):  # custom handling based on the
                    # type returned by the sklearn transformer function call
                    out, graph = _df_post_process(out, init_cols, name)
                elif isinstance(out, np.ndarray):
                    out, graph = _ndarray_post_process(out, df, init_cols,
                                                       name)
                elif scipy.sparse.issparse(out):
                    out = out.toarray()
                    out, graph = _ndarray_post_process(out, df, init_cols,
                                                       name)
                elif isinstance(out, pd.Series):
                    graph = []  # just return the series
                else:
                    raise ValueError("undefined input {0}".format(type(out)))

                if getattr(self, "keep_columns", False):
                    out = _keep_columns_process(out, df, name, graph)
                if getattr(self, "cache_manager", None) is not None:  # only
                    # used when part of the Foreshadow flow.
                    for column in X:
                        self.cache_manager["graph", column] = graph
                else:
                    logging.debug("cache_manager is not set for: "
                                  "{}".format(self))
            return out  # TODO output is a DataFrame, make it detect based
Ejemplo n.º 3
0
        def fit_transform(self, X, *args, **kwargs):
            df = check_df(X)
            kwargs.pop("full_df", None)
            init_cols = [str(col) for col in df]
            func = super(DFTransformer, self).fit_transform
            out = func(df, *args, **kwargs)

            # determine name of new columns
            name = getattr(self, "name", type(self).__name__)
            out_is_transformer = hasattr(out, "__class__") and is_transformer(
                out.__class__, method="issubclass")  # noqa: E127
            # check if the output returned by the scikit-learn public function
            # is a transformer or not. It will be a transformer in fit calls.

            if not (out_is_transformer) and not isinstance(out, pd.DataFrame):
                # out_is_transformer: if the output is a transformer,
                # we do nothing.
                # pd.DataFrame: fit_transform will likely be
                # passed to the TransformerMixin fit_transform, which just
                # calls .fit and .transform. Processing will be handled
                # there
                if isinstance(out, np.ndarray):  # output was not yet
                    # transformed to DataFrame
                    out, graph = _ndarray_post_process(out, df, init_cols,
                                                       name)
                elif scipy.sparse.issparse(out):
                    out = out.toarray()
                    out, graph = _ndarray_post_process(out, df, init_cols,
                                                       name)
                elif isinstance(out, pd.Series):
                    graph = []  # just return the series
                else:
                    raise ValueError("undefined input {0}".format(type(out)))
                if getattr(self, "keep_columns", False):
                    out = _keep_columns_process(out, df, name, graph)
                if getattr(self, "cache_manager", None) is not None:  # only
                    # used when part of the Foreshadow flow.
                    for column in X:
                        self.cache_manager["graph", column] = graph
                else:
                    logging.debug("cache_manager is not set for: "
                                  "{}".format(self))
            return out
Ejemplo n.º 4
0
    def pick_transformer(self, X, y=None, **fit_params):
        """Get best transformer for a given column.

        Args:
            X: input DataFrame
            y: input labels
            **fit_params: fit_params

        Returns:
            Best data cleaning transformer.

        """
        # TODO do we want to parallize this step?
        cleaners = config.get_cleaners(cleaners=True)

        user_provided_cleaners = self.cache_manager[
            AcceptedKey.CUSTOMIZED_TRANSFORMERS][ConfigKey.CUSTOMIZED_CLEANERS]
        if len(user_provided_cleaners) > 0:
            cleaners.extend(user_provided_cleaners)

        best_score = 0
        best_cleaner = None
        logging.debug("Picking cleaners...")

        # The sampling is to speed up the metric score calculation as it may
        # not be necessary to scan every row in the data frame to generate a
        # score.
        sampled_df = self.sample_data_frame(df=X)

        # TODO if this improvement is not sufficient, we can try using
        #  multiprocessing to get the scores instead of doing it sequentially.
        for cleaner in cleaners:
            cleaner = cleaner()
            score = cleaner.metric_score(sampled_df)
            if score > best_score:
                best_score = score
                best_cleaner = cleaner
        if best_cleaner is None:
            return NoTransform()
        logging.debug("Picked...")
        return best_cleaner
Ejemplo n.º 5
0
    def resolve(self, X, *args, **kwargs):
        """Resolve the underlying concrete transformer.

        Sets self.cache_manager with the domain tag.

        Args:
            X: input DataFrame
            *args: args to super
            **kwargs: kwargs to super

        Returns:
            Return from super.

        """
        ret = super().resolve(X, *args, **kwargs)
        if self.cache_manager is not None:
            self.cache_manager[
                "domain", X.columns[0]] = self.transformer.__class__.__name__
        else:
            logging.debug("cache_manager was None")
        return ret
Ejemplo n.º 6
0
    def metric_score(self, X):
        """Compute the score for this cleaner using confidence_computation.

        confidence_computation is passed through init for each subclass.
        The confidence determines which cleaner/flattener is picked in an
        OVR fashion.

        Args:
            X: input DataFrame.

        Returns:
            float: confidence value.

        """
        # TODO can we also do a sampling here?
        logging.debug("Calculating scores....")
        scores = []
        for metric_wrapper, weight in self.confidence_computation.items():
            scores.append(
                metric_wrapper.calculate(X, cleaner=self.transform_row)
                * weight
            )
        logging.debug("End calculating scores...")
        return sum(scores)