コード例 #1
0
ファイル: pipeline.py プロジェクト: zhigangmei/automatminer
    def __init__(self,
                 autofeaturizer=None,
                 cleaner=None,
                 reducer=None,
                 learner=None):
        transformers = [autofeaturizer, cleaner, reducer, learner]
        if not all(transformers):
            if any(transformers):
                raise AutomatminerError(
                    "Please specify all dataframe"
                    "transformers (autofeaturizer, learner,"
                    "reducer, and cleaner), or none (to use"
                    "default).")
            else:
                config = get_preset_config("express")
                autofeaturizer = config["autofeaturizer"]
                cleaner = config["cleaner"]
                reducer = config["reducer"]
                learner = config["learner"]

        self.autofeaturizer = autofeaturizer
        self.cleaner = cleaner
        self.reducer = reducer
        self.learner = learner
        self.pre_fit_df = None
        self.post_fit_df = None
        self.ml_type = None
        self.target = None
        self.version = get_version()
        super(MatPipe, self).__init__()
コード例 #2
0
    def __init__(self, autofeaturizer=None, cleaner=None, reducer=None,
                 learner=None, logger=True, log_level=None):
        transformers = [autofeaturizer, cleaner, reducer, learner]
        if not all(transformers):
            if any(transformers):
                raise AutomatminerError("Please specify all dataframe"
                                        "transformers (autofeaturizer, learner,"
                                        "reducer, and cleaner), or none (to use"
                                        "default).")
            else:
                config = get_preset_config("production")
                autofeaturizer = config["autofeaturizer"]
                cleaner = config["cleaner"]
                reducer = config["reducer"]
                learner = config["learner"]

        self._logger = self.get_logger(logger, level=log_level)
        self.autofeaturizer = autofeaturizer
        self.cleaner = cleaner
        self.reducer = reducer
        self.learner = learner
        self.autofeaturizer._logger = self.get_logger(logger)
        self.cleaner._logger = self.get_logger(logger)
        self.reducer._logger = self.get_logger(logger)
        self.learner._logger = self.get_logger(logger)
        self.pre_fit_df = None
        self.post_fit_df = None
        self.is_fit = False
        self.ml_type = None
コード例 #3
0
ファイル: base.py プロジェクト: zhigangmei/automatminer
    def predict(self,
                df: pd.DataFrame,
                target: str,
                output_col=None) -> pd.DataFrame:
        """
        Predict the target property of materials given a df of features. This
        base method is widely applicanble across different AutoML backends.

        The predictions are appended to the dataframe in a column named according
        to output_col. Default value is "{target_name} predicted"

        Args:
            df (pandas.DataFrame): Contains all features needed for ML (i.e.,
                all features contained in the training dataframe.
            target (str): The property to be predicted. Should match the target
                used for fitting. May or may not be present in the argument
                dataframe.

        Returns:
            (pandas.DataFrame): The argument dataframe plus a column containing
                the predictions of the target.

        """
        if target != self.fitted_target:
            raise AutomatminerError(
                "Argument dataframe target ({}) is different from the fitted "
                "dataframe target! ({})".format(target, self.fitted_target))
        elif not all([f in df.columns for f in self.features]):
            not_in_model = [f for f in self.features if f not in df.columns]
            not_in_df = [f for f in df.columns if f not in self.features]
            raise AutomatminerError(
                "Features used to build model are different from df columns! "
                "Features located in model not located in df: \n{} \n "
                "Features located in df not in model: \n{}"
                "".format(not_in_df, not_in_model))
        else:
            X = df[self.features].values  # rectify feature order
            y_pred = self.best_pipeline.predict(X)
            df[output_col or (target + " predicted")] = y_pred

            log_msg = "Prediction finished successfully."
            try:
                logger.info(self._log_prefix + log_msg)
            except AttributeError:
                pass
            return df
コード例 #4
0
ファイル: adaptors.py プロジェクト: shizhe1/automatminer
    def predict(self, df, target):
        """
        Predict the target property of materials given a df of features.

        The predictions are appended to the dataframe in a column called:
            "{target} predicted"

        Args:
            df (pandas.DataFrame): Contains all features needed for ML (i.e.,
                all features contained in the training dataframe.
            target (str): The property to be predicted. Should match the target
                used for fitting. May or may not be present in the argument
                dataframe.

        Returns:
            (pandas.DataFrame): The argument dataframe plus a column containing
                the predictions of the target.

        """
        if target != self.fitted_target:
            raise AutomatminerError(
                "Argument dataframe target {} is different "
                "from the fitted dataframe target! {}"
                "".format(target, self.fitted_target))
        elif not all([f in df.columns for f in self._features]):
            not_in_model = [f for f in self._features if f not in df.columns]
            not_in_df = [f for f in df.columns if f not in self._features]
            raise AutomatminerError(
                "Features used to build model are different"
                " from df columns! Features located in "
                "model not located in df: \n{} \n Features "
                "located in df not in model: \n{}"
                "".format(not_in_df, not_in_model))
        else:
            X = df[self._features].values  # rectify feature order
            y_pred = self._backend.predict(X)
            df[target + " predicted"] = y_pred
            self.logger.info(self._log_prefix +
                             "Prediction finished successfully.")
            return df
コード例 #5
0
    def transform(self, X, y=None):
        """
        Transforms the data with the subset of features determined after
            calling the fit method on the data.

        Args:
            X (pandas.DataFrame): input data, note that numpy matrix is NOT
                accepted since the X.columns is used for feature names
            y (placeholder): ignored input (for consistency in notation)

        Returns (pandas.DataFrame): the data with reduced number of features.
        """
        if self.selected_features is None:
            raise AutomatminerError('The fit method should be called first!')
        return X[self.selected_features]
コード例 #6
0
    def fit(self, X, y, tree="rf", recursive=True, cv=5):
        """
        Fits to the data (X) and target (y) to determine the selected_features.

        Args:
            X (pandas.DataFrame): input data, note that numpy matrix is NOT
                accepted since the X.columns is used for feature names
            y (pandas.Series or np.ndarray): list of outputs used for fitting
                the tree model
            tree (str or instantiated sklearn tree-based model): if a model is
                directly fed, it must have the .feature_importances_ attribute
            recursive (bool): whether to recursively reduce the features (True)
                or just do it once (False)
            cv (int or CrossValidation): sklearn's cross-validation with the
                same options (int or actual instantiated CrossValidation)

        Returns (None):
            sets the class attribute .selected_features
        """
        m0 = len(X.columns)
        if isinstance(tree, str):
            if tree.lower() in ["rf", "random forest", "randomforest"]:
                if self.mode.lower() in ["classification", "classifier"]:
                    tree = RandomForestClassifier(random_state=self.rs)
                else:
                    tree = RandomForestRegressor(random_state=self.rs)
            elif tree.lower() in ["gb", "gbt", "gradiet boosting"]:
                if self.mode.lower() in ["classification", "classifier"]:
                    tree = GradientBoostingClassifier(random_state=self.rs)
                else:
                    tree = GradientBoostingRegressor(random_state=self.rs)
            else:
                raise AutomatminerError(
                    "Unsupported tree_type {}!".format(tree))

        cv = check_cv(cv=cv, y=y, classifier=is_classifier(tree))
        all_feats = []
        for train, _ in cv.split(X, y, groups=None):
            Xtrn = X.iloc[train]
            ytrn = y.iloc[train]
            all_feats += self.get_reduced_features(tree, Xtrn, ytrn, recursive)
        # take the union of selected features of each fold
        self.selected_features = list(set(all_feats))
        logger.info(
            self._log_prefix +
            "Finished tree-based feature reduction of {} initial features to "
            "{}".format(m0, len(self.selected_features)))
        return self
コード例 #7
0
    def transform(self, df, target):
        """
        Apply the sequence of preprocessing steps determined by fit, with the
        option to change the na_method for samples.

        Args:
            df (pandas.DataFrame): Contains features and the target_key
            target (str): The name of the target in the dataframe

        Returns (pandas.DataFrame)
        """
        self.logger.info(self._log_prefix +
                         "Cleaning with respect to samples with sample "
                         "na_method '{}'".format(self.na_method_transform))

        if target != self.fitted_target:
            raise AutomatminerError(
                "The transformation target {} is not the same as the fitted "
                "target {}".format(target, self.fitted_target))

        # We assume the two targets are the same from here on out
        df = self.to_numerical(df, target)
        df = self.handle_na(df,
                            target,
                            self.na_method_transform,
                            coerce_mismatch=True)

        # Ensure the order of columns is identical
        if target in df.columns:
            self.logger.info(self._log_prefix + "Reordering columns...")
            df = df[self.fitted_df.columns]
        else:
            self.logger.info(self._log_prefix +
                             "Target not found in df columns. Ignoring...")
            reordered_cols = self.fitted_df.drop(columns=[target]).columns
            df = df[reordered_cols]
        return df
コード例 #8
0
ファイル: ml.py プロジェクト: shizhe1/automatminer
def is_greater_better(scoring_function) -> bool:
    """
    Determines whether scoring_function being greater is more favorable/better.
    Args:
        scoring_function (str): the name of the scoring function supported by
            TPOT and sklearn. Please see below for more information.
    Returns (bool): Whether the scoring metric should be considered better if
        it is larger or better if it is smaller
    """
    desired_high_metrics = {
        'accuracy', 'adjusted_rand_score', 'average_precision',
        'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples',
        'f1_weighted', 'precision', 'precision_macro', 'precision_micro',
        'precision_samples', 'precision_weighted', 'recall', 'recall_macro',
        'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'r2',
        'r2_score', 'neg_median_absolute_error', 'neg_mean_absolute_error',
        'neg_mean_squared_error'
    }

    desired_low_metrics = {
        'median_absolute_error', 'mean_absolute_error', 'mean_squared_error'
    }

    # Check to ensure no metrics are accidentally placed in both sets
    if desired_high_metrics.intersection(desired_low_metrics):
        raise AutomatminerError("Error, there is a metric in both desired"
                                " high and desired low metrics")

    if scoring_function not in desired_high_metrics \
            and scoring_function not in desired_low_metrics:
        warnings.warn(
            'The scoring_function: "{}" not found; continuing assuming'
            ' greater score is better'.format(scoring_function))

    # True if not in either set or only in desired_high,
    # False if in desired_low or both sets
    return scoring_function not in desired_low_metrics
コード例 #9
0
    def fit(self, df, target):
        """
        Determine a sequence of preprocessing steps to clean a dataframe.

        Args:
            df (pandas.DataFrame): Contains features and the target_key
            target (str): The name of the target in the dataframe

        Returns: self
        """

        self.logger.info(self._log_prefix +
                         "Cleaning with respect to samples with sample "
                         "na_method '{}'".format(self.na_method_fit))
        if target not in df.columns:
            raise AutomatminerError(
                "Target {} must be contained in df.".format(target))

        self._reset_attrs()
        df = self.to_numerical(df, target)
        df = self.handle_na(df, target, self.na_method_fit)
        self.fitted_df = df
        self.fitted_target = target
        return self
コード例 #10
0
    def handle_na(self, df, target, na_method, coerce_mismatch=True):
        """
        First pass for handling cells without values (null or nan). Additional
        preprocessing may be necessary as one column may be filled with
        median while the other with mean or mode, etc.

        Args:
            df (pandas.DataFrame): The dataframe containing features
            target (str): The key defining the ML target.
            coerce_mismatch (bool): If there is a mismatch between the fitted
                dataframe columns and the argument dataframe columns, create
                and drop mismatch columns so the dataframes are matching. If
                False, raises an error. New columns are instantiated as all
                zeros, as most of the time this is a onehot encoding issue.
            na_method (str): How to deal with samples still containing nans
                after troublesome columns are already dropped. Default is
                'drop'. Other options are from pandas.DataFrame.fillna:
                {‘bfill’, ‘pad’, ‘ffill’}, or 'ignore' to ignore nans.
                Alternatively, specify a value to replace the nans, e.g. 0.

        Returns:
            (pandas.DataFrame) The cleaned df
        """
        self.logger.info(self._log_prefix +
                         "Before handling na: {} samples, {} features"
                         "".format(*df.shape))

        # Drop targets containing na before further processing
        if self.drop_na_targets and target in df.columns:
            clean_df = df.dropna(axis=0, how='any', subset=[target])
            self.dropped_samples = df[~df.index.isin(clean_df.index)]
            self.logger.info(
                self._log_prefix +
                "{} samples did not have target values. They were "
                "dropped.".format(len(self.dropped_samples)))
            df = clean_df

        # Remove features failing the max_na_frac limit
        feats0 = set(df.columns)
        if not self.is_fit:
            self.logger.info(self._log_prefix +
                             "Handling feature na by max na threshold of {} "
                             "with method '{}'.".format(
                                 self.max_na_frac, self.feature_na_method))
            threshold = int((1 - self.max_na_frac) * len(df))
            if self.feature_na_method == "drop":
                df = df.dropna(axis=1, thresh=threshold)
            else:
                df = df.dropna(axis=1, thresh=1)
                problem_cols = df.columns[
                    df.isnull().mean() > self.max_na_frac]
                dfp = df[problem_cols]
                if self.feature_na_method == "fill":
                    dfp = dfp.fillna(method="ffill")
                    dfp = dfp.fillna(method="bfill")
                elif self.feature_na_method == "mean":
                    # Take the mean of all numeric columns
                    dfpn = dfp[[
                        ncol for ncol in dfp.columns
                        if ncol in self.number_cols
                    ]]
                    dfpn = dfpn.fillna(value=dfpn.mean())
                    dfp[dfpn.columns] = dfpn

                    # Simply fill one hot encoded columns
                    dfp = dfp.fillna(method="ffill")
                    dfp = dfp.fillna(method="bfill")
                else:
                    dfp = dfp.fillna(value=self.feature_na_method)
                df[problem_cols] = dfp

            if len(df.columns) < len(feats0):
                feats = set(df.columns)
                n_feats = len(feats0) - len(feats)
                napercent = self.max_na_frac * 100
                feat_names = feats0 - feats
                self.logger.info(
                    self._log_prefix +
                    'These {} features were removed as they had more '
                    'than {}% missing values: {}'.format(
                        n_feats, napercent, feat_names))
        else:
            mismatch = compare_columns(self.fitted_df, df, ignore=target)
            if mismatch["mismatch"]:
                self.logger.warning(self._log_prefix +
                                    "Mismatched columns found in dataframe "
                                    "used for fitting and argument dataframe.")
                if coerce_mismatch:
                    self.logger.warning(self._log_prefix +
                                        "Coercing mismatched columns...")
                    if mismatch["df1_not_in_df2"]:  # in fitted, not in arg
                        self.logger.warning(
                            self._log_prefix +
                            "Assuming missing columns in argument df are "
                            "one-hot encoding issues. Setting to zero the "
                            "following new columns:\n{}".format(
                                mismatch["df1_not_in_df2"]))
                        for c in self.fitted_df.columns:
                            if c not in df.columns and c != target:
                                # Interpret as one-hot problems...
                                df[c] = np.zeros((df.shape[0]))
                    if mismatch["df2_not_in_df1"]:  # arg cols not in fitted
                        self.logger.warning(
                            self._log_prefix +
                            "Following columns are being dropped:\n{}".format(
                                mismatch["df2_not_in_df1"]))
                        df = df.drop(columns=mismatch["df2_not_in_df1"])
                else:
                    raise AutomatminerError(
                        "Mismatch between columns found in "
                        "arg dataframe and dataframe used "
                        "for fitting!")

            # handle the case where all samples of transformed df are nan but
            # feature is required by fitted input df, and these is no way to
            # impute by samples or drop...
            nan_cols = [c for c in df.columns if df[c].isna().all()]
            if nan_cols:
                self.logger.error(
                    self._log_prefix + "Columns {} are all nan "
                    "in transform df but are required by the fit "
                    "df. Using mean values of fitted df to "
                    "impute transformed df. This may result in "
                    "highly erroenous imputed values!"
                    "".format(nan_cols))
                for col in nan_cols:
                    mean_val = self.fitted_df[col].mean()
                    df[col] = [mean_val] * df.shape[0]

        self.dropped_features = [
            c for c in feats0 if c not in df.columns.values
        ]

        # Handle all rows that still contain any nans
        if na_method == "drop":
            clean_df = df.dropna(axis=0, how='any')
            self.dropped_samples = pd.concat(
                (df[~df.index.isin(clean_df.index)], self.dropped_samples),
                axis=0,
                sort=True)
            df = clean_df
        elif na_method == "ignore":
            pass
        elif na_method == "fill":
            df = df.fillna(method="ffill")
            df = df.fillna(method="bfill")
        elif na_method == "mean":
            # Samples belonging in number columns are averaged to replace na
            dfn = df[[ncol for ncol in df.columns if ncol in self.number_cols]]
            dfn = dfn.fillna(value=dfn.mean())
            df[dfn.columns] = dfn

            # the rest are simply filled
            df = df.fillna(method="ffill")
            df = df.fillna(method="bfill")
        else:
            df = df.fillna(value=na_method)
        self.logger.info(self._log_prefix +
                         "After handling na: {} samples, {} features".format(
                             *df.shape))
        return df
コード例 #11
0
ファイル: core.py プロジェクト: kmu/automatminer
    def transform(self, df, target):
        """
        Decorate a dataframe containing composition, structure, bandstructure,
        and/or DOS objects with descriptors.

        Args:
            df (pandas.DataFrame): The dataframe not containing features.
            target (str): The ML-target property contained in the df.

        Returns:
            df (pandas.DataFrame): Transformed dataframe containing features.
        """
        if self.cache_src and os.path.exists(self.cache_src):
            self.logger.debug(self._log_prefix +
                              "Reading cache_src {}".format(self.cache_src))
            cached_df = pd.read_json(self.cache_src)
            if not all([loc in cached_df.index for loc in df.index]):
                raise AutomatminerError("Feature cache does not contain all "
                                        "entries (by DataFrame index) needed "
                                        "to transform the input df.")
            else:
                cached_subdf = cached_df.loc[df.index]
                if target in cached_subdf.columns:
                    if target not in df.columns:
                        self.logger.warn(
                            self._log_prefix +
                            "Target not present in both cached df and input df."
                            " Cannot perform comparison to ensure index match."
                        )
                    else:
                        cached_targets = cached_subdf[target]
                        input_targets = df[target]
                        cached_type = regression_or_classification(
                            cached_targets)
                        input_type = regression_or_classification(
                            input_targets)
                        if cached_type != input_type:
                            raise AutomatminerError(
                                "Cached targets appear to be '{}' type, while "
                                "input targets appear to be '{}'."
                                "".format(cached_type, input_type))

                        problems = {}
                        for ix in input_targets.index:
                            iv = input_targets[ix]
                            cv = cached_targets[ix]
                            if iv != cv:
                                try:
                                    if not math.isclose(iv, cv):
                                        problems[ix] = [iv, cv]
                                except TypeError:
                                    pass
                        if problems:
                            self.logger.warning(
                                self._log_prefix +
                                "Mismatch between cached targets and input "
                                "targets: \n{}".format(problems))

                self.logger.info(
                    self._log_prefix +
                    "Restored {} features on {} samples from "
                    "cache {}".format(len(cached_subdf.columns), len(df.index),
                                      self.cache_src))
                return cached_subdf
        else:
            transforming_on_fitted = df is self.fitted_input_df
            df = self._prescreen_df(df, inplace=True)

            if transforming_on_fitted:
                df = self.converted_input_df
            else:
                df = self._add_composition_from_structure(df)

            for featurizer_type, featurizers in self.featurizers.items():
                if featurizer_type in df.columns:
                    if not transforming_on_fitted:
                        df = self._tidy_column(df, featurizer_type)

                    for f in featurizers:
                        self.logger.info(self._log_prefix +
                                         "Featurizing with {}."
                                         "".format(f.__class__.__name__))
                        df = f.featurize_dataframe(
                            df,
                            featurizer_type,
                            ignore_errors=self.ignore_errors,
                            multiindex=self.multiindex,
                            inplace=False)
                    if self.drop_inputs:
                        df = df.drop(columns=[featurizer_type])
                else:
                    self.logger.info(
                        self._log_prefix +
                        "Featurizer type {} not in the dataframe. "
                        "Skipping...".format(featurizer_type))
            if self.functionalize:
                ff = FunctionFeaturizer()
                cols = df.columns.tolist()
                for ft in self.featurizers.keys():
                    if ft in cols:
                        cols.pop(ft)
                df = ff.fit_featurize_dataframe(
                    df,
                    cols,
                    ignore_errors=self.ignore_errors,
                    multiindex=self.multiindex,
                    inplace=False)
            if self.cache_src and not os.path.exists(self.cache_src):
                df.to_json(self.cache_src)
            return df
コード例 #12
0
ファイル: core.py プロジェクト: kmu/automatminer
    def __init__(self,
                 cache_src=None,
                 preset=None,
                 featurizers=None,
                 exclude=None,
                 functionalize=False,
                 ignore_cols=None,
                 ignore_errors=True,
                 drop_inputs=True,
                 guess_oxistates=True,
                 multiindex=False,
                 do_precheck=True,
                 n_jobs=None,
                 logger=True,
                 composition_col="composition",
                 structure_col="structure",
                 bandstructure_col="bandstructure",
                 dos_col="dos"):

        if featurizers and preset:
            raise AutomatminerError("Featurizers and preset were both set. "
                                    "Please either use a preset ('express', "
                                    "'all', 'debug', 'heavy') or set "
                                    "featurizers manually.")
        if not featurizers and not preset:
            raise AutomatminerError("Please specify set(s) of featurizers to "
                                    "use either through the featurizers"
                                    "argument or through the preset argument.")

        self.cache_src = cache_src
        self.preset = "express" if preset is None else preset
        self._logger = self.get_logger(logger)
        self.featurizers = featurizers
        self.exclude = exclude if exclude else []
        self.functionalize = functionalize
        self.ignore_cols = ignore_cols or []
        self.is_fit = False
        self.fitted_input_df = None
        self.converted_input_df = None
        self.ignore_errors = ignore_errors
        self.drop_inputs = drop_inputs
        self.multiindex = multiindex
        self.do_precheck = do_precheck
        self.n_jobs = n_jobs
        self.guess_oxistates = guess_oxistates
        self.features = []
        self.auto_featurizer = True if self.featurizers is None else False
        self.removed_featurizers = None
        self.composition_col = composition_col
        self.structure_col = structure_col
        self.bandstruct_col = bandstructure_col
        self.dos_col = dos_col

        _supported_featurizers = {
            composition_col: CompositionFeaturizers,
            structure_col: StructureFeaturizers,
            bandstructure_col: BSFeaturizers,
            dos_col: DOSFeaturizers
        }

        # user-set featurizers
        if self.featurizers:
            if not isinstance(self.featurizers, dict):
                raise TypeError("Featurizers must be a dictionary with keys"
                                "matching your {}".format(_COMMON_COL_ERR_STR))

            invalid_ftypes = [
                f for f in self.featurizers.keys()
                if f not in _supported_featurizers.keys()
            ]
            if invalid_ftypes:
                raise KeyError(
                    "The following keys were specified as featurizer"
                    " types but were not set in {}"
                    "".format(_COMMON_COL_ERR_STR))

            for ftype, fset in self.featurizers.items():
                _allowed = [
                    f.__class__.__name__
                    for f in _supported_featurizers[ftype]().all
                ]
                for f in fset:
                    if f.__class__.__name__ not in _allowed:
                        raise ValueError(
                            "The {} featurizer {} is not supported by "
                            "AutoFeaturizer. Try updating your version of "
                            "automatminer and matminer.".format(ftype, f))

        # auto-set featurizers
        else:
            featurizers = dict()
            for featurizer_type in _supported_featurizers.keys():
                featurizer_set = _supported_featurizers[featurizer_type]
                featurizers[featurizer_type] = getattr(
                    featurizer_set(exclude=self.exclude), self.preset)
            self.featurizers = featurizers

        # Check if any featurizers need fitting (useful for MatPipe)
        needs_fit = False
        fittable_fs = StructureFeaturizers().need_fit
        self.fittable_fcls = set([f.__class__.__name__ for f in fittable_fs])

        # Currently structure featurizers are the only featurizer types which
        # can be fittable
        for f in self.featurizers[self.structure_col]:
            if f.__class__.__name__ in self.fittable_fcls:
                needs_fit = True
                break
        self.needs_fit = needs_fit

        if self.needs_fit and self.cache_src:
            self.logger.warn(self._log_prefix +
                             "Using cached features on fittable featurizers! "
                             "Please make sure you are not benchmarking with "
                             "these options enabled; it is likely you will be"
                             "leaking data (i.e., features) from the testing"
                             "sets into the training.")

        if self.cache_src and "json" not in self.cache_src.lower():
            raise ValueError("The cache_src filename does not contain json."
                             "JSON is the required file type for featurizer"
                             "caching.")

        self.min_precheck_frac = 0.9
コード例 #13
0
ファイル: pipeline.py プロジェクト: zhigangmei/automatminer
    def benchmark(self,
                  df,
                  target,
                  kfold,
                  fold_subset=None,
                  cache=False,
                  ignore=None):
        """
        If the target property is known for all data, perform an ML benchmark
        using MatPipe. Used for getting an idea of how well AutoML can predict
        a certain target property.

        MatPipe benchmarks with a nested cross validation, meaning it makes
        k validation/test splits, where all model selection is done on the train
        /validation set (a typical CV). When the model is done validating, it is
        used to predict the previously unseen test set data. This process is
        repeated for each of the k folds, which (1) mitigates the benchmark from
        biasing the model based to the selection of test set and (2) better
        estimates the generalization error than a single validation/test split.

        tl;dr: Put in a dataset and kfold scheme for nested CV, get out the
        predicted test sets.

        Note: MatPipes after benchmarking have been fit on the last fold, not
        the entire dataset. To use your entire dataset for prediction, use the
        MatPipe fit and predict methods.

        Args:
            df (pandas.DataFrame): The dataframe for benchmarking. Must contain
            target (str): The column name to use as the ml target property.
            kfold (sklearn KFold or StratifiedKFold: The cross validation split
                object to use for nested cross validation. Used to index the
                dataframe with .iloc, NOT .loc.
            fold_subset ([int]): A subset of the folds in kfold to evaluate (by
                index). For example, to run only the 3rd train/validation/test
                split of the kfold, set fold_subset = [2]. To use the first and
                fourth, set fold_subset = [0, 3].
            cache (bool): If True, pre-featurizes the entire dataframe
                (including test data!) and caches it before iterating over
                folds. Do NOT use if you are using fittable featurizers whose
                feature labels are based on their input! Doing so may "leak"
                information from the testing set to the training set and will
                over-represent your benchmark. Enabling this for featurizers
                which are not fittable is completely safe. Note that your
                autofeaturizer must have a cache_src defined if allow_caching is
                enabled (do this either through the AutoFeaturizer class or
                using the cache_src argument to get_preset_config.
            ignore ([str], None): Ignore columns during prediction for each
                outer fold. See .predict --> ignore argument for more details.

        Returns:
            results ([pd.DataFrame]): Dataframes containing each fold's
                known targets, as well as their independently predicted targets.
        """
        cache_src = self.autofeaturizer.cache_src
        if cache_src and cache:
            if os.path.exists(cache_src):
                logger.warning(
                    "Cache src {} already found! Ensure this featurized data "
                    "matches the df being benchmarked.".format(cache_src))
            logger.warning("Running pre-featurization for caching.")
            self.autofeaturizer.fit_transform(df, target)
        elif cache_src and not cache:
            raise AutomatminerError(
                "Caching was enabled in AutoFeaturizer but not in benchmark. "
                "Either disable caching in AutoFeaturizer or enable it by "
                "passing cache=True to benchmark.")
        elif cache and not cache_src:
            raise AutomatminerError(
                "MatPipe cache is enabled, but no cache_src was defined in "
                "autofeaturizer. Pass the cache_src argument to AutoFeaturizer "
                "or use the cache_src get_preset_config powerup.")
        else:
            logger.debug("No caching being used in AutoFeaturizer or "
                         "benchmark.")

        if not fold_subset:
            fold_subset = list(range(kfold.n_splits))

        logger.warning("Beginning benchmark.")
        results = []
        fold = 0
        for _, test_ix in kfold.split(X=df, y=df[target]):
            if fold in fold_subset:
                logger.info("Training on fold index {}".format(fold))
                # Split, identify, and randomize test set
                test = df.iloc[test_ix].sample(frac=1)
                train = df[~df.index.isin(test.index)].sample(frac=1)
                self.fit(train, target)
                logger.info("Predicting fold index {}".format(fold))
                test = self.predict(test, ignore=ignore)
                results.append(test)
            fold += 1
        return results