Example #1
0
    def fit(self, df, target):
        """
        Fit a matpipe to a dataframe. Once fit, can be used to predict out of
        sample data.

        The dataframe should contain columns having some materials data:
            - compositions
            - structures
            - bandstructures
            - density of states
            - user-defined features

        Any combination of these data is ok.

        Args:
            df (pandas.DataFrame): Pipe will be fit to this dataframe.
            target (str): The column in the dataframe containing the target
                property of interest

        Returns:
            MatPipe (self)
        """
        self.pre_fit_df = df
        self.ml_type = regression_or_classification(df[target])
        self.logger.info("Problem type is: {}".format(self.ml_type))

        # Fit transformers on training data
        self.logger.info("Fitting MatPipe pipeline to data.")
        df = self.autofeaturizer.fit_transform(df, target)
        df = self.cleaner.fit_transform(df, target)
        df = self.reducer.fit_transform(df, target)
        self.learner.fit(df, target)
        self.logger.info("MatPipe successfully fit.")
        self.post_fit_df = df
        return self
Example #2
0
    def rm_correlated(self, df, target, r_max=0.95):
        """
        A feature selection method that remove those that are cross correlated
        by more than threshold.

        Args:
            df (pandas.DataFrame): The dataframe containing features, target_key
            target (str): the name of the target column/feature
            r_max (0<float<=1): if R is greater than this value, the
                feature that has lower correlation with the target is removed.

        Returns (pandas.DataFrame):
            the dataframe with the highly cross-correlated features removed.
        """
        mode = regression_or_classification(df[target])
        corr = abs(df.corr())
        if mode == AMM_REG_NAME:
            corr = corr.sort_values(by=target)
        rm_feats = []
        for feat in corr.columns:
            if feat == target:
                continue
            for idx, corval in zip(corr.index, corr[feat]):
                if np.isnan(corval):
                    break
                if idx == feat or idx in rm_feats:
                    continue
                else:
                    if corval >= r_max:
                        if mode == AMM_REG_NAME:
                            if corr.loc[idx, target] > corr.loc[feat, target]:
                                removed_feat = feat
                            else:
                                removed_feat = idx
                        else:  # mode is classification
                            removed_feat = lower_corr_clf(df, target, feat, idx)
                        if removed_feat not in rm_feats:
                            rm_feats.append(removed_feat)
                            self.logger.debug(
                                self._log_prefix +
                                '"{}" correlates strongly with '
                                '"{}"'.format(feat, idx))
                            self.logger.debug(
                                self._log_prefix +
                                'removing "{}"...'.format(removed_feat))
                        if removed_feat == feat:
                            break
        if len(rm_feats) > 0:
            df = df.drop(rm_feats, axis=1)
            self.logger.info(
                self._log_prefix +
                "{} features removed due to cross correlation more than {}"
                "".format(len(rm_feats), r_max))
            self.logger.debug(
                self._log_prefix +
                "Features removed by cross-correlation were: {}"
                "".format(rm_feats))
        return df
Example #3
0
    def fit(self, df, target, **fit_kwargs):
        """
        Train a TPOTRegressor or TPOTClassifier by fitting on a dataframe.

        Args:
            df (pandas.DataFrame): The df to be used for training.
            target (str): The key used to identify the machine learning target.
            **fit_kwargs: Keyword arguments to be passed to the TPOT backend.
                These arguments must be valid arguments to the TPOTBase class.

        Returns:
            TPOTAdaptor (self)

        """
        # Prevent goofy pandas casting by casting to native
        y = df[target].values
        X = df.drop(columns=target).values

        # Determine learning type based on whether classification or regression
        self.mode = regression_or_classification(df[target])

        mltype_str = "Classifier" if self.mode == AMM_CLF_NAME else "Regressor"
        self.tpot_kwargs["template"] = self.tpot_kwargs.get(
            "template", "Selector-Transformer-{}".format(mltype_str))

        if self.mode == AMM_CLF_NAME:
            self.tpot_kwargs["config_dict"] = self.tpot_kwargs.get(
                "config_dict", TPOT_CLASSIFIER_CONFIG)
            if "scoring" not in self.tpot_kwargs:
                self.tpot_kwargs["scoring"] = "balanced_accuracy"
            self._backend = TPOTClassifier(**self.tpot_kwargs)
        elif self.mode == AMM_REG_NAME:
            self.tpot_kwargs["config_dict"] = self.tpot_kwargs.get(
                "config_dict", TPOT_REGRESSOR_CONFIG)
            if "scoring" not in self.tpot_kwargs:
                self.tpot_kwargs["scoring"] = "neg_mean_absolute_error"
            self._backend = TPOTRegressor(**self.tpot_kwargs)
        else:
            raise ValueError("Learning type {} not recognized as a valid mode "
                             "for {}".format(self.mode,
                                             self.__class__.__name__))
        self._features = df.drop(columns=target).columns.tolist()
        self._fitted_target = target
        self._backend = self._backend.fit(X, y, **fit_kwargs)
        return self
Example #4
0
    def fit(self, df, target, **fit_kwargs):

        # Determine learning type based on whether classification or regression
        self.mode = regression_or_classification(df[target])

        if self.mode == AMM_CLF_NAME:
            self._best_pipeline = self._classifier
        elif self.mode == AMM_REG_NAME:
            self._best_pipeline = self._regressor
        else:
            raise ValueError("Learning type {} not recognized as a valid mode "
                             "for {}".format(self.mode,
                                             self.__class__.__name__))

        # Prevent goofy pandas casting by casting to native
        y = df[target].values.tolist()
        X = df.drop(columns=target).values.tolist()
        self._features = df.drop(columns=target).columns.tolist()
        self._fitted_target = target
        self._best_pipeline.fit(X, y)
Example #5
0
    def test_regression_or_classification(self):
        s = pd.Series(data=["4", "5", "6"])
        self.assertTrue(regression_or_classification(s) == AMM_REG_NAME)

        s = pd.Series(data=[1, 2, 3])
        self.assertTrue(regression_or_classification(s) == AMM_REG_NAME)

        s = pd.Series(data=["a", "b", "c"])
        self.assertTrue(regression_or_classification(s) == AMM_CLF_NAME)

        s = pd.Series(data=["a1", "b", "c"])
        self.assertTrue(regression_or_classification(s) == AMM_CLF_NAME)

        # binary classification
        s = pd.Series(data=[0, 1, 0, 0, 1])
        self.assertTrue(regression_or_classification(s) == AMM_CLF_NAME)

        s = pd.Series(data=[0.0, 1.0, 0.0, 0.0, 1.0])
        self.assertTrue(regression_or_classification(s) == AMM_CLF_NAME)

        s = pd.Series(data=[0, 1, 0, 0, 2])
        self.assertTrue(regression_or_classification(s) == AMM_REG_NAME)
Example #6
0
    def fit(self, df, target):
        missing_remove_features = [
            c for c in self._remove_features if c not in df.columns
        ]
        missing_keep_features = [
            c for c in self._keep_features if c not in df.columns
        ]
        for features, name in [(missing_remove_features, 'remove'),
                               (missing_keep_features, 'keep')]:
            if features:
                self.logger.warning(
                    self._log_prefix +
                    "Asked to {} some features that do not exist in the "
                    "dataframe. Skipping the following features:\n{}".format(
                        name, features))

        reduced_df = df
        for r in self.reducers:
            X = df.drop(columns=[target])
            y = df[target]
            if r == "corr":
                reduced_df = self.rm_correlated(df, target,
                                                self.corr_threshold)
                reduced_df = reduced_df.drop(columns=[target])
            if r == "tree":
                tbfr = TreeFeatureReducer(
                    importance_percentile=self.tree_importance_percentile,
                    mode=regression_or_classification(y),
                    logger=self.logger)
                reduced_df = tbfr.fit_transform(X, y).copy(deep=True)
                self.reducer_params[r] = {
                    "importance_percentile": tbfr.importance_percentile,
                    "mode": tbfr.mode,
                    "random_state": tbfr.rs
                }
            elif r == "rebate":
                if isinstance(self.n_rebate_features, float):
                    self.logger.info(
                        self._log_prefix +
                        "Retaining fraction {} of current {} features.".format(
                            self.n_rebate_features, df.shape[1] - 1))
                    self.n_rebate_features = int(df.shape[1] *
                                                 self.n_rebate_features)
                self.logger.info(
                    self._log_prefix +
                    "ReBATE MultiSURF running: retaining {} numerical "
                    "features.".format(self.n_rebate_features))
                reduced_df = rebate(df,
                                    target,
                                    n_features=self.n_rebate_features)
                reduced_df = reduced_df.copy(deep=True)
                self.logger.info(
                    self._log_prefix +
                    "ReBATE MultiSURF completed: retained {} numerical "
                    "features.".format(len(reduced_df.columns)))
                self.logger.debug(
                    self._log_prefix + "ReBATE MultiSURF gave the following "
                    "features: {}".format(reduced_df.columns.tolist()))
                self.reducer_params[r] = {"algo": "MultiSURF Algorithm"}
            elif r == "pca":
                n_samples, n_features = X.shape
                if self.n_pca_features == "auto":
                    if n_samples < n_features:
                        self.logger.warning(
                            self._log_prefix +
                            "Number of samples ({}) is less than number of "
                            "features ({}). Setting n_pca_features equal to "
                            "n_samples.".format(n_samples, n_features))
                        self._pca = PCA(n_components=n_samples,
                                        svd_solver="full")
                    else:
                        self.logger.info(
                            self._log_prefix +
                            "PCA automatically determining optimal number of "
                            "features using Minka's MLE.")
                        self._pca = PCA(n_components="mle", svd_solver="auto")
                else:
                    if isinstance(self.n_pca_features, float):
                        self.logger.info(
                            self._log_prefix +
                            "Retaining fraction {} of current {} features."
                            "".format(self.n_pca_features, df.shape[1]))
                        self.n_pca_features = int(df.shape[1] *
                                                  self.n_pca_features)
                    if self.n_pca_features > n_samples:
                        self.logger.warning(
                            self._log_prefix +
                            "Number of PCA features interpreted as {}, which is"
                            " more than the number of samples ({}). "
                            "n_pca_features coerced to equal n_samples."
                            "".format(self.n_pca_features, n_samples))
                        self.n_pca_features = n_samples
                    self.logger.info(
                        self._log_prefix +
                        "PCA running: retaining {} numerical features."
                        "".format(self.n_pca_features))
                    self._pca = PCA(n_components=self.n_pca_features,
                                    svd_solver="auto")
                self._pca.fit(X.values, y.values)
                matrix = self._pca.transform(X.values)
                pca_feats = [
                    "PCA {}".format(i) for i in range(matrix.shape[1])
                ]
                self._pca_feats = pca_feats
                reduced_df = pd.DataFrame(columns=pca_feats,
                                          data=matrix,
                                          index=X.index)
                self.logger.info(self._log_prefix +
                                 "PCA completed: retained {} numerical "
                                 "features.".format(len(reduced_df.columns)))

            retained = reduced_df.columns.values.tolist()
            removed = [
                c for c in df.columns.values
                if c not in retained and c != target
            ]

            self.removed_features[r] = removed
            if target not in reduced_df:
                reduced_df.loc[:, target] = y.tolist()
            df = reduced_df

        all_removed = [
            c for r, rf in self.removed_features.items() for c in rf
        ]
        all_kept = [c for c in df.columns.tolist() if c != target]
        save_from_removal = [
            c for c in self._keep_features if c in all_removed
        ]
        for_force_removal = [c for c in self._remove_features if c in all_kept]

        if save_from_removal:
            self.logger.info(self._log_prefix +
                             "Saving features from removal. "
                             "Saved features:\n{}".format(save_from_removal))

        if for_force_removal:
            self.logger.info(
                self._log_prefix + "Forcing removal of features. "
                "Removed features: \n{}".format(for_force_removal))
            self.removed_features['manual'] = for_force_removal

        self.retained_features = [
            c for c in all_kept
            if c not in self._remove_features or c != target
        ]
        return self
Example #7
0
    def transform(self, df, target):
        """
        Decorate a dataframe containing composition, structure, bandstructure,
        and/or DOS objects with descriptors.

        Args:
            df (pandas.DataFrame): The dataframe not containing features.
            target (str): The ML-target property contained in the df.

        Returns:
            df (pandas.DataFrame): Transformed dataframe containing features.
        """
        if self.cache_src and os.path.exists(self.cache_src):
            self.logger.debug(self._log_prefix +
                              "Reading cache_src {}".format(self.cache_src))
            cached_df = pd.read_json(self.cache_src)
            if not all([loc in cached_df.index for loc in df.index]):
                raise AutomatminerError("Feature cache does not contain all "
                                        "entries (by DataFrame index) needed "
                                        "to transform the input df.")
            else:
                cached_subdf = cached_df.loc[df.index]
                if target in cached_subdf.columns:
                    if target not in df.columns:
                        self.logger.warn(
                            self._log_prefix +
                            "Target not present in both cached df and input df."
                            " Cannot perform comparison to ensure index match."
                        )
                    else:
                        cached_targets = cached_subdf[target]
                        input_targets = df[target]
                        cached_type = regression_or_classification(
                            cached_targets)
                        input_type = regression_or_classification(
                            input_targets)
                        if cached_type != input_type:
                            raise AutomatminerError(
                                "Cached targets appear to be '{}' type, while "
                                "input targets appear to be '{}'."
                                "".format(cached_type, input_type))

                        problems = {}
                        for ix in input_targets.index:
                            iv = input_targets[ix]
                            cv = cached_targets[ix]
                            if iv != cv:
                                try:
                                    if not math.isclose(iv, cv):
                                        problems[ix] = [iv, cv]
                                except TypeError:
                                    pass
                        if problems:
                            self.logger.warning(
                                self._log_prefix +
                                "Mismatch between cached targets and input "
                                "targets: \n{}".format(problems))

                self.logger.info(
                    self._log_prefix +
                    "Restored {} features on {} samples from "
                    "cache {}".format(len(cached_subdf.columns), len(df.index),
                                      self.cache_src))
                return cached_subdf
        else:
            transforming_on_fitted = df is self.fitted_input_df
            df = self._prescreen_df(df, inplace=True)

            if transforming_on_fitted:
                df = self.converted_input_df
            else:
                df = self._add_composition_from_structure(df)

            for featurizer_type, featurizers in self.featurizers.items():
                if featurizer_type in df.columns:
                    if not transforming_on_fitted:
                        df = self._tidy_column(df, featurizer_type)

                    for f in featurizers:
                        self.logger.info(self._log_prefix +
                                         "Featurizing with {}."
                                         "".format(f.__class__.__name__))
                        df = f.featurize_dataframe(
                            df,
                            featurizer_type,
                            ignore_errors=self.ignore_errors,
                            multiindex=self.multiindex,
                            inplace=False)
                    if self.drop_inputs:
                        df = df.drop(columns=[featurizer_type])
                else:
                    self.logger.info(
                        self._log_prefix +
                        "Featurizer type {} not in the dataframe. "
                        "Skipping...".format(featurizer_type))
            if self.functionalize:
                ff = FunctionFeaturizer()
                cols = df.columns.tolist()
                for ft in self.featurizers.keys():
                    if ft in cols:
                        cols.pop(ft)
                df = ff.fit_featurize_dataframe(
                    df,
                    cols,
                    ignore_errors=self.ignore_errors,
                    multiindex=self.multiindex,
                    inplace=False)
            if self.cache_src and not os.path.exists(self.cache_src):
                df.to_json(self.cache_src)
            return df