def fit(self, df, target): """ Fit a matpipe to a dataframe. Once fit, can be used to predict out of sample data. The dataframe should contain columns having some materials data: - compositions - structures - bandstructures - density of states - user-defined features Any combination of these data is ok. Args: df (pandas.DataFrame): Pipe will be fit to this dataframe. target (str): The column in the dataframe containing the target property of interest Returns: MatPipe (self) """ self.pre_fit_df = df self.ml_type = regression_or_classification(df[target]) self.logger.info("Problem type is: {}".format(self.ml_type)) # Fit transformers on training data self.logger.info("Fitting MatPipe pipeline to data.") df = self.autofeaturizer.fit_transform(df, target) df = self.cleaner.fit_transform(df, target) df = self.reducer.fit_transform(df, target) self.learner.fit(df, target) self.logger.info("MatPipe successfully fit.") self.post_fit_df = df return self
def rm_correlated(self, df, target, r_max=0.95): """ A feature selection method that remove those that are cross correlated by more than threshold. Args: df (pandas.DataFrame): The dataframe containing features, target_key target (str): the name of the target column/feature r_max (0<float<=1): if R is greater than this value, the feature that has lower correlation with the target is removed. Returns (pandas.DataFrame): the dataframe with the highly cross-correlated features removed. """ mode = regression_or_classification(df[target]) corr = abs(df.corr()) if mode == AMM_REG_NAME: corr = corr.sort_values(by=target) rm_feats = [] for feat in corr.columns: if feat == target: continue for idx, corval in zip(corr.index, corr[feat]): if np.isnan(corval): break if idx == feat or idx in rm_feats: continue else: if corval >= r_max: if mode == AMM_REG_NAME: if corr.loc[idx, target] > corr.loc[feat, target]: removed_feat = feat else: removed_feat = idx else: # mode is classification removed_feat = lower_corr_clf(df, target, feat, idx) if removed_feat not in rm_feats: rm_feats.append(removed_feat) self.logger.debug( self._log_prefix + '"{}" correlates strongly with ' '"{}"'.format(feat, idx)) self.logger.debug( self._log_prefix + 'removing "{}"...'.format(removed_feat)) if removed_feat == feat: break if len(rm_feats) > 0: df = df.drop(rm_feats, axis=1) self.logger.info( self._log_prefix + "{} features removed due to cross correlation more than {}" "".format(len(rm_feats), r_max)) self.logger.debug( self._log_prefix + "Features removed by cross-correlation were: {}" "".format(rm_feats)) return df
def fit(self, df, target, **fit_kwargs): """ Train a TPOTRegressor or TPOTClassifier by fitting on a dataframe. Args: df (pandas.DataFrame): The df to be used for training. target (str): The key used to identify the machine learning target. **fit_kwargs: Keyword arguments to be passed to the TPOT backend. These arguments must be valid arguments to the TPOTBase class. Returns: TPOTAdaptor (self) """ # Prevent goofy pandas casting by casting to native y = df[target].values X = df.drop(columns=target).values # Determine learning type based on whether classification or regression self.mode = regression_or_classification(df[target]) mltype_str = "Classifier" if self.mode == AMM_CLF_NAME else "Regressor" self.tpot_kwargs["template"] = self.tpot_kwargs.get( "template", "Selector-Transformer-{}".format(mltype_str)) if self.mode == AMM_CLF_NAME: self.tpot_kwargs["config_dict"] = self.tpot_kwargs.get( "config_dict", TPOT_CLASSIFIER_CONFIG) if "scoring" not in self.tpot_kwargs: self.tpot_kwargs["scoring"] = "balanced_accuracy" self._backend = TPOTClassifier(**self.tpot_kwargs) elif self.mode == AMM_REG_NAME: self.tpot_kwargs["config_dict"] = self.tpot_kwargs.get( "config_dict", TPOT_REGRESSOR_CONFIG) if "scoring" not in self.tpot_kwargs: self.tpot_kwargs["scoring"] = "neg_mean_absolute_error" self._backend = TPOTRegressor(**self.tpot_kwargs) else: raise ValueError("Learning type {} not recognized as a valid mode " "for {}".format(self.mode, self.__class__.__name__)) self._features = df.drop(columns=target).columns.tolist() self._fitted_target = target self._backend = self._backend.fit(X, y, **fit_kwargs) return self
def fit(self, df, target, **fit_kwargs): # Determine learning type based on whether classification or regression self.mode = regression_or_classification(df[target]) if self.mode == AMM_CLF_NAME: self._best_pipeline = self._classifier elif self.mode == AMM_REG_NAME: self._best_pipeline = self._regressor else: raise ValueError("Learning type {} not recognized as a valid mode " "for {}".format(self.mode, self.__class__.__name__)) # Prevent goofy pandas casting by casting to native y = df[target].values.tolist() X = df.drop(columns=target).values.tolist() self._features = df.drop(columns=target).columns.tolist() self._fitted_target = target self._best_pipeline.fit(X, y)
def test_regression_or_classification(self): s = pd.Series(data=["4", "5", "6"]) self.assertTrue(regression_or_classification(s) == AMM_REG_NAME) s = pd.Series(data=[1, 2, 3]) self.assertTrue(regression_or_classification(s) == AMM_REG_NAME) s = pd.Series(data=["a", "b", "c"]) self.assertTrue(regression_or_classification(s) == AMM_CLF_NAME) s = pd.Series(data=["a1", "b", "c"]) self.assertTrue(regression_or_classification(s) == AMM_CLF_NAME) # binary classification s = pd.Series(data=[0, 1, 0, 0, 1]) self.assertTrue(regression_or_classification(s) == AMM_CLF_NAME) s = pd.Series(data=[0.0, 1.0, 0.0, 0.0, 1.0]) self.assertTrue(regression_or_classification(s) == AMM_CLF_NAME) s = pd.Series(data=[0, 1, 0, 0, 2]) self.assertTrue(regression_or_classification(s) == AMM_REG_NAME)
def fit(self, df, target): missing_remove_features = [ c for c in self._remove_features if c not in df.columns ] missing_keep_features = [ c for c in self._keep_features if c not in df.columns ] for features, name in [(missing_remove_features, 'remove'), (missing_keep_features, 'keep')]: if features: self.logger.warning( self._log_prefix + "Asked to {} some features that do not exist in the " "dataframe. Skipping the following features:\n{}".format( name, features)) reduced_df = df for r in self.reducers: X = df.drop(columns=[target]) y = df[target] if r == "corr": reduced_df = self.rm_correlated(df, target, self.corr_threshold) reduced_df = reduced_df.drop(columns=[target]) if r == "tree": tbfr = TreeFeatureReducer( importance_percentile=self.tree_importance_percentile, mode=regression_or_classification(y), logger=self.logger) reduced_df = tbfr.fit_transform(X, y).copy(deep=True) self.reducer_params[r] = { "importance_percentile": tbfr.importance_percentile, "mode": tbfr.mode, "random_state": tbfr.rs } elif r == "rebate": if isinstance(self.n_rebate_features, float): self.logger.info( self._log_prefix + "Retaining fraction {} of current {} features.".format( self.n_rebate_features, df.shape[1] - 1)) self.n_rebate_features = int(df.shape[1] * self.n_rebate_features) self.logger.info( self._log_prefix + "ReBATE MultiSURF running: retaining {} numerical " "features.".format(self.n_rebate_features)) reduced_df = rebate(df, target, n_features=self.n_rebate_features) reduced_df = reduced_df.copy(deep=True) self.logger.info( self._log_prefix + "ReBATE MultiSURF completed: retained {} numerical " "features.".format(len(reduced_df.columns))) self.logger.debug( self._log_prefix + "ReBATE MultiSURF gave the following " "features: {}".format(reduced_df.columns.tolist())) self.reducer_params[r] = {"algo": "MultiSURF Algorithm"} elif r == "pca": n_samples, n_features = X.shape if self.n_pca_features == "auto": if n_samples < n_features: self.logger.warning( self._log_prefix + "Number of samples ({}) is less than number of " "features ({}). Setting n_pca_features equal to " "n_samples.".format(n_samples, n_features)) self._pca = PCA(n_components=n_samples, svd_solver="full") else: self.logger.info( self._log_prefix + "PCA automatically determining optimal number of " "features using Minka's MLE.") self._pca = PCA(n_components="mle", svd_solver="auto") else: if isinstance(self.n_pca_features, float): self.logger.info( self._log_prefix + "Retaining fraction {} of current {} features." "".format(self.n_pca_features, df.shape[1])) self.n_pca_features = int(df.shape[1] * self.n_pca_features) if self.n_pca_features > n_samples: self.logger.warning( self._log_prefix + "Number of PCA features interpreted as {}, which is" " more than the number of samples ({}). " "n_pca_features coerced to equal n_samples." "".format(self.n_pca_features, n_samples)) self.n_pca_features = n_samples self.logger.info( self._log_prefix + "PCA running: retaining {} numerical features." "".format(self.n_pca_features)) self._pca = PCA(n_components=self.n_pca_features, svd_solver="auto") self._pca.fit(X.values, y.values) matrix = self._pca.transform(X.values) pca_feats = [ "PCA {}".format(i) for i in range(matrix.shape[1]) ] self._pca_feats = pca_feats reduced_df = pd.DataFrame(columns=pca_feats, data=matrix, index=X.index) self.logger.info(self._log_prefix + "PCA completed: retained {} numerical " "features.".format(len(reduced_df.columns))) retained = reduced_df.columns.values.tolist() removed = [ c for c in df.columns.values if c not in retained and c != target ] self.removed_features[r] = removed if target not in reduced_df: reduced_df.loc[:, target] = y.tolist() df = reduced_df all_removed = [ c for r, rf in self.removed_features.items() for c in rf ] all_kept = [c for c in df.columns.tolist() if c != target] save_from_removal = [ c for c in self._keep_features if c in all_removed ] for_force_removal = [c for c in self._remove_features if c in all_kept] if save_from_removal: self.logger.info(self._log_prefix + "Saving features from removal. " "Saved features:\n{}".format(save_from_removal)) if for_force_removal: self.logger.info( self._log_prefix + "Forcing removal of features. " "Removed features: \n{}".format(for_force_removal)) self.removed_features['manual'] = for_force_removal self.retained_features = [ c for c in all_kept if c not in self._remove_features or c != target ] return self
def transform(self, df, target): """ Decorate a dataframe containing composition, structure, bandstructure, and/or DOS objects with descriptors. Args: df (pandas.DataFrame): The dataframe not containing features. target (str): The ML-target property contained in the df. Returns: df (pandas.DataFrame): Transformed dataframe containing features. """ if self.cache_src and os.path.exists(self.cache_src): self.logger.debug(self._log_prefix + "Reading cache_src {}".format(self.cache_src)) cached_df = pd.read_json(self.cache_src) if not all([loc in cached_df.index for loc in df.index]): raise AutomatminerError("Feature cache does not contain all " "entries (by DataFrame index) needed " "to transform the input df.") else: cached_subdf = cached_df.loc[df.index] if target in cached_subdf.columns: if target not in df.columns: self.logger.warn( self._log_prefix + "Target not present in both cached df and input df." " Cannot perform comparison to ensure index match." ) else: cached_targets = cached_subdf[target] input_targets = df[target] cached_type = regression_or_classification( cached_targets) input_type = regression_or_classification( input_targets) if cached_type != input_type: raise AutomatminerError( "Cached targets appear to be '{}' type, while " "input targets appear to be '{}'." "".format(cached_type, input_type)) problems = {} for ix in input_targets.index: iv = input_targets[ix] cv = cached_targets[ix] if iv != cv: try: if not math.isclose(iv, cv): problems[ix] = [iv, cv] except TypeError: pass if problems: self.logger.warning( self._log_prefix + "Mismatch between cached targets and input " "targets: \n{}".format(problems)) self.logger.info( self._log_prefix + "Restored {} features on {} samples from " "cache {}".format(len(cached_subdf.columns), len(df.index), self.cache_src)) return cached_subdf else: transforming_on_fitted = df is self.fitted_input_df df = self._prescreen_df(df, inplace=True) if transforming_on_fitted: df = self.converted_input_df else: df = self._add_composition_from_structure(df) for featurizer_type, featurizers in self.featurizers.items(): if featurizer_type in df.columns: if not transforming_on_fitted: df = self._tidy_column(df, featurizer_type) for f in featurizers: self.logger.info(self._log_prefix + "Featurizing with {}." "".format(f.__class__.__name__)) df = f.featurize_dataframe( df, featurizer_type, ignore_errors=self.ignore_errors, multiindex=self.multiindex, inplace=False) if self.drop_inputs: df = df.drop(columns=[featurizer_type]) else: self.logger.info( self._log_prefix + "Featurizer type {} not in the dataframe. " "Skipping...".format(featurizer_type)) if self.functionalize: ff = FunctionFeaturizer() cols = df.columns.tolist() for ft in self.featurizers.keys(): if ft in cols: cols.pop(ft) df = ff.fit_featurize_dataframe( df, cols, ignore_errors=self.ignore_errors, multiindex=self.multiindex, inplace=False) if self.cache_src and not os.path.exists(self.cache_src): df.to_json(self.cache_src) return df