def __init__(self, autofeaturizer=None, cleaner=None, reducer=None, learner=None): transformers = [autofeaturizer, cleaner, reducer, learner] if not all(transformers): if any(transformers): raise AutomatminerError( "Please specify all dataframe" "transformers (autofeaturizer, learner," "reducer, and cleaner), or none (to use" "default).") else: config = get_preset_config("express") autofeaturizer = config["autofeaturizer"] cleaner = config["cleaner"] reducer = config["reducer"] learner = config["learner"] self.autofeaturizer = autofeaturizer self.cleaner = cleaner self.reducer = reducer self.learner = learner self.pre_fit_df = None self.post_fit_df = None self.ml_type = None self.target = None self.version = get_version() super(MatPipe, self).__init__()
def __init__(self, autofeaturizer=None, cleaner=None, reducer=None, learner=None, logger=True, log_level=None): transformers = [autofeaturizer, cleaner, reducer, learner] if not all(transformers): if any(transformers): raise AutomatminerError("Please specify all dataframe" "transformers (autofeaturizer, learner," "reducer, and cleaner), or none (to use" "default).") else: config = get_preset_config("production") autofeaturizer = config["autofeaturizer"] cleaner = config["cleaner"] reducer = config["reducer"] learner = config["learner"] self._logger = self.get_logger(logger, level=log_level) self.autofeaturizer = autofeaturizer self.cleaner = cleaner self.reducer = reducer self.learner = learner self.autofeaturizer._logger = self.get_logger(logger) self.cleaner._logger = self.get_logger(logger) self.reducer._logger = self.get_logger(logger) self.learner._logger = self.get_logger(logger) self.pre_fit_df = None self.post_fit_df = None self.is_fit = False self.ml_type = None
def predict(self, df: pd.DataFrame, target: str, output_col=None) -> pd.DataFrame: """ Predict the target property of materials given a df of features. This base method is widely applicanble across different AutoML backends. The predictions are appended to the dataframe in a column named according to output_col. Default value is "{target_name} predicted" Args: df (pandas.DataFrame): Contains all features needed for ML (i.e., all features contained in the training dataframe. target (str): The property to be predicted. Should match the target used for fitting. May or may not be present in the argument dataframe. Returns: (pandas.DataFrame): The argument dataframe plus a column containing the predictions of the target. """ if target != self.fitted_target: raise AutomatminerError( "Argument dataframe target ({}) is different from the fitted " "dataframe target! ({})".format(target, self.fitted_target)) elif not all([f in df.columns for f in self.features]): not_in_model = [f for f in self.features if f not in df.columns] not_in_df = [f for f in df.columns if f not in self.features] raise AutomatminerError( "Features used to build model are different from df columns! " "Features located in model not located in df: \n{} \n " "Features located in df not in model: \n{}" "".format(not_in_df, not_in_model)) else: X = df[self.features].values # rectify feature order y_pred = self.best_pipeline.predict(X) df[output_col or (target + " predicted")] = y_pred log_msg = "Prediction finished successfully." try: logger.info(self._log_prefix + log_msg) except AttributeError: pass return df
def predict(self, df, target): """ Predict the target property of materials given a df of features. The predictions are appended to the dataframe in a column called: "{target} predicted" Args: df (pandas.DataFrame): Contains all features needed for ML (i.e., all features contained in the training dataframe. target (str): The property to be predicted. Should match the target used for fitting. May or may not be present in the argument dataframe. Returns: (pandas.DataFrame): The argument dataframe plus a column containing the predictions of the target. """ if target != self.fitted_target: raise AutomatminerError( "Argument dataframe target {} is different " "from the fitted dataframe target! {}" "".format(target, self.fitted_target)) elif not all([f in df.columns for f in self._features]): not_in_model = [f for f in self._features if f not in df.columns] not_in_df = [f for f in df.columns if f not in self._features] raise AutomatminerError( "Features used to build model are different" " from df columns! Features located in " "model not located in df: \n{} \n Features " "located in df not in model: \n{}" "".format(not_in_df, not_in_model)) else: X = df[self._features].values # rectify feature order y_pred = self._backend.predict(X) df[target + " predicted"] = y_pred self.logger.info(self._log_prefix + "Prediction finished successfully.") return df
def transform(self, X, y=None): """ Transforms the data with the subset of features determined after calling the fit method on the data. Args: X (pandas.DataFrame): input data, note that numpy matrix is NOT accepted since the X.columns is used for feature names y (placeholder): ignored input (for consistency in notation) Returns (pandas.DataFrame): the data with reduced number of features. """ if self.selected_features is None: raise AutomatminerError('The fit method should be called first!') return X[self.selected_features]
def fit(self, X, y, tree="rf", recursive=True, cv=5): """ Fits to the data (X) and target (y) to determine the selected_features. Args: X (pandas.DataFrame): input data, note that numpy matrix is NOT accepted since the X.columns is used for feature names y (pandas.Series or np.ndarray): list of outputs used for fitting the tree model tree (str or instantiated sklearn tree-based model): if a model is directly fed, it must have the .feature_importances_ attribute recursive (bool): whether to recursively reduce the features (True) or just do it once (False) cv (int or CrossValidation): sklearn's cross-validation with the same options (int or actual instantiated CrossValidation) Returns (None): sets the class attribute .selected_features """ m0 = len(X.columns) if isinstance(tree, str): if tree.lower() in ["rf", "random forest", "randomforest"]: if self.mode.lower() in ["classification", "classifier"]: tree = RandomForestClassifier(random_state=self.rs) else: tree = RandomForestRegressor(random_state=self.rs) elif tree.lower() in ["gb", "gbt", "gradiet boosting"]: if self.mode.lower() in ["classification", "classifier"]: tree = GradientBoostingClassifier(random_state=self.rs) else: tree = GradientBoostingRegressor(random_state=self.rs) else: raise AutomatminerError( "Unsupported tree_type {}!".format(tree)) cv = check_cv(cv=cv, y=y, classifier=is_classifier(tree)) all_feats = [] for train, _ in cv.split(X, y, groups=None): Xtrn = X.iloc[train] ytrn = y.iloc[train] all_feats += self.get_reduced_features(tree, Xtrn, ytrn, recursive) # take the union of selected features of each fold self.selected_features = list(set(all_feats)) logger.info( self._log_prefix + "Finished tree-based feature reduction of {} initial features to " "{}".format(m0, len(self.selected_features))) return self
def transform(self, df, target): """ Apply the sequence of preprocessing steps determined by fit, with the option to change the na_method for samples. Args: df (pandas.DataFrame): Contains features and the target_key target (str): The name of the target in the dataframe Returns (pandas.DataFrame) """ self.logger.info(self._log_prefix + "Cleaning with respect to samples with sample " "na_method '{}'".format(self.na_method_transform)) if target != self.fitted_target: raise AutomatminerError( "The transformation target {} is not the same as the fitted " "target {}".format(target, self.fitted_target)) # We assume the two targets are the same from here on out df = self.to_numerical(df, target) df = self.handle_na(df, target, self.na_method_transform, coerce_mismatch=True) # Ensure the order of columns is identical if target in df.columns: self.logger.info(self._log_prefix + "Reordering columns...") df = df[self.fitted_df.columns] else: self.logger.info(self._log_prefix + "Target not found in df columns. Ignoring...") reordered_cols = self.fitted_df.drop(columns=[target]).columns df = df[reordered_cols] return df
def is_greater_better(scoring_function) -> bool: """ Determines whether scoring_function being greater is more favorable/better. Args: scoring_function (str): the name of the scoring function supported by TPOT and sklearn. Please see below for more information. Returns (bool): Whether the scoring metric should be considered better if it is larger or better if it is smaller """ desired_high_metrics = { 'accuracy', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'r2', 'r2_score', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error' } desired_low_metrics = { 'median_absolute_error', 'mean_absolute_error', 'mean_squared_error' } # Check to ensure no metrics are accidentally placed in both sets if desired_high_metrics.intersection(desired_low_metrics): raise AutomatminerError("Error, there is a metric in both desired" " high and desired low metrics") if scoring_function not in desired_high_metrics \ and scoring_function not in desired_low_metrics: warnings.warn( 'The scoring_function: "{}" not found; continuing assuming' ' greater score is better'.format(scoring_function)) # True if not in either set or only in desired_high, # False if in desired_low or both sets return scoring_function not in desired_low_metrics
def fit(self, df, target): """ Determine a sequence of preprocessing steps to clean a dataframe. Args: df (pandas.DataFrame): Contains features and the target_key target (str): The name of the target in the dataframe Returns: self """ self.logger.info(self._log_prefix + "Cleaning with respect to samples with sample " "na_method '{}'".format(self.na_method_fit)) if target not in df.columns: raise AutomatminerError( "Target {} must be contained in df.".format(target)) self._reset_attrs() df = self.to_numerical(df, target) df = self.handle_na(df, target, self.na_method_fit) self.fitted_df = df self.fitted_target = target return self
def handle_na(self, df, target, na_method, coerce_mismatch=True): """ First pass for handling cells without values (null or nan). Additional preprocessing may be necessary as one column may be filled with median while the other with mean or mode, etc. Args: df (pandas.DataFrame): The dataframe containing features target (str): The key defining the ML target. coerce_mismatch (bool): If there is a mismatch between the fitted dataframe columns and the argument dataframe columns, create and drop mismatch columns so the dataframes are matching. If False, raises an error. New columns are instantiated as all zeros, as most of the time this is a onehot encoding issue. na_method (str): How to deal with samples still containing nans after troublesome columns are already dropped. Default is 'drop'. Other options are from pandas.DataFrame.fillna: {‘bfill’, ‘pad’, ‘ffill’}, or 'ignore' to ignore nans. Alternatively, specify a value to replace the nans, e.g. 0. Returns: (pandas.DataFrame) The cleaned df """ self.logger.info(self._log_prefix + "Before handling na: {} samples, {} features" "".format(*df.shape)) # Drop targets containing na before further processing if self.drop_na_targets and target in df.columns: clean_df = df.dropna(axis=0, how='any', subset=[target]) self.dropped_samples = df[~df.index.isin(clean_df.index)] self.logger.info( self._log_prefix + "{} samples did not have target values. They were " "dropped.".format(len(self.dropped_samples))) df = clean_df # Remove features failing the max_na_frac limit feats0 = set(df.columns) if not self.is_fit: self.logger.info(self._log_prefix + "Handling feature na by max na threshold of {} " "with method '{}'.".format( self.max_na_frac, self.feature_na_method)) threshold = int((1 - self.max_na_frac) * len(df)) if self.feature_na_method == "drop": df = df.dropna(axis=1, thresh=threshold) else: df = df.dropna(axis=1, thresh=1) problem_cols = df.columns[ df.isnull().mean() > self.max_na_frac] dfp = df[problem_cols] if self.feature_na_method == "fill": dfp = dfp.fillna(method="ffill") dfp = dfp.fillna(method="bfill") elif self.feature_na_method == "mean": # Take the mean of all numeric columns dfpn = dfp[[ ncol for ncol in dfp.columns if ncol in self.number_cols ]] dfpn = dfpn.fillna(value=dfpn.mean()) dfp[dfpn.columns] = dfpn # Simply fill one hot encoded columns dfp = dfp.fillna(method="ffill") dfp = dfp.fillna(method="bfill") else: dfp = dfp.fillna(value=self.feature_na_method) df[problem_cols] = dfp if len(df.columns) < len(feats0): feats = set(df.columns) n_feats = len(feats0) - len(feats) napercent = self.max_na_frac * 100 feat_names = feats0 - feats self.logger.info( self._log_prefix + 'These {} features were removed as they had more ' 'than {}% missing values: {}'.format( n_feats, napercent, feat_names)) else: mismatch = compare_columns(self.fitted_df, df, ignore=target) if mismatch["mismatch"]: self.logger.warning(self._log_prefix + "Mismatched columns found in dataframe " "used for fitting and argument dataframe.") if coerce_mismatch: self.logger.warning(self._log_prefix + "Coercing mismatched columns...") if mismatch["df1_not_in_df2"]: # in fitted, not in arg self.logger.warning( self._log_prefix + "Assuming missing columns in argument df are " "one-hot encoding issues. Setting to zero the " "following new columns:\n{}".format( mismatch["df1_not_in_df2"])) for c in self.fitted_df.columns: if c not in df.columns and c != target: # Interpret as one-hot problems... df[c] = np.zeros((df.shape[0])) if mismatch["df2_not_in_df1"]: # arg cols not in fitted self.logger.warning( self._log_prefix + "Following columns are being dropped:\n{}".format( mismatch["df2_not_in_df1"])) df = df.drop(columns=mismatch["df2_not_in_df1"]) else: raise AutomatminerError( "Mismatch between columns found in " "arg dataframe and dataframe used " "for fitting!") # handle the case where all samples of transformed df are nan but # feature is required by fitted input df, and these is no way to # impute by samples or drop... nan_cols = [c for c in df.columns if df[c].isna().all()] if nan_cols: self.logger.error( self._log_prefix + "Columns {} are all nan " "in transform df but are required by the fit " "df. Using mean values of fitted df to " "impute transformed df. This may result in " "highly erroenous imputed values!" "".format(nan_cols)) for col in nan_cols: mean_val = self.fitted_df[col].mean() df[col] = [mean_val] * df.shape[0] self.dropped_features = [ c for c in feats0 if c not in df.columns.values ] # Handle all rows that still contain any nans if na_method == "drop": clean_df = df.dropna(axis=0, how='any') self.dropped_samples = pd.concat( (df[~df.index.isin(clean_df.index)], self.dropped_samples), axis=0, sort=True) df = clean_df elif na_method == "ignore": pass elif na_method == "fill": df = df.fillna(method="ffill") df = df.fillna(method="bfill") elif na_method == "mean": # Samples belonging in number columns are averaged to replace na dfn = df[[ncol for ncol in df.columns if ncol in self.number_cols]] dfn = dfn.fillna(value=dfn.mean()) df[dfn.columns] = dfn # the rest are simply filled df = df.fillna(method="ffill") df = df.fillna(method="bfill") else: df = df.fillna(value=na_method) self.logger.info(self._log_prefix + "After handling na: {} samples, {} features".format( *df.shape)) return df
def transform(self, df, target): """ Decorate a dataframe containing composition, structure, bandstructure, and/or DOS objects with descriptors. Args: df (pandas.DataFrame): The dataframe not containing features. target (str): The ML-target property contained in the df. Returns: df (pandas.DataFrame): Transformed dataframe containing features. """ if self.cache_src and os.path.exists(self.cache_src): self.logger.debug(self._log_prefix + "Reading cache_src {}".format(self.cache_src)) cached_df = pd.read_json(self.cache_src) if not all([loc in cached_df.index for loc in df.index]): raise AutomatminerError("Feature cache does not contain all " "entries (by DataFrame index) needed " "to transform the input df.") else: cached_subdf = cached_df.loc[df.index] if target in cached_subdf.columns: if target not in df.columns: self.logger.warn( self._log_prefix + "Target not present in both cached df and input df." " Cannot perform comparison to ensure index match." ) else: cached_targets = cached_subdf[target] input_targets = df[target] cached_type = regression_or_classification( cached_targets) input_type = regression_or_classification( input_targets) if cached_type != input_type: raise AutomatminerError( "Cached targets appear to be '{}' type, while " "input targets appear to be '{}'." "".format(cached_type, input_type)) problems = {} for ix in input_targets.index: iv = input_targets[ix] cv = cached_targets[ix] if iv != cv: try: if not math.isclose(iv, cv): problems[ix] = [iv, cv] except TypeError: pass if problems: self.logger.warning( self._log_prefix + "Mismatch between cached targets and input " "targets: \n{}".format(problems)) self.logger.info( self._log_prefix + "Restored {} features on {} samples from " "cache {}".format(len(cached_subdf.columns), len(df.index), self.cache_src)) return cached_subdf else: transforming_on_fitted = df is self.fitted_input_df df = self._prescreen_df(df, inplace=True) if transforming_on_fitted: df = self.converted_input_df else: df = self._add_composition_from_structure(df) for featurizer_type, featurizers in self.featurizers.items(): if featurizer_type in df.columns: if not transforming_on_fitted: df = self._tidy_column(df, featurizer_type) for f in featurizers: self.logger.info(self._log_prefix + "Featurizing with {}." "".format(f.__class__.__name__)) df = f.featurize_dataframe( df, featurizer_type, ignore_errors=self.ignore_errors, multiindex=self.multiindex, inplace=False) if self.drop_inputs: df = df.drop(columns=[featurizer_type]) else: self.logger.info( self._log_prefix + "Featurizer type {} not in the dataframe. " "Skipping...".format(featurizer_type)) if self.functionalize: ff = FunctionFeaturizer() cols = df.columns.tolist() for ft in self.featurizers.keys(): if ft in cols: cols.pop(ft) df = ff.fit_featurize_dataframe( df, cols, ignore_errors=self.ignore_errors, multiindex=self.multiindex, inplace=False) if self.cache_src and not os.path.exists(self.cache_src): df.to_json(self.cache_src) return df
def __init__(self, cache_src=None, preset=None, featurizers=None, exclude=None, functionalize=False, ignore_cols=None, ignore_errors=True, drop_inputs=True, guess_oxistates=True, multiindex=False, do_precheck=True, n_jobs=None, logger=True, composition_col="composition", structure_col="structure", bandstructure_col="bandstructure", dos_col="dos"): if featurizers and preset: raise AutomatminerError("Featurizers and preset were both set. " "Please either use a preset ('express', " "'all', 'debug', 'heavy') or set " "featurizers manually.") if not featurizers and not preset: raise AutomatminerError("Please specify set(s) of featurizers to " "use either through the featurizers" "argument or through the preset argument.") self.cache_src = cache_src self.preset = "express" if preset is None else preset self._logger = self.get_logger(logger) self.featurizers = featurizers self.exclude = exclude if exclude else [] self.functionalize = functionalize self.ignore_cols = ignore_cols or [] self.is_fit = False self.fitted_input_df = None self.converted_input_df = None self.ignore_errors = ignore_errors self.drop_inputs = drop_inputs self.multiindex = multiindex self.do_precheck = do_precheck self.n_jobs = n_jobs self.guess_oxistates = guess_oxistates self.features = [] self.auto_featurizer = True if self.featurizers is None else False self.removed_featurizers = None self.composition_col = composition_col self.structure_col = structure_col self.bandstruct_col = bandstructure_col self.dos_col = dos_col _supported_featurizers = { composition_col: CompositionFeaturizers, structure_col: StructureFeaturizers, bandstructure_col: BSFeaturizers, dos_col: DOSFeaturizers } # user-set featurizers if self.featurizers: if not isinstance(self.featurizers, dict): raise TypeError("Featurizers must be a dictionary with keys" "matching your {}".format(_COMMON_COL_ERR_STR)) invalid_ftypes = [ f for f in self.featurizers.keys() if f not in _supported_featurizers.keys() ] if invalid_ftypes: raise KeyError( "The following keys were specified as featurizer" " types but were not set in {}" "".format(_COMMON_COL_ERR_STR)) for ftype, fset in self.featurizers.items(): _allowed = [ f.__class__.__name__ for f in _supported_featurizers[ftype]().all ] for f in fset: if f.__class__.__name__ not in _allowed: raise ValueError( "The {} featurizer {} is not supported by " "AutoFeaturizer. Try updating your version of " "automatminer and matminer.".format(ftype, f)) # auto-set featurizers else: featurizers = dict() for featurizer_type in _supported_featurizers.keys(): featurizer_set = _supported_featurizers[featurizer_type] featurizers[featurizer_type] = getattr( featurizer_set(exclude=self.exclude), self.preset) self.featurizers = featurizers # Check if any featurizers need fitting (useful for MatPipe) needs_fit = False fittable_fs = StructureFeaturizers().need_fit self.fittable_fcls = set([f.__class__.__name__ for f in fittable_fs]) # Currently structure featurizers are the only featurizer types which # can be fittable for f in self.featurizers[self.structure_col]: if f.__class__.__name__ in self.fittable_fcls: needs_fit = True break self.needs_fit = needs_fit if self.needs_fit and self.cache_src: self.logger.warn(self._log_prefix + "Using cached features on fittable featurizers! " "Please make sure you are not benchmarking with " "these options enabled; it is likely you will be" "leaking data (i.e., features) from the testing" "sets into the training.") if self.cache_src and "json" not in self.cache_src.lower(): raise ValueError("The cache_src filename does not contain json." "JSON is the required file type for featurizer" "caching.") self.min_precheck_frac = 0.9
def benchmark(self, df, target, kfold, fold_subset=None, cache=False, ignore=None): """ If the target property is known for all data, perform an ML benchmark using MatPipe. Used for getting an idea of how well AutoML can predict a certain target property. MatPipe benchmarks with a nested cross validation, meaning it makes k validation/test splits, where all model selection is done on the train /validation set (a typical CV). When the model is done validating, it is used to predict the previously unseen test set data. This process is repeated for each of the k folds, which (1) mitigates the benchmark from biasing the model based to the selection of test set and (2) better estimates the generalization error than a single validation/test split. tl;dr: Put in a dataset and kfold scheme for nested CV, get out the predicted test sets. Note: MatPipes after benchmarking have been fit on the last fold, not the entire dataset. To use your entire dataset for prediction, use the MatPipe fit and predict methods. Args: df (pandas.DataFrame): The dataframe for benchmarking. Must contain target (str): The column name to use as the ml target property. kfold (sklearn KFold or StratifiedKFold: The cross validation split object to use for nested cross validation. Used to index the dataframe with .iloc, NOT .loc. fold_subset ([int]): A subset of the folds in kfold to evaluate (by index). For example, to run only the 3rd train/validation/test split of the kfold, set fold_subset = [2]. To use the first and fourth, set fold_subset = [0, 3]. cache (bool): If True, pre-featurizes the entire dataframe (including test data!) and caches it before iterating over folds. Do NOT use if you are using fittable featurizers whose feature labels are based on their input! Doing so may "leak" information from the testing set to the training set and will over-represent your benchmark. Enabling this for featurizers which are not fittable is completely safe. Note that your autofeaturizer must have a cache_src defined if allow_caching is enabled (do this either through the AutoFeaturizer class or using the cache_src argument to get_preset_config. ignore ([str], None): Ignore columns during prediction for each outer fold. See .predict --> ignore argument for more details. Returns: results ([pd.DataFrame]): Dataframes containing each fold's known targets, as well as their independently predicted targets. """ cache_src = self.autofeaturizer.cache_src if cache_src and cache: if os.path.exists(cache_src): logger.warning( "Cache src {} already found! Ensure this featurized data " "matches the df being benchmarked.".format(cache_src)) logger.warning("Running pre-featurization for caching.") self.autofeaturizer.fit_transform(df, target) elif cache_src and not cache: raise AutomatminerError( "Caching was enabled in AutoFeaturizer but not in benchmark. " "Either disable caching in AutoFeaturizer or enable it by " "passing cache=True to benchmark.") elif cache and not cache_src: raise AutomatminerError( "MatPipe cache is enabled, but no cache_src was defined in " "autofeaturizer. Pass the cache_src argument to AutoFeaturizer " "or use the cache_src get_preset_config powerup.") else: logger.debug("No caching being used in AutoFeaturizer or " "benchmark.") if not fold_subset: fold_subset = list(range(kfold.n_splits)) logger.warning("Beginning benchmark.") results = [] fold = 0 for _, test_ix in kfold.split(X=df, y=df[target]): if fold in fold_subset: logger.info("Training on fold index {}".format(fold)) # Split, identify, and randomize test set test = df.iloc[test_ix].sample(frac=1) train = df[~df.index.isin(test.index)].sample(frac=1) self.fit(train, target) logger.info("Predicting fold index {}".format(fold)) test = self.predict(test, ignore=ignore) results.append(test) fold += 1 return results