def from_json(self, json_desc): self.library_version = json_desc.get("library_version", self.library_version) self.algorithm_name = json_desc.get("algorithm_name", self.algorithm_name) self.algorithm_short_name = json_desc.get( "algorithm_short_name", self.algorithm_short_name ) self.uid = json_desc.get("uid", self.uid) self.selected_models = [] models_json = json_desc.get("models") for selected in models_json: model = selected["model"] repeat = selected["repeat"] il = ModelFramework(model.get("params")) il.from_json(model) self.selected_models += [ # {"model": LearnerFactory.load(model), "repeat": repeat} {"model": il, "repeat": repeat} ]
class BaseAutoML(BaseEstimator, ABC): """ Automated Machine Learning for supervised tasks (binary classification, multiclass classification, regression). Warning: This class should not be used directly. Use derived classes instead. """ def __init__(self): logger.debug("BaseAutoML.__init__") self._results_path = None self._models = [ ] # instances of iterative learner framework or ensemble self._best_model = None self._verbose = True self._threshold = None # used only in classification self._metrics_details = None self._max_metrics = None self._confusion_matrix = None self._X_path, self._y_path = None, None self._data_info = None self._model_paths = [] self._stacked_models = None self._fit_level = None self._start_time = time.time() self._time_ctrl = None self._all_params = {} # https://scikit-learn.org/stable/developers/develop.html#universal-attributes self.n_features_in_ = None # for scikit-learn api def _get_tuner_params(self, start_random_models, hill_climbing_steps, top_models_to_improve): return { "start_random_models": start_random_models, "hill_climbing_steps": hill_climbing_steps, "top_models_to_improve": top_models_to_improve, } def _check_can_load(self): """ Checks if AutoML can be loaded from a folder""" if self.results_path is not None: # Dir exists and can be loaded if os.path.exists(self.results_path) and os.path.exists( os.path.join(self.results_path, "params.json")): self.load(self.results_path) self._results_path = self.results_path def load(self, path): logger.info("Loading AutoML models ...") try: params = json.load(open(os.path.join(path, "params.json"))) self._model_paths = params["saved"] self._ml_task = params["ml_task"] self._eval_metric = params["eval_metric"] stacked_models = params.get("stacked") models_map = {} for model_path in self._model_paths: if model_path.endswith("Ensemble") or model_path.endswith( "Ensemble_Stacked"): ens = Ensemble.load(model_path, models_map) self._models += [ens] models_map[ens.get_name()] = ens else: m = ModelFramework.load(model_path) self._models += [m] models_map[m.get_name()] = m if stacked_models is not None: self._stacked_models = [] for stacked_model_name in stacked_models: self._stacked_models += [models_map[stacked_model_name]] best_model_name = None with open(os.path.join(path, "best_model.txt"), "r") as fin: best_model_name = fin.read() self._best_model = models_map[best_model_name] data_info_path = os.path.join(path, "data_info.json") self._data_info = json.load(open(data_info_path)) self.n_features_in_ = self._data_info["n_features"] if "n_classes" in self._data_info: self.n_classes = self._data_info["n_classes"] self._fit_level = "finished" except Exception as e: raise AutoMLException(f"Cannot load AutoML directory. {str(e)}") def get_leaderboard(self): ldb = { "name": [], "model_type": [], "metric_type": [], "metric_value": [], "train_time": [], } for m in self._models: ldb["name"] += [m.get_name()] ldb["model_type"] += [m.get_type()] ldb["metric_type"] += [self._eval_metric] ldb["metric_value"] += [m.get_final_loss()] ldb["train_time"] += [np.round(m.get_train_time(), 2)] return pd.DataFrame(ldb) def keep_model(self, model, model_path): if model is None: return self._models += [model] self._model_paths += [model_path] self.select_and_save_best() self.verbose_print("{} {} {} trained in {} seconds".format( model.get_name(), self._eval_metric, np.round(model.get_final_loss(), 6), np.round(model.get_train_time(), 2), )) self._time_ctrl.log_time(model.get_name(), model.get_type(), self._fit_level, model.get_train_time()) def create_dir(self, model_path): if not os.path.exists(model_path): try: os.mkdir(model_path) except Exception as e: raise AutoMLException( f"Cannot create directory {model_path}. {str(e)}") def train_model(self, params): # do we have enough time to train? # if not, skip if not self._time_ctrl.enough_time(params["learner"]["model_type"], self._fit_level): logger.info( f"Cannot train {params['name']} because of the time constraint" ) return False # let's create directory to log all training artifacts model_path = os.path.join(self._results_path, params["name"]) self.create_dir(model_path) # prepare callbacks early_stop = EarlyStopping({ "metric": { "name": self._eval_metric }, "log_to_dir": model_path }) learner_time_constraint = LearnerTimeConstraint({ "learner_time_limit": self._time_ctrl.learner_time_limit( params["learner"]["model_type"], self._fit_level, self._validation_strategy.get("k_folds", 1.0), ), "min_steps": params["additional"].get("min_steps"), }) total_time_constraint = TotalTimeConstraint({ "total_time_limit": self._total_time_limit if self._model_time_limit is None else None, "total_time_start": self._start_time, }) # create model framework mf = ModelFramework( params, callbacks=[ early_stop, learner_time_constraint, total_time_constraint ], ) # start training logger.info( f"Train model #{len(self._models)+1} / Model name: {params['name']}" ) mf.train(model_path) # save the model mf.save(model_path) # and keep info about the model self.keep_model(mf, model_path) return True def verbose_print(self, msg): if self._verbose > 0: # self._progress_bar.write(msg) print(msg) def ensemble_step(self, is_stacked=False): if self._train_ensemble and len(self._models) > 1: ensemble_path = os.path.join( self._results_path, "Ensemble_Stacked" if is_stacked else "Ensemble") self.create_dir(ensemble_path) self.ensemble = Ensemble(self._eval_metric, self._ml_task, is_stacked=is_stacked) oofs, target = self.ensemble.get_oof_matrix(self._models) self.ensemble.fit(oofs, target) self.ensemble.save(ensemble_path) self.keep_model(self.ensemble, ensemble_path) return True return False def can_we_stack_them(self, y): # if multiclass and too many classes then No return True def get_stacked_data(self, X, mode="training"): # mode can be `training` or `predict` if self._stacked_models is None: return X all_oofs = [] for m in self._stacked_models: oof = None if mode == "training": oof = m.get_out_of_folds() else: oof = m.predict(X) if self._ml_task == BINARY_CLASSIFICATION: cols = [f for f in oof.columns if "prediction" in f] if len(cols) == 2: oof = pd.DataFrame({"prediction": oof[cols[1]]}) cols = [f for f in oof.columns if "prediction" in f] oof = oof[cols] oof.columns = [f"{m.get_name()}_{c}" for c in cols] all_oofs += [oof] org_index = X.index.copy() X.reset_index(drop=True, inplace=True) X_stacked = pd.concat(all_oofs + [X], axis=1) X_stacked.index = org_index.copy() X.index = org_index.copy() return X_stacked def _perform_model_stacking(self): if self._stacked_models is not None: return ldb = self.get_leaderboard() ldb = ldb.sort_values(by="metric_value", ascending=True) models_map = { m.get_name(): m for m in self._models if not m._is_stacked } self._stacked_models = [] models_limit = 10 for model_type in np.unique(ldb.model_type): if model_type in ["Baseline"]: continue ds = ldb[ldb.model_type == model_type].copy() ds.sort_values(by="metric_value", inplace=True) for n in list(ds.name.iloc[:models_limit].values): self._stacked_models += [models_map[n]] scores = [m.get_final_loss() for m in self._stacked_models] self._stacked_models = [ self._stacked_models[i] for i in np.argsort(scores).tolist() ] def prepare_for_stacking(self): # print("Stacked models ....") # do we have enough models? if len(self._models) < 5: return # do we have time? if self._total_time_limit is not None: time_left = self._total_time_limit - (time.time() - self._start_time) # we need at least 60 seconds to do anything if time_left < 60: return self._perform_model_stacking() X_stacked_path = os.path.join(self._results_path, "X_stacked.parquet") if os.path.exists(X_stacked_path): return X = pd.read_parquet(self._X_path) org_columns = X.columns.tolist() X_stacked = self.get_stacked_data(X) new_columns = X_stacked.columns.tolist() added_columns = [c for c in new_columns if c not in org_columns] # save stacked train data X_stacked.to_parquet(X_stacked_path, index=False) """ # resue old params for m in self._stacked_models: # print(m.get_type()) # use only Xgboost, LightGBM and CatBoost as stacked models if m.get_type() not in ["Xgboost", "LightGBM", "CatBoost"]: continue params = copy.deepcopy(m.params) params["validation"]["X_train_path"] = X_train_stacked_path params["name"] = params["name"] + "_Stacked" params["is_stacked"] = True # print(params) if "model_architecture_json" in params["learner"]: # the new model will be created with wider input size del params["learner"]["model_architecture_json"] if self._ml_task == REGRESSION: # scale added predictions in regression if the target was scaled (in the case of NN) target_preprocessing = params["preprocessing"]["target_preprocessing"] scale = None if "scale_log_and_normal" in target_preprocessing: scale = "scale_log_and_normal" elif "scale_normal" in target_preprocessing: scale = "scale_normal" if scale is not None: for col in added_columns: params["preprocessing"]["columns_preprocessing"][col] = [ scale] self.train_model(params) """ def _save_data(self, X, y): self._X_path = os.path.join(self._results_path, "X.parquet") self._y_path = os.path.join(self._results_path, "y.parquet") X.to_parquet(self._X_path, index=False) # let's check before any conversions target_is_numeric = pd.api.types.is_numeric_dtype(y) if self._ml_task == MULTICLASS_CLASSIFICATION: y = y.astype(str) pd.DataFrame({"target": y}).to_parquet(self._y_path, index=False) self._validation_strategy["X_path"] = self._X_path self._validation_strategy["y_path"] = self._y_path self._validation_strategy["results_path"] = self._results_path columns_and_target_info = DataInfo.compute(X, y, self._ml_task) self.n_features_in_ = X.shape[1] self.n_classes = len(np.unique(y[~pd.isnull(y)])) self._data_info = { "columns": X.columns.tolist(), "rows": y.shape[0], "cols": X.shape[1], "target_is_numeric": target_is_numeric, "columns_info": columns_and_target_info["columns_info"], "target_info": columns_and_target_info["target_info"], "n_features": self.n_features_in_, } # Add n_classes if not regression if self._ml_task != REGRESSION: self._data_info["n_classes"] = self.n_classes if columns_and_target_info.get("num_class") is not None: self._data_info["num_class"] = columns_and_target_info["num_class"] data_info_path = os.path.join(self._results_path, "data_info.json") with open(data_info_path, "w") as fout: fout.write(json.dumps(self._data_info, indent=4)) self._drop_data_variables(X) def _drop_data_variables(self, X): X.drop(X.columns, axis=1, inplace=True) def _load_data_variables(self, X_train): if X_train.shape[1] == 0: X = pd.read_parquet(self._X_path) for c in X.columns: X_train.insert(loc=X_train.shape[1], column=c, value=X[c]) os.remove(self._X_path) os.remove(self._y_path) def save_progress(self, step=None, generated_params=None): if step is not None and generated_params is not None: self._all_params[step] = generated_params state = {} state["fit_level"] = self._fit_level state["time_controller"] = self._time_ctrl.to_json() state["all_params"] = self._all_params fname = os.path.join(self._results_path, "progress.json") with open(fname, "w") as fout: fout.write(json.dumps(state, indent=4)) def load_progress(self): state = {} fname = os.path.join(self._results_path, "progress.json") if not os.path.exists(fname): return state = json.load(open(fname, "r")) self._fit_level = state.get("fit_level", self._fit_level) self._all_params = state.get("all_params", self._all_params) self._time_ctrl = TimeController.from_json( state.get("time_controller")) def _validate_X_predict(self, X): """Validate X whenever one tries to predict, apply, predict_proba""" # X = check_array(X, ensure_2d=False) X = np.atleast_2d(X) n_features = X.shape[1] if self.n_features_in_ != n_features: raise ValueError( f"Number of features of the model must match the input. Model n_features_in_ is {self.n_features_in_} and input n_features is {n_features}. Reshape your data." ) # This method builds pandas.Dataframe from input. The input can be numpy.ndarray, matrix, or pandas.Dataframe # This method is used to build dataframes in `fit()` and in `predict`. That's the reason y can be None (`predict()` method) def _build_dataframe(self, X, y=None): # If Inputs are not pandas dataframes use scikit-learn validation for X array if not isinstance(X, pd.DataFrame): # Validate X as array X = check_array(X, ensure_2d=False) # Force X to be 2D X = np.atleast_2d(X) # Create Pandas dataframe from np.arrays, columns get names with the schema: feature_{index} X = pd.DataFrame( X, columns=["feature_" + str(i) for i in range(1, len(X[0]) + 1)]) # Enforce column names # Enforce X_train columns to be string X.columns = X.columns.astype(str) X.reset_index(drop=True, inplace=True) if y is None: return X # Check if y is np.ndarray, transform to pd.Series if isinstance(y, np.ndarray): y = check_array(y, ensure_2d=False) y = pd.Series(np.array(y), name="target") # if pd.DataFrame, slice first column elif isinstance(y, pd.DataFrame): y = np.array(y.iloc[:, 0]) y = check_array(y, ensure_2d=False) y = pd.Series(np.array(y), name="target") X, y = ExcludeRowsMissingTarget.transform(X, y, warn=True) X.reset_index(drop=True, inplace=True) y.reset_index(drop=True, inplace=True) return X, y def _fit(self, X, y): """Fits the AutoML model with data""" if self._fit_level == "finished": print( "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new a 'fit()'." ) return # Validate input and build dataframes X, y = self._build_dataframe(X, y) self.n_features_in_ = X.shape[1] self.n_classes = len(np.unique(y[~pd.isnull(y)])) # Get attributes (__init__ params) self._mode = self._get_mode() self._ml_task = self._get_ml_task() self._results_path = self._get_results_path() self._total_time_limit = self._get_total_time_limit() self._model_time_limit = self._get_model_time_limit() self._algorithms = self._get_algorithms() self._train_ensemble = self._get_train_ensemble() self._stack_models = self._get_stack_models() self._eval_metric = self._get_eval_metric() self._validation_strategy = self._get_validation_strategy() self._verbose = self._get_verbose() self._explain_level = self._get_explain_level() self._golden_features = self._get_golden_features() self._feature_selection = self._get_feature_selection() self._start_random_models = self._get_start_random_models() self._hill_climbing_steps = self._get_hill_climbing_steps() self._top_models_to_improve = self._get_top_models_to_improve() self._random_state = self._get_random_state() try: self.load_progress() if self._fit_level == "finished": print( "This model has already been fitted. You can use predict methods or select a new 'results_path' for a new 'fit()'." ) return self._check_can_load() self.verbose_print(f"AutoML directory: {self._results_path}") self.verbose_print( f"The task is {self._ml_task} with evaluation metric {self._eval_metric}" ) self.verbose_print( f"AutoML will use algorithms: {self._algorithms}") if self._stack_models: self.verbose_print("AutoML will stack models") if self._train_ensemble: self.verbose_print("AutoML will ensemble availabe models") self._start_time = time.time() if self._time_ctrl is not None: self._start_time -= self._time_ctrl.already_spend() # Automatic Exloratory Data Analysis if self._explain_level == 2: EDA.compute(X, y, os.path.join(self._results_path, "EDA")) # Save data self._save_data(X.copy(deep=False), y) tuner = MljarTuner( self._get_tuner_params( self._start_random_models, self._hill_climbing_steps, self._top_models_to_improve, ), self._algorithms, self._ml_task, self._validation_strategy, self._explain_level, self._data_info, self._golden_features, self._feature_selection, self._train_ensemble, self._stack_models, self._random_state, ) self.tuner = tuner steps = tuner.steps() self.verbose_print(f"AutoML steps: {steps}") if self._time_ctrl is None: self._time_ctrl = TimeController( self._start_time, self._total_time_limit, self._model_time_limit, steps, self._algorithms, ) self._time_ctrl.log_time( "prepare_data", "prepare_data", "prepare_data", time.time() - self._start_time, ) for step in steps: self._fit_level = step start = time.time() # self._time_start[step] = start if step == "stack": self.prepare_for_stacking() generated_params = [] if step in self._all_params: generated_params = self._all_params[step] else: generated_params = tuner.generate_params( step, self._models, self._results_path, self._stacked_models) if generated_params is None or not generated_params: self.verbose_print( f"Skip {step} because no parameters were generated.") continue if generated_params: if "learner" in generated_params[ 0] and not self._time_ctrl.enough_time( generated_params[0]["learner"]["model_type"], self._fit_level): self.verbose_print( f"Skip {step} because of the time limit.") else: model_str = "models" if len( generated_params) > 1 else "model" self.verbose_print( f"* Step {step} will try to check up to {len(generated_params)} {model_str}" ) for params in generated_params: if params.get("status", "") in ["trained", "skipped", "error"]: self.verbose_print( f"{params['name']}: {params['status']}.") continue try: trained = False if "ensemble" in step: trained = self.ensemble_step( is_stacked=params["is_stacked"]) else: trained = self.train_model(params) params["status"] = "trained" if trained else "skipped" params["final_loss"] = self._models[-1].get_final_loss( ) params["train_time"] = self._models[-1].get_train_time( ) except Exception as e: self._update_errors_report(params.get("name"), str(e)) params["status"] = "error" self.save_progress(step, generated_params) self._fit_level = "finished" self.save_progress() self.verbose_print( f"AutoML fit time: {np.round(time.time() - self._start_time,2)} seconds" ) except Exception as e: raise e finally: if self._X_path is not None: self._load_data_variables(X) return self def _update_errors_report(self, model_name, error_msg): """Append error message to errors.md file. """ errors_filename = os.path.join(self._get_results_path(), "errors.md") with open(errors_filename, "a") as fout: self.verbose_print( f"There was an error during {model_name} training.") self.verbose_print(f"Please check {errors_filename} for details.") fout.write(f"## Error for {model_name}\n\n") fout.write(error_msg) link = "https://github.com/mljar/mljar-supervised/issues/new" fout.write( f"\n\nPlease set a GitHub issue with above error message at: {link}" ) fout.write("\n\n") def select_and_save_best(self): # Select best model (lowest loss) self._best_model = min(self._models, key=lambda x: x.get_final_loss()) with open(os.path.join(self._results_path, "best_model.txt"), "w") as fout: fout.write(f"{self._best_model.get_name()}") with open(os.path.join(self._results_path, "params.json"), "w") as fout: params = { "ml_task": self._ml_task, "eval_metric": self._eval_metric, "saved": self._model_paths, } if self._stacked_models is not None: params["stacked"] = [ m.get_name() for m in self._stacked_models ] fout.write(json.dumps(params, indent=4)) ldb = self.get_leaderboard() ldb.to_csv(os.path.join(self._results_path, "leaderboard.csv"), index=False) # save report ldb["Link"] = [ f"[Results link]({m}/README.md)" for m in ldb["name"].values ] ldb.insert(loc=0, column="Best model", value="") ldb.loc[ldb.name == self._best_model.get_name(), "Best model"] = "**the best**" with open(os.path.join(self._results_path, "README.md"), "w") as fout: fout.write(f"# AutoML Leaderboard\n\n") fout.write(tabulate(ldb.values, ldb.columns, tablefmt="pipe")) LeaderboardPlots.compute(ldb, self._results_path, fout) def _check_is_fitted(self): # First check if model can be loaded self._check_can_load() # Check if fitted if self._fit_level != "finished": raise AutoMLException( "This model has not been fitted yet. Please call `fit()` first." ) def _base_predict(self, X): self._check_is_fitted() X = self._build_dataframe(X) if not isinstance(X.columns[0], str): X.columns = [str(c) for c in X.columns] input_columns = X.columns.tolist() for column in self._data_info["columns"]: if column not in input_columns: raise AutoMLException( f"Missing column: {column} in input data. Cannot predict") X = X[self._data_info["columns"]] self._validate_X_predict(X) # is stacked model if self._best_model._is_stacked: self._perform_model_stacking() X_stacked = self.get_stacked_data(X, mode="predict") if self._best_model.get_type() == "Ensemble": # Ensemble is using both original and stacked data predictions = self._best_model.predict(X, X_stacked) else: predictions = self._best_model.predict(X_stacked) else: predictions = self._best_model.predict(X) if self._ml_task == BINARY_CLASSIFICATION: # need to predict the label based on predictions and threshold neg_label, pos_label = ( predictions.columns[0][11:], predictions.columns[1][11:], ) if neg_label == "0" and pos_label == "1": neg_label, pos_label = 0, 1 target_is_numeric = self._data_info.get("target_is_numeric", False) if target_is_numeric: neg_label = int() pos_label = int(pos_label) # assume that it is binary classification predictions[ "label"] = predictions.iloc[:, 1] > self._best_model._threshold predictions["label"] = predictions["label"].map({ True: pos_label, False: neg_label }) return predictions elif self._ml_task == MULTICLASS_CLASSIFICATION: target_is_numeric = self._data_info.get("target_is_numeric", False) if target_is_numeric: predictions["label"] = predictions["label"].astype(np.int32) return predictions # Regression else: return predictions def _predict(self, X): predictions = self._base_predict(X) # Return predictions # If classification task the result is in column 'label' # If regression task the result is in column 'prediction' return (predictions["label"].to_numpy() if self._ml_task != REGRESSION else predictions["prediction"].to_numpy()) def _predict_proba(self, X): # Check is task type is correct if self._ml_task == REGRESSION: raise AutoMLException( f"Method `predict_proba()` can only be used when in classification tasks. Current task: '{self._ml_task}'." ) # Make and return predictions # If classification task the result is in column 'label' # Need to drop `label` column. return self._base_predict(X).drop(["label"], axis=1).to_numpy() def _predict_all(self, X): # Check is task type is correct if self._ml_task == REGRESSION: raise AutoMLException( f"Method `predict_all()` can only be used when in classification tasks. Current task: '{self._ml_task}'." ) # Make and return predictions return self._base_predict(X) def _score(self, X, y=None): # y default must be None for scikit-learn compatibility # Check if y is None if y is None: raise AutoMLException("y must be specified.") predictions = self._predict(X) return (r2_score(y, predictions) if self._ml_task == REGRESSION else accuracy_score(y, predictions)) def _get_mode(self): """ Gets the current mode""" self._validate_mode() return deepcopy(self.mode) def _get_ml_task(self): """ Gets the current ml_task. If "auto" it is determined""" self._validate_ml_task() if self.ml_task == "auto": classes_number = self.n_classes if classes_number == 2: self._estimator_type = "classifier" # for sk-learn api return BINARY_CLASSIFICATION elif classes_number <= 20: self._estimator_type = "classifier" # for sk-learn api return MULTICLASS_CLASSIFICATION else: self._estimator_type = "regressor" # for sk-learn api return REGRESSION else: return deepcopy(self.ml_task) def _get_results_path(self): """ Gets the current results_path""" # if we already have the results path set, please return it if self._results_path is not None: return self._results_path self._validate_results_path() path = self.results_path if path is None: for i in range(1, 10001): name = f"AutoML_{i}" if not os.path.exists(name): self.create_dir(name) self._results_path = name return name # If it got here, could not create, raise expection raise AutoMLException("Cannot create directory for AutoML results") elif os.path.exists(self.results_path) and os.path.exists( os.path.join( self.results_path, "params.json")): # AutoML already loaded, return path self._results_path = path return path # Dir does not exist, create it elif not os.path.exists(path): self.create_dir(path) self._results_path = path return path # Dir exists and is empty, use it elif os.path.exists(path) and not len(os.listdir(path)): self._results_path = path return path elif os.path.exists(path) and len(os.listdir(path)): raise AutoMLException( f"Cannot set directory for AutoML. Directory '{path}' is not empty." ) raise AutoMLException("Cannot set directory for AutoML results") def _get_total_time_limit(self): """ Gets the current total_time_limit""" self._validate_total_time_limit() return deepcopy(self.total_time_limit) def _get_model_time_limit(self): """ Gets the current model_time_limit""" self._validate_model_time_limit() return deepcopy(self.model_time_limit) def _get_algorithms(self): """ Gets the current algorithms. If "auto" it is determined""" self._validate_algorithms() if self.algorithms == "auto": if self._get_mode() == "Explain": return [ "Baseline", "Linear", "Decision Tree", "Random Forest", "Xgboost", "Neural Network", ] if self._get_mode() == "Perform": return [ "Linear", "Random Forest", "LightGBM", "Xgboost", "CatBoost", "Neural Network", ] if self._get_mode() == "Compete": return [ "Linear", "Decision Tree", "Random Forest", "Extra Trees", "LightGBM", "Xgboost", "CatBoost", "Neural Network", "Nearest Neighbors", ] else: return deepcopy(self.algorithms) def _get_train_ensemble(self): """ Gets the current train_ensemble""" self._validate_train_ensemble() return deepcopy(self.train_ensemble) def _get_stack_models(self): """ Gets the current stack_models""" self._validate_stack_models() if self.stack_models == "auto": return True if self.mode == "Compete" else False else: return deepcopy(self.stack_models) def _get_eval_metric(self): """ Gets the current eval_metric""" self._validate_eval_metric() if self.eval_metric == "auto": if self._get_ml_task() == BINARY_CLASSIFICATION: return "logloss" elif self._get_ml_task() == MULTICLASS_CLASSIFICATION: return "logloss" elif self._get_ml_task() == REGRESSION: return "rmse" else: return deepcopy(self.eval_metric) def _get_validation_strategy(self): """ Gets the current validation_strategy""" strat = {} self._validate_validation_strategy() if self.validation_strategy == "auto": if self._get_mode() == "Explain": strat = { "validation_type": "split", "train_ratio": 0.75, "shuffle": True, "stratify": True, } elif self._get_mode() == "Perform": strat = { "validation_type": "kfold", "k_folds": 5, "shuffle": True, "stratify": True, } elif self._get_mode() == "Compete": strat = { "validation_type": "kfold", "k_folds": 10, "shuffle": True, "stratify": True, } if self._get_ml_task() == REGRESSION: if "stratify" in strat: # it's better to always check # before delete (trust me) del strat["stratify"] return strat else: strat = deepcopy(self.validation_strategy) if "stratify" in strat: del strat["stratify"] return strat def _get_verbose(self): """Gets the current verbose""" self._validate_verbose() return deepcopy(self.verbose) def _get_explain_level(self): """ Gets the current explain_level""" self._validate_explain_level() if self.explain_level == "auto": if self._get_mode() == "Explain": return 2 if self._get_mode() == "Perform": return 1 if self._get_mode() == "Compete": return 0 else: return deepcopy(self.explain_level) def _get_golden_features(self): self._validate_golden_features() if self.golden_features == "auto": if self._get_mode() == "Explain": return False if self._get_mode() == "Perform": return True if self._get_mode() == "Compete": return True else: return deepcopy(self.golden_features) def _get_feature_selection(self): """ Gets the current feature_selection""" self._validate_feature_selection() if self.feature_selection == "auto": if self._get_mode() == "Explain": return False if self._get_mode() == "Perform": return True if self._get_mode() == "Compete": return True else: return deepcopy(self.feature_selection) def _get_start_random_models(self): """ Gets the current start_random_models""" self._validate_start_random_models() if self.start_random_models == "auto": if self._get_mode() == "Explain": return 1 if self._get_mode() == "Perform": return 5 if self._get_mode() == "Compete": return 10 else: return deepcopy(self.start_random_models) def _get_hill_climbing_steps(self): """ Gets the current hill_climbing_steps""" self._validate_hill_climbing_steps() if self.hill_climbing_steps == "auto": if self._get_mode() == "Explain": return 0 if self._get_mode() == "Perform": return 2 if self._get_mode() == "Compete": return 2 else: return deepcopy(self.hill_climbing_steps) def _get_top_models_to_improve(self): """ Gets the current top_models_to_improve""" self._validate_top_models_to_improve() if self.top_models_to_improve == "auto": if self._get_mode() == "Explain": return 0 if self._get_mode() == "Perform": return 2 if self._get_mode() == "Compete": return 3 else: return deepcopy(self.top_models_to_improve) def _get_random_state(self): """ Gets the current random_state""" self._validate_random_state() return deepcopy(self.random_state) def _validate_mode(self): """ Validates mode parameter""" valid_modes = ["Explain", "Perform", "Compete"] if self.mode not in valid_modes: raise ValueError( f"Expected 'mode' to be {' or '.join(valid_modes)}, got '{self.mode}'" ) def _validate_ml_task(self): """ Validates ml_task parameter""" if isinstance(self.ml_task, str) and self.ml_task == "auto": return if self.ml_task not in AlgorithmsRegistry.get_supported_ml_tasks(): raise ValueError( f"Expected 'ml_task' to be {' or '.join(AlgorithmsRegistry.get_supported_ml_tasks())}, got '{self.ml_task}''" ) def _validate_results_path(self): """ Validates path parameter""" if self.results_path is None or isinstance(self.results_path, str): return raise ValueError( f"Expected 'results_path' to be of type string, got '{type(self.results_path)}''" ) def _validate_total_time_limit(self): """ Validates total_time_limit parameter""" check_greater_than_zero_integer(self.total_time_limit, "total_time_limit") def _validate_model_time_limit(self): """ Validates model_time_limit parameter""" if self.model_time_limit is not None: check_greater_than_zero_integer(self.model_time_limit, "model_time_limit") def _validate_algorithms(self): """ Validates algorithms parameter""" if isinstance(self.algorithms, str) and self.algorithms == "auto": return for algo in self.algorithms: if algo not in list( AlgorithmsRegistry.registry[self._ml_task].keys()): raise ValueError( f"The algorithm {algo} is not allowed to use for ML task: {self._ml_task}. Allowed algorithms: {list(AlgorithmsRegistry.registry[self._ml_task].keys())}" ) def _validate_train_ensemble(self): """ Validates train_ensemble parameter""" # `train_ensemble` defaults to True, no further checking required check_bool(self.train_ensemble, "train_ensemble") def _validate_stack_models(self): """ Validates stack_models parameter""" # `stack_models` defaults to "auto". If "auto" return, else check if is valid bool if isinstance(self.stack_models, str) and self.stack_models == "auto": return check_bool(self.stack_models, "stack_models") def _validate_eval_metric(self): """ Validates eval_metric parameter""" # `stack_models` defaults to "auto". If not "auto", check if is valid bool if isinstance(self.eval_metric, str) and self.eval_metric == "auto": return if (self._get_ml_task() == BINARY_CLASSIFICATION or self._get_ml_task() == MULTICLASS_CLASSIFICATION ) and self.eval_metric != "logloss": raise ValueError( f"Metric {self.eval_metric} is not allowed in ML task: {self._get_ml_task()}. \ Use 'log_loss'") elif self._get_ml_task() == REGRESSION and self.eval_metric != "rmse": raise ValueError( f"Metric {self.eval_metric} is not allowed in ML task: {self._get_ml_task()}. \ Use 'rmse'") def _validate_validation_strategy(self): """ Validates validation parameter""" if (isinstance(self.validation_strategy, str) and self.validation_strategy == "auto"): return # only validation_type is mandatory # other parameters of validations # have defaults set in their constructors required_keys = ["validation_type"] if type(self.validation_strategy) is not dict: raise ValueError( f"Expected 'validation_strategy' to be a dict, got '{type(self.validation_strategy)}'" ) if not all(key in self.validation_strategy for key in required_keys): raise ValueError( f"Expected dict with keys: {' , '.join(required_keys)}") def _validate_verbose(self): """ Validates verbose parameter""" check_positive_integer(self.verbose, "verbose") def _validate_explain_level(self): """ Validates explain_level parameter""" if isinstance(self.explain_level, str) and self.explain_level == "auto": return valid_explain_levels = [0, 1, 2] # Check if explain level is 0 or greater integer if not (isinstance(self.explain_level, int) and self.explain_level in valid_explain_levels): raise ValueError( f"Expected 'explain_level' to be {' or '.join([str(x) for x in valid_explain_levels])}, got '{self.explain_level}'" ) def _validate_golden_features(self): """ Validates golden_features parameter""" if isinstance(self.golden_features, str) and self.golden_features == "auto": return check_bool(self.golden_features, "golden_features") def _validate_feature_selection(self): """ Validates feature_selection parameter""" if isinstance(self.feature_selection, str) and self.feature_selection == "auto": return check_bool(self.feature_selection, "feature_selection") def _validate_start_random_models(self): """ Validates start_random_models parameter""" if (isinstance(self.start_random_models, str) and self.start_random_models == "auto"): return check_greater_than_zero_integer(self.start_random_models, "start_random_models") def _validate_hill_climbing_steps(self): """ Validates hill_climbing_steps parameter""" if (isinstance(self.hill_climbing_steps, str) and self.hill_climbing_steps == "auto"): return check_positive_integer(self.hill_climbing_steps, "hill_climbing_steps") def _validate_top_models_to_improve(self): """ Validates top_models_to_improve parameter""" if (isinstance(self.top_models_to_improve, str) and self.top_models_to_improve == "auto"): return check_positive_integer(self.top_models_to_improve, "top_models_to_improve") def _validate_random_state(self): """ Validates random_state parameter""" check_positive_integer(self.random_state, "random_state") def to_json(self): if self._best_model is None: return None return { "best_model": self._best_model.to_json(), "threshold": self._threshold, "ml_task": self._ml_task, } def from_json(self, json_data): if json_data["best_model"]["algorithm_short_name"] == "Ensemble": self._best_model = Ensemble() self._best_model.from_json(json_data["best_model"]) else: self._best_model = ModelFramework( json_data["best_model"].get("params")) self._best_model.from_json(json_data["best_model"]) self._threshold = json_data.get("threshold") self._ml_task = json_data.get("ml_task")
class AutoML: """ Automated Machine Learning for supervised tasks (binary classification, multiclass classification, regression). """ def __init__( self, results_path=None, total_time_limit=60 * 60, model_time_limit=None, algorithms=[ "Baseline", "Linear", "Decision Tree", "Random Forest", "Extra Trees", "LightGBM", "Xgboost", "CatBoost", "Neural Network", "Nearest Neighbors", ], tuning_mode="Normal", train_ensemble=True, stack=True, optimize_metric=None, validation={ "validation_type": "kfold", "k_folds": 10, "shuffle": True, "stratify": True, }, verbose=True, ml_task=None, explain_level=2, seed=1, ): """ Create the AutoML object. Initialize directory for results. :param results_path: The path where all results will be saved. If left `None` then the name of directory will be generated, with schema: AutoML_{number}, where number can be from 1 to 100 - depends which direcory name will be available. If the `results_path` will point to directory with AutoML results, then all models will be loaded. :param total_time_limit: The time limit in seconds for AutoML training. It is not used when `model_time_limit` is not `None`. :param model_time_limit: The time limit in seconds for training single model. If `model_time_limit` is set, the `total_time_limit` is not respected. Single model can contain several learners, for example in the case of 10-fold cross-validation, one model will have 10 learners. Based on `model_time_limit` the time limit for single learner is computed. :param algorithms: The list of algorithms that will be used in the training. :param tuning_mode: The mode for tuning. It can be: `Normal`, `Sport`, `Insane`, `Perfect`. The names are kept the same as in https://mljar.com application. Each mode describe how many models will be checked: - `Normal` - about 5-10 models of each algorithm will be trained, - `Sport` - about 10-15 models of each algorithm will be trained, - `Insane` - about 15-20 models of each algorithm will be trained, - `Perfect` - about 25-35 models of each algorithm will be trained. You can also set how many models will be trained with `set_advanced` method. :param train_ensemble: If true then at the end of models training the ensemble will be created. (Default is `True`) :param stack: If true then stacked models will be created. Stack level is 1. (Default is `True`) :param optimize_metric: The metric to be optimized. (not implemented yet, please left `None`) :param validation: The JSON with validation type. Right now only Cross-Validation is supported. The example JSON parameters for validation: ``` {"validation_type": "kfold", "k_folds": 5, "shuffle": True, "stratify": True, "random_seed": 123} ``` :param verbose: Not implemented yet. :param ml_task: The machine learning task that will be solved. Can be: `"binary_classification", "multiclass_classification", "regression"`. If left `None` AutoML will try to guess the task based on target values. If there will be only 2 values in the target, then task will be set to `"binary_classification"`. If number of values in the target will be between 2 and 20 (included), then task will be set to `"multiclass_classification"`. In all other casses, the task is set to `"regression"`. :param explain_level: The level of explanations included to each model. `explain_level = 0` means no explanations `explain_level = 1` means produce importance plot (with permutation method), for decision trees produce tree plots, for linear models save coefficients `explain_level = 2` the same as for `1` plus SHAP explanations :param seed: The seed for random generator. """ logger.debug("AutoML.__init__") # total_time_limit is the time for computing for all models # model_time_limit is the time for computing a single model # if model_time_limit is None then its value is computed from total_time_limit # if total_time_limit is set and model_time_limit is set, then total_time_limit constraint will be omitted self._total_time_limit = total_time_limit self._model_time_limit = model_time_limit # time limit in seconds for single learner (model consists of learners) # the value is computed before fit, initilize with any number self._time_limit = 1 self._train_ensemble = train_ensemble self._stack = stack self._models = [ ] # instances of iterative learner framework or ensemble # it is instance of model framework or ensemble self._best_model = None self._validation = validation self.set_tuning_mode(tuning_mode) self._algorithms = algorithms self._verbose = verbose self._fit_time = None self._models_train_time = {} self._threshold, self._metrics_details, self._max_metrics, self._confusion_matrix = ( None, None, None, None, ) self._seed = seed self._user_set_optimize_metric = optimize_metric self._ml_task = ml_task self._X_train_path, self._y_train_path = None, None self._X_validation_path, self._y_validation_path = None, None self._data_info = None self._model_paths = [] self._stacked_models = None self._explain_level = explain_level self._results_path = results_path self._fit_level = None self._time_spend = {} self._start_time = time.time() # it will be updated in `fit` method if self._validation["validation_type"] != "kfold": # stack only available of k-fold validation self._stack = False # this should be last in the constrcutor # in case there is a dir, it might load models self._set_results_dir() def set_tuning_mode(self, mode="Normal"): if mode == "Sport": self._start_random_models = 10 self._hill_climbing_steps = 2 self._top_models_to_improve = 3 elif mode == "Insane": self._start_random_models = 15 self._hill_climbing_steps = 3 self._top_models_to_improve = 4 elif mode == "Perfect": self._start_random_models = 25 self._hill_climbing_steps = 5 self._top_models_to_improve = 5 else: # Normal self._start_random_models = 5 self._hill_climbing_steps = 1 self._top_models_to_improve = 2 self._tuner_params = { "start_random_models": self._start_random_models, "hill_climbing_steps": self._hill_climbing_steps, "top_models_to_improve": self._top_models_to_improve, } def set_advanced(self, start_random_models=1, hill_climbing_steps=0, top_models_to_improve=0): """ Advanced set of tuning parameters. :param start_random_models: Number of not-so-random models to check for each algorithm. :param hill_climbing_steps: Number of hill climbing steps during tuning. :param top_models_to_improve: Number of top models (of each algorithm) which will be considered for improving in hill climbing steps. """ self._start_random_models = start_random_models self._hill_climbing_steps = hill_climbing_steps self._top_models_to_improve = top_models_to_improve self._tuner_params = { "start_random_models": self._start_random_models, "hill_climbing_steps": self._hill_climbing_steps, "top_models_to_improve": self._top_models_to_improve, } def _set_results_dir(self): if self._results_path is None: found = False for i in range(1, 10001): self._results_path = f"AutoML_{i}" if not os.path.exists(self._results_path): found = True break if not found: raise AutoMLException( "Cannot create directory for AutoML results") if os.path.exists(self._results_path) and os.path.exists( os.path.join(self._results_path, "params.json")): print(f"Directory {self._results_path} already exists") self.load() elif self._results_path is not None: if not os.path.exists(self._results_path): print(f"Create directory {self._results_path}") try: os.mkdir(self._results_path) except Exception as e: raise AutoMLException( f"Cannot create directory {self._results_path}") elif os.path.exists(self._results_path) and len( os.listdir(self._results_path)): raise AutoMLException( f"Cannot set directory for AutoML. Directory {self._results_path} is not empty." ) else: raise AutoMLException("Cannot set directory for AutoML results") def load(self): logger.info("Loading AutoML models ...") try: params = json.load( open(os.path.join(self._results_path, "params.json"))) self._model_paths = params["saved"] self._ml_task = params["ml_task"] self._optimize_metric = params["optimize_metric"] stacked_models = params.get("stacked") models_map = {} for model_path in self._model_paths: if model_path.endswith("Ensemble") or model_path.endswith( "Ensemble_Stacked"): ens = Ensemble.load(model_path, models_map) self._models += [ens] models_map[ens.get_name()] = ens else: m = ModelFramework.load(model_path) self._models += [m] models_map[m.get_name()] = m if stacked_models is not None: self._stacked_models = [] for stacked_model_name in stacked_models: self._stacked_models += [models_map[stacked_model_name]] best_model_name = None with open(os.path.join(self._results_path, "best_model.txt"), "r") as fin: best_model_name = fin.read() self._best_model = models_map[best_model_name] data_info_path = os.path.join(self._results_path, "data_info.json") self._data_info = json.load(open(data_info_path)) except Exception as e: raise AutoMLException(f"Cannot load AutoML directory. {str(e)}") def get_leaderboard(self): ldb = { "name": [], "model_type": [], "metric_type": [], "metric_value": [], "train_time": [], } for m in self._models: ldb["name"] += [m.get_name()] ldb["model_type"] += [m.get_type()] ldb["metric_type"] += [self._optimize_metric] ldb["metric_value"] += [m.get_final_loss()] ldb["train_time"] += [np.round(m.get_train_time(), 2)] return pd.DataFrame(ldb) def keep_model(self, model): if model is None: return self._models += [model] self.verbose_print("{} final {} {} time {} seconds".format( model.get_name(), self._optimize_metric, model.get_final_loss(), np.round(model.get_train_time(), 2), )) self.log_train_time(model.get_type(), model.get_train_time()) def _get_learner_time_limit(self, model_type): logger.debug( f"Fit level: {self._fit_level}, model type: {model_type}. " + f"Time spend: {json.dumps(self._time_spend, indent=4)}") if self._model_time_limit is not None: k = self._validation.get("k_folds", 1.0) return self._model_time_limit / k if self._fit_level == "simple_algorithms": return None if self._fit_level == "default_algorithms": return None tune_algorithms = [ a for a in self._algorithms if a not in ["Baseline", "Linear", "Decision Tree", "Nearest Neighbors"] ] tune_algs_cnt = len(tune_algorithms) if tune_algs_cnt == 0: return None time_elapsed = time.time() - self._start_time time_left = self._total_time_limit - time_elapsed k_folds = self._validation.get("k_folds", 1.0) if self._fit_level == "not_so_random": tt = (self._total_time_limit - self._time_spend["simple_algorithms"] - self._time_spend["default_algorithms"]) if self._stack: tt *= ( 0.6 ) # leave some time for stacking (approx. 40% for stacking of time left) tt /= 2.0 # leave some time for hill-climbing tt /= tune_algs_cnt # give time equally for each algorithm tt /= k_folds # time is per learner (per fold) return tt if self._fit_level == "hill_climbing": tt = (self._total_time_limit - self._time_spend["simple_algorithms"] - self._time_spend["default_algorithms"] - self._time_spend["not_so_random"]) if self._stack: tt *= ( 0.4 ) # leave some time for stacking (approx. 60% for stacking of time left) tt /= tune_algs_cnt # give time equally for each algorithm tt /= k_folds # time is per learner (per fold) return tt if self._stack and self._fit_level == "stack": tt = time_left tt /= tune_algs_cnt # give time equally for each algorithm tt /= k_folds # time is per learner (per fold) return tt def train_model(self, params): model_path = os.path.join(self._results_path, params["name"]) early_stop = EarlyStopping({ "metric": { "name": self._optimize_metric }, "log_to_dir": model_path }) learner_time_constraint = LearnerTimeConstraint({ "learner_time_limit": self._get_learner_time_limit( params["learner"]["model_type"]), # self._time_limit, "min_steps": params["additional"].get("min_steps"), }) total_time_constraint = TotalTimeConstraint({ "total_time_limit": self._total_time_limit if self._model_time_limit is None else None, "total_time_start": self._start_time, }) mf = ModelFramework( params, callbacks=[ early_stop, learner_time_constraint, total_time_constraint ], ) if self._enough_time_to_train(mf.get_type()): # self.verbose_print(params["name"] + " training start ...") logger.info( f"Train model #{len(self._models)+1} / Model name: {params['name']}" ) try: os.mkdir(model_path) except Exception as e: raise AutoMLException(f"Cannot create directory {model_path}") mf.train(model_path) mf.save(model_path) self._model_paths += [model_path] self.keep_model(mf) # save the best one in the case the training will be interrupted self.select_and_save_best() else: logger.info( f"Cannot train {mf.get_type()} because of time constraint") # self._progress_bar.update(1) def verbose_print(self, msg): if self._verbose: # self._progress_bar.write(msg) print(msg) def log_train_time(self, model_type, train_time): if model_type in self._models_train_time: self._models_train_time[model_type] += [train_time] else: self._models_train_time[model_type] = [train_time] def _enough_time_to_train(self, model_type): # if model_time_limit is set, train every model # do not apply total_time_limit if self._model_time_limit is not None: return True # no total time limit, just train, dont ask if self._total_time_limit is None: return True total_time_spend = time.time() - self._start_time # no time left, do not train more models, sorry ... time_left = self._total_time_limit - total_time_spend if time_left < 0: return False # there is still time and model_type was not tested yet # we should try it if time_left > 0 and model_type not in self._models_train_time: return True # check the fit level type # we dont want to spend too much time on one level if self._fit_level == "not_so_random": time_should_use = (self._total_time_limit - self._time_spend["simple_algorithms"] - self._time_spend["default_algorithms"]) if self._stack: time_should_use *= 0.6 # leave time for stacking if self._hill_climbing_steps > 0: time_should_use /= 2.0 # leave time for hill-climbing if (total_time_spend > time_should_use + self._time_spend["simple_algorithms"] + self._time_spend["default_algorithms"]): return False ################## # hill climbing check if self._fit_level == "hill_climbing": time_should_use = (self._total_time_limit - self._time_spend["simple_algorithms"] - self._time_spend["default_algorithms"] - self._time_spend["not_so_random"]) if self._stack: time_should_use *= 0.4 # leave time for stacking if (total_time_spend > time_should_use + self._time_spend["simple_algorithms"] + self._time_spend["default_algorithms"] + self._time_spend["not_so_random"]): return False model_total_time_spend = (0 if model_type not in self._models_train_time else np.sum( self._models_train_time[model_type])) model_mean_time_spend = (0 if model_type not in self._models_train_time else np.mean( self._models_train_time[model_type])) algo_cnt = float(len(self._algorithms)) for a in ["Baseline", "Decision Tree", "Linear", "Nearest Neighbors"]: if a in self._algorithms: algo_cnt -= 1.0 if algo_cnt < 1.0: algo_cnt = 1.0 model_time_left = time_left / algo_cnt if model_mean_time_spend <= model_time_left: return True return False def ensemble_step(self, is_stacked=False): if self._train_ensemble and len(self._models) > 1: self.ensemble = Ensemble(self._optimize_metric, self._ml_task, is_stacked=is_stacked) oofs, target = self.ensemble.get_oof_matrix(self._models) self.ensemble.fit(oofs, target) self.keep_model(self.ensemble) ensemble_path = os.path.join( self._results_path, "Ensemble_Stacked" if is_stacked else "Ensemble") try: os.mkdir(ensemble_path) except Exception as e: raise AutoMLException( f"Cannot create directory {ensemble_path}") self.ensemble.save(ensemble_path) self._model_paths += [ensemble_path] # save the best one in the case the training will be interrupted self.select_and_save_best() def can_we_stack_them(self, y): # if multiclass and too many classes then No return True def get_stacked_data(self, X, mode="training"): # mode can be `training` or `predict` if self._stacked_models is None: return X all_oofs = [] for m in self._stacked_models: oof = None if mode == "training": oof = m.get_out_of_folds() else: oof = m.predict(X) if self._ml_task == BINARY_CLASSIFICATION: cols = [f for f in oof.columns if "prediction" in f] if len(cols) == 2: oof = pd.DataFrame({"prediction": oof[cols[1]]}) cols = [f for f in oof.columns if "prediction" in f] oof = oof[cols] oof.columns = [f"{m.get_name()}_{c}" for c in cols] all_oofs += [oof] org_index = X.index.copy() X.reset_index(drop=True, inplace=True) X_stacked = pd.concat(all_oofs + [X], axis=1) X_stacked.index = org_index.copy() X.index = org_index.copy() return X_stacked def stack_models(self): if self._stacked_models is not None: return ldb = self.get_leaderboard() ldb = ldb.sort_values(by="metric_value", ascending=True) models_map = { m.get_name(): m for m in self._models if not m._is_stacked } self._stacked_models = [] models_limit = 10 for model_type in np.unique(ldb.model_type): if model_type in ["Baseline"]: continue ds = ldb[ldb.model_type == model_type].copy() ds.sort_values(by="metric_value", inplace=True) for n in list(ds.name.iloc[:models_limit].values): self._stacked_models += [models_map[n]] scores = [m.get_final_loss() for m in self._stacked_models] self._stacked_models = [ self._stacked_models[i] for i in np.argsort(scores).tolist() ] def stacked_ensemble_step(self): # print("Stacked models ....") # do we have enough models? if len(self._models) < 5: return # do we have time? if self._total_time_limit is not None: time_left = self._total_time_limit - (time.time() - self._start_time) # we need at least 60 seconds to do anything if time_left < 60: return # read X directly from parquet X = pd.read_parquet(self._X_train_path) self.stack_models() org_columns = X.columns.tolist() X_stacked = self.get_stacked_data(X) new_columns = X_stacked.columns.tolist() added_columns = [c for c in new_columns if c not in org_columns] # save stacked data X_train_stacked_path = os.path.join(self._results_path, "X_train_stacked.parquet") X_stacked.to_parquet(X_train_stacked_path, index=False) # resue old params for m in self._stacked_models: # print(m.get_type()) # use only Xgboost, LightGBM and CatBoost as stacked models if m.get_type() not in ["Xgboost", "LightGBM", "CatBoost"]: continue params = copy.deepcopy(m.params) params["validation"]["X_train_path"] = X_train_stacked_path params["name"] = params["name"] + "_Stacked" params["is_stacked"] = True # print(params) if "model_architecture_json" in params["learner"]: # the new model will be created with wider input size del params["learner"]["model_architecture_json"] if self._ml_task == REGRESSION: # scale added predictions in regression if the target was scaled (in the case of NN) target_preprocessing = params["preprocessing"][ "target_preprocessing"] scale = None if "scale_log_and_normal" in target_preprocessing: scale = "scale_log_and_normal" elif "scale_normal" in target_preprocessing: scale = "scale_normal" if scale is not None: for col in added_columns: params["preprocessing"]["columns_preprocessing"][ col] = [scale] self.train_model(params) def _set_ml_task(self, y): """ Set and validate the ML task. If ML task is not set, it trys to guess ML task based on count of unique values in the target. Then it performs validation. """ # if not set, guess if self._ml_task is None: target_unique_cnt = len(np.unique(y[~pd.isnull(y)])) if target_unique_cnt == 2: self._ml_task = BINARY_CLASSIFICATION elif target_unique_cnt <= 20: self._ml_task = MULTICLASS_CLASSIFICATION else: self._ml_task = REGRESSION # validation if self._ml_task not in AlgorithmsRegistry.get_supported_ml_tasks(): raise Exception("Unknow Machine Learning task {}." " Supported tasks are: {}".format( self._ml_task, AlgorithmsRegistry.get_supported_ml_tasks())) if self._ml_task == REGRESSION: if "stratify" in self._validation: del self._validation["stratify"] logger.info("AutoML task to be solved: {}".format(self._ml_task)) print(f"AutoML task to be solved: { self._ml_task}") def _set_algorithms(self): """ Set and validate available algorithms. If algorithms are not set, all algorithms from registry are used. Then perform vadlidation of algorithms. """ if len(self._algorithms) == 0: self._algorithms = list( AlgorithmsRegistry.registry[self._ml_task].keys()) for a in self._algorithms: if a not in list( AlgorithmsRegistry.registry[self._ml_task].keys()): raise AutoMLException( "The algorithm {} is not allowed to use for ML task: {}. Allowed algorithms: {}" .format( a, self._ml_task, list( AlgorithmsRegistry.registry[self._ml_task].keys()), )) logger.info("AutoML will use algorithms: {}".format(self._algorithms)) print(f"AutoML will use algorithms: {self._algorithms}") def _set_metric(self): """ Set and validate the metric to be optimized. """ if self._ml_task == BINARY_CLASSIFICATION: if self._user_set_optimize_metric is None: self._optimize_metric = "logloss" elif self._user_set_optimize_metric not in ["logloss", "auc"]: raise AutoMLException( "Metric {} is not allowed in ML task: {}".format( self._user_set_optimize_metric, self._ml_task)) else: self._optimize_metric = self._user_set_optimize_metric elif self._ml_task == MULTICLASS_CLASSIFICATION: if self._user_set_optimize_metric is None: self._optimize_metric = "logloss" elif self._user_set_optimize_metric not in ["logloss"]: raise AutoMLException( "Metric {} is not allowed in ML task: {}".format( self._user_set_optimize_metric, self._ml_task)) else: self._optimize_metric = self._user_set_optimize_metric elif self._ml_task == REGRESSION: if self._user_set_optimize_metric is None: self._optimize_metric = "rmse" elif self._user_set_optimize_metric not in ["rmse"]: raise AutoMLException( "Metric {} is not allowed in ML task: {}".format( self._user_set_optimize_metric, self._ml_task)) else: self._optimize_metric = self._user_set_optimize_metric logger.info("AutoML will optimize for metric: {0}".format( self._optimize_metric)) print(f"AutoML will optimize for metric: {self._optimize_metric}") def _check_imbalanced(self, y): v = y.value_counts() # at least 10 samples of each class ii = v < 10 if np.sum(ii): raise AutoMLException( f"There need to be at least 10 samples of each class, for class {list(v[ii].index)} there is {v[ii].values} samples" ) # at least 1% of all samples for each class v = y.value_counts(normalize=True) * 100.0 ii = v < 1.0 if np.sum(ii): raise AutoMLException( f"There need to be at least 1% of samples of each class, for class {list(v[ii].index)} there is {v[ii].values} % of samples" ) def _initial_prep(self, X_train, y_train, X_validation=None, y_validation=None): if not isinstance(X_train, pd.DataFrame): X_train = pd.DataFrame(X_train) if not isinstance(X_train.columns[0], str): X_train.columns = [str(c) for c in X_train.columns] X_train.reset_index(drop=True, inplace=True) if isinstance(y_train, pd.DataFrame): if "target" not in y_train.columns: raise AutoMLException( "y_train should be Numpy array, Pandas Series or DataFrame with column 'target' " ) else: y_train = y_train["target"] y_train = pd.Series(np.array(y_train), name="target") X_train, y_train = ExcludeRowsMissingTarget.transform(X_train, y_train, warn=True) return X_train, y_train, X_validation, y_validation def _save_data(self, X_train, y_train, X_validation=None, y_validation=None): self._X_train_path = os.path.join(self._results_path, "X_train.parquet") self._y_train_path = os.path.join(self._results_path, "y_train.parquet") X_train.to_parquet(self._X_train_path, index=False) pd.DataFrame({ "target": y_train }).to_parquet(self._y_train_path, index=False) self._validation["X_train_path"] = self._X_train_path self._validation["y_train_path"] = self._y_train_path self._validation["results_path"] = self._results_path columns_and_target_info = DataInfo.compute(X_train, y_train, self._ml_task) self._data_info = { "columns": X_train.columns.tolist(), "rows": X_train.shape[0], "cols": X_train.shape[1], "target_is_numeric": pd.api.types.is_numeric_dtype(y_train), "columns_info": columns_and_target_info["columns_info"], "target_info": columns_and_target_info["target_info"], } if columns_and_target_info.get("num_class") is not None: self._data_info["num_class"] = columns_and_target_info["num_class"] data_info_path = os.path.join(self._results_path, "data_info.json") with open(data_info_path, "w") as fout: fout.write(json.dumps(self._data_info, indent=4)) self._drop_data_variables(X_train) def _drop_data_variables(self, X_train): X_train.drop(X_train.columns, axis=1, inplace=True) def _load_data_variables(self, X_train): if X_train.shape[1] == 0: X = pd.read_parquet(self._X_train_path) for c in X.columns: X_train.insert(loc=X_train.shape[1], column=c, value=X[c]) os.remove(self._X_train_path) os.remove(self._y_train_path) def fit(self, X_train, y_train, X_validation=None, y_validation=None): """ Fit AutoML :param X_train: Pandas DataFrame with training data. :param y_train: Numpy Array with target training data. :param X_validation: Pandas DataFrame with validation data. (Not implemented yet) :param y_validation: Numpy Array with target of validation data. (Not implemented yet) """ try: if self._best_model is not None: print( "Best model is already set, no need to run fit. Skipping ..." ) return self._start_time = time.time() if not isinstance(X_train, pd.DataFrame): raise AutoMLException( "AutoML needs X_train matrix to be a Pandas DataFrame") self._set_ml_task(y_train) if X_train is not None: X_train = X_train.copy(deep=False) X_train, y_train, X_validation, y_validation = self._initial_prep( X_train, y_train, X_validation, y_validation) self._save_data(X_train, y_train, X_validation, y_validation) self._set_algorithms() self._set_metric() # self._estimate_training_times() if self._ml_task in [ BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION ]: self._check_imbalanced(y_train) tuner = MljarTuner( self._tuner_params, self._algorithms, self._ml_task, self._validation, self._explain_level, self._data_info, self._seed, ) self.tuner = tuner self._time_spend = {} self._time_start = {} # 1. Check simple algorithms self._fit_level = "simple_algorithms" start = time.time() self._time_start[self._fit_level] = start for params in tuner.simple_algorithms_params(): self.train_model(params) self._time_spend["simple_algorithms"] = np.round( time.time() - start, 2) # 2. Default parameters self._fit_level = "default_algorithms" start = time.time() self._time_start[self._fit_level] = start for params in tuner.default_params(len(self._models)): self.train_model(params) self._time_spend["default_algorithms"] = np.round( time.time() - start, 2) # 3. The not-so-random step self._fit_level = "not_so_random" start = time.time() self._time_start[self._fit_level] = start generated_params = tuner.get_not_so_random_params(len( self._models)) for params in generated_params: self.train_model(params) self._time_spend["not_so_random"] = np.round( time.time() - start, 2) # 4. The hill-climbing step self._fit_level = "hill_climbing" start = time.time() self._time_start[self._fit_level] = start for params in tuner.get_hill_climbing_params(self._models): self.train_model(params) self._time_spend["hill_climbing"] = np.round( time.time() - start, 2) # 5. Ensemble unstacked models self._fit_level = "ensemble_unstacked" start = time.time() self._time_start[self._fit_level] = start self.ensemble_step() self._time_spend["ensemble_unstacked"] = np.round( time.time() - start, 2) if self._stack: # 6. Stack best models self._fit_level = "stack" start = time.time() self._time_start[self._fit_level] = start self.stacked_ensemble_step() self._time_spend["stack"] = np.round(time.time() - start, 2) # 7. Ensemble all models (original and stacked) any_stacked = False for m in self._models: if m._is_stacked: any_stacked = True break if any_stacked: self._fit_level = "ensemble_all" start = time.time() self.ensemble_step(is_stacked=True) self._time_spend["ensemble_all"] = np.round( time.time() - start, 2) self._fit_time = time.time() - self._start_time logger.info(f"AutoML fit time: {self._fit_time}") except Exception as e: raise e finally: if self._X_train_path is not None: self._load_data_variables(X_train) def select_and_save_best(self): max_loss = 10e14 for i, m in enumerate(self._models): if m.get_final_loss() < max_loss: self._best_model = m max_loss = m.get_final_loss() with open(os.path.join(self._results_path, "best_model.txt"), "w") as fout: fout.write(f"{self._best_model.get_name()}") with open(os.path.join(self._results_path, "params.json"), "w") as fout: params = { "ml_task": self._ml_task, "optimize_metric": self._optimize_metric, "saved": self._model_paths, } if self._stacked_models is not None: params["stacked"] = [ m.get_name() for m in self._stacked_models ] fout.write(json.dumps(params, indent=4)) ldb = self.get_leaderboard() ldb.to_csv(os.path.join(self._results_path, "leaderboard.csv"), index=False) # save report ldb["Link"] = [ f"[Results link]({m}/README.md)" for m in ldb["name"].values ] ldb.insert(loc=0, column="Best model", value="") ldb.loc[ldb.name == self._best_model.get_name(), "Best model"] = "**the best**" with open(os.path.join(self._results_path, "README.md"), "w") as fout: fout.write(f"# AutoML Leaderboard\n\n") fout.write(tabulate(ldb.values, ldb.columns, tablefmt="pipe")) LeaderboardPlots.compute(ldb, self._results_path, fout) def predict(self, X): """ Computes predictions from AutoML best model. :param X: The Pandas DataFrame with input data. The input data should have the same columns as data used for training, otherwise the `AutoMLException` will be raised. """ if self._best_model is None: return None if not isinstance(X.columns[0], str): X.columns = [str(c) for c in X.columns] input_columns = X.columns.tolist() for column in self._data_info["columns"]: if column not in input_columns: raise AutoMLException( f"Missing column: {column} in input data. Cannot predict") X = X[self._data_info["columns"]] # is stacked model if self._best_model._is_stacked: self.stack_models() X_stacked = self.get_stacked_data(X, mode="predict") if self._best_model.get_type() == "Ensemble": # Ensemble is using both original and stacked data predictions = self._best_model.predict(X, X_stacked) else: predictions = self._best_model.predict(X_stacked) else: predictions = self._best_model.predict(X) if self._ml_task == BINARY_CLASSIFICATION: # need to predict the label based on predictions and threshold neg_label, pos_label = ( predictions.columns[0][11:], predictions.columns[1][11:], ) if neg_label == "0" and pos_label == "1": neg_label, pos_label = 0, 1 target_is_numeric = self._data_info.get("target_is_numeric", False) if target_is_numeric: neg_label = int(neg_label) pos_label = int(pos_label) # assume that it is binary classification predictions[ "label"] = predictions.iloc[:, 1] > self._best_model._threshold predictions["label"] = predictions["label"].map({ True: pos_label, False: neg_label }) return predictions elif self._ml_task == MULTICLASS_CLASSIFICATION: target_is_numeric = self._data_info.get("target_is_numeric", False) if target_is_numeric: predictions["label"] = predictions["label"].astype(int) return predictions else: return predictions def to_json(self): if self._best_model is None: return None return { "best_model": self._best_model.to_json(), "threshold": self._threshold, "ml_task": self._ml_task, } def from_json(self, json_data): if json_data["best_model"]["algorithm_short_name"] == "Ensemble": self._best_model = Ensemble() self._best_model.from_json(json_data["best_model"]) else: self._best_model = ModelFramework( json_data["best_model"].get("params")) self._best_model.from_json(json_data["best_model"]) self._threshold = json_data.get("threshold") self._ml_task = json_data.get("ml_task")
class AutoML: """ Automated Machine Learning for supervised tasks (binary classification, multiclass classification, regression). """ def __init__( self, results_path=None, total_time_limit=60 * 60, model_time_limit=None, algorithms=["Random Forest", "Xgboost"], tuning_mode="Sport", train_ensemble=True, optimize_metric=None, validation={"validation_type": "kfold", "k_folds": 5, "shuffle": True}, verbose=True, ml_task=None, seed=1, ): """ Create the AutoML object. Initialize directory for results. :param results_path: The path where all results will be saved. If left `None` then the name of directory will be generated, with schema: AutoML_{number}, where number can be from 1 to 100 - depends which direcory name will be available. If the `results_path` will point to directory with AutoML results, then all models will be loaded. :param total_time_limit: The time limit in seconds for AutoML training. It is not used when `model_time_limit` is not `None`. :param model_time_limit: The time limit in seconds for training single model. If `model_time_limit` is set, the `total_time_limit` is not respected. Single model can contain several learners, for example in the case of 10-fold cross-validation, one model will have 10 learners. Based on `model_time_limit` the time limit for single learner is computed. :param algorithms: The list of algorithms that will be used in the training. :param tuning_mode: The mode for tuning. It can be: `Normal`, `Sport`, `Insane`, `Perfect`. The names are kept the same as in https://mljar.com application. Each mode describe how many models will be checked: - `Normal` - about 5-10 models of each algorithm will be trained, - `Sport` - about 10-15 models of each algorithm will be trained, - `Insane` - about 15-20 models of each algorithm will be trained, - `Perfect` - about 25-35 models of each algorithm will be trained. You can also set how many models will be trained with `set_advanced` method. :param train_ensemble: If true then at the end of models training the ensemble will be created. :param optimize_metric: The metric to be optimized. (not implemented yet, please left `None`) :param validation: The JSON with validation type. Right now only Cross-Validation is supported. The example JSON parameters for validation: ``` {"validation_type": "kfold", "k_folds": 5, "shuffle": True, "stratify": True, "random_seed": 123} ``` :param verbose: Not implemented yet. :param ml_task: The machine learning task that will be solved. Can be: `"binary_classification", "multiclass_classification", "regression"`. If left `None` AutoML will try to guess the task based on target values. If there will be only 2 values in the target, then task will be set to `"binary_classification"`. If number of values in the target will be between 2 and 20 (included), then task will be set to `"multiclass_classification"`. In all other casses, the task is set to `"regression"`. :param seed: The seed for random generator. """ logger.debug("AutoML.__init__") # total_time_limit is the time for computing for all models # model_time_limit is the time for computing a single model # if model_time_limit is None then its value is computed from total_time_limit # if total_time_limit is set and model_time_limit is set, then total_time_limit constraint will be omitted self._total_time_limit = total_time_limit self._model_time_limit = model_time_limit # time limit in seconds for single learner (model consists of learners) # the value is computed before fit, initilize with any number self._time_limit = 1 self._train_ensemble = train_ensemble self._models = [] # instances of iterative learner framework or ensemble # it is instance of model framework or ensemble self._best_model = None self._validation = validation self.set_tuning_mode("Sport") self._algorithms = algorithms self._verbose = verbose self._fit_time = None self._models_train_time = {} self._threshold, self._metrics_details, self._max_metrics, self._confusion_matrix = ( None, None, None, None, ) self._seed = seed self._user_set_optimize_metric = optimize_metric self._ml_task = ml_task self._tuner_params = { "start_random_models": self._start_random_models, "hill_climbing_steps": self._hill_climbing_steps, "top_models_to_improve": self._top_models_to_improve, } self._X_train_path, self._y_train_path = None, None self._X_validation_path, self._y_validation_path = None, None self._data_info = None self._model_paths = [] self._results_path = results_path self._set_results_dir() def set_tuning_mode(self, mode="Normal"): if mode == "Sport": self._start_random_models = 10 self._hill_climbing_steps = 2 self._top_models_to_improve = 3 if mode == "Insane": self._start_random_models = 15 self._hill_climbing_steps = 3 self._top_models_to_improve = 4 if mode == "Perfect": self._start_random_models = 25 self._hill_climbing_steps = 5 self._top_models_to_improve = 5 else: # Normal self._start_random_models = 5 self._hill_climbing_steps = 1 self._top_models_to_improve = 2 def set_advanced( self, start_random_models=1, hill_climbing_steps=0, top_models_to_improve=0 ): """ Advanced set of tuning parameters. :param start_random_models: Number of not-so-random models to check for each algorithm. :param hill_climbing_steps: Number of hill climbing steps during tuning. :param top_models_to_improve: Number of top models (of each algorithm) which will be considered for improving in hill climbing steps. """ self._start_random_models = start_random_models self._hill_climbing_steps = hill_climbing_steps self._top_models_to_improve = top_models_to_improve def _set_results_dir(self): if self._results_path is None: found = False for i in range(1, 101): self._results_path = f"AutoML_{i}" if not os.path.exists(self._results_path): found = True break if not found: raise AutoMLException("Cannot create directory for AutoML results") if os.path.exists(self._results_path) and os.path.exists( os.path.join(self._results_path, "params.json") ): print(f"Directory {self._results_path} already exists") self.load() elif self._results_path is not None: if not os.path.exists(self._results_path): print(f"Create directory {self._results_path}") try: os.mkdir(self._results_path) except Exception as e: raise AutoMLException( f"Cannot create directory {self._results_path}" ) elif os.path.exists(self._results_path) and len( os.listdir(self._results_path) ): raise AutoMLException( f"Cannot set directory for AutoML. Directory {self._results_path} is not empty." ) else: raise AutoMLException("Cannot set directory for AutoML results") def load(self): logger.info("Loading AutoML models ...") try: params = json.load(open(os.path.join(self._results_path, "params.json"))) self._model_paths = params["saved"] self._ml_task = params["ml_task"] self._optimize_metric = params["optimize_metric"] models_map = {} for model_path in self._model_paths: if model_path.endswith("ensemble"): ens = Ensemble.load(model_path, models_map) models_map[ens.get_name()] = ens else: m = ModelFramework.load(model_path) self._models += [m] models_map[m.get_name()] = m best_model_name = None with open(os.path.join(self._results_path, "best_model.txt"), "r") as fin: best_model_name = fin.read() self._best_model = models_map[best_model_name] data_info_path = os.path.join(self._results_path, "data_info.json") self._data_info = json.load(open(data_info_path)) except Exception as e: raise AutoMLException(f"Cannot load AutoML directory. {str(e)}") def _estimate_training_times(self): # single models including models in the folds self._estimated_models_to_check = ( len(self._algorithms) * self._start_random_models + self._top_models_to_improve * self._hill_climbing_steps * 2 ) if self._model_time_limit is not None: k = self._validation.get("k_folds", 1.0) self._time_limit = self._model_time_limit / k elif self._total_time_limit is not None: # set time limit for single model training # the 0.85 is safe scale factor, to not exceed time limit # scaling is added because number of models to be trained are estimate k = self._validation.get("k_folds", 1.0) self._time_limit = ( self._total_time_limit * 0.85 / self._estimated_models_to_check / k ) print( f"AutoML will try to check about {int(self._estimated_models_to_check)} models" ) def get_leaderboard(self): ldb = { "name": [], "model_type": [], "metric_type": [], "metric_value": [], "train_time": [], } for m in self._models: ldb["name"] += [m.get_name()] ldb["model_type"] += [m.get_type()] ldb["metric_type"] += [self._optimize_metric] ldb["metric_value"] += [m.get_final_loss()] ldb["train_time"] += [np.round(m.get_train_time(), 2)] return pd.DataFrame(ldb) def get_additional_metrics(self): additional_metrics = self._best_model.get_additional_metrics() # AdditionalMetrics.compute( # oof_predictions[target_cols], # oof_predictions[prediction_cols], # self._ml_task, # ) if self._ml_task == BINARY_CLASSIFICATION: self._metrics_details = additional_metrics["metric_details"] self._max_metrics = additional_metrics["max_metrics"] self._confusion_matrix = additional_metrics["confusion_matrix"] self._threshold = additional_metrics["threshold"] logger.info( "Metric details:\n{}\n\nConfusion matrix:\n{}".format( self._max_metrics.transpose(), self._confusion_matrix ) ) with open( os.path.join(self._results_path, "best_model_metrics.txt"), "w" ) as fout: fout.write( "Metric details:\n{}\n\nConfusion matrix:\n{}".format( self._max_metrics.transpose(), self._confusion_matrix ) ) elif self._ml_task == MULTICLASS_CLASSIFICATION: max_metrics = additional_metrics["max_metrics"] confusion_matrix = additional_metrics["confusion_matrix"] logger.info( "Metric details:\n{}\nConfusion matrix:\n{}".format( max_metrics, confusion_matrix ) ) with open( os.path.join(self._results_path, "best_model_metrics.txt"), "w" ) as fout: fout.write("Metric details:\n{}\n\n".format(max_metrics.transpose())) fout.write("Confusion matrix:\n{}".format(confusion_matrix)) def keep_model(self, model): if model is None: return self._models += [model] self.verbose_print( "{} final {} {} time {} seconds".format( model.get_type(), self._optimize_metric, model.get_final_loss(), np.round(model.get_train_time(), 2), ) ) self.log_train_time(model.get_type(), model.get_train_time()) def train_model(self, params): model_path = os.path.join(self._results_path, params["name"]) early_stop = EarlyStopping( {"metric": {"name": self._optimize_metric}, "log_to_dir": model_path} ) time_constraint = TimeConstraint({"train_seconds_time_limit": self._time_limit}) mf = ModelFramework(params, callbacks=[early_stop, time_constraint]) if self._enough_time_to_train(mf.get_type()): logger.info( f"Train model #{len(self._models)+1} / Model name: {params['name']}" ) try: os.mkdir(model_path) except Exception as e: raise AutoMLException(f"Cannot create directory {model_path}") mf.train() # {"train": {"X": X, "y": y}}) mf.save(model_path) self._model_paths += [model_path] self.keep_model(mf) else: logger.info( f"Cannot check more models of {mf.get_type()} because of time constraint" ) # self._progress_bar.update(1) def verbose_print(self, msg): if self._verbose: # self._progress_bar.write(msg) print(msg) def log_train_time(self, model_type, train_time): if model_type in self._models_train_time: self._models_train_time[model_type] += [train_time] else: self._models_train_time[model_type] = [train_time] def _enough_time_to_train(self, model_type): # if model_time_limit is set, train every model # do not apply total_time_limit if self._model_time_limit is not None: return True # no total time limit, just train, dont ask if self._total_time_limit is None: return True total_time_already_spend = ( 0 if model_type not in self._models_train_time else np.sum(self._models_train_time[model_type]) ) mean_time_already_spend = ( 0 if model_type not in self._models_train_time else np.mean(self._models_train_time[model_type]) ) if ( total_time_already_spend + mean_time_already_spend < 0.85 * self._total_time_limit / float(len(self._algorithms)) ): return True return False def ensemble_step(self): if self._train_ensemble: self.ensemble = Ensemble(self._optimize_metric, self._ml_task) oofs, target = self.ensemble.get_oof_matrix(self._models) self.ensemble.fit(oofs, target) self.keep_model(self.ensemble) ensemble_path = os.path.join(self._results_path, "ensemble") try: os.mkdir(ensemble_path) except Exception as e: raise AutoMLException(f"Cannot create directory {ensemble_path}") self.ensemble.save(ensemble_path) self._model_paths += [ensemble_path] def _set_ml_task(self, y): """ Set and validate the ML task. If ML task is not set, it trys to guess ML task based on count of unique values in the target. Then it performs validation. """ # if not set, guess if self._ml_task is None: target_unique_cnt = len(np.unique(y[~pd.isnull(y)])) if target_unique_cnt == 2: self._ml_task = BINARY_CLASSIFICATION elif target_unique_cnt <= 20: self._ml_task = MULTICLASS_CLASSIFICATION else: self._ml_task = REGRESSION # validation if self._ml_task not in AlgorithmsRegistry.get_supported_ml_tasks(): raise Exception( "Unknow Machine Learning task {}." " Supported tasks are: {}".format( self._ml_task, AlgorithmsRegistry.get_supported_ml_tasks() ) ) logger.info("AutoML task to be solved: {}".format(self._ml_task)) print(f"AutoML task to be solved: { self._ml_task}") def _set_algorithms(self): """ Set and validate available algorithms. If algorithms are not set, all algorithms from registry are used. Then perform vadlidation of algorithms. """ if len(self._algorithms) == 0: self._algorithms = list(AlgorithmsRegistry.registry[self._ml_task].keys()) for a in self._algorithms: if a not in list(AlgorithmsRegistry.registry[self._ml_task].keys()): raise AutoMLException( "The algorithm {} is not allowed to use for ML task: {}. Allowed algorithms: {}".format( a, self._ml_task, list(AlgorithmsRegistry.registry[self._ml_task].keys()), ) ) logger.info("AutoML will use algorithms: {}".format(self._algorithms)) print(f"AutoML will use algorithms: {self._algorithms}") def _set_metric(self): """ Set and validate the metric to be optimized. """ if self._ml_task == BINARY_CLASSIFICATION: if self._user_set_optimize_metric is None: self._optimize_metric = "logloss" elif self._user_set_optimize_metric not in ["logloss", "auc"]: raise AutoMLException( "Metric {} is not allowed in ML task: {}".format( self._user_set_optimize_metric, self._ml_task ) ) else: self._optimize_metric = self._user_set_optimize_metric elif self._ml_task == MULTICLASS_CLASSIFICATION: if self._user_set_optimize_metric is None: self._optimize_metric = "logloss" elif self._user_set_optimize_metric not in ["logloss"]: raise AutoMLException( "Metric {} is not allowed in ML task: {}".format( self._user_set_optimize_metric, self._ml_task ) ) else: self._optimize_metric = self._user_set_optimize_metric elif self._ml_task == REGRESSION: if self._user_set_optimize_metric is None: self._optimize_metric = "mse" elif self._user_set_optimize_metric not in ["mse"]: raise AutoMLException( "Metric {} is not allowed in ML task: {}".format( self._user_set_optimize_metric, self._ml_task ) ) else: self._optimize_metric = self._user_set_optimize_metric logger.info( "AutoML will optimize for metric: {0}".format(self._optimize_metric) ) print(f"AutoML will optimize for metric: {self._optimize_metric}") def _check_imbalanced(self, y): v = y.value_counts() # at least 10 samples of each class ii = v < 10 if np.sum(ii): raise AutoMLException( f"There need to be at least 10 samples of each class, for class {list(v[ii].index)} there is {v[ii].values} samples" ) # at least 1% of all samples for each class v = y.value_counts(normalize=True) * 100.0 ii = v < 1.0 if np.sum(ii): raise AutoMLException( f"There need to be at least 1% of samples of each class, for class {list(v[ii].index)} there is {v[ii].values} % of samples" ) def _initial_prep(self, X_train, y_train, X_validation=None, y_validation=None): if not isinstance(X_train, pd.DataFrame): X_train = pd.DataFrame(X_train) if not isinstance(X_train.columns[0], str): X_train.columns = [str(c) for c in X_train.columns] X_train.reset_index(drop=True, inplace=True) y_train = pd.Series(np.array(y_train), name="target") X_train, y_train = ExcludeRowsMissingTarget.transform( X_train, y_train, warn=True ) return X_train, y_train, X_validation, y_validation def _save_data(self, X_train, y_train, X_validation=None, y_validation=None): self._X_train_path = os.path.join(self._results_path, "X_train.parquet") self._y_train_path = os.path.join(self._results_path, "y_train.parquet") X_train.to_parquet(self._X_train_path, index=False) pd.DataFrame({"target": y_train}).to_parquet(self._y_train_path, index=False) self._validation["X_train_path"] = self._X_train_path self._validation["y_train_path"] = self._y_train_path self._validation["results_path"] = self._results_path self._data_info = { "columns": X_train.columns.tolist(), "rows": X_train.shape[0], "cols": X_train.shape[1], "target_is_numeric": pd.api.types.is_numeric_dtype(y_train), } data_info_path = os.path.join(self._results_path, "data_info.json") with open(data_info_path, "w") as fout: fout.write(json.dumps(self._data_info, indent=4)) def _del_data_variables(self, X_train, y_train): X_train.drop(X_train.columns, axis=1, inplace=True) def _load_data_variables(self, X_train): X = pd.read_parquet(self._X_train_path) for c in X.columns: X_train.insert(loc=X_train.shape[1], column=c, value=X[c]) os.remove(self._X_train_path) os.remove(self._y_train_path) def fit(self, X_train, y_train, X_validation=None, y_validation=None): """ Fit AutoML :param X_train: Pandas DataFrame with training data. :param y_train: Numpy Array with target training data. :param X_validation: Pandas DataFrame with validation data. (Not implemented yet) :param y_validation: Numpy Array with target of validation data. (Not implemented yet) """ try: if self._best_model is not None: print("Best model is already set, no need to run fit. Skipping ...") return start_time = time.time() if not isinstance(X_train, pd.DataFrame): raise AutoMLException( "AutoML needs X_train matrix to be a Pandas DataFrame" ) if X_train is not None: X_train = X_train.copy(deep=False) X_train, y_train, X_validation, y_validation = self._initial_prep( X_train, y_train, X_validation, y_validation ) self._save_data(X_train, y_train, X_validation, y_validation) self._set_ml_task(y_train) self._set_algorithms() self._set_metric() self._estimate_training_times() if self._ml_task in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION]: self._check_imbalanced(y_train) tuner = MljarTuner( self._tuner_params, self._algorithms, self._ml_task, self._validation, self._seed, ) # not so random step generated_params = tuner.get_not_so_random_params(X_train, y_train) self._del_data_variables(X_train, y_train) for params in generated_params: self.train_model(params) # hill climbing for params in tuner.get_hill_climbing_params(self._models): self.train_model(params) self.ensemble_step() max_loss = 10e12 for i, m in enumerate(self._models): if m.get_final_loss() < max_loss: self._best_model = m max_loss = m.get_final_loss() self.get_additional_metrics() self._fit_time = time.time() - start_time # self._progress_bar.close() with open(os.path.join(self._results_path, "best_model.txt"), "w") as fout: fout.write(f"{self._best_model.get_name()}") with open(os.path.join(self._results_path, "params.json"), "w") as fout: params = { "ml_task": self._ml_task, "optimize_metric": self._optimize_metric, "saved": self._model_paths, } fout.write(json.dumps(params, indent=4)) ldb = self.get_leaderboard() ldb.to_csv(os.path.join(self._results_path, "leaderboard.csv"), index=False) # save report ldb["Link"] = [f"[Results link]({m}/README.md)" for m in ldb["name"].values] ldb.insert(loc=0, column="Best model", value="") ldb.loc[ ldb.name == self._best_model.get_name(), "Best model" ] = "*** the best ***" with open(os.path.join(self._results_path, "README.md"), "w") as fout: fout.write(f"# AutoML Leaderboard\n\n") fout.write(tabulate(ldb.values, ldb.columns, tablefmt="pipe")) except Exception as e: raise e finally: if self._X_train_path is not None: self._load_data_variables(X_train) def predict(self, X): """ Computes predictions from AutoML best model. :param X: The Pandas DataFrame with input data. The input data should have the same columns as data used for training, otherwise the `AutoMLException` will be raised. """ if self._best_model is None: return None if not isinstance(X.columns[0], str): X.columns = [str(c) for c in X.columns] input_columns = X.columns.tolist() for column in self._data_info["columns"]: if column not in input_columns: raise AutoMLException( f"Missing column: {column} in input data. Cannot predict" ) X = X[self._data_info["columns"]] predictions = self._best_model.predict(X) if self._ml_task == BINARY_CLASSIFICATION: # need to predict the label based on predictions and threshold neg_label, pos_label = ( predictions.columns[0][11:], predictions.columns[1][11:], ) if neg_label == "0" and pos_label == "1": neg_label, pos_label = 0, 1 target_is_numeric = self._data_info.get("target_is_numeric", False) if target_is_numeric: neg_label = int(neg_label) pos_label = int(pos_label) # assume that it is binary classification predictions["label"] = predictions.iloc[:, 1] > self._best_model._threshold predictions["label"] = predictions["label"].map( {True: pos_label, False: neg_label} ) return predictions elif self._ml_task == MULTICLASS_CLASSIFICATION: target_is_numeric = self._data_info.get("target_is_numeric", False) if target_is_numeric: predictions["label"] = predictions["label"].astype(int) return predictions else: return predictions def to_json(self): if self._best_model is None: return None return { "best_model": self._best_model.to_json(), "threshold": self._threshold, "ml_task": self._ml_task, } def from_json(self, json_data): if json_data["best_model"]["algorithm_short_name"] == "Ensemble": self._best_model = Ensemble() self._best_model.from_json(json_data["best_model"]) else: self._best_model = ModelFramework(json_data["best_model"].get("params")) self._best_model.from_json(json_data["best_model"]) self._threshold = json_data.get("threshold") self._ml_task = json_data.get("ml_task")
class AutoML: def __init__( self, results_path=None, total_time_limit=60 * 60, algorithms=["Random Forest", "Xgboost"], # , "Random Forest"], start_random_models=10, hill_climbing_steps=3, top_models_to_improve=5, train_ensemble=True, verbose=True, optimize_metric=None, ml_task=None, seed=1, ): logger.debug("AutoML.__init__") self._total_time_limit = total_time_limit # time limit in seconds for single learner self._time_limit = 1 # wtf self._train_ensemble = train_ensemble self._models = [] # instances of iterative learner framework or ensemble # it is instance of model framework or ensemble self._best_model = None # default validation self._validation = {"validation_type": "kfold", "k_folds": 5, "shuffle": True} self._start_random_models = start_random_models self._hill_climbing_steps = hill_climbing_steps self._top_models_to_improve = top_models_to_improve self._algorithms = algorithms self._verbose = verbose self._fit_time = None self._models_train_time = {} self._threshold, self._metrics_details, self._max_metrics, self._confusion_matrix = ( None, None, None, None, ) self._seed = seed self._user_set_optimize_metric = optimize_metric self._ml_task = ml_task self._tuner_params = { "start_random_models": self._start_random_models, "hill_climbing_steps": self._hill_climbing_steps, "top_models_to_improve": self._top_models_to_improve, } self._X_train_path, self._y_train_path = None, None self._X_validation_path, self._y_validation_path = None, None self._data_info = None self._model_paths = [] self._results_path = results_path self._set_results_dir() def _set_results_dir(self): if self._results_path is None: found = False for i in range(1, 101): self._results_path = f"AutoML_{i}" if not os.path.exists(self._results_path): found = True break if not found: raise AutoMLException("Cannot create directory for AutoML results") if os.path.exists(self._results_path): print(f"Directory {self._results_path} already exists") self.load() elif self._results_path is not None: print(f"Create directory {self._results_path}") try: os.mkdir(self._results_path) except Exception as e: raise AutoMLException(f"Cannot create directory {self._results_path}") def load(self): logger.info("Loading AutoML models ...") params = json.load(open(os.path.join(self._results_path, "params.json"))) self._model_paths = params["saved"] self._ml_task = params["ml_task"] self._optimize_metric = params["optimize_metric"] models_map = {} for model_path in self._model_paths: if model_path.endswith("ensemble"): ens = Ensemble.load(model_path, models_map) models_map[ens.get_name()] = ens else: m = ModelFramework.load(model_path) self._models += [m] models_map[m.get_name()] = m best_model_name = None with open(os.path.join(self._results_path, "best_model.txt"), "r") as fin: best_model_name = fin.read() self._best_model = models_map[best_model_name] data_info_path = os.path.join(self._results_path, "data_info.json") self._data_info = json.load(open(data_info_path)) print("data info", self._data_info) def _estimate_training_times(self): # single models including models in the folds self._estimated_models_to_check = ( len(self._algorithms) * self._start_random_models + self._top_models_to_improve * self._hill_climbing_steps * 2 ) if self._total_time_limit is not None: # set time limit for single model training # the 0.85 is safe scale factor, to not exceed time limit k = self._validation.get("k_folds", 1.0) self._time_limit = ( self._total_time_limit * 0.85 / self._estimated_models_to_check / k ) print( f"AutoML will try to check about {int(self._estimated_models_to_check)} models" ) def get_leaderboard(self): ldb = { "name": [], "model_type": [], "metric_type": [], "metric_value": [], "train_time": [], } for m in self._models: ldb["name"] += [m.get_name()] ldb["model_type"] += [m.get_type()] ldb["metric_type"] += [self._optimize_metric] ldb["metric_value"] += [m.get_final_loss()] ldb["train_time"] += [np.round(m.get_train_time(), 2)] return pd.DataFrame(ldb) def get_additional_metrics(self): # 'target' - the target after processing used for model training # 'prediction' - out of folds predictions of the model # oof_predictions = self._best_model.get_out_of_folds() # prediction_cols = [c for c in oof_predictions.columns if "prediction" in c] # target_cols = [c for c in oof_predictions.columns if "target" in c] additional_metrics = self._best_model.get_additional_metrics() # AdditionalMetrics.compute( # oof_predictions[target_cols], # oof_predictions[prediction_cols], # self._ml_task, # ) if self._ml_task == BINARY_CLASSIFICATION: self._metrics_details = additional_metrics["metric_details"] self._max_metrics = additional_metrics["max_metrics"] self._confusion_matrix = additional_metrics["confusion_matrix"] self._threshold = additional_metrics["threshold"] logger.info( "Metric details:\n{}\n\nConfusion matrix:\n{}".format( self._max_metrics.transpose(), self._confusion_matrix ) ) with open( os.path.join(self._results_path, "best_model_metrics.txt"), "w" ) as fout: fout.write( "Metric details:\n{}\n\nConfusion matrix:\n{}".format( self._max_metrics.transpose(), self._confusion_matrix ) ) elif self._ml_task == MULTICLASS_CLASSIFICATION: max_metrics = additional_metrics["max_metrics"] confusion_matrix = additional_metrics["confusion_matrix"] logger.info( "Metric details:\n{}\nConfusion matrix:\n{}".format( max_metrics, confusion_matrix ) ) with open( os.path.join(self._results_path, "best_model_metrics.txt"), "w" ) as fout: fout.write("Metric details:\n{}\n\n".format(max_metrics.transpose())) fout.write("Confusion matrix:\n{}".format(confusion_matrix)) def keep_model(self, model): if model is None: return self._models += [model] self.verbose_print( "{} final {} {} time {} seconds".format( model.get_type(), self._optimize_metric, model.get_final_loss(), np.round(model.get_train_time(), 2), ) ) self.log_train_time(model.get_type(), model.get_train_time()) def train_model(self, params): model_path = os.path.join(self._results_path, f"model_{len(self._models)+1}") early_stop = EarlyStopping( {"metric": {"name": self._optimize_metric}, "log_to_dir": model_path} ) time_constraint = TimeConstraint({"train_seconds_time_limit": self._time_limit}) mf = ModelFramework(params, callbacks=[early_stop, time_constraint]) if self._enough_time_to_train(mf.get_type()): logger.info(f"Train model #{len(self._models)+1}") try: os.mkdir(model_path) except Exception as e: raise AutoMLException(f"Cannot create directory {model_path}") mf.train() # {"train": {"X": X, "y": y}}) mf.save(model_path) self._model_paths += [model_path] self.keep_model(mf) else: logger.info( f"Cannot check more models of {mf.get_type()} because of time constraint" ) # self._progress_bar.update(1) def verbose_print(self, msg): if self._verbose: # self._progress_bar.write(msg) print(msg) def log_train_time(self, model_type, train_time): if model_type in self._models_train_time: self._models_train_time[model_type] += [train_time] else: self._models_train_time[model_type] = [train_time] def _enough_time_to_train(self, model_type): # no time limit, just train, dont ask if self._total_time_limit is None: return True total_time_already_spend = ( 0 if model_type not in self._models_train_time else np.sum(self._models_train_time[model_type]) ) mean_time_already_spend = ( 0 if model_type not in self._models_train_time else np.mean(self._models_train_time[model_type]) ) if ( total_time_already_spend + mean_time_already_spend < 0.85 * self._total_time_limit / float(len(self._algorithms)) ): return True return False def ensemble_step(self): if self._train_ensemble: self.ensemble = Ensemble(self._optimize_metric, self._ml_task) oofs, target = self.ensemble.get_oof_matrix(self._models) self.ensemble.fit(oofs, target) self.keep_model(self.ensemble) ensemble_path = os.path.join(self._results_path, "ensemble") try: os.mkdir(ensemble_path) except Exception as e: raise AutoMLException(f"Cannot create directory {ensemble_path}") self.ensemble.save(ensemble_path) self._model_paths += [ensemble_path] def _set_ml_task(self, y): """ Set and validate the ML task. If ML task is not set, it trys to guess ML task based on count of unique values in the target. Then it performs validation. """ # if not set, guess if self._ml_task is None: target_unique_cnt = len(np.unique(y[~pd.isnull(y)])) if target_unique_cnt == 2: self._ml_task = BINARY_CLASSIFICATION elif target_unique_cnt <= 20: self._ml_task = MULTICLASS_CLASSIFICATION else: self._ml_task = REGRESSION # validation if self._ml_task not in AlgorithmsRegistry.get_supported_ml_tasks(): raise Exception( "Unknow Machine Learning task {}." " Supported tasks are: {}".format( self._ml_task, AlgorithmsRegistry.get_supported_ml_tasks() ) ) logger.info("AutoML task to be solved: {}".format(self._ml_task)) print(f"AutoML task to be solved: { self._ml_task}") def _set_algorithms(self): """ Set and validate available algorithms. If algorithms are not set, all algorithms from registry are used. Then perform vadlidation of algorithms. """ if len(self._algorithms) == 0: self._algorithms = list(AlgorithmsRegistry.registry[self._ml_task].keys()) for a in self._algorithms: if a not in list(AlgorithmsRegistry.registry[self._ml_task].keys()): raise AutoMLException( "The algorithm {} is not allowed to use for ML task: {}. Allowed algorithms: {}".format( a, self._ml_task, list(AlgorithmsRegistry.registry[self._ml_task].keys()) ) ) logger.info("AutoML will use algorithms: {}".format(self._algorithms)) print(f"AutoML will use algorithms: {self._algorithms}") def _set_metric(self): """ Set and validate the metric to be optimized. """ if self._ml_task == BINARY_CLASSIFICATION: if self._user_set_optimize_metric is None: self._optimize_metric = "logloss" elif self._user_set_optimize_metric not in ["logloss", "auc"]: raise AutoMLException( "Metric {} is not allowed in ML task: {}".format( self._user_set_optimize_metric, self._ml_task ) ) else: self._optimize_metric = self._user_set_optimize_metric elif self._ml_task == MULTICLASS_CLASSIFICATION: if self._user_set_optimize_metric is None: self._optimize_metric = "logloss" elif self._user_set_optimize_metric not in ["logloss"]: raise AutoMLException( "Metric {} is not allowed in ML task: {}".format( self._user_set_optimize_metric, self._ml_task ) ) else: self._optimize_metric = self._user_set_optimize_metric elif self._ml_task == REGRESSION: if self._user_set_optimize_metric is None: self._optimize_metric = "mse" elif self._user_set_optimize_metric not in ["mse"]: raise AutoMLException( "Metric {} is not allowed in ML task: {}".format( self._user_set_optimize_metric, self._ml_task ) ) else: self._optimize_metric = self._user_set_optimize_metric logger.info( "AutoML will optimize for metric: {0}".format(self._optimize_metric) ) print(f"AutoML will optimize for metric: {self._optimize_metric}") def _check_imbalanced(self, y): v = y.value_counts() # at least 10 samples of each class ii = v < 10 if np.sum(ii): raise AutoMLException( f"There need to be at least 10 samples of each class, for class {list(v[ii].index)} there is {v[ii].values} samples" ) # at least 1% of all samples for each class v = y.value_counts(normalize=True) * 100.0 ii = v < 1.0 if np.sum(ii): raise AutoMLException( f"There need to be at least 1% of samples of each class, for class {list(v[ii].index)} there is {v[ii].values} % of samples" ) def _initial_prep(self, X_train, y_train, X_validation=None, y_validation=None): if not isinstance(X_train, pd.DataFrame): X_train = pd.DataFrame(X_train) if not isinstance(X_train.columns[0], str): X_train.columns = [str(c) for c in X_train.columns] X_train.reset_index(drop=True, inplace=True) if not isinstance(y_train, pd.DataFrame): y_train = pd.DataFrame({"target": np.array(y_train)}) else: if "target" not in y_train.columns: raise AutoMLException("There should be target column in y_train") y_train.reset_index(drop=True, inplace=True) return X_train, y_train["target"], X_validation, y_validation def _save_data(self, X_train, y_train, X_validation=None, y_validation=None): self._X_train_path = os.path.join(self._results_path, "X_train.csv") self._y_train_path = os.path.join(self._results_path, "y_train.csv") X_train.to_parquet(self._X_train_path, index=False) pd.DataFrame({"target": y_train}).to_parquet(self._y_train_path, index=False) self._validation["X_train_path"] = self._X_train_path self._validation["y_train_path"] = self._y_train_path self._validation["results_path"] = self._results_path self._data_info = { "columns": X_train.columns.tolist(), "rows": X_train.shape[0], "cols": X_train.shape[1], } data_info_path = os.path.join(self._results_path, "data_info.json") with open(data_info_path, "w") as fout: fout.write(json.dumps(self._data_info, indent=4)) def _del_data_variables(self, X_train, y_train): X_train.drop(X_train.columns, axis=1, inplace=True) def _load_data_variables(self, X_train): X = pd.read_parquet(self._X_train_path) for c in X.columns: X_train.insert(loc=X_train.shape[1], column=c, value=X[c]) os.remove(self._X_train_path) os.remove(self._y_train_path) def fit(self, X_train, y_train, X_validation=None, y_validation=None): if self._best_model is not None: print("Best model is already set, no need to run fit. Skipping ...") return start_time = time.time() if not isinstance(X_train, pd.DataFrame): raise AutoMLException( "AutoML needs X_train matrix to be a Pandas DataFrame" ) if X_train is not None: X_train = X_train.copy(deep=False) X_train, y_train, X_validation, y_validation = self._initial_prep( X_train, y_train, X_validation, y_validation ) self._save_data(X_train, y_train, X_validation, y_validation) self._set_ml_task(y_train) self._set_algorithms() self._set_metric() self._estimate_training_times() if self._ml_task in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION]: self._check_imbalanced(y_train) tuner = MljarTuner( self._tuner_params, self._algorithms, self._ml_task, self._validation, self._seed, ) # not so random step generated_params = tuner.get_not_so_random_params(X_train, y_train) self._del_data_variables(X_train, y_train) for params in generated_params: self.train_model(params) # hill climbing for params in tuner.get_hill_climbing_params(self._models): self.train_model(params) self.ensemble_step() max_loss = 10e12 for i, m in enumerate(self._models): if m.get_final_loss() < max_loss: self._best_model = m max_loss = m.get_final_loss() self.get_additional_metrics() self._fit_time = time.time() - start_time # self._progress_bar.close() with open(os.path.join(self._results_path, "best_model.txt"), "w") as fout: fout.write(f"{self._best_model.get_name()}") with open(os.path.join(self._results_path, "params.json"), "w") as fout: params = { "ml_task": self._ml_task, "optimize_metric": self._optimize_metric, "saved": self._model_paths, } fout.write(json.dumps(params, indent=4)) ldb = self.get_leaderboard() ldb.to_csv(os.path.join(self._results_path, "leaderboard.csv"), index=False) # save report ldb["Link"] = [f"[Results link]({m}/README.md)" for m in ldb["name"].values] ldb.insert(loc=0, column="Best model", value="") ldb["Best model"][ldb.name == self._best_model.get_name()] = "*** the best ***" with open(os.path.join(self._results_path, "README.md"), "w") as fout: fout.write(f"# AutoML Leaderboard\n\n") fout.write(tabulate(ldb.values, ldb.columns, tablefmt="pipe")) self._load_data_variables(X_train) def predict(self, X): if self._best_model is None: return None if not isinstance(X.columns[0], str): X.columns = [str(c) for c in X.columns] input_columns = X.columns.tolist() for column in self._data_info["columns"]: if column not in input_columns: raise AutoMLException( f"Missing column: {column} in input data. Cannot predict" ) X = X[self._data_info["columns"]] predictions = self._best_model.predict(X) if self._ml_task == BINARY_CLASSIFICATION: # need to predict the label based on predictions and threshold neg_label, pos_label = ( predictions.columns[0][11:], predictions.columns[1][11:], ) if neg_label == "0" and pos_label == "1": neg_label, pos_label = 0, 1 # assume that it is binary classification predictions["label"] = predictions.iloc[:, 1] > self._best_model._threshold predictions["label"] = predictions["label"].map( {True: pos_label, False: neg_label} ) return predictions elif self._ml_task == MULTICLASS_CLASSIFICATION: return predictions else: return predictions def to_json(self): if self._best_model is None: return None return { "best_model": self._best_model.to_json(), "threshold": self._threshold, "ml_task": self._ml_task, } def from_json(self, json_data): # pretty sure that this can be easily refactored if json_data["best_model"]["algorithm_short_name"] == "Ensemble": self._best_model = Ensemble() self._best_model.from_json(json_data["best_model"]) else: self._best_model = ModelFramework(json_data["best_model"].get("params")) self._best_model.from_json(json_data["best_model"]) self._threshold = json_data.get("threshold") self._ml_task = json_data.get("ml_task")