def _check_imbalanced(self, y): v = y.value_counts() # at least 10 samples of each class ii = v < 10 if np.sum(ii): raise AutoMLException( f"There need to be at least 10 samples of each class, for class {list(v[ii].index)} there is {v[ii].values} samples" ) # at least 1% of all samples for each class v = y.value_counts(normalize=True) * 100.0 ii = v < 1.0 if np.sum(ii): raise AutoMLException( f"There need to be at least 1% of samples of each class, for class {list(v[ii].index)} there is {v[ii].values} % of samples" )
def _initial_prep(self, X_train, y_train, X_validation=None, y_validation=None): if not isinstance(X_train, pd.DataFrame): X_train = pd.DataFrame(X_train) if not isinstance(X_train.columns[0], str): X_train.columns = [str(c) for c in X_train.columns] X_train.reset_index(drop=True, inplace=True) if isinstance(y_train, pd.DataFrame): if "target" not in y_train.columns: raise AutoMLException( "y_train should be Numpy array, Pandas Series or DataFrame with column 'target' " ) else: y_train = y_train["target"] y_train = pd.Series(np.array(y_train), name="target") X_train, y_train = ExcludeRowsMissingTarget.transform(X_train, y_train, warn=True) return X_train, y_train, X_validation, y_validation
def create_dir(self, model_path): if not os.path.exists(model_path): try: os.mkdir(model_path) except Exception as e: raise AutoMLException( f"Cannot create directory {model_path}. {str(e)}")
def train_model(self, params): model_path = os.path.join(self._results_path, params["name"]) early_stop = EarlyStopping( {"metric": {"name": self._optimize_metric}, "log_to_dir": model_path} ) time_constraint = TimeConstraint({"train_seconds_time_limit": self._time_limit}) mf = ModelFramework(params, callbacks=[early_stop, time_constraint]) if self._enough_time_to_train(mf.get_type()): logger.info( f"Train model #{len(self._models)+1} / Model name: {params['name']}" ) try: os.mkdir(model_path) except Exception as e: raise AutoMLException(f"Cannot create directory {model_path}") mf.train() # {"train": {"X": X, "y": y}}) mf.save(model_path) self._model_paths += [model_path] self.keep_model(mf) else: logger.info( f"Cannot check more models of {mf.get_type()} because of time constraint" )
def load(self): logger.info("Loading AutoML models ...") try: params = json.load(open(os.path.join(self._results_path, "params.json"))) self._model_paths = params["saved"] self._ml_task = params["ml_task"] self._optimize_metric = params["optimize_metric"] models_map = {} for model_path in self._model_paths: if model_path.endswith("ensemble"): ens = Ensemble.load(model_path, models_map) models_map[ens.get_name()] = ens else: m = ModelFramework.load(model_path) self._models += [m] models_map[m.get_name()] = m best_model_name = None with open(os.path.join(self._results_path, "best_model.txt"), "r") as fin: best_model_name = fin.read() self._best_model = models_map[best_model_name] data_info_path = os.path.join(self._results_path, "data_info.json") self._data_info = json.load(open(data_info_path)) except Exception as e: raise AutoMLException(f"Cannot load AutoML directory. {str(e)}")
def predict(self, X): """ Computes predictions from AutoML best model. :param X: The Pandas DataFrame with input data. The input data should have the same columns as data used for training, otherwise the `AutoMLException` will be raised. """ if self._best_model is None: return None if not isinstance(X.columns[0], str): X.columns = [str(c) for c in X.columns] input_columns = X.columns.tolist() for column in self._data_info["columns"]: if column not in input_columns: raise AutoMLException( f"Missing column: {column} in input data. Cannot predict") X = X[self._data_info["columns"]] # is stacked model if self._best_model._is_stacked: self.stack_models() X_stacked = self.get_stacked_data(X, mode="predict") if self._best_model.get_type() == "Ensemble": # Ensemble is using both original and stacked data predictions = self._best_model.predict(X, X_stacked) else: predictions = self._best_model.predict(X_stacked) else: predictions = self._best_model.predict(X) if self._ml_task == BINARY_CLASSIFICATION: # need to predict the label based on predictions and threshold neg_label, pos_label = ( predictions.columns[0][11:], predictions.columns[1][11:], ) if neg_label == "0" and pos_label == "1": neg_label, pos_label = 0, 1 target_is_numeric = self._data_info.get("target_is_numeric", False) if target_is_numeric: neg_label = int(neg_label) pos_label = int(pos_label) # assume that it is binary classification predictions[ "label"] = predictions.iloc[:, 1] > self._best_model._threshold predictions["label"] = predictions["label"].map({ True: pos_label, False: neg_label }) return predictions elif self._ml_task == MULTICLASS_CLASSIFICATION: target_is_numeric = self._data_info.get("target_is_numeric", False) if target_is_numeric: predictions["label"] = predictions["label"].astype(int) return predictions else: return predictions
def get_split(self, k, repeat=0): try: train_index_file = os.path.join(self._results_path, "folds", f"fold_{k}_train_indices.npy") validation_index_file = os.path.join( self._results_path, "folds", f"fold_{k}_validation_indices.npy") train_index = np.load(train_index_file) validation_index = np.load(validation_index_file) X = load_data(self._X_path) y = load_data(self._y_path) y = y["target"] sample_weight = None if self._sample_weight_path is not None: sample_weight = load_data(self._sample_weight_path) sample_weight = sample_weight["sample_weight"] train_data = {"X": X.iloc[train_index], "y": y.iloc[train_index]} validation_data = { "X": X.iloc[validation_index], "y": y.iloc[validation_index], } if sample_weight is not None: train_data["sample_weight"] = sample_weight.iloc[train_index] validation_data["sample_weight"] = sample_weight.iloc[ validation_index] except Exception as e: import traceback print(traceback.format_exc()) raise AutoMLException("Problem with custom validation. " + str(e)) return (train_data, validation_data)
def _check_is_fitted(self): # First check if model can be loaded self._check_can_load() # Check if fitted if self._fit_level != "finished": raise AutoMLException( "This model has not been fitted yet. Please call `fit()` first." )
def _base_predict(self, X): self._check_is_fitted() X = self._build_dataframe(X) if not isinstance(X.columns[0], str): X.columns = [str(c) for c in X.columns] input_columns = X.columns.tolist() for column in self._data_info["columns"]: if column not in input_columns: raise AutoMLException( f"Missing column: {column} in input data. Cannot predict") X = X[self._data_info["columns"]] self._validate_X_predict(X) # is stacked model if self._best_model._is_stacked: self._perform_model_stacking() X_stacked = self.get_stacked_data(X, mode="predict") if self._best_model.get_type() == "Ensemble": # Ensemble is using both original and stacked data predictions = self._best_model.predict(X, X_stacked) else: predictions = self._best_model.predict(X_stacked) else: predictions = self._best_model.predict(X) if self._ml_task == BINARY_CLASSIFICATION: # need to predict the label based on predictions and threshold neg_label, pos_label = ( predictions.columns[0][11:], predictions.columns[1][11:], ) if neg_label == "0" and pos_label == "1": neg_label, pos_label = 0, 1 target_is_numeric = self._data_info.get("target_is_numeric", False) if target_is_numeric: neg_label = int() pos_label = int(pos_label) # assume that it is binary classification predictions[ "label"] = predictions.iloc[:, 1] > self._best_model._threshold predictions["label"] = predictions["label"].map({ True: pos_label, False: neg_label }) return predictions elif self._ml_task == MULTICLASS_CLASSIFICATION: target_is_numeric = self._data_info.get("target_is_numeric", False) if target_is_numeric: predictions["label"] = predictions["label"].astype(np.int32) return predictions # Regression else: return predictions
def get_algorithm(cls, params): alg_type = params.get("model_type", "Xgboost") ml_task = params.get("ml_task", BINARY_CLASSIFICATION) try: Algorithm = AlgorithmsRegistry.get_algorithm_class(ml_task, alg_type) return Algorithm(params) except Exception as e: raise AutoMLException(f"Cannot get algorithm class. {str(e)}")
def _predict_all(self, X): # Check is task type is correct if self._ml_task == REGRESSION: raise AutoMLException( f"Method `predict_all()` can only be used when in classification tasks. Current task: '{self._ml_task}'." ) # Make and return predictions return self._base_predict(X)
def train_model(self, params): model_path = os.path.join(self._results_path, params["name"]) early_stop = EarlyStopping({ "metric": { "name": self._optimize_metric }, "log_to_dir": model_path }) learner_time_constraint = LearnerTimeConstraint({ "learner_time_limit": self._get_learner_time_limit( params["learner"]["model_type"]), # self._time_limit, "min_steps": params["additional"].get("min_steps"), }) total_time_constraint = TotalTimeConstraint({ "total_time_limit": self._total_time_limit if self._model_time_limit is None else None, "total_time_start": self._start_time, }) mf = ModelFramework( params, callbacks=[ early_stop, learner_time_constraint, total_time_constraint ], ) if self._enough_time_to_train(mf.get_type()): # self.verbose_print(params["name"] + " training start ...") logger.info( f"Train model #{len(self._models)+1} / Model name: {params['name']}" ) try: os.mkdir(model_path) except Exception as e: raise AutoMLException(f"Cannot create directory {model_path}") mf.train(model_path) mf.save(model_path) self._model_paths += [model_path] self.keep_model(mf) # save the best one in the case the training will be interrupted self.select_and_save_best() else: logger.info( f"Cannot train {mf.get_type()} because of time constraint")
def _score(self, X, y=None): # y default must be None for scikit-learn compatibility # Check if y is None if y is None: raise AutoMLException("y must be specified.") predictions = self._predict(X) return (r2_score(y, predictions) if self._ml_task == REGRESSION else accuracy_score(y, predictions))
def _predict_proba(self, X): # Check is task type is correct if self._ml_task == REGRESSION: raise AutoMLException( f"Method `predict_proba()` can only be used when in classification tasks. Current task: '{self._ml_task}'." ) # Make and return predictions # If classification task the result is in column 'label' # Need to drop `label` column. return self._base_predict(X).drop(["label"], axis=1).to_numpy()
def __init__( self, results_path, ml_task, eval_metric, time_budget=3600, init_params={}, verbose=True, n_jobs=-1, random_state=42, ): if eval_metric.name not in [ "auc", "logloss", "rmse", "mse", "mae", "mape", "r2", "spearman", "pearson", "f1", "average_precision", "accuracy", "user_defined_metric", ]: raise AutoMLException( f"Metric {eval_metric.name} is not supported") self.study_dir = os.path.join(results_path, "optuna") if not os.path.exists(self.study_dir): try: os.mkdir(self.study_dir) except Exception as e: print("Problem while creating directory for optuna studies.", str(e)) self.tuning_fname = os.path.join(self.study_dir, "optuna.json") self.tuning = init_params self.eval_metric = eval_metric self.direction = ("maximize" if Metric.optimize_negative( eval_metric.name) else "minimize") self.n_warmup_steps = ( 500 # set large enough to give small learning rates a chance ) self.time_budget = time_budget self.verbose = verbose self.ml_task = ml_task self.n_jobs = n_jobs self.random_state = random_state self.cat_features_indices = [] self.load() if not self.verbose: optuna.logging.set_verbosity(optuna.logging.CRITICAL)
def _set_results_dir(self): if self._results_path is None: found = False for i in range(1, 101): self._results_path = f"AutoML_{i}" if not os.path.exists(self._results_path): found = True break if not found: raise AutoMLException("Cannot create directory for AutoML results") if os.path.exists(self._results_path): print(f"Directory {self._results_path} already exists") self.load() elif self._results_path is not None: print(f"Create directory {self._results_path}") try: os.mkdir(self._results_path) except Exception as e: raise AutoMLException(f"Cannot create directory {self._results_path}")
def on_learner_train_end(self, logs): if ( self.total_time_limit is not None and len(self.learners) == 1 and self.expected_learners_cnt > 1 # just check for the first learner # need to have more than 1 learner # otherwise it is a finish of the training ): one_fold_time = time.time() - self.train_start_time estimate_all_folds = one_fold_time * self.expected_learners_cnt total_elapsed_time = np.round(time.time() - self.total_time_start, 2) # we need to add time for the rest of learners (assuming that all folds training time is the same) estimate_elapsed_time = total_elapsed_time + one_fold_time * ( self.expected_learners_cnt - 1 ) if estimate_elapsed_time >= self.total_time_limit: raise AutoMLException( "Stop training after the first fold. " f"Time needed to train on the first fold {np.round(one_fold_time)} seconds. " "The time estimate for training on all folds is larger than total_time_limit." ) if ( self.total_time_limit is not None and len(self.learners) < self.expected_learners_cnt # dont stop for last learner, we are finishing anyway ): total_elapsed_time = np.round(time.time() - self.total_time_start, 2) if total_elapsed_time > self.total_time_limit + 600: # add 10 minutes of margin # margin is added because of unexpected time changes # if training on each fold will be the same # then the training will be stopped after first fold (above condition) raise AutoMLException( "Force to stop the training. " "Total time for AutoML training already exceeded." )
def _get_results_path(self): """ Gets the current results_path""" # if we already have the results path set, please return it if self._results_path is not None: return self._results_path self._validate_results_path() path = self.results_path if path is None: for i in range(1, 10001): name = f"AutoML_{i}" if not os.path.exists(name): self.create_dir(name) self._results_path = name return name # If it got here, could not create, raise expection raise AutoMLException("Cannot create directory for AutoML results") elif os.path.exists(self.results_path) and os.path.exists( os.path.join( self.results_path, "params.json")): # AutoML already loaded, return path self._results_path = path return path # Dir does not exist, create it elif not os.path.exists(path): self.create_dir(path) self._results_path = path return path # Dir exists and is empty, use it elif os.path.exists(path) and not len(os.listdir(path)): self._results_path = path return path elif os.path.exists(path) and len(os.listdir(path)): raise AutoMLException( f"Cannot set directory for AutoML. Directory '{path}' is not empty." ) raise AutoMLException("Cannot set directory for AutoML results")
def _set_metric(self): """ Set and validate the metric to be optimized. """ if self._ml_task == BINARY_CLASSIFICATION: if self._user_set_optimize_metric is None: self._optimize_metric = "logloss" elif self._user_set_optimize_metric not in ["logloss", "auc"]: raise AutoMLException( "Metric {} is not allowed in ML task: {}".format( self._user_set_optimize_metric, self._ml_task ) ) else: self._optimize_metric = self._user_set_optimize_metric elif self._ml_task == MULTICLASS_CLASSIFICATION: if self._user_set_optimize_metric is None: self._optimize_metric = "logloss" elif self._user_set_optimize_metric not in ["logloss"]: raise AutoMLException( "Metric {} is not allowed in ML task: {}".format( self._user_set_optimize_metric, self._ml_task ) ) else: self._optimize_metric = self._user_set_optimize_metric elif self._ml_task == REGRESSION: if self._user_set_optimize_metric is None: self._optimize_metric = "rmse" elif self._user_set_optimize_metric not in ["rmse"]: raise AutoMLException( "Metric {} is not allowed in ML task: {}".format( self._user_set_optimize_metric, self._ml_task ) ) else: self._optimize_metric = self._user_set_optimize_metric logger.info( "AutoML will optimize for metric: {0}".format(self._optimize_metric) ) print(f"AutoML will optimize for metric: {self._optimize_metric}")
def ensemble_step(self): if self._train_ensemble: self.ensemble = Ensemble(self._optimize_metric, self._ml_task) oofs, target = self.ensemble.get_oof_matrix(self._models) self.ensemble.fit(oofs, target) self.keep_model(self.ensemble) ensemble_path = os.path.join(self._results_path, "ensemble") try: os.mkdir(ensemble_path) except Exception as e: raise AutoMLException(f"Cannot create directory {ensemble_path}") self.ensemble.save(ensemble_path) self._model_paths += [ensemble_path]
def __init__(self, params): # kfold is default validation technique self.validation_type = params.get("validation_type", "kfold") if self.validation_type == "kfold": self.validator = KFoldValidator(params) elif self.validation_type == "split": self.validator = SplitValidator(params) else: raise AutoMLException( f"The validation type ({self.validation_type}) is not implemented." ) """
def __init__(self, params): BaseValidator.__init__(self, params) self.train_ratio = self.params.get("train_ratio", 0.8) self.shuffle = self.params.get("shuffle", True) self.stratify = self.params.get("stratify", False) self.random_seed = self.params.get("random_seed", 1234) log.debug("SplitValidator, train_ratio: {0}".format(self.train_ratio)) self._results_path = self.params.get("results_path") self._X_path = self.params.get("X_path") self._y_path = self.params.get("y_path") if self._X_path is None or self._y_path is None: raise AutoMLException("No data path set in SplitValidator params")
def transform(self, X): if self._kmeans is None: raise AutoMLException("KMeans not fitted") # scale X_scaled = self._scale.transform(X[self._input_columns]) # kmeans distances = self._kmeans.transform(X_scaled) clusters = self._kmeans.predict(X_scaled) X[self._new_features[:-1]] = distances X[self._new_features[-1]] = clusters return X
def ensemble_step(self): if self._train_ensemble and len(self._models) > 1: self.ensemble = Ensemble(self._optimize_metric, self._ml_task) oofs, target = self.ensemble.get_oof_matrix(self._models) self.ensemble.fit(oofs, target) self.keep_model(self.ensemble) ensemble_path = os.path.join(self._results_path, "ensemble") try: os.mkdir(ensemble_path) except Exception as e: raise AutoMLException( f"Cannot create directory {ensemble_path}") self.ensemble.save(ensemble_path) self._model_paths += [ensemble_path] # save the best one in the case the training will be interrupted self.select_and_save_best()
def __init__( self, results_path, ml_task, eval_metric, time_budget=3600, init_params={}, verbose=True, n_jobs=-1, random_state=42, ): if eval_metric.name not in ["auc", "logloss", "rmse", "mae", "mape"]: raise AutoMLException( f"Metric {eval_metric.name} is not supported") self.study_dir = os.path.join(results_path, "optuna") if not os.path.exists(self.study_dir): try: os.mkdir(self.study_dir) except Exception as e: print("Problem while creating directory for optuna studies.", str(e)) self.tuning_fname = os.path.join(self.study_dir, "optuna.json") self.tuning = init_params self.eval_metric = eval_metric self.direction = ("maximize" if Metric.optimize_negative( eval_metric.name) else "minimize") self.n_warmup_steps = 500 # set large enough to give small learning rates a chance self.time_budget = time_budget self.verbose = verbose self.ml_task = ml_task self.n_jobs = n_jobs self.random_state = random_state self.cat_features_indices = [] data_info_fname = os.path.join(results_path, "data_info.json") if os.path.exists(data_info_fname): data_info = json.loads(open(data_info_fname).read()) for i, (k, v) in enumerate(data_info["columns_info"].items()): if "categorical" in v: self.cat_features_indices += [i] self.load() if not self.verbose: optuna.logging.set_verbosity(optuna.logging.CRITICAL)
def _set_algorithms(self): """ Set and validate available algorithms. If algorithms are not set, all algorithms from registry are used. Then perform vadlidation of algorithms. """ if len(self._algorithms) == 0: self._algorithms = list(AlgorithmsRegistry.registry[self._ml_task].keys()) for a in self._algorithms: if a not in list(AlgorithmsRegistry.registry[self._ml_task].keys()): raise AutoMLException( "The algorithm {} is not allowed to use for ML task: {}. Allowed algorithms: {}".format( a, self._ml_task, list(AlgorithmsRegistry.registry[self._ml_task].keys()) ) ) logger.info("AutoML will use algorithms: {}".format(self._algorithms)) print(f"AutoML will use algorithms: {self._algorithms}")
def _initial_prep(self, X_train, y_train, X_validation=None, y_validation=None): if not isinstance(X_train, pd.DataFrame): X_train = pd.DataFrame(X_train) if not isinstance(X_train.columns[0], str): X_train.columns = [str(c) for c in X_train.columns] X_train.reset_index(drop=True, inplace=True) if not isinstance(y_train, pd.DataFrame): y_train = pd.DataFrame({"target": np.array(y_train)}) else: if "target" not in y_train.columns: raise AutoMLException("There should be target column in y_train") y_train.reset_index(drop=True, inplace=True) return X_train, y_train["target"], X_validation, y_validation
def load(self, path): logger.info("Loading AutoML models ...") try: params = json.load(open(os.path.join(path, "params.json"))) self._model_paths = params["saved"] self._ml_task = params["ml_task"] self._eval_metric = params["eval_metric"] stacked_models = params.get("stacked") models_map = {} for model_path in self._model_paths: if model_path.endswith("Ensemble") or model_path.endswith( "Ensemble_Stacked" ): ens = Ensemble.load(model_path, models_map) self._models += [ens] models_map[ens.get_name()] = ens else: m = ModelFramework.load(model_path) self._models += [m] models_map[m.get_name()] = m if stacked_models is not None: self._stacked_models = [] for stacked_model_name in stacked_models: self._stacked_models += [models_map[stacked_model_name]] best_model_name = None with open(os.path.join(path, "best_model.txt"), "r") as fin: best_model_name = fin.read() self._best_model = models_map[best_model_name] data_info_path = os.path.join(path, "data_info.json") self._data_info = json.load(open(data_info_path)) self.n_features_in_ = self._data_info["n_features"] if "n_classes" in self._data_info: self.n_classes = self._data_info["n_classes"] self._fit_level = "finished" except Exception as e: raise AutoMLException(f"Cannot load AutoML directory. {str(e)}")
def __init__(self, params): BaseValidator.__init__(self, params) self.train_ratio = self.params.get("train_ratio", 0.8) self.shuffle = self.params.get("shuffle", True) self.stratify = self.params.get("stratify", False) self.random_seed = self.params.get("random_seed", 1234) self.repeats = self.params.get("repeats", 1) if not self.shuffle and self.repeats > 1: warnings.warn( "Disable repeats in validation because shuffle is disabled") self.repeats = 1 self._results_path = self.params.get("results_path") self._X_path = self.params.get("X_path") self._y_path = self.params.get("y_path") if self._X_path is None or self._y_path is None: raise AutoMLException("No data path set in SplitValidator params")
def predict(self, X): if self._best_model is None: return None if not isinstance(X.columns[0], str): X.columns = [str(c) for c in X.columns] input_columns = X.columns.tolist() for column in self._data_info["columns"]: if column not in input_columns: raise AutoMLException( f"Missing column: {column} in input data. Cannot predict" ) X = X[self._data_info["columns"]] predictions = self._best_model.predict(X) if self._ml_task == BINARY_CLASSIFICATION: # need to predict the label based on predictions and threshold neg_label, pos_label = ( predictions.columns[0][11:], predictions.columns[1][11:], ) if neg_label == "0" and pos_label == "1": neg_label, pos_label = 0, 1 # assume that it is binary classification predictions["label"] = predictions.iloc[:, 1] > self._best_model._threshold predictions["label"] = predictions["label"].map( {True: pos_label, False: neg_label} ) return predictions elif self._ml_task == MULTICLASS_CLASSIFICATION: return predictions else: return predictions