def __init__( self, optimize_metric="logloss", ml_task=BINARY_CLASSIFICATION, is_stacked=False, max_single_prediction_time=None, ): self.library_version = "0.1" self.uid = str(uuid.uuid4()) self.metric = Metric({"name": optimize_metric}) self.best_loss = self.metric.get_maximum( ) # the best loss obtained by ensemble self.models_map = None self.selected_models = [] self.train_time = None self.total_best_sum = None # total sum of predictions, the oof of ensemble self.target = None self.target_columns = None self.sample_weight = None self._ml_task = ml_task self._optimize_metric = optimize_metric self._is_stacked = is_stacked self._additional_metrics = None self._threshold = None self._name = "Ensemble_Stacked" if is_stacked else "Ensemble" self._scores = [] self.oof_predictions = None self._oof_predictions_fname = None self._single_prediction_time = None # prediction time on single sample self._max_single_prediction_time = max_single_prediction_time self.model_prediction_time = {}
def test_metric_improvement(self): params = {"name": "logloss"} m = Metric(params) y_true = np.array([0, 0, 1, 1]) y_predicted = np.array([0, 0, 0, 1]) score_1 = m(y_true, y_predicted) y_true = np.array([0, 0, 1, 1]) y_predicted = np.array([0, 0, 1, 1]) score_2 = m(y_true, y_predicted) self.assertTrue(m.improvement(score_1, score_2))
def __init__(self, params): super(SklearnTreesEnsembleClassifierAlgorithm, self).__init__(params) self.log_metric = Metric( {"name": self.params.get("eval_metric_name", "logloss")}) self.max_iters = ( 1 # max iters is used by model_framework, max_steps is used internally ) if params.get("ml_task") == BINARY_CLASSIFICATION: self.predict_function = predict_proba_function_binary else: self.predict_function = predict_proba_function_multiclass
def test_fit_predict(self): metric = Metric({"name": "logloss"}) lgb = LightgbmAlgorithm(self.params) lgb.fit(self.X, self.y) y_predicted = lgb.predict(self.X) loss = metric(self.y, y_predicted) self.assertTrue(loss < 0.7)
def test_mape_metric(self): params = {"name": "mape"} m = Metric(params) y_true = np.array([0, 0, 1, 1]) y_predicted = np.array([0, 0, 1, 1]) score = m(y_true, y_predicted) self.assertEqual(score, 0.0)
def __init__(self, params): super(SklearnTreesEnsembleClassifierAlgorithm, self).__init__(params) self.log_metric = Metric({"name": "logloss"}) self.max_iters = ( 1 ) # max iters is used by model_framework, max_steps is used internally self.predict_function = predict_proba_function
def __init__(self, params): super(MetricLogger, self).__init__(params) self.name = params.get("name", "metric_logger") self.loss_values = {} self.metrics = [] for metric_name in params.get("metric_names"): self.metrics += [Metric({"name": metric_name})]
def test_fit_predict(self): metric = Metric({"name": "mse"}) nn = MLPRegressorAlgorithm(self.params) nn.fit(self.X, self.y) y_predicted = nn.predict(self.X) loss = metric(self.y, y_predicted) self.assertLess(loss, 1)
def __call__(self, trial): try: params = { "n_neighbors": trial.suggest_int("n_neighbors", 1, 128), "weights": trial.suggest_categorical( "weights", ["uniform", "distance"] ), "n_jobs": self.n_jobs, "rows_limit": 100000, "ml_task": self.ml_task, } Algorithm = ( KNeighborsRegressorAlgorithm if self.ml_task == REGRESSION else KNeighborsAlgorithm ) model = Algorithm(params) model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight) preds = model.predict(self.X_validation) score = self.eval_metric(self.y_validation, preds) if Metric.optimize_negative(self.eval_metric.name): score *= -1.0 except optuna.exceptions.TrialPruned as e: raise e except Exception as e: print("Exception in KNNObjective", str(e)) return None return score
def test_fit_predict(self): metric = Metric({"name": "logloss"}) nn = MLPAlgorithm(self.params) nn.fit(self.X, self.y) y_predicted = nn.predict_proba(self.X) loss = metric(self.y, y_predicted) self.assertLess(loss, 2)
def test_copy(self): # train model #1 metric = Metric({"name": "logloss"}) nn = NeuralNetworkAlgorithm(self.params) nn.fit(self.X, self.y) y_predicted = nn.predict(self.X) loss = metric(self.y, y_predicted) # create model #2 nn2 = NeuralNetworkAlgorithm(self.params) # model #2 is not initialized in constructor self.assertTrue(nn2.model is None) # do a copy and use it for predictions nn2 = nn.copy() self.assertEqual(type(nn), type(nn2)) y_predicted = nn2.predict(self.X) loss2 = metric(self.y, y_predicted) self.assertEqual(loss, loss2) # fit model #1, there should be improvement in loss nn.fit(self.X, self.y) y_predicted = nn.predict(self.X) loss3 = metric(self.y, y_predicted) self.assertTrue(loss3 < loss) # the loss of model #2 should not change y_predicted = nn2.predict(self.X) loss4 = metric(self.y, y_predicted) assert_almost_equal(loss2, loss4)
def test_r2_metric(self): params = {"name": "r2"} m = Metric(params) y_true = np.array([0, 0, 1, 1]) y_predicted = np.array([0, 0, 1, 1]) score = m(y_true, y_predicted) self.assertEqual(score, -1.0) # negative r2
def test_copy(self): # train model #1 metric = Metric({"name": "logloss"}) cat = CatBoostAlgorithm(self.params) cat.fit(self.X, self.y) y_predicted = cat.predict(self.X) loss = metric(self.y, y_predicted) # create model #2 cat2 = CatBoostAlgorithm(self.params) # model #2 is initialized in constructor self.assertTrue(cat2.model is not None) # do a copy and use it for predictions cat2 = cat.copy() self.assertEqual(type(cat), type(cat2)) y_predicted = cat2.predict(self.X) loss2 = metric(self.y, y_predicted) self.assertEqual(loss, loss2) # fit model #1, there should be improvement in loss cat.fit(self.X, self.y) y_predicted = cat.predict(self.X) loss3 = metric(self.y, y_predicted) self.assertTrue(loss3 < loss) # the loss of model #2 should not change y_predicted = cat2.predict(self.X) loss4 = metric(self.y, y_predicted) assert_almost_equal(loss2, loss4)
def test_fit_predict(self): metric = Metric({"name": "logloss"}) params = {"ml_task": "binary_classification"} la = KNeighborsAlgorithm(params) la.fit(self.X, self.y) y_predicted = la.predict(self.X) self.assertTrue(metric(self.y, y_predicted) < 0.6)
def test_fit_predict(self): metric = Metric({"name": "logloss"}) params = {"ml_task": "binary_classification"} rf = RandomForestAlgorithm(params) rf.fit(self.X, self.y) y_predicted = rf.predict(self.X) self.assertTrue(metric(self.y, y_predicted) < 1.0)
def test_fit_predict(self): metric = Metric({"name": "logloss"}) params = {"trees_in_step": 50} rf = RandomForestAlgorithm(params) rf.fit(self.X, self.y) y_predicted = rf.predict(self.X) self.assertTrue(metric(self.y, y_predicted) < 0.6)
def test_fit_predict(self): metric = Metric({"name": "logloss"}) params = {"trees_in_step": 50, "ml_task": "binary_classification"} rf = ExtraTreesAlgorithm(params) rf.fit(self.X, self.y) y_predicted = rf.predict(self.X) self.assertTrue(metric(self.y, y_predicted) < 0.6)
def plot_iterations(learner_names, metric_name, model_path, colors, trees_in_iteration=None): plt.figure(figsize=(10, 7)) for ln in learner_names: df = pd.read_csv( os.path.join(model_path, f"{ln}_training.log"), names=["iteration", "train", "test"], ) fold, repeat = learner_name_to_fold_repeat(ln) repeat_str = f" Reapeat {repeat+1}," if repeat is not None else "" # if trees_in_iteration is not None: # df.iteration = df.iteration * trees_in_iteration plt.plot( df.iteration, df.train, "--", color=colors[fold], label=f"Fold {fold+1},{repeat_str} train", ) any_none = np.sum(pd.isnull(df.test)) if any_none == 0: plt.plot( df.iteration, df.test, color=colors[fold], label=f"Fold {fold+1},{repeat_str} test", ) best_iter = None if Metric.optimize_negative(metric_name): best_iter = df.test.argmax() else: best_iter = df.test.argmin() if best_iter is not None and best_iter != -1: plt.axvline(best_iter, color=colors[fold], alpha=0.3) if trees_in_iteration is not None: plt.xlabel("#Trees") else: plt.xlabel("#Iteration") plt.ylabel(metric_name) # limit number of learners in the legend # too many will raise warnings if len(learner_names) <= 15: plt.legend(loc="best") plt.tight_layout(pad=2.0) plot_path = os.path.join(model_path, LearningCurves.output_file_name) plt.savefig(plot_path) plt.close("all")
def __init__(self, optimize_metric="logloss", ml_task=BINARY_CLASSIFICATION): self.library_version = "0.1" self.uid = str(uuid.uuid4()) self.metric = Metric({"name": optimize_metric}) self.best_loss = self.metric.get_maximum() # the best loss obtained by ensemble self.models_map = None self.selected_models = [] self.train_time = None self.total_best_sum = None # total sum of predictions, the oof of ensemble self.target = None self.target_columns = None self._ml_task = ml_task self._optimize_metric = optimize_metric self._additional_metrics = None self._threshold = None self._name = "ensemble" self._scores = []
def __call__(self, trial): param = { "objective": self.objective, "eval_metric": self.eval_metric_name, "tree_method": "hist", "booster": "gbtree", "eta": trial.suggest_categorical("eta", [0.0125, 0.025, 0.05, 0.1]), "max_depth": trial.suggest_int("max_depth", 2, 12), "lambda": trial.suggest_float("lambda", EPS, 10.0, log=True), "alpha": trial.suggest_float("alpha", EPS, 10.0, log=True), "colsample_bytree": min(trial.suggest_float("colsample_bytree", 0.3, 1.0 + EPS), 1.0), "subsample": min(trial.suggest_float("subsample", 0.3, 1.0 + EPS), 1.0), "min_child_weight": trial.suggest_int("min_child_weight", 1, 100), "n_jobs": self.n_jobs, "seed": self.seed, } if self.num_class is not None: param["num_class"] = self.num_class try: pruning_callback = optuna.integration.XGBoostPruningCallback( trial, f"validation-{self.eval_metric_name}") bst = xgb.train( param, self.dtrain, self.rounds, evals=[(self.dvalidation, "validation")], early_stopping_rounds=self.early_stopping_rounds, callbacks=[pruning_callback], verbose_eval=False, ) preds = bst.predict(self.dvalidation, ntree_limit=bst.best_ntree_limit) score = self.eval_metric(self.y_validation, preds) if Metric.optimize_negative(self.eval_metric.name): score *= -1.0 except optuna.exceptions.TrialPruned as e: raise e except Exception as e: print("Exception in XgboostObjective", str(e)) return None return score
def test_fit_predict(self): metric = Metric({"name": "logloss"}) loss_prev = None for _ in range(2): cat = CatBoostAlgorithm(self.params) cat.fit(self.X, self.y) y_predicted = cat.predict(self.X) loss = metric(self.y, y_predicted) if loss_prev is not None: assert_almost_equal(loss, loss_prev) loss_prev = loss
def test_fit_predict(self): metric = Metric({"name": "logloss"}) cat = CatBoostAlgorithm(self.params) loss_prev = None for _ in range(5): cat.fit(self.X, self.y) y_predicted = cat.predict(self.X) loss = metric(self.y, y_predicted) if loss_prev is not None: self.assertTrue(loss + 0.001 < loss_prev) loss_prev = loss
def test_reproduce_fit(self): metric = Metric({"name": "logloss"}) prev_loss = None for _ in range(3): model = CatBoostAlgorithm(self.params) model.fit(self.X, self.y) y_predicted = model.predict(self.X) loss = metric(self.y, y_predicted) if prev_loss is not None: assert_almost_equal(prev_loss, loss) prev_loss = loss
def test_fit_predict(self): metric = Metric({"name": "logloss"}) nn = NeuralNetworkAlgorithm(self.params) loss_prev = None for _ in range(3): nn.fit(self.X, self.y) y_predicted = nn.predict(self.X) loss = metric(self.y, y_predicted) if loss_prev is not None: self.assertTrue(loss + 0.000001 < loss_prev) loss_prev = loss
def test_reproduce_fit_regression(self): metric = Metric({"name": "rmse"}) prev_loss = None for _ in range(3): model = BaselineRegressorAlgorithm({"ml_task": "regression"}) model.fit(self.X, self.y) y_predicted = model.predict(self.X) loss = metric(self.y, y_predicted) if prev_loss is not None: assert_almost_equal(prev_loss, loss) prev_loss = loss
def test_fit_predict(self): metric = Metric({"name": "logloss"}) lgb = LightgbmAlgorithm(self.params) loss_prev = None for _ in range(3): lgb.fit(self.X, self.y) y_predicted = lgb.predict(self.X) loss = metric(self.y, y_predicted) if loss_prev is not None: self.assertTrue(loss + 0.001 < loss_prev) loss_prev = loss
def test_create(self): params = {"name": "logloss"} m = Metric(params) y_true = np.array([0, 0, 1, 1]) y_predicted = np.array([0, 0, 1, 1]) score = m(y_true, y_predicted) self.assertTrue(score < 0.1) y_true = np.array([0, 0, 1, 1]) y_predicted = np.array([1, 1, 0, 0]) score = m(y_true, y_predicted) self.assertTrue(score > 1.0)
def test_sample_weight(self): metrics = ["logloss", "auc", "acc", "rmse", "mse", "mae", "r2", "mape"] for m in metrics: metric = Metric({"name": m}) y_true = np.array([0, 0, 1, 1]) y_predicted = np.array([0, 0, 0, 1]) sample_weight = np.array([1, 1, 1, 1]) score_1 = metric(y_true, y_predicted) score_2 = metric(y_true, y_predicted, sample_weight) assert_almost_equal(score_1, score_2)
def __init__( self, results_path, ml_task, eval_metric, time_budget=3600, init_params={}, verbose=True, n_jobs=-1, random_state=42, ): if eval_metric.name not in [ "auc", "logloss", "rmse", "mse", "mae", "mape", "r2", "spearman", "pearson", "f1", "average_precision", "accuracy", "user_defined_metric", ]: raise AutoMLException( f"Metric {eval_metric.name} is not supported") self.study_dir = os.path.join(results_path, "optuna") if not os.path.exists(self.study_dir): try: os.mkdir(self.study_dir) except Exception as e: print("Problem while creating directory for optuna studies.", str(e)) self.tuning_fname = os.path.join(self.study_dir, "optuna.json") self.tuning = init_params self.eval_metric = eval_metric self.direction = ("maximize" if Metric.optimize_negative( eval_metric.name) else "minimize") self.n_warmup_steps = ( 500 # set large enough to give small learning rates a chance ) self.time_budget = time_budget self.verbose = verbose self.ml_task = ml_task self.n_jobs = n_jobs self.random_state = random_state self.cat_features_indices = [] self.load() if not self.verbose: optuna.logging.set_verbosity(optuna.logging.CRITICAL)
def test_reproduce_fit(self): metric = Metric({"name": "logloss"}) params = {"trees_in_step": 1, "seed": 1, "ml_task": "binary_classification"} prev_loss = None for _ in range(3): model = ExtraTreesAlgorithm(params) model.fit(self.X, self.y) y_predicted = model.predict(self.X) loss = metric(self.y, y_predicted) if prev_loss is not None: assert_almost_equal(prev_loss, loss) prev_loss = loss