Ejemplo n.º 1
0
    def __init__(
        self,
        optimize_metric="logloss",
        ml_task=BINARY_CLASSIFICATION,
        is_stacked=False,
        max_single_prediction_time=None,
    ):
        self.library_version = "0.1"
        self.uid = str(uuid.uuid4())

        self.metric = Metric({"name": optimize_metric})
        self.best_loss = self.metric.get_maximum(
        )  # the best loss obtained by ensemble
        self.models_map = None
        self.selected_models = []
        self.train_time = None
        self.total_best_sum = None  # total sum of predictions, the oof of ensemble
        self.target = None
        self.target_columns = None
        self.sample_weight = None
        self._ml_task = ml_task
        self._optimize_metric = optimize_metric
        self._is_stacked = is_stacked

        self._additional_metrics = None
        self._threshold = None
        self._name = "Ensemble_Stacked" if is_stacked else "Ensemble"
        self._scores = []
        self.oof_predictions = None
        self._oof_predictions_fname = None
        self._single_prediction_time = None  # prediction time on single sample
        self._max_single_prediction_time = max_single_prediction_time
        self.model_prediction_time = {}
Ejemplo n.º 2
0
 def test_metric_improvement(self):
     params = {"name": "logloss"}
     m = Metric(params)
     y_true = np.array([0, 0, 1, 1])
     y_predicted = np.array([0, 0, 0, 1])
     score_1 = m(y_true, y_predicted)
     y_true = np.array([0, 0, 1, 1])
     y_predicted = np.array([0, 0, 1, 1])
     score_2 = m(y_true, y_predicted)
     self.assertTrue(m.improvement(score_1, score_2))
Ejemplo n.º 3
0
 def __init__(self, params):
     super(SklearnTreesEnsembleClassifierAlgorithm, self).__init__(params)
     self.log_metric = Metric(
         {"name": self.params.get("eval_metric_name", "logloss")})
     self.max_iters = (
         1  # max iters is used by model_framework, max_steps is used internally
     )
     if params.get("ml_task") == BINARY_CLASSIFICATION:
         self.predict_function = predict_proba_function_binary
     else:
         self.predict_function = predict_proba_function_multiclass
Ejemplo n.º 4
0
 def test_fit_predict(self):
     metric = Metric({"name": "logloss"})
     lgb = LightgbmAlgorithm(self.params)
     lgb.fit(self.X, self.y)
     y_predicted = lgb.predict(self.X)
     loss = metric(self.y, y_predicted)
     self.assertTrue(loss < 0.7)
Ejemplo n.º 5
0
 def test_mape_metric(self):
     params = {"name": "mape"}
     m = Metric(params)
     y_true = np.array([0, 0, 1, 1])
     y_predicted = np.array([0, 0, 1, 1])
     score = m(y_true, y_predicted)
     self.assertEqual(score, 0.0)
Ejemplo n.º 6
0
 def __init__(self, params):
     super(SklearnTreesEnsembleClassifierAlgorithm, self).__init__(params)
     self.log_metric = Metric({"name": "logloss"})
     self.max_iters = (
         1
     )  # max iters is used by model_framework, max_steps is used internally
     self.predict_function = predict_proba_function
Ejemplo n.º 7
0
 def __init__(self, params):
     super(MetricLogger, self).__init__(params)
     self.name = params.get("name", "metric_logger")
     self.loss_values = {}
     self.metrics = []
     for metric_name in params.get("metric_names"):
         self.metrics += [Metric({"name": metric_name})]
Ejemplo n.º 8
0
 def test_fit_predict(self):
     metric = Metric({"name": "mse"})
     nn = MLPRegressorAlgorithm(self.params)
     nn.fit(self.X, self.y)
     y_predicted = nn.predict(self.X)
     loss = metric(self.y, y_predicted)
     self.assertLess(loss, 1)
Ejemplo n.º 9
0
    def __call__(self, trial):
        try:
            params = {
                "n_neighbors": trial.suggest_int("n_neighbors", 1, 128),
                "weights": trial.suggest_categorical(
                    "weights", ["uniform", "distance"]
                ),
                "n_jobs": self.n_jobs,
                "rows_limit": 100000,
                "ml_task": self.ml_task,
            }
            Algorithm = (
                KNeighborsRegressorAlgorithm
                if self.ml_task == REGRESSION
                else KNeighborsAlgorithm
            )
            model = Algorithm(params)
            model.fit(self.X_train, self.y_train, sample_weight=self.sample_weight)
            preds = model.predict(self.X_validation)

            score = self.eval_metric(self.y_validation, preds)
            if Metric.optimize_negative(self.eval_metric.name):
                score *= -1.0

        except optuna.exceptions.TrialPruned as e:
            raise e
        except Exception as e:
            print("Exception in KNNObjective", str(e))
            return None

        return score
Ejemplo n.º 10
0
 def test_fit_predict(self):
     metric = Metric({"name": "logloss"})
     nn = MLPAlgorithm(self.params)
     nn.fit(self.X, self.y)
     y_predicted = nn.predict_proba(self.X)
     loss = metric(self.y, y_predicted)
     self.assertLess(loss, 2)
Ejemplo n.º 11
0
 def test_copy(self):
     # train model #1
     metric = Metric({"name": "logloss"})
     nn = NeuralNetworkAlgorithm(self.params)
     nn.fit(self.X, self.y)
     y_predicted = nn.predict(self.X)
     loss = metric(self.y, y_predicted)
     # create model #2
     nn2 = NeuralNetworkAlgorithm(self.params)
     # model #2 is not initialized in constructor
     self.assertTrue(nn2.model is None)
     # do a copy and use it for predictions
     nn2 = nn.copy()
     self.assertEqual(type(nn), type(nn2))
     y_predicted = nn2.predict(self.X)
     loss2 = metric(self.y, y_predicted)
     self.assertEqual(loss, loss2)
     # fit model #1, there should be improvement in loss
     nn.fit(self.X, self.y)
     y_predicted = nn.predict(self.X)
     loss3 = metric(self.y, y_predicted)
     self.assertTrue(loss3 < loss)
     # the loss of model #2 should not change
     y_predicted = nn2.predict(self.X)
     loss4 = metric(self.y, y_predicted)
     assert_almost_equal(loss2, loss4)
Ejemplo n.º 12
0
 def test_r2_metric(self):
     params = {"name": "r2"}
     m = Metric(params)
     y_true = np.array([0, 0, 1, 1])
     y_predicted = np.array([0, 0, 1, 1])
     score = m(y_true, y_predicted)
     self.assertEqual(score, -1.0)  # negative r2
Ejemplo n.º 13
0
 def test_copy(self):
     # train model #1
     metric = Metric({"name": "logloss"})
     cat = CatBoostAlgorithm(self.params)
     cat.fit(self.X, self.y)
     y_predicted = cat.predict(self.X)
     loss = metric(self.y, y_predicted)
     # create model #2
     cat2 = CatBoostAlgorithm(self.params)
     # model #2 is initialized in constructor
     self.assertTrue(cat2.model is not None)
     # do a copy and use it for predictions
     cat2 = cat.copy()
     self.assertEqual(type(cat), type(cat2))
     y_predicted = cat2.predict(self.X)
     loss2 = metric(self.y, y_predicted)
     self.assertEqual(loss, loss2)
     # fit model #1, there should be improvement in loss
     cat.fit(self.X, self.y)
     y_predicted = cat.predict(self.X)
     loss3 = metric(self.y, y_predicted)
     self.assertTrue(loss3 < loss)
     # the loss of model #2 should not change
     y_predicted = cat2.predict(self.X)
     loss4 = metric(self.y, y_predicted)
     assert_almost_equal(loss2, loss4)
Ejemplo n.º 14
0
    def test_fit_predict(self):
        metric = Metric({"name": "logloss"})
        params = {"ml_task": "binary_classification"}
        la = KNeighborsAlgorithm(params)

        la.fit(self.X, self.y)
        y_predicted = la.predict(self.X)
        self.assertTrue(metric(self.y, y_predicted) < 0.6)
Ejemplo n.º 15
0
    def test_fit_predict(self):
        metric = Metric({"name": "logloss"})
        params = {"ml_task": "binary_classification"}
        rf = RandomForestAlgorithm(params)

        rf.fit(self.X, self.y)
        y_predicted = rf.predict(self.X)
        self.assertTrue(metric(self.y, y_predicted) < 1.0)
Ejemplo n.º 16
0
    def test_fit_predict(self):
        metric = Metric({"name": "logloss"})
        params = {"trees_in_step": 50}
        rf = RandomForestAlgorithm(params)

        rf.fit(self.X, self.y)
        y_predicted = rf.predict(self.X)
        self.assertTrue(metric(self.y, y_predicted) < 0.6)
Ejemplo n.º 17
0
    def test_fit_predict(self):
        metric = Metric({"name": "logloss"})
        params = {"trees_in_step": 50, "ml_task": "binary_classification"}
        rf = ExtraTreesAlgorithm(params)

        rf.fit(self.X, self.y)
        y_predicted = rf.predict(self.X)
        self.assertTrue(metric(self.y, y_predicted) < 0.6)
Ejemplo n.º 18
0
    def plot_iterations(learner_names,
                        metric_name,
                        model_path,
                        colors,
                        trees_in_iteration=None):
        plt.figure(figsize=(10, 7))
        for ln in learner_names:
            df = pd.read_csv(
                os.path.join(model_path, f"{ln}_training.log"),
                names=["iteration", "train", "test"],
            )

            fold, repeat = learner_name_to_fold_repeat(ln)
            repeat_str = f" Reapeat {repeat+1}," if repeat is not None else ""
            # if trees_in_iteration is not None:
            #    df.iteration = df.iteration * trees_in_iteration
            plt.plot(
                df.iteration,
                df.train,
                "--",
                color=colors[fold],
                label=f"Fold {fold+1},{repeat_str} train",
            )
            any_none = np.sum(pd.isnull(df.test))
            if any_none == 0:
                plt.plot(
                    df.iteration,
                    df.test,
                    color=colors[fold],
                    label=f"Fold {fold+1},{repeat_str} test",
                )

            best_iter = None
            if Metric.optimize_negative(metric_name):
                best_iter = df.test.argmax()
            else:
                best_iter = df.test.argmin()

            if best_iter is not None and best_iter != -1:
                plt.axvline(best_iter, color=colors[fold], alpha=0.3)

        if trees_in_iteration is not None:
            plt.xlabel("#Trees")
        else:
            plt.xlabel("#Iteration")
        plt.ylabel(metric_name)

        # limit number of learners in the legend
        # too many will raise warnings
        if len(learner_names) <= 15:
            plt.legend(loc="best")

        plt.tight_layout(pad=2.0)
        plot_path = os.path.join(model_path, LearningCurves.output_file_name)
        plt.savefig(plot_path)
        plt.close("all")
Ejemplo n.º 19
0
    def __init__(self, optimize_metric="logloss", ml_task=BINARY_CLASSIFICATION):
        self.library_version = "0.1"
        self.uid = str(uuid.uuid4())

        self.metric = Metric({"name": optimize_metric})
        self.best_loss = self.metric.get_maximum()  # the best loss obtained by ensemble
        self.models_map = None
        self.selected_models = []
        self.train_time = None
        self.total_best_sum = None  # total sum of predictions, the oof of ensemble
        self.target = None
        self.target_columns = None
        self._ml_task = ml_task
        self._optimize_metric = optimize_metric

        self._additional_metrics = None
        self._threshold = None
        self._name = "ensemble"
        self._scores = []
Ejemplo n.º 20
0
    def __call__(self, trial):
        param = {
            "objective":
            self.objective,
            "eval_metric":
            self.eval_metric_name,
            "tree_method":
            "hist",
            "booster":
            "gbtree",
            "eta":
            trial.suggest_categorical("eta", [0.0125, 0.025, 0.05, 0.1]),
            "max_depth":
            trial.suggest_int("max_depth", 2, 12),
            "lambda":
            trial.suggest_float("lambda", EPS, 10.0, log=True),
            "alpha":
            trial.suggest_float("alpha", EPS, 10.0, log=True),
            "colsample_bytree":
            min(trial.suggest_float("colsample_bytree", 0.3, 1.0 + EPS), 1.0),
            "subsample":
            min(trial.suggest_float("subsample", 0.3, 1.0 + EPS), 1.0),
            "min_child_weight":
            trial.suggest_int("min_child_weight", 1, 100),
            "n_jobs":
            self.n_jobs,
            "seed":
            self.seed,
        }
        if self.num_class is not None:
            param["num_class"] = self.num_class
        try:
            pruning_callback = optuna.integration.XGBoostPruningCallback(
                trial, f"validation-{self.eval_metric_name}")
            bst = xgb.train(
                param,
                self.dtrain,
                self.rounds,
                evals=[(self.dvalidation, "validation")],
                early_stopping_rounds=self.early_stopping_rounds,
                callbacks=[pruning_callback],
                verbose_eval=False,
            )
            preds = bst.predict(self.dvalidation,
                                ntree_limit=bst.best_ntree_limit)
            score = self.eval_metric(self.y_validation, preds)
            if Metric.optimize_negative(self.eval_metric.name):
                score *= -1.0
        except optuna.exceptions.TrialPruned as e:
            raise e
        except Exception as e:
            print("Exception in XgboostObjective", str(e))
            return None

        return score
Ejemplo n.º 21
0
 def test_fit_predict(self):
     metric = Metric({"name": "logloss"})
     loss_prev = None
     for _ in range(2):
         cat = CatBoostAlgorithm(self.params)
         cat.fit(self.X, self.y)
         y_predicted = cat.predict(self.X)
         loss = metric(self.y, y_predicted)
         if loss_prev is not None:
             assert_almost_equal(loss, loss_prev)
         loss_prev = loss
Ejemplo n.º 22
0
 def test_fit_predict(self):
     metric = Metric({"name": "logloss"})
     cat = CatBoostAlgorithm(self.params)
     loss_prev = None
     for _ in range(5):
         cat.fit(self.X, self.y)
         y_predicted = cat.predict(self.X)
         loss = metric(self.y, y_predicted)
         if loss_prev is not None:
             self.assertTrue(loss + 0.001 < loss_prev)
         loss_prev = loss
Ejemplo n.º 23
0
 def test_reproduce_fit(self):
     metric = Metric({"name": "logloss"})
     prev_loss = None
     for _ in range(3):
         model = CatBoostAlgorithm(self.params)
         model.fit(self.X, self.y)
         y_predicted = model.predict(self.X)
         loss = metric(self.y, y_predicted)
         if prev_loss is not None:
             assert_almost_equal(prev_loss, loss)
         prev_loss = loss
Ejemplo n.º 24
0
 def test_fit_predict(self):
     metric = Metric({"name": "logloss"})
     nn = NeuralNetworkAlgorithm(self.params)
     loss_prev = None
     for _ in range(3):
         nn.fit(self.X, self.y)
         y_predicted = nn.predict(self.X)
         loss = metric(self.y, y_predicted)
         if loss_prev is not None:
             self.assertTrue(loss + 0.000001 < loss_prev)
         loss_prev = loss
Ejemplo n.º 25
0
 def test_reproduce_fit_regression(self):
     metric = Metric({"name": "rmse"})
     prev_loss = None
     for _ in range(3):
         model = BaselineRegressorAlgorithm({"ml_task": "regression"})
         model.fit(self.X, self.y)
         y_predicted = model.predict(self.X)
         loss = metric(self.y, y_predicted)
         if prev_loss is not None:
             assert_almost_equal(prev_loss, loss)
         prev_loss = loss
Ejemplo n.º 26
0
 def test_fit_predict(self):
     metric = Metric({"name": "logloss"})
     lgb = LightgbmAlgorithm(self.params)
     loss_prev = None
     for _ in range(3):
         lgb.fit(self.X, self.y)
         y_predicted = lgb.predict(self.X)
         loss = metric(self.y, y_predicted)
         if loss_prev is not None:
             self.assertTrue(loss + 0.001 < loss_prev)
         loss_prev = loss
Ejemplo n.º 27
0
 def test_create(self):
     params = {"name": "logloss"}
     m = Metric(params)
     y_true = np.array([0, 0, 1, 1])
     y_predicted = np.array([0, 0, 1, 1])
     score = m(y_true, y_predicted)
     self.assertTrue(score < 0.1)
     y_true = np.array([0, 0, 1, 1])
     y_predicted = np.array([1, 1, 0, 0])
     score = m(y_true, y_predicted)
     self.assertTrue(score > 1.0)
Ejemplo n.º 28
0
    def test_sample_weight(self):
        metrics = ["logloss", "auc", "acc", "rmse", "mse", "mae", "r2", "mape"]
        for m in metrics:
            metric = Metric({"name": m})
            y_true = np.array([0, 0, 1, 1])
            y_predicted = np.array([0, 0, 0, 1])
            sample_weight = np.array([1, 1, 1, 1])

            score_1 = metric(y_true, y_predicted)
            score_2 = metric(y_true, y_predicted, sample_weight)
            assert_almost_equal(score_1, score_2)
Ejemplo n.º 29
0
    def __init__(
        self,
        results_path,
        ml_task,
        eval_metric,
        time_budget=3600,
        init_params={},
        verbose=True,
        n_jobs=-1,
        random_state=42,
    ):
        if eval_metric.name not in [
                "auc",
                "logloss",
                "rmse",
                "mse",
                "mae",
                "mape",
                "r2",
                "spearman",
                "pearson",
                "f1",
                "average_precision",
                "accuracy",
                "user_defined_metric",
        ]:
            raise AutoMLException(
                f"Metric {eval_metric.name} is not supported")

        self.study_dir = os.path.join(results_path, "optuna")
        if not os.path.exists(self.study_dir):
            try:
                os.mkdir(self.study_dir)
            except Exception as e:
                print("Problem while creating directory for optuna studies.",
                      str(e))
        self.tuning_fname = os.path.join(self.study_dir, "optuna.json")
        self.tuning = init_params
        self.eval_metric = eval_metric

        self.direction = ("maximize" if Metric.optimize_negative(
            eval_metric.name) else "minimize")
        self.n_warmup_steps = (
            500  # set large enough to give small learning rates a chance
        )
        self.time_budget = time_budget
        self.verbose = verbose
        self.ml_task = ml_task
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.cat_features_indices = []
        self.load()
        if not self.verbose:
            optuna.logging.set_verbosity(optuna.logging.CRITICAL)
Ejemplo n.º 30
0
 def test_reproduce_fit(self):
     metric = Metric({"name": "logloss"})
     params = {"trees_in_step": 1, "seed": 1, "ml_task": "binary_classification"}
     prev_loss = None
     for _ in range(3):
         model = ExtraTreesAlgorithm(params)
         model.fit(self.X, self.y)
         y_predicted = model.predict(self.X)
         loss = metric(self.y, y_predicted)
         if prev_loss is not None:
             assert_almost_equal(prev_loss, loss)
         prev_loss = loss