Beispiel #1
0
    def run(self):
        # update status
        mlmodel = MLModel.objects.get(pk=self.job_params.get("db_id"))
        mlmodel.status = "started"
        mlmodel.save()
        mlexperiment = MLExperiment.objects.get(
            pk=mlmodel.parent_experiment_id)
        print("mlexperiment", mlexperiment.id)
        print(mlexperiment.parent_columns_usage)

        # prepare data
        columns_usage = mlexperiment.parent_columns_usage.columns_usage
        print("cols", columns_usage)
        training_dataframe = mlexperiment.parent_training_dataframe
        print("training", training_dataframe.absolute_path)
        metric_params = mlexperiment.params.get("metric")
        validation_params = mlexperiment.params.get("validation")
        preprocessing_params = mlexperiment.params.get("preprocessing")

        df_train = DataServe.get(training_dataframe.absolute_path)

        training_data = {
            "train": {
                "X": df_train[columns_usage.get("input")],
                "y": df_train[columns_usage.get("target")],
            }
        }

        # prepare model hyper parameters
        learner_params = {
            "learner_type": mlmodel.model_type,
            "max_iters": 3,
            "max_depth": 1,
        }
        for k, v in mlmodel.params.items():
            learner_params[k] = v

        train_params = {
            "preprocessing": preprocessing_params,
            "validation": validation_params,
            "learner": learner_params,
        }
        print(train_params)
        # prepare needed callbacks
        early_stop = EarlyStopping({"metric": {"name": "logloss"}})
        metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})
        # run the training
        il = IterativeLearner(train_params,
                              callbacks=[early_stop, metric_logger])
        il.train(training_data)
        # save the model
        save_details = il.save()
        logger.info(save_details)
        # store model details in platform database
        mlmodel.status = "done"
        mlmodel.save_details = save_details
        mlmodel.all_params = (
            train_params)  # all parameters will be needed for models loading
        mlmodel.save()
Beispiel #2
0
 def from_json(self, json_data):
     # pretty sure that this can be easily refactored
     if json_data["algorithm_short_name"] == "Ensemble":
         self._best_model = Ensemble()
         self._best_model.from_json(json_data)
     else:
         self._best_model = IterativeLearner(json_data.get("params"))
         self._best_model.from_json(json_data)
    def test_fit_and_predict(self):
        il = IterativeLearner(self.train_params, callbacks=[])
        il.train(self.data)

        y_predicted = il.predict(self.X)
        metric = Metric({"name": "logloss"})
        loss = metric(self.y, y_predicted)
        self.assertTrue(loss < 0.4)
    def test_fit_and_predict(self):

        early_stop = EarlyStopping({"metric": {"name": "logloss"}})
        metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})
        il = IterativeLearner(self.train_params,
                              callbacks=[early_stop, metric_logger])
        il.train(self.data)

        y_predicted = il.predict(self.X)
        metric = Metric({"name": "logloss"})
        loss = metric(self.y, y_predicted)
        self.assertTrue(loss < 0.4)
    def test_fit_and_predict_split(self):
        self.assertTrue("Private" in list(self.data["train"]["X"]["workclass"]))

        early_stop = EarlyStopping({"metric": {"name": "logloss"}})
        metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})
        il = IterativeLearner(self.train_params, callbacks=[early_stop, metric_logger])
        il.train(self.data)

        self.assertTrue("Private" in list(self.data["train"]["X"]["workclass"]))

        y_predicted = il.predict(self.data["train"]["X"])
        self.assertTrue("Private" in list(self.data["train"]["X"]["workclass"]))

        metric = Metric({"name": "logloss"})
        loss = metric(self.data["train"]["y"], y_predicted)
        self.assertTrue(loss < 0.6)
Beispiel #6
0
 def test_fit_and_predict(self):
     MAX_STEPS = 100
     additional["max_steps"] = MAX_STEPS
     iters_cnt = 5
     max_iters = MaxItersConstraint({"max_iters": iters_cnt})
     metric_logger = MetricLogger({"metric_names": ["logloss"]})
     il = IterativeLearner(self.train_params, callbacks=[max_iters, metric_logger])
     il.train(self.data)
     metric_logs = il.get_metric_logs()
     for k in range(self.kfolds):
         self.assertEqual(
             len(metric_logs[il.learners[k].uid]["train"]["logloss"]), iters_cnt
         )
         self.assertNotEqual(
             len(metric_logs[il.learners[k].uid]["train"]["logloss"]), MAX_STEPS
         )
Beispiel #7
0
    def run(self):
        # read data
        logger.info("ComputeBatchPrediction::run")

        batch = MLBatchPrediction.objects.get(pk=self.job_params.get("db_id"))
        logger.info("batch", batch)
        # {'parent_mlmodel': 9, 'parent_dataframe': 1, 'db_id': 1, 'created_by_id': 1, 'parent_organization_id': 1, 'parent_project_id': 1}

        mlmodel = MLModel.objects.get(pk=self.job_params.get("parent_mlmodel"))
        logger.info(mlmodel.save_details)
        logger.info(mlmodel.all_params)
        il = IterativeLearner(mlmodel.all_params)
        il.load(mlmodel.save_details)
        logger.info(batch.parent_dataframe.absolute_path)
        input_df = DataServe.get(batch.parent_dataframe.absolute_path)

        predictions = il.predict(input_df)
        logger.info(predictions)

        filename = "predictions-{0}.csv".format(str(uuid.uuid4())[:8])
        organization_slug = batch.parent_organization.slug
        project_id = batch.parent_project.id
        relative_dir = "org_{0}_proj_{1}".format(organization_slug, project_id)
        relative_path = os.path.join(relative_dir, filename)
        result_absolute_path = Storage().get_path(relative_dir, filename)

        logger.info(result_absolute_path)

        df = pd.DataFrame({"prediction": predictions})
        df.to_csv(result_absolute_path, index=False)

        # create mljar data frame
        result_df = DataFrame(
            source_id=self.job_params.get("parent_dataframe"),  # fix this
            absolute_path=result_absolute_path,
            file_size=1,  # TODO fix the file size
            columns_details="",  # we can describe any data frame (always :-))
            preview_absolute_path="",
            created_by_id=self.job_params["created_by_id"],
            parent_organization_id=self.job_params["parent_organization_id"],
            parent_project_id=self.job_params["parent_project_id"],
        )
        result_df.save()

        batch.result_dataframe = result_df
        batch.status = "done"
        batch.save()
 def test_fit_and_predict(self):
     MAX_STEPS = 10
     additional["max_steps"] = MAX_STEPS
     metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})
     il = IterativeLearner(self.train_params, callbacks=[metric_logger])
     il.train(self.data)
     metric_logs = il.get_metric_logs()
     self.assertEqual(
         len(metric_logs[il.learners[0].uid]["train"]["logloss"]),
         len(metric_logs[il.learners[0].uid]["train"]["auc"]),
     )
     self.assertEqual(
         len(metric_logs[il.learners[0].uid]["train"]["logloss"]),
         len(metric_logs[il.learners[0].uid]["iters"]),
     )
     self.assertEqual(
         len(metric_logs[il.learners[0].uid]["train"]["logloss"]),
         MAX_STEPS)
Beispiel #9
0
    def test_fit_and_predict_kfold(self):
        self.assertTrue(
            "Private" in list(self.data["train"]["X"]["workclass"]))

        early_stop = EarlyStopping({"metric": {"name": "logloss"}})
        metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})

        params = copy.deepcopy(self.train_params)
        params["validation"] = {
            "validation_type": "kfold",
            "k_folds": 5,
            "shuffle": True,
        }
        il = IterativeLearner(params, callbacks=[early_stop, metric_logger])
        il.train(self.data)
        oof = il.get_out_of_folds()

        self.assertEqual(len(np.unique(oof.index)), oof.shape[0])
        self.assertTrue(
            np.array_equal(oof.index, self.data["train"]["X"].index))
        self.assertTrue(oof.shape[0], self.data["train"]["X"].shape[0])

        self.assertTrue(
            "Private" in list(self.data["train"]["X"]["workclass"]))

        y_predicted = il.predict(self.data["train"]["X"])
        self.assertTrue(
            "Private" in list(self.data["train"]["X"]["workclass"]))

        metric = Metric({"name": "logloss"})
        loss = metric(self.data["train"]["y"], y_predicted)
        self.assertTrue(loss < 0.6)
Beispiel #10
0
    def from_json(self, json_desc):
        self.library_version = json_desc.get("library_version",
                                             self.library_version)
        self.algorithm_name = json_desc.get("algorithm_name",
                                            self.algorithm_name)
        self.algorithm_short_name = json_desc.get("algorithm_short_name",
                                                  self.algorithm_short_name)
        self.uid = json_desc.get("uid", self.uid)
        self.selected_models = []
        models_json = json_desc.get("models")
        for selected in models_json:
            model = selected["model"]
            repeat = selected["repeat"]

            il = IterativeLearner(model.get("params"))
            il.from_json(model)
            self.selected_models += [
                # {"model": LearnerFactory.load(model), "repeat": repeat}
                {
                    "model": il,
                    "repeat": repeat
                }
            ]
Beispiel #11
0
 def train_model(self, params, X, y):
     early_stop = EarlyStopping({"metric": {"name": "logloss"}})
     time_constraint = TimeConstraint({"train_seconds_time_limit": self._time_limit})
     il = IterativeLearner(params, callbacks=[early_stop, time_constraint])
     il_key = il.get_params_key()
     if il_key in self._models_params_keys:
         return None
     self._models_params_keys += [il_key]
     if self.should_train_next(il.get_name()):
         il.train({"train": {"X": X, "y": y}})
         return il
     return None
Beispiel #12
0
 def train_model(self, params, X, y):
     metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})
     early_stop = EarlyStopping({"metric": {"name": self._optimize_metric}})
     time_constraint = TimeConstraint({"train_seconds_time_limit": self._time_limit})
     il = IterativeLearner(
         params, callbacks=[early_stop, time_constraint, metric_logger]
     )
     il_key = il.get_params_key()
     if il_key in self._models_params_keys:
         self._progress_bar.update(1)
         return None
     self._models_params_keys += [il_key]
     if self.should_train_next(il.get_name()):
         il.train({"train": {"X": X, "y": y}})
         self._progress_bar.update(1)
         return il
     self._progress_bar.update(1)
     return None
    def test_save_and_load(self):
        il = IterativeLearner(self.train_params, callbacks=[])
        il.train(self.data)

        metric = Metric({"name": "logloss"})
        loss = metric(self.y, il.predict(self.X))

        json_desc = il.to_json()
        il2 = IterativeLearner(json_desc.get("params"), callbacks=[])
        self.assertTrue(il.uid != il2.uid)

        il2.from_json(json_desc)
        self.assertTrue(il.uid == il2.uid)
        loss2 = metric(self.y, il2.predict(self.X))
        assert_almost_equal(loss, loss2)

        uids = [i.uid for i in il.learners]
        uids2 = [i.uid for i in il2.learners]
        for u in uids:
            self.assertTrue(u in uids2)
Beispiel #14
0
    def test_save_and_load(self):
        self.assertTrue(
            "Private" in list(self.data["train"]["X"]["workclass"]))
        early_stop = EarlyStopping({"metric": {"name": "logloss"}})
        metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})

        il = IterativeLearner(self.train_params,
                              callbacks=[early_stop, metric_logger])
        il.train(self.data)
        y_predicted = il.predict(self.data["train"]["X"])
        metric = Metric({"name": "logloss"})
        loss_1 = metric(self.data["train"]["y"], y_predicted)

        json_desc = il.to_json()

        il2 = IterativeLearner(self.train_params, callbacks=[])
        self.assertTrue(il.uid != il2.uid)
        il2.from_json(json_desc)
        self.assertTrue(il.uid == il2.uid)
        y_predicted_2 = il2.predict(self.data["train"]["X"])
        loss_2 = metric(self.data["train"]["y"], y_predicted_2)

        assert_almost_equal(loss_1, loss_2)

        uids = [i.uid for i in il.learners]
        uids2 = [i.uid for i in il2.learners]
        for u in uids:
            self.assertTrue(u in uids2)
Beispiel #15
0
class AutoML:
    def __init__(
        self,
        total_time_limit=60 * 60,
        learner_time_limit=120,
        algorithms=["CatBoost", "Xgboost", "RF", "LightGBM", "NN"],
        start_random_models=10,
        hill_climbing_steps=3,
        top_models_to_improve=5,
        train_ensemble=True,
        verbose=True,
    ):
        self._total_time_limit = total_time_limit
        self._time_limit = (
            learner_time_limit
        )  # time limit in seconds for single learner
        self._train_ensemble = train_ensemble
        self._models = []  # instances of iterative learner framework or ensemble
        self._models_params_keys = []
        self._best_model = (
            None
        )  # it is instance of iterative learner framework or ensemble
        self._validation = {"validation_type": "kfold", "k_folds": 5, "shuffle": True}

        self._start_random_models = start_random_models
        self._hill_climbing_steps = hill_climbing_steps
        self._top_models_to_improve = top_models_to_improve
        self._algorithms = algorithms
        self._verbose = verbose

        if self._total_time_limit is not None:
            estimated_models_to_check = (
                len(self._algorithms)
                * (
                    self._start_random_models
                    + self._top_models_to_improve * self._hill_climbing_steps * 2
                )
                * 5
            )
            # set time limit for single model training
            # the 0.85 is safe scale factor, to not exceed time limit
            self._time_limit = self._total_time_limit * 0.85 / estimated_models_to_check

        if len(self._algorithms) == 0:
            self._algorithms = list(
                ModelsRegistry.registry[BINARY_CLASSIFICATION].keys()
            )
        self._fit_time = None
        self._models_train_time = {}
        self._threshold, self._metrics_details, self._max_metrics, self._confusion_matrix = (
            None,
            None,
            None,
            None,
        )

    def get_additional_metrics(self):
        # 'target' - the target after processing used for model training
        # 'prediction' - out of folds predictions of model
        oof_predictions = self._best_model.get_out_of_folds()
        self._metrics_details, self._max_metrics, self._confusion_matrix = ComputeAdditionalMetrics.compute(
            oof_predictions["target"], oof_predictions["prediction"], BINARY_CLASSIFICATION
        )
        self._threshold = self._max_metrics["f1"]["threshold"]
        # print(self._metrics_details, self._max_metrics, self._confusion_matrix)

    def _get_model_params(self, model_type, X, y):
        model_info = ModelsRegistry.registry[BINARY_CLASSIFICATION][model_type]
        model_params = RandomParameters.get(model_info["params"])
        required_preprocessing = model_info["required_preprocessing"]
        model_additional = model_info["additional"]
        preprocessing_params = PreprocessingTuner.get(
            required_preprocessing, {"train": {"X": X, "y": y}}, BINARY_CLASSIFICATION
        )
        return {
            "additional": model_additional,
            "preprocessing": preprocessing_params,
            "validation": self._validation,
            "learner": {
                "model_type": model_info["class"].algorithm_short_name,
                **model_params,
            },
        }

    def train_model(self, params, X, y):
        early_stop = EarlyStopping({"metric": {"name": "logloss"}})
        time_constraint = TimeConstraint({"train_seconds_time_limit": self._time_limit})
        il = IterativeLearner(params, callbacks=[early_stop, time_constraint])
        il_key = il.get_params_key()
        if il_key in self._models_params_keys:
            return None
        self._models_params_keys += [il_key]
        if self.should_train_next(il.get_name()):
            il.train({"train": {"X": X, "y": y}})
            return il
        return None

    def verbose_print(self, msg):
        if self._verbose:
            print(msg)

    def log_train_time(self, model_type, train_time):
        if model_type in self._models_train_time:
            self._models_train_time[model_type] += [train_time]
        else:
            self._models_train_time[model_type] = [train_time]

    def should_train_next(self, model_type):
        # no time limit, just train, dont ask
        if self._total_time_limit is None:
            return True

        total_time_already_spend = (
            0
            if model_type not in self._models_train_time
            else np.sum(self._models_train_time[model_type])
        )
        mean_time_already_spend = (
            0
            if model_type not in self._models_train_time
            else np.mean(self._models_train_time[model_type])
        )

        if (
            total_time_already_spend + mean_time_already_spend
            < 0.85 * self._total_time_limit / float(len(self._algorithms))
        ):
            return True
        return False

    def not_so_random_step(self, X, y):
        for model_type in self._algorithms:
            for i in range(self._start_random_models):
                params = self._get_model_params(model_type, X, y)
                m = self.train_model(params, X, y)
                if m is not None:
                    self._models += [m]
                    self.verbose_print(
                        "Learner {} final loss {} time {}".format(
                            m.get_name(), m.get_final_loss(), m.get_train_time()
                        )
                    )
                    self.log_train_time(m.get_name(), m.get_train_time())

    def hill_climbing_step(self, X, y):
        for hill_climbing in range(self._hill_climbing_steps):
            # get models orderer by loss
            models = []
            for m in self._models:
                models += [(m.callbacks.callbacks[0].final_loss, m)]
            models = sorted(models, key=lambda x: x[0])
            for i in range(min(self._top_models_to_improve, len(models))):
                m = models[i][1]
                for p in HillClimbing.get(m.params.get("learner")):
                    if p is not None:
                        all_params = copy.deepcopy(m.params)
                        all_params["learner"] = p
                        new_model = self.train_model(all_params, X, y)
                        if new_model is not None:
                            self._models += [new_model]
                            self.verbose_print(
                                "Learner {} final loss {} time {}".format(
                                    new_model.get_name(),
                                    new_model.get_final_loss(),
                                    new_model.get_train_time(),
                                )
                            )
                            self.log_train_time(
                                new_model.get_name(), new_model.get_train_time()
                            )

    def ensemble_step(self, y):
        if self._train_ensemble:
            self.ensemble = Ensemble()
            X_oof = self.ensemble.get_oof_matrix(self._models)
            self.ensemble.fit(X_oof, y)
            self._models += [self.ensemble]
            self.verbose_print(
                "Learner {} final loss {} time {}".format(
                    self.ensemble.get_name(),
                    self.ensemble.get_final_loss(),
                    self.ensemble.get_train_time(),
                )
            )
            self.log_train_time(
                self.ensemble.get_name(), self.ensemble.get_train_time()
            )

    def fit(self, X, y):
        start_time = time.time()
        X.reset_index(drop=True, inplace=True)
        y = np.array(y)
        if not isinstance(y, pd.DataFrame):
            y = pd.DataFrame({"target": y})
        y.reset_index(drop=True, inplace=True)
        y = y["target"]

        # drops rows with missing target
        X, y = PreprocessingExcludeMissingValues.transform(X, y)

        # start with not-so-random models
        self.not_so_random_step(X, y)

        # perform hill climbing steps on best models
        self.hill_climbing_step(X, y)

        # train ensemble
        self.ensemble_step(y)

        max_loss = 10e12
        for i, m in enumerate(self._models):
            if m.get_final_loss() < max_loss:
                self._best_model = m
                max_loss = m.get_final_loss()

        self.get_additional_metrics()
        self._fit_time = time.time() - start_time

    def predict(self, X):
        if self._best_model is not None:
            predictions = self._best_model.predict(X)
            neg_label, pos_label = predictions.columns[0][2:], predictions.columns[1][2:]
            if neg_label == '0' and pos_label == '1':
                neg_label, pos_label = 0, 1
            # assume that it is binary classification
            predictions['label'] = predictions.iloc[:, 1] > self._threshold
            predictions['label'] = predictions['label'].map({True: pos_label, False: neg_label})

            return predictions
            #return pd.DataFrame(
            #    {
            #        "prediction": self._best_model.predict(X),
            #        "label": self._best_model.predict(X) > self._threshold,
            #    }
            #)
        return None

    def to_json(self):
        if self._best_model is None:
            return None

        return {"best_model": self._best_model.to_json(), "threshold": self._threshold}

    def from_json(self, json_data):
        # pretty sure that this can be easily refactored
        if json_data["best_model"]["algorithm_short_name"] == "Ensemble":
            self._best_model = Ensemble()
            self._best_model.from_json(json_data["best_model"])
        else:
            self._best_model = IterativeLearner(json_data["best_model"].get("params"))
            self._best_model.from_json(json_data["best_model"])
        self._threshold = json_data.get("threshold")