Beispiel #1
0
    def test_save_and_load(self):
        self.assertTrue(
            "Private" in list(self.data["train"]["X"]["workclass"]))
        early_stop = EarlyStopping({"metric": {"name": "logloss"}})
        metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})

        il = IterativeLearner(self.train_params,
                              callbacks=[early_stop, metric_logger])
        il.train(self.data)
        y_predicted = il.predict(self.data["train"]["X"])
        metric = Metric({"name": "logloss"})
        loss_1 = metric(self.data["train"]["y"], y_predicted)

        json_desc = il.to_json()

        il2 = IterativeLearner(self.train_params, callbacks=[])
        self.assertTrue(il.uid != il2.uid)
        il2.from_json(json_desc)
        self.assertTrue(il.uid == il2.uid)
        y_predicted_2 = il2.predict(self.data["train"]["X"])
        loss_2 = metric(self.data["train"]["y"], y_predicted_2)

        assert_almost_equal(loss_1, loss_2)

        uids = [i.uid for i in il.learners]
        uids2 = [i.uid for i in il2.learners]
        for u in uids:
            self.assertTrue(u in uids2)
Beispiel #2
0
    def test_fit_and_predict_kfold(self):
        self.assertTrue(
            "Private" in list(self.data["train"]["X"]["workclass"]))

        early_stop = EarlyStopping({"metric": {"name": "logloss"}})
        metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})

        params = copy.deepcopy(self.train_params)
        params["validation"] = {
            "validation_type": "kfold",
            "k_folds": 5,
            "shuffle": True,
        }
        il = IterativeLearner(params, callbacks=[early_stop, metric_logger])
        il.train(self.data)
        oof = il.get_out_of_folds()

        self.assertEqual(len(np.unique(oof.index)), oof.shape[0])
        self.assertTrue(
            np.array_equal(oof.index, self.data["train"]["X"].index))
        self.assertTrue(oof.shape[0], self.data["train"]["X"].shape[0])

        self.assertTrue(
            "Private" in list(self.data["train"]["X"]["workclass"]))

        y_predicted = il.predict(self.data["train"]["X"])
        self.assertTrue(
            "Private" in list(self.data["train"]["X"]["workclass"]))

        metric = Metric({"name": "logloss"})
        loss = metric(self.data["train"]["y"], y_predicted)
        self.assertTrue(loss < 0.6)
Beispiel #3
0
    def train_model(self, params):

        model_path = os.path.join(self._results_path, params["name"])

        early_stop = EarlyStopping(
            {"metric": {"name": self._optimize_metric}, "log_to_dir": model_path}
        )
        time_constraint = TimeConstraint({"train_seconds_time_limit": self._time_limit})
        mf = ModelFramework(params, callbacks=[early_stop, time_constraint])

        if self._enough_time_to_train(mf.get_type()):

            logger.info(
                f"Train model #{len(self._models)+1} / Model name: {params['name']}"
            )

            try:
                os.mkdir(model_path)
            except Exception as e:
                raise AutoMLException(f"Cannot create directory {model_path}")

            mf.train()  # {"train": {"X": X, "y": y}})

            mf.save(model_path)
            self._model_paths += [model_path]

            self.keep_model(mf)

        else:
            logger.info(
                f"Cannot check more models of {mf.get_type()} because of time constraint"
            )
Beispiel #4
0
    def run(self):
        # update status
        mlmodel = MLModel.objects.get(pk=self.job_params.get("db_id"))
        mlmodel.status = "started"
        mlmodel.save()
        mlexperiment = MLExperiment.objects.get(
            pk=mlmodel.parent_experiment_id)
        print("mlexperiment", mlexperiment.id)
        print(mlexperiment.parent_columns_usage)

        # prepare data
        columns_usage = mlexperiment.parent_columns_usage.columns_usage
        print("cols", columns_usage)
        training_dataframe = mlexperiment.parent_training_dataframe
        print("training", training_dataframe.absolute_path)
        metric_params = mlexperiment.params.get("metric")
        validation_params = mlexperiment.params.get("validation")
        preprocessing_params = mlexperiment.params.get("preprocessing")

        df_train = DataServe.get(training_dataframe.absolute_path)

        training_data = {
            "train": {
                "X": df_train[columns_usage.get("input")],
                "y": df_train[columns_usage.get("target")],
            }
        }

        # prepare model hyper parameters
        learner_params = {
            "learner_type": mlmodel.model_type,
            "max_iters": 3,
            "max_depth": 1,
        }
        for k, v in mlmodel.params.items():
            learner_params[k] = v

        train_params = {
            "preprocessing": preprocessing_params,
            "validation": validation_params,
            "learner": learner_params,
        }
        print(train_params)
        # prepare needed callbacks
        early_stop = EarlyStopping({"metric": {"name": "logloss"}})
        metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})
        # run the training
        il = IterativeLearner(train_params,
                              callbacks=[early_stop, metric_logger])
        il.train(training_data)
        # save the model
        save_details = il.save()
        logger.info(save_details)
        # store model details in platform database
        mlmodel.status = "done"
        mlmodel.save_details = save_details
        mlmodel.all_params = (
            train_params)  # all parameters will be needed for models loading
        mlmodel.save()
Beispiel #5
0
    def train_model(self, params):

        # do we have enough time to train?
        # if not, skip
        if not self._time_ctrl.enough_time(
            params["learner"]["model_type"], self._fit_level
        ):
            logger.info(f"Cannot train {params['name']} because of the time constraint")
            return False

        # let's create directory to log all training artifacts
        model_path = os.path.join(self._results_path, params["name"])
        self.create_dir(model_path)

        # prepare callbacks
        early_stop = EarlyStopping(
            {"metric": {"name": self._optimize_metric}, "log_to_dir": model_path}
        )

        learner_time_constraint = LearnerTimeConstraint(
            {
                "learner_time_limit": self._time_ctrl.learner_time_limit(
                    params["learner"]["model_type"],
                    self._fit_level,
                    self._validation.get("k_folds", 1.0),
                ),
                "min_steps": params["additional"].get("min_steps"),
            }
        )

        total_time_constraint = TotalTimeConstraint(
            {
                "total_time_limit": self._total_time_limit
                if self._model_time_limit is None
                else None,
                "total_time_start": self._start_time,
            }
        )

        # create model framework
        mf = ModelFramework(
            params,
            callbacks=[early_stop, learner_time_constraint, total_time_constraint],
        )

        # start training
        logger.info(
            f"Train model #{len(self._models)+1} / Model name: {params['name']}"
        )
        mf.train(model_path)

        # save the model
        mf.save(model_path)

        # and keep info about the model
        self.keep_model(mf, model_path)
        return True
Beispiel #6
0
    def train_model(self, params):

        model_path = os.path.join(self._results_path, params["name"])
        early_stop = EarlyStopping({
            "metric": {
                "name": self._optimize_metric
            },
            "log_to_dir": model_path
        })

        learner_time_constraint = LearnerTimeConstraint({
            "learner_time_limit":
            self._get_learner_time_limit(
                params["learner"]["model_type"]),  # self._time_limit,
            "min_steps":
            params["additional"].get("min_steps"),
        })

        total_time_constraint = TotalTimeConstraint({
            "total_time_limit":
            self._total_time_limit if self._model_time_limit is None else None,
            "total_time_start":
            self._start_time,
        })

        mf = ModelFramework(
            params,
            callbacks=[
                early_stop, learner_time_constraint, total_time_constraint
            ],
        )

        if self._enough_time_to_train(mf.get_type()):

            # self.verbose_print(params["name"] + " training start ...")
            logger.info(
                f"Train model #{len(self._models)+1} / Model name: {params['name']}"
            )

            try:
                os.mkdir(model_path)
            except Exception as e:
                raise AutoMLException(f"Cannot create directory {model_path}")

            mf.train(model_path)

            mf.save(model_path)
            self._model_paths += [model_path]

            self.keep_model(mf)

            # save the best one in the case the training will be interrupted
            self.select_and_save_best()
        else:
            logger.info(
                f"Cannot train {mf.get_type()} because of time constraint")
Beispiel #7
0
 def train_model(self, params, X, y):
     early_stop = EarlyStopping({"metric": {"name": "logloss"}})
     time_constraint = TimeConstraint({"train_seconds_time_limit": self._time_limit})
     il = IterativeLearner(params, callbacks=[early_stop, time_constraint])
     il_key = il.get_params_key()
     if il_key in self._models_params_keys:
         return None
     self._models_params_keys += [il_key]
     if self.should_train_next(il.get_name()):
         il.train({"train": {"X": X, "y": y}})
         return il
     return None
    def test_fit_and_predict(self):

        early_stop = EarlyStopping({"metric": {"name": "logloss"}})
        metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})
        il = IterativeLearner(self.train_params,
                              callbacks=[early_stop, metric_logger])
        il.train(self.data)

        y_predicted = il.predict(self.X)
        metric = Metric({"name": "logloss"})
        loss = metric(self.y, y_predicted)
        self.assertTrue(loss < 0.4)
    def test_fit_and_predict_split(self):
        self.assertTrue("Private" in list(self.data["train"]["X"]["workclass"]))

        early_stop = EarlyStopping({"metric": {"name": "logloss"}})
        metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})
        il = IterativeLearner(self.train_params, callbacks=[early_stop, metric_logger])
        il.train(self.data)

        self.assertTrue("Private" in list(self.data["train"]["X"]["workclass"]))

        y_predicted = il.predict(self.data["train"]["X"])
        self.assertTrue("Private" in list(self.data["train"]["X"]["workclass"]))

        metric = Metric({"name": "logloss"})
        loss = metric(self.data["train"]["y"], y_predicted)
        self.assertTrue(loss < 0.6)
Beispiel #10
0
 def train_model(self, params, X, y):
     metric_logger = MetricLogger({"metric_names": ["logloss", "auc"]})
     early_stop = EarlyStopping({"metric": {"name": self._optimize_metric}})
     time_constraint = TimeConstraint({"train_seconds_time_limit": self._time_limit})
     il = IterativeLearner(
         params, callbacks=[early_stop, time_constraint, metric_logger]
     )
     il_key = il.get_params_key()
     if il_key in self._models_params_keys:
         self._progress_bar.update(1)
         return None
     self._models_params_keys += [il_key]
     if self.should_train_next(il.get_name()):
         il.train({"train": {"X": X, "y": y}})
         self._progress_bar.update(1)
         return il
     self._progress_bar.update(1)
     return None