Exemple #1
0
class ModelFramework:
    def __init__(self, params, callbacks=[]):
        logger.debug("ModelFramework.__init__")
        self.uid = str(uuid.uuid4())

        for i in ["learner", "validation"]:  # mandatory parameters
            if i not in params:
                msg = "Missing {0} parameter in ModelFramework params".format(
                    i)
                logger.error(msg)
                raise ValueError(msg)

        self.params = params
        self.callbacks = CallbackList(callbacks)

        self._name = params.get("name", "model")
        self.additional_params = params.get("additional")
        self.preprocessing_params = params.get("preprocessing")
        self.validation_params = params.get("validation")
        self.learner_params = params.get("learner")

        self._ml_task = params.get("ml_task")

        self.validation = None
        self.preprocessings = []
        self.learners = []

        self.train_time = None
        self._additional_metrics = None
        self._threshold = None  # used only for binary classifiers

    def get_train_time(self):
        return self.train_time

    def predictions(self, learner, preproces, X_train, y_train, X_validation,
                    y_validation):

        y_train_true = y_train
        y_train_predicted = learner.predict(X_train)
        y_validation_true = y_validation
        y_validation_predicted = learner.predict(X_validation)

        y_train_true = preproces.inverse_scale_target(y_train_true)
        y_train_predicted = preproces.inverse_scale_target(y_train_predicted)
        y_validation_true = preproces.inverse_scale_target(y_validation_true)
        y_validation_predicted = preproces.inverse_scale_target(
            y_validation_predicted)

        y_validation_columns = []
        if self._ml_task == MULTICLASS_CLASSIFICATION:
            # y_train_true = preproces.inverse_categorical_target(y_train_true)
            # y_validation_true = preproces.inverse_categorical_target(y_validation_true)
            # get columns, omit the last one (it is label)
            y_validation_columns = preproces.prepare_target_labels(
                y_validation_predicted).columns.tolist()[:-1]

        return {
            "y_train_true": y_train_true,
            "y_train_predicted": y_train_predicted,
            "y_validation_true": y_validation_true,
            "y_validation_predicted": y_validation_predicted,
            "validation_index": X_validation.index,
            "validation_columns": y_validation_columns,
        }

    def train(self):  # , data):
        logger.debug(
            f"ModelFramework.train {self.learner_params.get('model_type')}")

        start_time = time.time()
        np.random.seed(self.learner_params["seed"])

        self.validation = ValidationStep(self.validation_params)

        for k_fold in range(self.validation.get_n_splits()):
            train_data, validation_data = self.validation.get_split(k_fold)
            logger.debug(
                "Data split, train X:{} y:{}, validation X:{}, y:{}".format(
                    train_data["X"].shape,
                    train_data["y"].shape,
                    validation_data["X"].shape,
                    validation_data["y"].shape,
                ))
            # the proprocessing is done at every validation step
            self.preprocessings += [Preprocessing(self.preprocessing_params)]

            X_train, y_train = self.preprocessings[-1].fit_and_transform(
                train_data["X"], train_data["y"])
            X_validation, y_validation = self.preprocessings[-1].transform(
                validation_data["X"], validation_data["y"])

            self.learners += [
                AlgorithmFactory.get_algorithm(self.learner_params)
            ]
            learner = self.learners[-1]

            self.callbacks.add_and_set_learner(learner)
            self.callbacks.on_learner_train_start()

            for i in range(learner.max_iters):
                self.callbacks.on_iteration_start()

                learner.fit(X_train, y_train)

                self.callbacks.on_iteration_end(
                    {"iter_cnt": i},
                    self.predictions(
                        learner,
                        self.preprocessings[-1],
                        X_train,
                        y_train,
                        X_validation,
                        y_validation,
                    ),
                )
                if learner.stop_training:
                    break
                learner.update({"step": i})
            # end of learner iters loop
            self.callbacks.on_learner_train_end()

        # end of validation loop
        self.callbacks.on_framework_train_end()
        self.train_time = time.time() - start_time
        self.get_additional_metrics()
        logger.debug("ModelFramework end of training")

    def get_metric_name(self):
        early_stopping = self.callbacks.get("early_stopping")
        if early_stopping is None:
            return None
        return early_stopping.metric.name

    def get_out_of_folds(self):
        early_stopping = self.callbacks.get("early_stopping")
        if early_stopping is None:
            return None
        return early_stopping.best_y_oof

    def get_final_loss(self):
        early_stopping = self.callbacks.get("early_stopping")
        if early_stopping is None:
            return None
        return early_stopping.final_loss

    def get_metric_logs(self):
        metric_logger = self.callbacks.get("metric_logger")
        if metric_logger is None:
            return None
        return metric_logger.loss_values

    def get_type(self):
        return self.learner_params.get("model_type")

    def get_name(self):
        return self._name

    def predict(self, X):
        logger.debug("ModelFramework.predict")

        if self.learners is None or len(self.learners) == 0:
            raise Exception("Learnes are not initialized")
        # run predict on all learners and return the average
        y_predicted = None  # np.zeros((X.shape[0],))
        for ind, learner in enumerate(self.learners):
            # preprocessing goes here
            X_data, _ = self.preprocessings[ind].transform(X, None)
            y_p = learner.predict(X_data)

            y_p = self.preprocessings[ind].inverse_scale_target(y_p)

            y_predicted = y_p if y_predicted is None else y_predicted + y_p

        y_predicted_average = y_predicted / float(len(self.learners))

        y_predicted_final = self.preprocessings[0].prepare_target_labels(
            y_predicted_average)

        return y_predicted_final

    def get_additional_metrics(self):
        if self._additional_metrics is None:
            # 'target' - the target after processing used for model training
            # 'prediction' - out of folds predictions of the model
            oof_predictions = self.get_out_of_folds()
            prediction_cols = [
                c for c in oof_predictions.columns if "prediction" in c
            ]
            target_cols = [c for c in oof_predictions.columns if "target" in c]

            target = oof_predictions[target_cols]

            oof_preds = None
            if self._ml_task == MULTICLASS_CLASSIFICATION:
                oof_preds = self.preprocessings[0].prepare_target_labels(
                    oof_predictions[prediction_cols].values)

            else:
                oof_preds = oof_predictions[prediction_cols]

            self._additional_metrics = AdditionalMetrics.compute(
                target, oof_preds, self._ml_task)
            if self._ml_task == BINARY_CLASSIFICATION:
                self._threshold = float(self._additional_metrics["threshold"])
        return self._additional_metrics

    def save(self, model_path):
        logger.info(f"Save the model {model_path}")

        saved = []
        for i, l in enumerate(self.learners):
            p = os.path.join(model_path,
                             f"learner_{i+1}.{l.file_extenstion()}")
            l.save(p)
            saved += [p]

        with open(os.path.join(model_path, "framework.json"), "w") as fout:
            preprocessing = [p.to_json() for p in self.preprocessings]
            learners_params = [
                learner.get_params() for learner in self.learners
            ]
            desc = {
                "uid": self.uid,
                "name": self._name,
                "preprocessing": preprocessing,
                "learners": learners_params,
                "params": self.params,
                "saved": saved,
            }
            if self._threshold is not None:
                desc["threshold"] = self._threshold
            fout.write(json.dumps(desc, indent=4))

        type_of_predictions = ("validation"
                               if "k_folds" not in self.validation_params else
                               "out_of_folds")
        predictions = self.get_out_of_folds()
        predictions.to_csv(
            os.path.join(model_path, f"predictions_{type_of_predictions}.csv"),
            index=False,
        )

        self._additional_metrics = self.get_additional_metrics()

        with open(os.path.join(model_path, "metrics.txt"), "w") as fout:

            if self._ml_task == BINARY_CLASSIFICATION:
                max_metrics = self._additional_metrics["max_metrics"]
                confusion_matrix = self._additional_metrics["confusion_matrix"]
                threshold = self._additional_metrics["threshold"]

                fout.write("Metric details:\n{}\n\n".format(
                    max_metrics.transpose()))
                fout.write("Confusion matrix (at threshold={}):\n{}".format(
                    np.round(threshold, 6), confusion_matrix))
            elif self._ml_task == MULTICLASS_CLASSIFICATION:
                max_metrics = self._additional_metrics["max_metrics"]
                confusion_matrix = self._additional_metrics["confusion_matrix"]

                fout.write("Metric details:\n{}\n\n".format(
                    max_metrics.transpose()))
                fout.write("Confusion matrix:\n{}".format(confusion_matrix))

        with open(os.path.join(model_path, "README.md"), "w") as fout:

            fout.write(f"# Summary of {self.get_name()}\n")

            fout.write(f"\n ## {self.get_type()}\n")

            for k, v in self.learner_params.items():
                if k in ["model_type", "ml_task", "seed"]:
                    continue
                fout.write(f"- **{k}**: {v}\n")

            fout.write("\n# Validation\n")
            #fout.write(f" - validation type: {self.validation.validation_type}\n")
            for k, v in self.validation_params.items():
                if "path" not in k:
                    fout.write(f" - **{k}**: {v}\n")

            if self._ml_task == BINARY_CLASSIFICATION:
                max_metrics = self._additional_metrics["max_metrics"]
                confusion_matrix = self._additional_metrics["confusion_matrix"]
                threshold = self._additional_metrics["threshold"]

                mm = max_metrics.transpose()
                fout.write("\n## Metric details\n{}\n\n".format(
                    mm.to_markdown()))
                fout.write(
                    "\n## Confusion matrix (at threshold={})\n{}".format(
                        np.round(threshold, 6),
                        confusion_matrix.to_markdown()))
            elif self._ml_task == MULTICLASS_CLASSIFICATION:
                max_metrics = self._additional_metrics["max_metrics"]
                confusion_matrix = self._additional_metrics["confusion_matrix"]

                mm = max_metrics.transpose()
                fout.write("\n### Metric details\n{}\n\n".format(
                    mm.to_markdown()))
                fout.write("\n## Confusion matrix\n{}".format(
                    confusion_matrix.to_markdown()))

            plt.figure(figsize=(10, 7))
            for l in range(self.validation.get_n_splits()):
                df = pd.read_csv(
                    os.path.join(model_path, f"./learner_{l+1}_training.log"),
                    names=["iteration", "train", "test", "no_improvement"])
                plt.plot(df.iteration,
                         df.train,
                         "--",
                         color=MY_COLORS[l],
                         label=f"Fold {l}, train")
                plt.plot(df.iteration,
                         df.test,
                         color=MY_COLORS[l],
                         label=f"Fold {l}, test")
            plt.xlabel("#Iteration")
            plt.ylabel(self.get_metric_name())
            plt.legend(loc="best")
            plot_path = os.path.join(model_path, "learning_curves.png")
            plt.savefig(plot_path)

            fout.write("\n\n## Learning curves\n")
            fout.write(f"![Learning curves](learning_curves.png)")

        with open(os.path.join(model_path, "status.txt"), "w") as fout:
            fout.write("ALL OK!")

    @staticmethod
    def load(model_path):
        logger.info(f"Loading model framework from {model_path}")

        json_desc = json.load(open(os.path.join(model_path, "framework.json")))
        mf = ModelFramework(json_desc["params"])
        mf.uid = json_desc.get("uid", mf.uid)
        mf._name = json_desc.get("name", mf._name)
        mf._threshold = json_desc.get("threshold")
        mf.learners = []
        for learner_desc, learner_path in zip(json_desc.get("learners"),
                                              json_desc.get("saved")):

            l = AlgorithmFactory.load(learner_desc, learner_path)
            mf.learners += [l]

        mf.preprocessings = []
        for p in json_desc.get("preprocessing"):
            ps = Preprocessing()
            ps.from_json(p)
            mf.preprocessings += [ps]

        return mf
class ModelFramework:
    def __init__(self, params, callbacks=[]):
        logger.debug("ModelFramework.__init__")
        self.uid = str(uuid.uuid4())

        for i in ["learner", "validation_strategy"]:  # mandatory parameters
            if i not in params:
                msg = "Missing {0} parameter in ModelFramework params".format(
                    i)
                logger.error(msg)
                raise ValueError(msg)

        self.params = params
        self.callbacks = CallbackList(callbacks)

        self._name = params.get("name", "model")
        self.additional_params = params.get("additional")
        self.preprocessing_params = params.get("preprocessing")
        self.validation_params = params.get("validation_strategy")
        self.learner_params = params.get("learner")

        self._ml_task = params.get("ml_task")
        self._explain_level = params.get("explain_level")
        self._is_stacked = params.get("is_stacked", False)

        self.validation = None
        self.preprocessings = []
        self.learners = []

        self.train_time = None
        self.final_loss = None
        self.metric_name = None
        self.oof_predictions = None
        self._additional_metrics = None
        self._threshold = None  # used only for binary classifiers
        self._max_time_for_learner = params.get("max_time_for_learner", 3600)
        self._oof_predictions_fname = None
        self._single_prediction_time = None  # prediction time on single sample
        self._optuna_time_budget = params.get("optuna_time_budget")
        self._optuna_init_params = params.get("optuna_init_params", {})
        self._optuna_verbose = params.get("optuna_verbose", True)

        # the automl random state from AutoML constructor, used in Optuna optimizer
        self._automl_random_state = params.get("automl_random_state", 42)

    def get_train_time(self):
        return self.train_time

    def predictions(
        self,
        learner,
        preproces,
        X_train,
        y_train,
        sample_weight,
        X_validation,
        y_validation,
        sample_weight_validation,
    ):
        y_train_true = y_train
        y_train_predicted = learner.predict(X_train)
        y_validation_true = y_validation
        y_validation_predicted = learner.predict(X_validation)

        y_train_true = preproces.inverse_scale_target(y_train_true)
        y_train_predicted = preproces.inverse_scale_target(y_train_predicted)
        y_validation_true = preproces.inverse_scale_target(y_validation_true)
        y_validation_predicted = preproces.inverse_scale_target(
            y_validation_predicted)

        y_validation_columns = []
        if self._ml_task == MULTICLASS_CLASSIFICATION:
            # y_train_true = preproces.inverse_categorical_target(y_train_true)
            # y_validation_true = preproces.inverse_categorical_target(y_validation_true)
            # get columns, omit the last one (it is label)
            y_validation_columns = preproces.prepare_target_labels(
                y_validation_predicted).columns.tolist()[:-1]
        elif self._ml_task == BINARY_CLASSIFICATION:
            class_names = self.preprocessings[-1].get_target_class_names()
            y_validation_columns = "prediction"
            if not ("0" in class_names and "1" in class_names):
                y_validation_columns = (
                    f"prediction_0_for_{class_names[0]}_1_for_{class_names[1]}"
                )
        else:
            y_validation_columns = "prediction"

        return {
            "y_train_true": y_train_true,
            "y_train_predicted": y_train_predicted,
            "sample_weight": sample_weight,
            "y_validation_true": y_validation_true,
            "y_validation_predicted": y_validation_predicted,
            "sample_weight_validation": sample_weight_validation,
            "validation_index": X_validation.index,
            "validation_columns": y_validation_columns,
        }

    def train(self, results_path, model_subpath):
        logger.debug(
            f"ModelFramework.train {self.learner_params.get('model_type')}")

        start_time = time.time()
        np.random.seed(self.learner_params["seed"])

        optuna_tuner = None
        if self._optuna_time_budget is not None and OptunaTuner.is_optimizable(
                self.learner_params.get("model_type", "")):
            optuna_tuner = OptunaTuner(
                results_path,
                ml_task=self._ml_task,
                eval_metric=self.get_metric(),
                time_budget=self._optuna_time_budget,
                init_params=self._optuna_init_params,
                verbose=self._optuna_verbose,
                n_jobs=self.learner_params.get("n_jobs", -1),
                random_state=self._automl_random_state,
            )

        self.validation = ValidationStep(self.validation_params)

        repeats = self.validation.get_repeats()
        for repeat in range(repeats):
            for k_fold in range(self.validation.get_n_splits()):
                train_data, validation_data = self.validation.get_split(
                    k_fold, repeat)
                logger.debug(
                    "Data split, train X:{} y:{}, validation X:{}, y:{}".
                    format(
                        train_data["X"].shape,
                        train_data["y"].shape,
                        validation_data["X"].shape,
                        validation_data["y"].shape,
                    ))
                if "sample_weight" in train_data:
                    logger.debug(
                        "Sample weight available during the training.")

                # the proprocessing is done at every validation step
                self.preprocessings += [
                    Preprocessing(self.preprocessing_params, self.get_name(),
                                  k_fold, repeat)
                ]

                X_train, y_train, sample_weight = self.preprocessings[
                    -1].fit_and_transform(train_data["X"], train_data["y"],
                                          train_data.get("sample_weight"))
                (
                    X_validation,
                    y_validation,
                    sample_weight_validation,
                ) = self.preprocessings[-1].transform(
                    validation_data["X"],
                    validation_data["y"],
                    validation_data.get("sample_weight"),
                )

                if optuna_tuner is not None:
                    optuna_start_time = time.time()
                    self.learner_params = optuna_tuner.optimize(
                        self.learner_params.get("model_type", ""),
                        self.params.get("data_type", ""),
                        X_train,
                        y_train,
                        sample_weight,
                        X_validation,
                        y_validation,
                        sample_weight_validation,
                        self.learner_params,
                    )
                    # exclude optuna optimize time from model training
                    start_time += time.time() - optuna_start_time

                self.learner_params["explain_level"] = self._explain_level
                self.learners += [
                    AlgorithmFactory.get_algorithm(
                        copy.deepcopy(self.learner_params))
                ]
                learner = self.learners[-1]
                learner.set_learner_name(k_fold, repeat, repeats)

                self.callbacks.add_and_set_learner(learner)
                self.callbacks.on_learner_train_start()

                log_to_file = os.path.join(results_path, model_subpath,
                                           f"{learner.name}_training.log")

                for i in range(learner.max_iters):

                    self.callbacks.on_iteration_start()

                    learner.fit(
                        X_train,
                        y_train,
                        sample_weight,
                        X_validation,
                        y_validation,
                        sample_weight_validation,
                        log_to_file,
                        self._max_time_for_learner,
                    )

                    if self.params.get("injected_sample_weight", False):
                        # print("Dont use sample weight in model evaluation")
                        sample_weight = None
                        sample_weight_validation = None

                    self.callbacks.on_iteration_end(
                        {"iter_cnt": i},
                        self.predictions(
                            learner,
                            self.preprocessings[-1],
                            X_train,
                            y_train,
                            sample_weight,
                            X_validation,
                            y_validation,
                            sample_weight_validation,
                        ),
                    )

                    if learner.stop_training:
                        break
                    learner.update({"step": i})

                # end of learner iters loop
                self.callbacks.on_learner_train_end()

                model_path = os.path.join(results_path, model_subpath)
                learner.interpret(
                    X_train,
                    y_train,
                    X_validation,
                    y_validation,
                    model_file_path=model_path,
                    learner_name=learner.name,
                    class_names=self.preprocessings[-1].get_target_class_names(
                    ),
                    metric_name=self.get_metric_name(),
                    ml_task=self._ml_task,
                    explain_level=self._explain_level,
                )

                # save learner and free the memory
                p = os.path.join(model_path, learner.get_fname())
                learner.save(p)
                del learner.model
                learner.model = None
                # end of learner training

        # end of validation loop
        self.callbacks.on_framework_train_end()
        # self.get_additional_metrics()
        self._additional_metrics = self.get_additional_metrics()

        self.train_time = time.time() - start_time
        logger.debug("ModelFramework end of training")

    def release_learners(self):
        for learner in self.learners:
            if learner.model is not None:
                del learner.model
                learner.model = None

    def get_metric_name(self):
        if self.metric_name is not None:
            return self.metric_name
        early_stopping = self.callbacks.get("early_stopping")
        if early_stopping is None:
            return None
        self.metric_name = early_stopping.metric.name
        return early_stopping.metric.name

    def get_metric(self):
        early_stopping = self.callbacks.get("early_stopping")
        if early_stopping:
            return early_stopping.metric
        return Metric({"name": self.get_metric_name()})

    def get_out_of_folds(self):
        if self.oof_predictions is not None:
            return self.oof_predictions.copy(deep=True)

        if self._oof_predictions_fname is not None:
            self.oof_predictions = pd.read_csv(self._oof_predictions_fname)
            return self.oof_predictions.copy(deep=True)

        early_stopping = self.callbacks.get("early_stopping")
        if early_stopping is None:
            return None
        self.oof_predictions = early_stopping.best_y_oof

        ###############################################################
        # in case of one-hot coded multiclass target
        target_cols = [
            c for c in self.oof_predictions.columns.tolist() if "target" in c
        ]
        if len(target_cols) > 1:
            target = self.oof_predictions[target_cols[0]].copy()
            target.name = "target"
            for i, t in enumerate(target_cols):
                target[self.oof_predictions[t] == 1] = i
            self.oof_predictions.drop(target_cols, axis=1, inplace=True)

            self.oof_predictions.insert(0, "target", np.array(target))

        return early_stopping.best_y_oof

    def get_final_loss(self):
        if self.final_loss is not None:
            return self.final_loss
        early_stopping = self.callbacks.get("early_stopping")
        if early_stopping is None:
            return None
        self.final_loss = early_stopping.final_loss
        return early_stopping.final_loss

    """
    def get_metric_logs(self):
        metric_logger = self.callbacks.get("metric_logger")
        if metric_logger is None:
            return None
        return metric_logger.loss_values
    """

    def get_type(self):
        return self.learner_params.get("model_type")

    def get_name(self):
        return self._name

    def is_valid(self):
        """is_valid is used in Ensemble to check if it has more than 1 model in it.
        If Ensemble has only 1 model in it, then Ensemble shouldn't be used as best model"""
        return True

    def is_fast_enough(self, max_single_prediction_time):
        # dont need to check
        if max_single_prediction_time is None:
            return True

        # no iformation about prediction time
        if self._single_prediction_time is None:
            return True

        return self._single_prediction_time < max_single_prediction_time

    def predict(self, X):
        logger.debug("ModelFramework.predict")

        if self.learners is None or len(self.learners) == 0:
            raise Exception("Learnes are not initialized")
        # run predict on all learners and return the average
        y_predicted = None  # np.zeros((X.shape[0],))
        for ind, learner in enumerate(self.learners):
            # preprocessing goes here
            X_data, _, _ = self.preprocessings[ind].transform(X.copy(), None)
            y_p = learner.predict(X_data)
            y_p = self.preprocessings[ind].inverse_scale_target(y_p)

            y_predicted = y_p if y_predicted is None else y_predicted + y_p

        y_predicted_average = y_predicted / float(len(self.learners))

        y_predicted_final = self.preprocessings[0].prepare_target_labels(
            y_predicted_average)

        return y_predicted_final

    def get_additional_metrics(self):

        if self._additional_metrics is None:
            # 'target' - the target after processing used for model training
            # 'prediction' - out of folds predictions of the model
            oof_predictions = self.get_out_of_folds()
            prediction_cols = [
                c for c in oof_predictions.columns if "prediction" in c
            ]
            target_cols = [c for c in oof_predictions.columns if "target" in c]

            target = oof_predictions[target_cols]

            oof_preds = None
            if self._ml_task == MULTICLASS_CLASSIFICATION:
                oof_preds = self.preprocessings[0].prepare_target_labels(
                    oof_predictions[prediction_cols].values)
            else:
                oof_preds = oof_predictions[prediction_cols]

            sample_weight = None
            if "sample_weight" in oof_predictions.columns:
                sample_weight = oof_predictions["sample_weight"]

            self._additional_metrics = AdditionalMetrics.compute(
                target, oof_preds, sample_weight, self._ml_task)
            if self._ml_task == BINARY_CLASSIFICATION:
                self._threshold = float(self._additional_metrics["threshold"])
        return self._additional_metrics

    def save(self, results_path, model_subpath):
        start_time = time.time()
        model_path = os.path.join(results_path, model_subpath)
        logger.info(f"Save the model {model_path}")

        type_of_predictions = ("validation"
                               if "k_folds" not in self.validation_params else
                               "out_of_folds")
        predictions_fname = os.path.join(
            model_subpath, f"predictions_{type_of_predictions}.csv")
        self._oof_predictions_fname = os.path.join(results_path,
                                                   predictions_fname)
        predictions = self.get_out_of_folds()
        predictions.to_csv(self._oof_predictions_fname, index=False)

        saved = [
            os.path.join(model_subpath, l.get_fname()) for l in self.learners
        ]

        with open(os.path.join(model_path, "framework.json"), "w") as fout:
            preprocessing = [p.to_json() for p in self.preprocessings]
            learners_params = [
                learner.get_params() for learner in self.learners
            ]
            desc = {
                "uid": self.uid,
                "name": self._name,
                "preprocessing": preprocessing,
                "learners": learners_params,
                "params": self.params,
                "saved": saved,
                "predictions_fname": predictions_fname,
                "metric_name": self.get_metric_name(),
                "final_loss": self.get_final_loss(),
                "train_time": self.get_train_time(),
                "is_stacked": self._is_stacked,
            }
            if self._threshold is not None:
                desc["threshold"] = self._threshold
            fout.write(json.dumps(desc, indent=4))

        learning_curve_metric = self.learners[0].get_metric_name()
        if learning_curve_metric is None:
            learning_curve_metric = self.get_metric_name()

        LearningCurves.plot(
            [l.name for l in self.learners],
            learning_curve_metric,
            model_path,
            trees_in_iteration=self.additional_params.get("trees_in_step"),
        )

        # call additional metics just to be sure they are computed
        self._additional_metrics = self.get_additional_metrics()

        AdditionalMetrics.save(self._additional_metrics, self._ml_task,
                               self.model_markdown(), model_path)

        with open(os.path.join(model_path, "status.txt"), "w") as fout:
            fout.write("ALL OK!")
        # I'm adding save time to total train time
        # there is always save after the training
        self.train_time += time.time() - start_time

    def model_markdown(self):
        long_name = AlgorithmsRegistry.get_long_name(
            self._ml_task, self.learner_params["model_type"])
        short_name = self.learner_params["model_type"]
        desc = f"# Summary of {self.get_name()}\n\n"

        desc += "[<< Go back](../README.md)\n\n"

        if long_name == short_name:
            desc += f"\n## {short_name}\n"
        else:
            desc += f"\n## {long_name} ({short_name})\n"
        for k, v in self.learner_params.items():
            if k in ["model_type", "ml_task", "seed"]:
                continue
            desc += f"- **{k}**: {v}\n"
        desc += "\n## Validation\n"
        for k, v in self.validation_params.items():
            if "path" not in k:
                desc += f" - **{k}**: {v}\n"
        desc += "\n## Optimized metric\n"
        desc += f"{self.get_metric_name()}\n"
        desc += "\n## Training time\n"
        desc += f"\n{np.round(self.train_time,1)} seconds\n"
        return desc

    @staticmethod
    def load(results_path, model_subpath, lazy_load=True):
        model_path = os.path.join(results_path, model_subpath)
        logger.info(f"Loading model framework from {model_path}")

        json_desc = json.load(open(os.path.join(model_path, "framework.json")))
        mf = ModelFramework(json_desc["params"])
        mf.uid = json_desc.get("uid", mf.uid)
        mf._name = json_desc.get("name", mf._name)
        mf._threshold = json_desc.get("threshold")
        mf.train_time = json_desc.get("train_time", mf.train_time)
        mf.final_loss = json_desc.get("final_loss", mf.final_loss)
        mf.metric_name = json_desc.get("metric_name", mf.metric_name)
        mf._is_stacked = json_desc.get("is_stacked", mf._is_stacked)
        predictions_fname = json_desc.get("predictions_fname")
        if predictions_fname is not None:
            mf._oof_predictions_fname = os.path.join(results_path,
                                                     predictions_fname)

        mf.learners = []
        for learner_desc, learner_subpath in zip(json_desc.get("learners"),
                                                 json_desc.get("saved")):
            learner_path = os.path.join(results_path, learner_subpath)
            l = AlgorithmFactory.load(learner_desc, learner_path, lazy_load)
            mf.learners += [l]

        mf.preprocessings = []
        for p in json_desc.get("preprocessing"):
            ps = Preprocessing()
            ps.from_json(p, results_path)
            mf.preprocessings += [ps]

        return mf
Exemple #3
0
class ModelFramework:
    def __init__(self, params, callbacks=[]):
        logger.debug("ModelFramework.__init__")
        self.uid = str(uuid.uuid4())

        for i in ["learner", "validation"]:  # mandatory parameters
            if i not in params:
                msg = "Missing {0} parameter in ModelFramework params".format(
                    i)
                logger.error(msg)
                raise ValueError(msg)

        self.params = params
        self.callbacks = CallbackList(callbacks)

        self._name = params.get("name", "model")
        self.additional_params = params.get("additional")
        self.preprocessing_params = params.get("preprocessing")
        self.validation_params = params.get("validation")
        self.learner_params = params.get("learner")

        self._ml_task = params.get("ml_task")
        self._explain_level = params.get("explain_level")
        self._is_stacked = params.get("is_stacked", False)

        self.validation = None
        self.preprocessings = []
        self.learners = []

        self.train_time = None
        self.final_loss = None
        self.metric_name = None
        self.oof_predictions = None
        self._additional_metrics = None
        self._threshold = None  # used only for binary classifiers

    def get_train_time(self):
        return self.train_time

    def predictions(self, learner, preproces, X_train, y_train, X_validation,
                    y_validation):
        y_train_true = y_train
        y_train_predicted = learner.predict(X_train)
        y_validation_true = y_validation
        y_validation_predicted = learner.predict(X_validation)

        y_train_true = preproces.inverse_scale_target(y_train_true)
        y_train_predicted = preproces.inverse_scale_target(y_train_predicted)
        y_validation_true = preproces.inverse_scale_target(y_validation_true)
        y_validation_predicted = preproces.inverse_scale_target(
            y_validation_predicted)

        y_validation_columns = []
        if self._ml_task == MULTICLASS_CLASSIFICATION:
            # y_train_true = preproces.inverse_categorical_target(y_train_true)
            # y_validation_true = preproces.inverse_categorical_target(y_validation_true)
            # get columns, omit the last one (it is label)
            y_validation_columns = preproces.prepare_target_labels(
                y_validation_predicted).columns.tolist()[:-1]

        return {
            "y_train_true": y_train_true,
            "y_train_predicted": y_train_predicted,
            "y_validation_true": y_validation_true,
            "y_validation_predicted": y_validation_predicted,
            "validation_index": X_validation.index,
            "validation_columns": y_validation_columns,
        }

    def train(self, model_path):
        logger.debug(
            f"ModelFramework.train {self.learner_params.get('model_type')}")

        start_time = time.time()
        np.random.seed(self.learner_params["seed"])

        self.validation = ValidationStep(self.validation_params)

        for k_fold in range(self.validation.get_n_splits()):
            train_data, validation_data = self.validation.get_split(k_fold)
            logger.debug(
                "Data split, train X:{} y:{}, validation X:{}, y:{}".format(
                    train_data["X"].shape,
                    train_data["y"].shape,
                    validation_data["X"].shape,
                    validation_data["y"].shape,
                ))

            # the proprocessing is done at every validation step
            self.preprocessings += [Preprocessing(self.preprocessing_params)]

            X_train, y_train = self.preprocessings[-1].fit_and_transform(
                train_data["X"], train_data["y"])
            X_validation, y_validation = self.preprocessings[-1].transform(
                validation_data["X"], validation_data["y"])

            self.learner_params["explain_level"] = self._explain_level
            self.learners += [
                AlgorithmFactory.get_algorithm(
                    copy.deepcopy(self.learner_params))
            ]
            learner = self.learners[-1]

            self.callbacks.add_and_set_learner(learner)
            self.callbacks.on_learner_train_start()

            log_to_file = os.path.join(model_path,
                                       f"learner_{k_fold+1}_training.log")

            for i in range(learner.max_iters):

                self.callbacks.on_iteration_start()

                learner.fit(X_train, y_train, X_validation, y_validation,
                            log_to_file)

                self.callbacks.on_iteration_end(
                    {"iter_cnt": i},
                    self.predictions(
                        learner,
                        self.preprocessings[-1],
                        X_train,
                        y_train,
                        X_validation,
                        y_validation,
                    ),
                )

                if learner.stop_training:
                    break
                learner.update({"step": i})

            # end of learner iters loop
            self.callbacks.on_learner_train_end()

            learner.interpret(
                X_train,
                y_train,
                X_validation,
                y_validation,
                model_file_path=model_path,
                learner_name=f"learner_{k_fold+1}",
                class_names=self.preprocessings[-1].get_target_class_names(),
                metric_name=self.get_metric_name(),
                ml_task=self._ml_task,
                explain_level=self._explain_level,
            )

            # save learner and free the memory
            p = os.path.join(model_path,
                             f"learner_{k_fold+1}.{learner.file_extension()}")
            learner.save(p)
            del learner.model
            learner.model = None
            # end of learner training

        # end of validation loop
        self.callbacks.on_framework_train_end()
        self.get_additional_metrics()
        self.train_time = time.time() - start_time
        logger.debug("ModelFramework end of training")

    def get_metric_name(self):
        if self.metric_name is not None:
            return self.metric_name
        early_stopping = self.callbacks.get("early_stopping")
        if early_stopping is None:
            return None
        self.metric_name = early_stopping.metric.name
        return early_stopping.metric.name

    def get_out_of_folds(self):
        if self.oof_predictions is not None:
            return self.oof_predictions
        early_stopping = self.callbacks.get("early_stopping")
        if early_stopping is None:
            return None
        self.oof_predictions = early_stopping.best_y_oof

        ###############################################################
        # in case of Neural Network and one-hot coded multiclass target
        target_cols = [
            c for c in self.oof_predictions.columns.tolist() if "target" in c
        ]
        if len(target_cols) > 1:
            target = self.oof_predictions[target_cols[0]].copy()
            target.name = "target"
            for i, t in enumerate(target_cols):
                target[self.oof_predictions[t] == 1] = i
            self.oof_predictions.drop(target_cols, axis=1, inplace=True)

            self.oof_predictions.insert(0, "target", np.array(target))

        return early_stopping.best_y_oof

    def get_final_loss(self):
        if self.final_loss is not None:
            return self.final_loss
        early_stopping = self.callbacks.get("early_stopping")
        if early_stopping is None:
            return None
        self.final_loss = early_stopping.final_loss
        return early_stopping.final_loss

    """
    def get_metric_logs(self):
        metric_logger = self.callbacks.get("metric_logger")
        if metric_logger is None:
            return None
        return metric_logger.loss_values
    """

    def get_type(self):
        return self.learner_params.get("model_type")

    def get_name(self):
        return self._name

    def predict(self, X):
        logger.debug("ModelFramework.predict")

        if self.learners is None or len(self.learners) == 0:
            raise Exception("Learnes are not initialized")
        # run predict on all learners and return the average
        y_predicted = None  # np.zeros((X.shape[0],))
        for ind, learner in enumerate(self.learners):
            # preprocessing goes here
            X_data, _ = self.preprocessings[ind].transform(X.copy(), None)
            y_p = learner.predict(X_data)

            y_p = self.preprocessings[ind].inverse_scale_target(y_p)

            y_predicted = y_p if y_predicted is None else y_predicted + y_p

        y_predicted_average = y_predicted / float(len(self.learners))

        y_predicted_final = self.preprocessings[0].prepare_target_labels(
            y_predicted_average)

        return y_predicted_final

    def get_additional_metrics(self):
        if self._additional_metrics is None:
            # 'target' - the target after processing used for model training
            # 'prediction' - out of folds predictions of the model
            oof_predictions = self.get_out_of_folds()
            prediction_cols = [
                c for c in oof_predictions.columns if "prediction" in c
            ]
            target_cols = [c for c in oof_predictions.columns if "target" in c]

            target = oof_predictions[target_cols]

            oof_preds = None
            if self._ml_task == MULTICLASS_CLASSIFICATION:
                oof_preds = self.preprocessings[0].prepare_target_labels(
                    oof_predictions[prediction_cols].values)

            else:
                oof_preds = oof_predictions[prediction_cols]

            self._additional_metrics = AdditionalMetrics.compute(
                target, oof_preds, self._ml_task)
            if self._ml_task == BINARY_CLASSIFICATION:
                self._threshold = float(self._additional_metrics["threshold"])
        return self._additional_metrics

    def save(self, model_path):
        start_time = time.time()
        logger.info(f"Save the model {model_path}")

        type_of_predictions = ("validation"
                               if "k_folds" not in self.validation_params else
                               "out_of_folds")
        predictions_fname = os.path.join(
            model_path, f"predictions_{type_of_predictions}.csv")
        predictions = self.get_out_of_folds()
        predictions.to_csv(predictions_fname, index=False)

        saved = []
        for i, l in enumerate(self.learners):
            p = os.path.join(model_path, f"learner_{i+1}.{l.file_extension()}")
            # l.save(p)
            saved += [p]

        with open(os.path.join(model_path, "framework.json"), "w") as fout:
            preprocessing = [p.to_json() for p in self.preprocessings]
            learners_params = [
                learner.get_params() for learner in self.learners
            ]
            desc = {
                "uid": self.uid,
                "name": self._name,
                "preprocessing": preprocessing,
                "learners": learners_params,
                "params": self.params,
                "saved": saved,
                "predictions_fname": predictions_fname,
                "metric_name": self.get_metric_name(),
                "final_loss": self.get_final_loss(),
                "train_time": self.get_train_time(),
                "is_stacked": self._is_stacked,
            }
            if self._threshold is not None:
                desc["threshold"] = self._threshold
            fout.write(json.dumps(desc, indent=4))

        LearningCurves.plot(
            self.validation.get_n_splits(),
            self.get_metric_name(),
            model_path,
            trees_in_iteration=self.additional_params.get("trees_in_step"),
        )

        self._additional_metrics = self.get_additional_metrics()

        AdditionalMetrics.save(self._additional_metrics, self._ml_task,
                               self.model_markdown(), model_path)

        with open(os.path.join(model_path, "status.txt"), "w") as fout:
            fout.write("ALL OK!")
        # I'm adding save time to total train time
        # there is always save after the training
        self.train_time += time.time() - start_time

    def model_markdown(self):
        long_name = AlgorithmsRegistry.get_long_name(
            self._ml_task, self.learner_params["model_type"])
        short_name = self.learner_params["model_type"]
        desc = f"# Summary of {self.get_name()}\n"
        if long_name == short_name:
            desc += f"\n## {short_name}\n"
        else:
            desc += f"\n## {long_name} ({short_name})\n"
        for k, v in self.learner_params.items():
            if k in ["model_type", "ml_task", "seed"]:
                continue
            desc += f"- **{k}**: {v}\n"
        desc += "\n## Validation\n"
        for k, v in self.validation_params.items():
            if "path" not in k:
                desc += f" - **{k}**: {v}\n"
        desc += "\n## Optimized metric\n"
        desc += f"{self.get_metric_name()}\n"
        desc += "\n## Training time\n"
        desc += f"\n{np.round(self.train_time,1)} seconds\n"
        return desc

    @staticmethod
    def load(model_path):
        logger.info(f"Loading model framework from {model_path}")

        json_desc = json.load(open(os.path.join(model_path, "framework.json")))
        mf = ModelFramework(json_desc["params"])
        mf.uid = json_desc.get("uid", mf.uid)
        mf._name = json_desc.get("name", mf._name)
        mf._threshold = json_desc.get("threshold")
        mf.train_time = json_desc.get("train_time", mf.train_time)
        mf.final_loss = json_desc.get("final_loss", mf.final_loss)
        mf.metric_name = json_desc.get("metric_name", mf.metric_name)
        mf._is_stacked = json_desc.get("is_stacked", mf._is_stacked)
        predictions_fname = json_desc.get("predictions_fname")
        if predictions_fname is not None:
            mf.oof_predictions = pd.read_csv(predictions_fname)

        mf.learners = []
        for learner_desc, learner_path in zip(json_desc.get("learners"),
                                              json_desc.get("saved")):

            l = AlgorithmFactory.load(learner_desc, learner_path)
            mf.learners += [l]

        mf.preprocessings = []
        for p in json_desc.get("preprocessing"):
            ps = Preprocessing()
            ps.from_json(p)
            mf.preprocessings += [ps]

        return mf