Esempio n. 1
0
class EarlyStopping(Callback):
    def __init__(self, params):
        super(EarlyStopping, self).__init__(params)
        self.name = params.get("name", "early_stopping")
        self.metric = Metric(params.get("metric"))
        self.max_no_improvement_cnt = params.get("max_no_improvement_cnt", 5)

        self.keep_best_model = params.get("keep_best_model", True)
        self.best_iter = {}
        self.best_loss = {}
        self.loss_values = {}
        self.best_models = {}
        self.best_y_predicted = {}
        self.best_y_oof = (
            None)  # predictions computed on out of folds or on validation set
        self.final_loss = (
            None
        )  # final score computed on combined predictions from all learners
        # path to best model local copy, only used if cannot deep copy
        self.best_model_paths = {}

    def add_and_set_learner(self, learner):
        self.learners += [learner]
        self.learner = learner
        self.best_iter[learner.uid] = None
        self.best_loss[learner.uid] = self.metric.worst_value()
        self.loss_values[learner.uid] = {
            "train": [],
            "validation": [],
            "iters": []
        }
        self.best_models[learner.uid] = None
        self.best_model_paths[learner.uid] = None
        self.best_y_predicted[learner.uid] = None

    def on_learner_train_start(self, logs):
        self.no_improvement_cnt = 0

    def on_framework_train_end(self, logs):
        # aggregate predictions from all learners
        # it has two columns: 'prediction', 'target'

        self.best_y_oof = pd.concat(list(self.best_y_predicted.values()))
        self.best_y_oof.sort_index(inplace=True)
        self.final_loss = self.metric(self.best_y_oof["target"],
                                      self.best_y_oof["prediction"])

    def on_iteration_end(self, logs, predictions):

        train_loss = self.metric(predictions.get("y_train_true"),
                                 predictions.get("y_train_predicted"))
        validation_loss = self.metric(
            predictions.get("y_validation_true"),
            predictions.get("y_validation_predicted"),
        )
        self.loss_values[self.learner.uid]["train"] += [train_loss]
        self.loss_values[self.learner.uid]["validation"] += [validation_loss]
        self.loss_values[self.learner.uid]["iters"] += [logs.get("iter_cnt")]

        if self.metric.improvement(previous=self.best_loss[self.learner.uid],
                                   current=validation_loss):

            y_validation_true = predictions.get("y_validation_true")
            self.no_improvement_cnt = 0
            self.best_iter[self.learner.uid] = logs.get("iter_cnt")
            self.best_loss[self.learner.uid] = validation_loss
            self.best_y_predicted[self.learner.uid] = pd.DataFrame(
                {
                    "prediction":
                    predictions.get("y_validation_predicted"),
                    "target":
                    y_validation_true.values.reshape(
                        y_validation_true.shape[0]),
                },
                index=predictions.get("validation_index"),
            )
            self.best_models[self.learner.uid] = self.learner.copy()

            # if local copy is not available, save model and keep path
            if self.best_models[self.learner.uid] is None:
                self.best_model_paths[self.learner.uid] = self.learner.save()
        else:
            self.no_improvement_cnt += 1

        if self.no_improvement_cnt > self.max_no_improvement_cnt:
            self.learner.stop_training = True

        log.debug(
            "EarlyStopping.on_iteration_end, train loss: {}, validation loss: {}, "
            "no improvement cnt {}, iters {}".format(
                train_loss,
                validation_loss,
                self.no_improvement_cnt,
                len(self.loss_values[self.learner.uid]["iters"]),
            ))

    def get_status(self):
        return "Train loss: {}, Validation loss: {} @ iteration {}".format(
            self.loss_values[self.learner.uid]["train"][-1],
            self.loss_values[self.learner.uid]["validation"][-1],
            len(self.loss_values[self.learner.uid]["iters"]),
        )
Esempio n. 2
0
class Ensemble:

    algorithm_name = "Greedy Ensemble"
    algorithm_short_name = "Ensemble"

    def __init__(self, optimize_metric="logloss"):
        self.library_version = "0.1"
        self.uid = str(uuid.uuid4())
        self.model_file = self.uid + ".ensemble.model"
        self.model_file_path = os.path.join(storage_path, self.model_file)
        self.metric = Metric({"name": optimize_metric})
        self.best_loss = self.metric.get_maximum(
        )  # the best loss obtained by ensemble
        self.models = None
        self.selected_models = []
        self.train_time = None
        self.total_best_sum = None  # total sum of predictions, the oof of ensemble
        self.target = None

    def get_train_time(self):
        return self.train_time

    def get_final_loss(self):
        return self.best_loss

    def get_name(self):
        return self.algorithm_short_name

    def get_out_of_folds(self):
        return pd.DataFrame({
            "prediction": self.total_best_sum,
            "target": self.target
        })

    def _get_mean(self, X, best_sum, best_count, selected):
        resp = copy.deepcopy(X[selected])
        if best_count > 1:
            resp += best_sum
            resp /= float(best_count)
        return resp

    def get_oof_matrix(self, models):
        oofs = {}
        for i, m in enumerate(models):
            oof = m.get_out_of_folds()
            oofs["model_{}".format(i)] = oof["prediction"]
            if self.target is None:
                self.target = oof[
                    "target"]  # it will be needed for computing advance model statistics
                # it can be a mess in the future when target will be transformed depending on each model

        X = pd.DataFrame(oofs)
        self.models = models  # remeber models, will be needed in predictions
        return X

    def fit(self, X, y):
        start_time = time.time()
        selected_algs_cnt = 0  # number of selected algorithms
        self.best_algs = []  # selected algoritms indices from each loop

        best_sum = None  # sum of best algorihtms
        for j in range(X.shape[1]):  # iterate over all solutions
            min_score = self.metric.get_maximum()
            best_index = -1
            # try to add some algorithm to the best_sum to minimize metric
            for i in range(X.shape[1]):
                y_ens = self._get_mean(X, best_sum, j + 1,
                                       "model_{}".format(i))
                score = self.metric(y, y_ens)

                if self.metric.improvement(previous=min_score, current=score):
                    min_score = score
                    best_index = i

            # there is improvement, save it
            if self.metric.improvement(previous=self.best_loss,
                                       current=min_score):
                self.best_loss = min_score
                selected_algs_cnt = j

            self.best_algs.append(best_index)  # save the best algoritm index
            # update best_sum value
            best_sum = (X["model_{}".format(best_index)] if best_sum is None
                        else best_sum + X["model_{}".format(best_index)])
            if j == selected_algs_cnt:
                self.total_best_sum = copy.deepcopy(best_sum)

        # keep oof predictions of ensemble
        self.total_best_sum /= float(selected_algs_cnt + 1)
        self.best_algs = self.best_algs[:(selected_algs_cnt + 1)]
        for i in np.unique(self.best_algs):
            self.selected_models += [{
                "model": self.models[i],
                "repeat": np.sum(self.best_algs == i)
            }]
        self.train_time = time.time() - start_time

    def predict(self, X):
        y_predicted = None
        total_repeat = 0.0
        for selected in self.selected_models:
            model = selected["model"]
            repeat = selected["repeat"]
            total_repeat += repeat
            y_predicted = (model.predict(X) *
                           repeat if y_predicted is None else y_predicted +
                           model.predict(X) * repeat)
        return y_predicted / total_repeat

    def to_json(self):
        models_json = []
        for selected in self.selected_models:
            model = selected["model"]
            repeat = selected["repeat"]
            models_json += [{"model": model.to_json(), "repeat": repeat}]

        json_desc = {
            "library_version": self.library_version,
            "algorithm_name": self.algorithm_name,
            "algorithm_short_name": self.algorithm_short_name,
            "uid": self.uid,
            "models": models_json,
        }
        return json_desc

    def from_json(self, json_desc):
        self.library_version = json_desc.get("library_version",
                                             self.library_version)
        self.algorithm_name = json_desc.get("algorithm_name",
                                            self.algorithm_name)
        self.algorithm_short_name = json_desc.get("algorithm_short_name",
                                                  self.algorithm_short_name)
        self.uid = json_desc.get("uid", self.uid)
        self.selected_models = []
        models_json = json_desc.get("models")
        for selected in models_json:
            model = selected["model"]
            repeat = selected["repeat"]

            il = IterativeLearner(model.get("params"))
            il.from_json(model)
            self.selected_models += [
                # {"model": LearnerFactory.load(model), "repeat": repeat}
                {
                    "model": il,
                    "repeat": repeat
                }
            ]