class EarlyStopping(Callback): def __init__(self, params): super(EarlyStopping, self).__init__(params) self.name = params.get("name", "early_stopping") self.metric = Metric(params.get("metric")) self.max_no_improvement_cnt = params.get("max_no_improvement_cnt", 5) self.keep_best_model = params.get("keep_best_model", True) self.best_iter = {} self.best_loss = {} self.loss_values = {} self.best_models = {} self.best_y_predicted = {} self.best_y_oof = ( None) # predictions computed on out of folds or on validation set self.final_loss = ( None ) # final score computed on combined predictions from all learners # path to best model local copy, only used if cannot deep copy self.best_model_paths = {} def add_and_set_learner(self, learner): self.learners += [learner] self.learner = learner self.best_iter[learner.uid] = None self.best_loss[learner.uid] = self.metric.worst_value() self.loss_values[learner.uid] = { "train": [], "validation": [], "iters": [] } self.best_models[learner.uid] = None self.best_model_paths[learner.uid] = None self.best_y_predicted[learner.uid] = None def on_learner_train_start(self, logs): self.no_improvement_cnt = 0 def on_framework_train_end(self, logs): # aggregate predictions from all learners # it has two columns: 'prediction', 'target' self.best_y_oof = pd.concat(list(self.best_y_predicted.values())) self.best_y_oof.sort_index(inplace=True) self.final_loss = self.metric(self.best_y_oof["target"], self.best_y_oof["prediction"]) def on_iteration_end(self, logs, predictions): train_loss = self.metric(predictions.get("y_train_true"), predictions.get("y_train_predicted")) validation_loss = self.metric( predictions.get("y_validation_true"), predictions.get("y_validation_predicted"), ) self.loss_values[self.learner.uid]["train"] += [train_loss] self.loss_values[self.learner.uid]["validation"] += [validation_loss] self.loss_values[self.learner.uid]["iters"] += [logs.get("iter_cnt")] if self.metric.improvement(previous=self.best_loss[self.learner.uid], current=validation_loss): y_validation_true = predictions.get("y_validation_true") self.no_improvement_cnt = 0 self.best_iter[self.learner.uid] = logs.get("iter_cnt") self.best_loss[self.learner.uid] = validation_loss self.best_y_predicted[self.learner.uid] = pd.DataFrame( { "prediction": predictions.get("y_validation_predicted"), "target": y_validation_true.values.reshape( y_validation_true.shape[0]), }, index=predictions.get("validation_index"), ) self.best_models[self.learner.uid] = self.learner.copy() # if local copy is not available, save model and keep path if self.best_models[self.learner.uid] is None: self.best_model_paths[self.learner.uid] = self.learner.save() else: self.no_improvement_cnt += 1 if self.no_improvement_cnt > self.max_no_improvement_cnt: self.learner.stop_training = True log.debug( "EarlyStopping.on_iteration_end, train loss: {}, validation loss: {}, " "no improvement cnt {}, iters {}".format( train_loss, validation_loss, self.no_improvement_cnt, len(self.loss_values[self.learner.uid]["iters"]), )) def get_status(self): return "Train loss: {}, Validation loss: {} @ iteration {}".format( self.loss_values[self.learner.uid]["train"][-1], self.loss_values[self.learner.uid]["validation"][-1], len(self.loss_values[self.learner.uid]["iters"]), )
class Ensemble: algorithm_name = "Greedy Ensemble" algorithm_short_name = "Ensemble" def __init__(self, optimize_metric="logloss"): self.library_version = "0.1" self.uid = str(uuid.uuid4()) self.model_file = self.uid + ".ensemble.model" self.model_file_path = os.path.join(storage_path, self.model_file) self.metric = Metric({"name": optimize_metric}) self.best_loss = self.metric.get_maximum( ) # the best loss obtained by ensemble self.models = None self.selected_models = [] self.train_time = None self.total_best_sum = None # total sum of predictions, the oof of ensemble self.target = None def get_train_time(self): return self.train_time def get_final_loss(self): return self.best_loss def get_name(self): return self.algorithm_short_name def get_out_of_folds(self): return pd.DataFrame({ "prediction": self.total_best_sum, "target": self.target }) def _get_mean(self, X, best_sum, best_count, selected): resp = copy.deepcopy(X[selected]) if best_count > 1: resp += best_sum resp /= float(best_count) return resp def get_oof_matrix(self, models): oofs = {} for i, m in enumerate(models): oof = m.get_out_of_folds() oofs["model_{}".format(i)] = oof["prediction"] if self.target is None: self.target = oof[ "target"] # it will be needed for computing advance model statistics # it can be a mess in the future when target will be transformed depending on each model X = pd.DataFrame(oofs) self.models = models # remeber models, will be needed in predictions return X def fit(self, X, y): start_time = time.time() selected_algs_cnt = 0 # number of selected algorithms self.best_algs = [] # selected algoritms indices from each loop best_sum = None # sum of best algorihtms for j in range(X.shape[1]): # iterate over all solutions min_score = self.metric.get_maximum() best_index = -1 # try to add some algorithm to the best_sum to minimize metric for i in range(X.shape[1]): y_ens = self._get_mean(X, best_sum, j + 1, "model_{}".format(i)) score = self.metric(y, y_ens) if self.metric.improvement(previous=min_score, current=score): min_score = score best_index = i # there is improvement, save it if self.metric.improvement(previous=self.best_loss, current=min_score): self.best_loss = min_score selected_algs_cnt = j self.best_algs.append(best_index) # save the best algoritm index # update best_sum value best_sum = (X["model_{}".format(best_index)] if best_sum is None else best_sum + X["model_{}".format(best_index)]) if j == selected_algs_cnt: self.total_best_sum = copy.deepcopy(best_sum) # keep oof predictions of ensemble self.total_best_sum /= float(selected_algs_cnt + 1) self.best_algs = self.best_algs[:(selected_algs_cnt + 1)] for i in np.unique(self.best_algs): self.selected_models += [{ "model": self.models[i], "repeat": np.sum(self.best_algs == i) }] self.train_time = time.time() - start_time def predict(self, X): y_predicted = None total_repeat = 0.0 for selected in self.selected_models: model = selected["model"] repeat = selected["repeat"] total_repeat += repeat y_predicted = (model.predict(X) * repeat if y_predicted is None else y_predicted + model.predict(X) * repeat) return y_predicted / total_repeat def to_json(self): models_json = [] for selected in self.selected_models: model = selected["model"] repeat = selected["repeat"] models_json += [{"model": model.to_json(), "repeat": repeat}] json_desc = { "library_version": self.library_version, "algorithm_name": self.algorithm_name, "algorithm_short_name": self.algorithm_short_name, "uid": self.uid, "models": models_json, } return json_desc def from_json(self, json_desc): self.library_version = json_desc.get("library_version", self.library_version) self.algorithm_name = json_desc.get("algorithm_name", self.algorithm_name) self.algorithm_short_name = json_desc.get("algorithm_short_name", self.algorithm_short_name) self.uid = json_desc.get("uid", self.uid) self.selected_models = [] models_json = json_desc.get("models") for selected in models_json: model = selected["model"] repeat = selected["repeat"] il = IterativeLearner(model.get("params")) il.from_json(model) self.selected_models += [ # {"model": LearnerFactory.load(model), "repeat": repeat} { "model": il, "repeat": repeat } ]