class LGBM(Model):
    def __init__(self):
        super(LGBM, self).__init__(LGBMClassifier())


    def fit_atstart(self, X_train, y_train, params, save = False):
        self.model = LGBMClassifier(**params)
        print('Начало тренировки модели LGBM...')
        self.model.fit(X_train, y_train)
        print('Конец тренировки, сохранение модели LGBM...')
        if save: self.save_model(self.model, file_name)

    def fit_continue(self, X_train, y_train, save=False):
        self.model = self.load_model(file_name)
        print('Продолжение тренировки модели LGBM....')
        self.model.fit(X_train, y_train)
        print('Конец тренировки, сохранение модели LGBM...')
        if save: self.save_model(self.model, file_name)


    def save_model(self, file_name):
        self.model.save_model(file_name)


    def load_model(self, file_name):
        pass
Exemple #2
0
    selection_x_train = selection.transform(x_train)
    selection_x_test = selection.transform(x_test)
    print(selection_x_train.shape)

    selection_model = LGBMClassifier(n_estimators=1000,
                                     max_depth=4,
                                     learning_rate=0.5,
                                     n_jobs=-1)

    selection_model.fit(selection_x_train,
                        y_train,
                        verbose=False,
                        eval_metric=["multi_error", "multi_logloss"],
                        eval_set=[(selection_x_train, y_train),
                                  (selection_x_test, y_test)],
                        early_stopping_rounds=100)

    y_pred = selection_model.predict(selection_x_test)

    # results = selection_model.evals_result()
    # print("evals_result : \n", results)

    score = accuracy_score(y_test, y_pred)
    print("Thresh=%.3f, n=%d, acc: %.2f%%" %
          (thresh, selection_x_train.shape[1], score * 100.0))
# (120, 1)
# Thresh=285.000, n=4, acc: 100.00%

model.save_model("./model/xgb_save/lgbm_iris_acc_100_model")
Exemple #3
0
    df["good"] = df["good"].astype(int)
    return df


# train
df = getDataSet()
X, y = df[df.columns[:-1]], df["good"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

<<<<<<< HEAD:.history/Day 64 LightGBM_20211123172448.py
gbm = LGBMClassifier(
    num_leaves=5,
    max_depth=2,
    learning_rate=0.05,
    min_data_in_leaf=3,
    n_estimators=5,
    max_bin=5,
    min_data_in_bin=2,
    subsample_for_bin=17,
)
# gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5)
gbm.fit(X_train, y_train)

gbm.save_model("model.txt")
=======
gbm = LGBMClassifier(num_leaves=5, max_depth=2, learning_rate=0.05, min_data_in_leaf=3,
                     n_estimators=5, max_bin=5, min_data_in_bin=2, subsample_for_bin=17)
#gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=5)
gbm.fit(X_train, y_train)
>>>>>>> 75a87411087af4b4279d9fa7a2c53a7c5daece56:Day 64 LightGBM.py
Exemple #4
0
 0.00346774 0.00359193 0.00407993 0.00500716 0.00536249 0.00552303
 0.00699229 0.00727135 0.00831567 0.00982234 0.01391234 0.01405333
 0.01454801 0.01663779 0.01722644 0.01724869 0.02118052 0.0229155
 0.03107507 0.10279346 0.11432321 0.1687874  0.16907775 0.20711204]
'''

for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh, prefit=True)

    select_x_train = selection.transform(x_train)
    select_x_test = selection.transform(x_test)
    selection_model = LGBMClassifier(n_estimators=300,
                                     learning_rate=0.1,
                                     n_jobs=-1)
    selection_model.fit(select_x_train,
                        y_train,
                        verbose=False,
                        eval_metric=['logloss', 'auc'],
                        eval_set=[(select_x_train, y_train),
                                  (select_x_test, y_test)],
                        early_stopping_rounds=20)

    y_pred = selection_model.predict(select_x_test)

    acc = accuracy_score(y_test, y_pred)

    print('Thresh=%.3f, n=%d, acc: %.2f%%' %
          (thresh, select_x_train.shape[1], acc * 100.0))
    model.save_model('./model/xgb_save/cancer_n=%d_acc=%.3f.model' %
                     (select_x_train.shape[1], acc))
class modelLightBoost(Training, BaseEstimator, ClassifierMixin):
    """
    Ejemplo multiclass:
    https://www.kaggle.com/nicapotato/multi-class-lgbm-cv-and-seed-diversification
    """
    def __init__(self,
                 name="LGB",
                 random_state=99,
                 train_dir="",
                 params=None,
                 *args,
                 **kwargs):

        self.name = name
        self.train_dir = train_dir + "/" + "model_" + str(self.name) + "/"
        self.random_state = random_state

        if params is None:
            self.get_params_json()
            self.params.update({
                'model_dir': self.train_dir,
                "seed": self.random_state
            })
        else:
            # if isinstance(params)
            self.params = params

        self.model = LGBMClassifier(**self.params)
        super().__init__(self.model, random_state=self.random_state)

    def get_params_json(self):
        self.manager_models = ParamsManager(param_file, key_read="Models")
        self.params = self.manager_models.get_params()["LightBoost"]

        self.manager_finetune = ParamsManager(param_file, key_read="FineTune")
        self.params_finetune = self.manager_finetune.get_params()["LightBoost"]

    def dataset(self,
                X,
                y,
                categorical_columns_indices=None,
                test_size=0.2,
                *args,
                **kwarg):

        self.categorical_columns_indices = categorical_columns_indices
        self.X = X
        self.columns = list(X)

        self.y, self.cat_replace = self.replace_multiclass(y)

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X,
            self.y,
            test_size=test_size,
            random_state=self.random_state)

        self.dtrain = lgb.Dataset(self.X_train.values,
                                  label=self.y_train.values,
                                  feature_name=self.X_train.columns.tolist())
        self.dvalid = lgb.Dataset(self.X_test.values,
                                  label=self.y_test.values,
                                  feature_name=self.X_test.columns.tolist())
        self.all_train_data = lgb.Dataset(self.X.values,
                                          label=self.y.values,
                                          feature_name=self.X.columns.tolist())

    def set_dataset_nosplit(self,
                            X_train,
                            X_test,
                            y_train,
                            y_test,
                            categorical_columns_indices=None,
                            *args,
                            **kwarg):

        self.categorical_columns_indices = categorical_columns_indices
        self.columns = list(X_train)
        _ytrain, _ = self.replace_multiclass(y_train)
        _ytest, _ = self.replace_multiclass(y_test)

        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test

        self.X = pd.concat([X_train, X_test], axis=0)
        self.y = pd.concat([y_train, y_test], axis=0)

        self.dtrain = lgb.Dataset(self.X_train.values,
                                  label=self.y_train.values,
                                  feature_name=self.X_train.columns.tolist())
        self.dvalid = lgb.Dataset(self.X_test.values,
                                  label=self.y_test.values,
                                  feature_name=self.X_test.columns.tolist())
        self.all_train_data = lgb.Dataset(self.X.values,
                                          label=self.y.values,
                                          feature_name=self.X.columns.tolist())

    def replace_multiclass(self, targets):

        _unic = targets.unique().tolist()
        _remp = np.arange(0, len(_unic)).tolist()
        return targets.replace(_unic, _remp), _unic

    def fit(self,
            X=None,
            y=None,
            X_train=None,
            X_test=None,
            y_train=None,
            y_test=None,
            mute=False,
            use_best_model=True,
            verbose=0,
            num_boost_round=100,
            nosplit=False,
            **kwargs):

        if not nosplit:
            self.dataset(X, y)
        else:
            self.set_dataset_nosplit(X_train, X_test, y_train, y_test)

        self.params.update({'verbose': verbose})

        self.model = lgb.train(self.params,
                               self.dtrain,
                               num_boost_round=num_boost_round,
                               verbose_eval=verbose,
                               **kwargs)

        preds_test = [
            np.argmax(line) for line in self.model.predict(
                self.X_test, num_iteration=self.model.best_iteration)
        ]
        score_test = accuracy_score(self.y_test, preds_test)

        preds_train = [
            np.argmax(line) for line in self.model.predict(
                self.X_train, num_iteration=self.model.best_iteration)
        ]
        score_train = accuracy_score(self.y_train, preds_train)

        if not mute:
            print("Accurancy para el conjunto de entrenamiento ---> {:.2f}%".
                  format(score_train * 100))
            print("Accurancy para el conjunto de validacion ------> {:.2f}%".
                  format(score_test * 100))

    def fit_cv(self,
               X=None,
               y=None,
               X_train=None,
               X_test=None,
               y_train=None,
               y_test=None,
               nfold=5,
               use_best_model=True,
               verbose=200,
               nosplit=False,
               early_stopping_rounds=150,
               num_boost_round=2000,
               **kwargs):

        if not nosplit:
            self.dataset(X, y)
        else:
            self.set_dataset_nosplit(X_train, X_test, y_train, y_test)

        self.params.update({'verbose': verbose})
        self.lgb_cv = lgb.cv(params=self.params,
                             train_set=self.all_train_data,
                             num_boost_round=num_boost_round,
                             stratified=True,
                             nfold=nfold,
                             seed=self.random_state,
                             early_stopping_rounds=early_stopping_rounds,
                             **kwargs)
        loss = self.params["metric"]
        optimal_rounds = np.argmin(self.lgb_cv[str(loss) + '-mean'])
        best_cv_score = min(self.lgb_cv[str(loss) + '-mean'])

        if not verbose == 0:
            print("\nOptimal Round: {}\nOptimal Score: {:.3f} + stdv:{:.3f}".
                  format(optimal_rounds, best_cv_score,
                         self.lgb_cv[str(loss) + '-stdv'][optimal_rounds]))

        results = {
            "Rounds": optimal_rounds,
            "Score": best_cv_score,
            "STDV": self.lgb_cv[str(loss) + '-stdv'][optimal_rounds],
            "LB": None,
            "Parameters": self.params
        }
        score = np.mean(self.lgb_cv[str(loss) + '-mean'])

        return score, results

    def func_acc(self, prob_pred, y_target):

        _y_pred = np.zeros(len(prob_pred))

        for i in range(0, len(prob_pred)):
            _y_pred[i] = int(np.argmax(prob_pred[i]))

        accuracy = accuracy_score(_y_pred, y_target)
        return accuracy

    def pred_binary(self, X, *args, **kwargs):
        _X_copy = X.loc[:, self.columns].copy()
        preds = self.model.predict(_X_copy, *args, **kwargs)
        return np.where(preds > 0.5, 1, 0)

    def pred_multiclass(self, X, *args, **kwargs):
        _X_copy = X.loc[:, self.columns].copy()
        return [
            np.argmax(line) for line in self.model.predict(
                _X_copy, num_iteration=self.model.best_iteration)
        ]

    def update_model(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self.model, k, v)

    def save_model(self,
                   direct="./checkpoints",
                   name="LGM_model",
                   file_model=".txt"):

        if not os.path.isdir(direct):
            try:
                os.mkdir(direct)
                print("Directorio creado: " + direct)
            except OSError as e:
                raise NameError("Error al crear el directorio")

        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

        if file_model == ".txt":
            filename = direct + "/" + name + "_" + current_time + ".txt"
            self.model.save_model(filename)
        elif file_model == ".pkl":
            filename = direct + "/" + name + "_" + current_time + ".pkl"
            joblib.dump(self.model, filename)
        else:
            raise NameError("Type {} not permited".format(file_model))
        print("Modelo guardado en la ruta: " + filename)

    def load_model(self,
                   direct="./checkpoints/LGM_model.txt",
                   file_model=".txt"):

        if not os.path.isdir(direct):
            print("no existe el drectorio especificado")

        if file_model == ".txt":
            self.model = LGBMClassifier(model_file=direct)
        elif file_model == ".pkl":
            self.model = joblib.load(direct)
        else:
            raise NameError("Type {} not permited".format(file_model))
        print("Modelo cargado de la ruta: " + direct)

    def predict(self, X, *args, **kwargs):
        _X_copy = X.loc[:, self.columns].copy()
        return self.model.predict(_X_copy, *args, **kwargs)

    def predict_proba(self, X, *args, **kwargs):
        _X_copy = X.loc[:, self.columns].copy()
        return self.model.predict_proba(_X_copy, *args, **kwargs)

    def index_features(self, features):

        _index = []
        for i in features:
            _index.append(self.X.columns.get_loc(i))
        if _index == []:
            raise NameError("No coincide ninguna de las features introducidas")
        return _index

    def get_important_features(self, display=True, max_num_features=20):

        if display:
            lgb.plot_importance(self.model,
                                max_num_features=max_num_features,
                                figsize=(6, 6),
                                title='Feature importance (LightGBM)')
            plt.show()
        # return _feature_importance_df

    def FineTune_SearchCV(self,
                          X=None,
                          y=None,
                          X_train=None,
                          X_test=None,
                          y_train=None,
                          y_test=None,
                          params=None,
                          params_finetune=None,
                          ROC=False,
                          randomized=True,
                          cv=10,
                          display_ROC=True,
                          verbose=0,
                          n_iter=10,
                          replace_model=True,
                          nosplit=False,
                          finetune_dir=""):

        self.get_params_json()

        self.finetune_dir = finetune_dir + "/" + "model_finetune_" + str(
            self.name) + "/"
        self.params.update({
            'train_dir': self.finetune_dir,
            "seed": self.random_state
        })
        if params is not None:
            self.params = params
        if params_finetune is not None:
            self.params_finetune = params_finetune

        if not nosplit:
            self.dataset(X, y)
        else:
            self.set_dataset_nosplit(X_train, X_test, y_train, y_test)

        self.params.update({'verbosity': verbose})
        self.model = LGBMClassifier(**self.params)

        self._best_Parameters, self.results_df = self.FineTune(
            self.model,
            self.X_train,
            self.y_train,
            self.params_finetune,
            cv=cv,
            randomized=True,
            n_iter=n_iter,
            verbose=1)
        self.params.update(**self._best_Parameters)
        self.fit(self.X_train, self.y_train)

        print("\n")
        score = accuracy_score(self.y_test, self.pred_multiclass(self.X_test))
        print("\n")
        print(
            "Resultado del conjunto de test con los parametros optimos: {:.2f}%"
            .format(score * 100))
        print("\n")
        print("Report clasificacion con el conjunto de test: ")
        self.evaluate(self.model, self.X_test, self.y_test)
        print("\n")
        print("Validacion cruzada con todos los datos del dataset: ")
        print("\n")
        self.KFold_CrossValidation(LGBMClassifier(**self._best_Parameters),
                                   self.X,
                                   self.y,
                                   n_splits=cv,
                                   ROC=ROC,
                                   shuffle=True,
                                   mute=False,
                                   logdir_report="",
                                   display=True,
                                   save_image=True,
                                   verbose=0)
        return self._best_Parameters, self.results_df

    def SeedDiversification_cv(self,
                               X=None,
                               y=None,
                               X_train=None,
                               X_test=None,
                               y_train=None,
                               y_test=None,
                               n_iter=10,
                               n_max=2018 - 2022,
                               cv=10,
                               nosplit=False,
                               finetuneseed_dir="",
                               display=True,
                               save_image=True,
                               verbose=0):
        allmodelstart = time.time()
        self.get_params_json()

        self.finetune_dir = finetuneseed_dir + "/" + "model_finetune_seed" + str(
            self.name) + "/"
        self.params.update({
            'train_dir': self.finetune_dir,
            'verbosity': verbose
        })
        if not nosplit:
            self.dataset(X, y)
        else:
            self.set_dataset_nosplit(X_train, X_test, y_train, y_test)

        self.params.update({'verbosity': verbose})
        self.model = LGBMClassifier(**self.params)

        _rd = np.random.uniform(0, n_max, n_iter).astype(np.int32).tolist()
        params_finetuneseed = {"seed": _rd}
        del (_rd)

        self._best_Parameters, self.results_df = self.FineTune(
            self.model,
            self.X,
            self.y,
            params_finetuneseed,
            randomized=False,
            cv=cv,
            n_iter=n_iter,
            verbose=1,
            mute=True)
        print("All Model Runtime: %0.2f Minutes" %
              ((time.time() - allmodelstart) / 60))

        print(
            "Diversificacion de la semilla - mean AUC: {:.2f}% - std AUC: {:.5f}"
            .format(self.results_df['mean_test_AUC'].mean() * 100,
                    self.results_df['std_test_AUC'].mean()))

        print(
            "Diversificacion de la semilla - mean Acc: {:.2f}% - std Acc: {:.5f}"
            .format(self.results_df['mean_test_Accuracy'].mean() * 100,
                    self.results_df['std_test_Accuracy'].mean()))
        return self._best_Parameters, self.results_df

    def SeedDiversification_fs(self,
                               X,
                               y,
                               params,
                               n_iter=10,
                               mute=False,
                               logdir_report="",
                               display=True,
                               save_image=True):

        allmodelstart = time.time()
        # Run Model with different Seeds
        all_feature_importance_df = pd.DataFrame()
        _y, _ = self.replace_multiclass(y)
        all_seeds = np.random.uniform(1, 1000,
                                      n_iter).astype(np.int32).tolist()

        for seeds_x in all_seeds:
            modelstart = time.time()
            print(
                "Seed: ",
                seeds_x,
            )
            # Go Go Go
            params["seed"] = seeds_x
            model = lgb.train(params,
                              lgb.Dataset(X.values, label=_y.values),
                              verbose_eval=100)

            # Feature Importance
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = X.columns.tolist()
            fold_importance_df["importance"] = model.feature_importance()
            all_feature_importance_df = pd.concat(
                [all_feature_importance_df, fold_importance_df], axis=0)

            # Submit Model Individually
            #     seed_submit(model= lgb_final, seed= seeds_x, X_test)
            if not mute:
                print("Model Runtime: %0.2f seconds" %
                      ((time.time() - modelstart)))
                print("#" * 50)
            del model

        cols = all_feature_importance_df[[
            "feature", "importance"
        ]].groupby("feature").mean().sort_values(by="importance",
                                                 ascending=False)[:50].index
        best_features = all_feature_importance_df.loc[
            all_feature_importance_df.feature.isin(cols)]
        plt.figure(figsize=(8, 10))
        sns.barplot(x="importance",
                    y="feature",
                    data=best_features.sort_values(by="importance",
                                                   ascending=False))
        plt.title('LightGBM Features (avg over folds)')
        plt.tight_layout()
        if display:
            plt.show()
        if save_image:
            filename = logdir_report + 'lgb_importances.png'
            plt.savefig(filename)
        print("All Model Runtime: %0.2f Minutes" %
              ((time.time() - allmodelstart) / 60))

    def __getattr__(self, attr):
        """
        Pass all other method calls to self.model.
        """
        return getattr(self.model, attr)