Beispiel #1
0
 def __init__(self, random_state=99):
     """
     Inicializacion de la clase de modelos
     """
     self.random_state = random_state
     self.manager_models = ParamsManager(param_file, key_read="Models")
     self.manager_finetune = ParamsManager(param_file, key_read="FineTune")
Beispiel #2
0
    def __init__(self, name="CBT", random_state=99, *args, **kwargs):

        self.name = name
        self.train_dir = "model_" + str(self.name) + "/"
        self.random_state = random_state

        self.manager_models = ParamsManager(param_file, key_read="Models")
        self.params = self.manager_models.get_params()["CatBoost"]
        self.params.update({
            'train_dir': self.train_dir,
            "random_state": self.random_state
        })

        self.model = CatBoostClassifier(**self.params)
Beispiel #3
0
class modelCatBoost(object):
    def __init__(self, name="CBT", random_state=99, *args, **kwargs):

        self.name = name
        self.train_dir = "model_" + str(self.name) + "/"
        self.random_state = random_state

        self.manager_models = ParamsManager(param_file, key_read="Models")
        self.params = self.manager_models.get_params()["CatBoost"]
        self.params.update({
            'train_dir': self.train_dir,
            "random_state": self.random_state
        })

        self.model = CatBoostClassifier(**self.params)

    def dataset(self,
                X,
                y,
                categorical_columns_indices=None,
                test_size=0.2,
                *args,
                **kwargs):

        self.categorical_columns_indices = categorical_columns_indices
        self.X = X
        self.columns = list(X)

        self.y, self.cat_replace = self.replace_multiclass(y)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X,
            self.y,
            test_size=test_size,
            random_state=self.random_state)

        self.train_data = catboost.Pool(
            data=self.X_train.values,
            label=self.y_train.values,
            cat_features=self.categorical_columns_indices)
        self.eval_data = catboost.Pool(
            data=self.X_test.values,
            label=self.y_test.values,
            cat_features=self.categorical_columns_indices)
        self.all_train_data = catboost.Pool(
            data=self.X.values,
            label=self.y.values,
            cat_features=self.categorical_columns_indices)

    def replace_multiclass(self, targets):

        _unic = targets.unique().tolist()
        _remp = np.arange(0, len(_unic)).tolist()
        return targets.replace(_unic, _remp), _unic

    def fit(self,
            X,
            y,
            use_best_model=True,
            plot=True,
            save_snapshot=False,
            verbose=0,
            *args,
            **kwargs):

        self.dataset(X, y)
        _params = self.model.get_params()

        if verbose:
            _verbose = 0
        else:
            _verbose = _params["verbose"]

        return self.model.fit(self.train_data,
                              verbose=_verbose,
                              eval_set=self.eval_data,
                              use_best_model=use_best_model,
                              plot=plot,
                              save_snapshot=save_snapshot,
                              **kwargs)

        _preds = self.model.predict(self.dvalid)
        preds_test = np.where(_preds > 0.5, 1, 0)
        score_test = accuracy_score(self.y_test, preds_test)

        _preds = self.model.predict(self.dtrain)
        preds_train = np.where(_preds > 0.5, 1, 0)
        score_train = accuracy_score(self.y_train, preds_train)

        if not verbose == 0:
            print("Accurancy para el conjunto de entrenamiento ---> {:.2f}%".
                  format(score_train * 100))
            print("Accurancy para el conjunto de validacion ------> {:.2f}%".
                  format(score_test * 100))

    def fit_cv(self,
               X,
               y,
               fold_count=4,
               shuffle=True,
               stratified=True,
               plot=True,
               verbose=100):

        self.dataset(X, y)

        _params = self.model.get_params()
        _params.update({'verbose': verbose})

        _scores = catboost.cv(pool=self.all_train_data,
                              params=_params,
                              fold_count=fold_count,
                              seed=self.random_state,
                              shuffle=shuffle,
                              verbose=verbose,
                              plot=plot)
        if not verbose == 0:
            print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.
                  format(
                      np.max(_scores['test-Accuracy-mean']),
                      _scores['test-Accuracy-std'][np.argmax(
                          _scores['test-Accuracy-mean'])],
                      np.argmax(_scores['test-Accuracy-mean'])))

        return _scores

    def copy(self, *args, **kwargs):
        returned_classifier = CatBoostClassifier()
        returned_classifier.catboost_classifier = self.model.copy()
        returned_classifier.columns = self.columns
        return returned_classifier

    def update_model(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self.model, k, v)

    def save_model(self, direct="./checkpoints", name="catboost_model"):

        if not os.path.isdir(direct):
            try:
                os.mkdir(direct)
                print("Directorio creado: " + direct)
            except OSError as e:
                raise NameError("Error al crear el directorio")
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        filename = direct + "/" + name + "_" + current_time + ".dump"
        self.model.save_model(filename)
        print("Modelo guardado en la ruta: " + filename)

    def load_model(self, direct="./checkpoints", name="catboost_model"):

        if not os.path.isdir(direct):
            print("no existe el drectorio especificado")
        filename = direct + "/" + name + ".dump"
        self.model.load_model(filename)
        print("Modelo cargado de la ruta: " + filename)

    def predict(self, X, *args, **kwargs):
        _X_copy = X.loc[:, self.columns].copy()
        return self.model.predict(_X_copy.values, *args, **kwargs)

    def predict_proba(self, X, *args, **kwargs):
        _X_copy = X.loc[:, self.columns].copy()
        return self.model.predict_proba(_X_copy.values, *args, **kwargs)

    def add_cat_features(self, index_features):

        self.categorical_columns_indices = index_features
        print(self.categorical_columns_indices)

        self.train_data = catboost.Pool(
            data=self.X_train,
            label=self.y_train,
            cat_features=self.categorical_columns_indices)
        self.eval_data = catboost.Pool(
            data=self.X_test,
            label=self.y_test,
            cat_features=self.categorical_columns_indices)
        self.all_train_data = catboost.Pool(
            data=self.X,
            label=self.y,
            cat_features=self.categorical_columns_indices)

    def index_features(self, features):

        _index = []
        for i in features:
            _index.append(self.X.columns.get_loc(i))
        if _index == []:
            raise NameError("No coincide ninguna de las features introducidas")
        return _index

    def get_important_features(self, display=True):

        self.model.get_feature_importance(prettified=True)
        _feature_importance_df = self.model.get_feature_importance(
            prettified=True)

        if display:
            plt.figure(figsize=(12, 6))
            sns.barplot(x="Importances",
                        y="Feature Id",
                        data=_feature_importance_df)
            plt.title('CatBoost features importance:')

        return _feature_importance_df

    def Visualizer_Models(self, directs=None, visu_model=True):

        directorios = []
        if len(directs) < 0:
            if visu_model:
                directorios.append(self.train_dir)
            else:
                raise NameError("No se ha seleccionado ningun directorio")
        else:
            if visu_model:
                directorios.append(self.train_dir)
            for i in directs:
                directorios.append(i)
        print(directorios)
        widget = MetricVisualizer(directorios)
        widget.start()

    def hyperopt_objective(self, params):

        _model = CatBoostClassifier(
            l2_leaf_reg=int(params['l2_leaf_reg']),
            learning_rate=params['learning_rate'],
            bagging_temperature=params["bagging_temperature"],
            iterations=500,
            eval_metric='AUC',
            random_seed=99,
            verbose=False,
            loss_function='Logloss')
        _cv_data = catboost.cv(self.all_train_data, _model.get_params())
        best_accuracy = np.max(_cv_data['test-AUC-mean'])

        return 1 - best_accuracy

    def FineTune_hyperopt(self, X, y, mute=False):

        self.dataset(X, y)

        params_space = {
            'l2_leaf_reg':
            hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
            'learning_rate':
            hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
            'bagging_temperature':
            hyperopt.hp.uniform("bagging_temperature", 0, 0.3)
        }
        trials = hyperopt.Trials()
        best = hyperopt.fmin(self.hyperopt_objective,
                             space=params_space,
                             algo=hyperopt.tpe.suggest,
                             max_evals=2,
                             trials=trials,
                             rstate=RandomState(self.random_state))
        if not mute:
            print("\nBest parameters:")
            print(best)
            print("\n")

        _parameters = self.params
        _parameters.update(best)

        _model = CatBoostClassifier(**_parameters)
        _cv_data = catboost.cv(self.all_train_data, _model.get_params())

        if not mute:
            print('\nPrecise validation accuracy score: {}'.format(
                np.max(_cv_data['test-Accuracy-mean'])))
        return best

    def FineTune_sklearn(self, X, y, mute=False, n_splits=10, n_iter=2):
        """
        https://www.kaggle.com/ksaaskil/pets-definitive-catboost-tuning
        """
        self.dataset(X, y)

        def build_search(modelo,
                         param_distributions,
                         cv=5,
                         n_iter=10,
                         verbose=1,
                         random_state=99):
            """
            Builder function for RandomizedSearch.
            """
            QWS = make_scorer(cohen_kappa_score, weights='quadratic')
            return RandomizedSearchCV(modelo,
                                      param_distributions=param_distributions,
                                      cv=cv,
                                      return_train_score=True,
                                      refit='cohen_kappa_quadratic',
                                      n_iter=n_iter,
                                      n_jobs=None,
                                      scoring={
                                          'accuracy':
                                          make_scorer(accuracy_score),
                                          'cohen_kappa_quadratic': QWS
                                      },
                                      verbose=verbose,
                                      random_state=random_state)

        def pretty_cv_results(cv_results,
                              sort_by='rank_test_cohen_kappa_quadratic',
                              sort_ascending=True,
                              n_rows=30):
            """
            Return pretty Pandas dataframe from the `cv_results_` attribute of finished parameter search,
            ranking by test performance and only keeping the columns of interest.
            """
            df = pd.DataFrame(cv_results)
            cols_of_interest = [
                key for key in df.keys() if key.startswith('param_')
                or key.startswith("mean_train") or key.startswith("std_train")
                or key.startswith("mean_test") or key.startswith("std_test")
                or key.startswith('mean_fit_time') or key.startswith('rank')
            ]
            return df.loc[:, cols_of_interest].sort_values(
                by=sort_by, ascending=sort_ascending).head(n_rows)

        def run_search(X_train, y_train, search, mute=False):
            search.fit(X_train, y_train)
            print('Best score is:', search.best_score_)
            return pretty_cv_results(search.cv_results_)

        param_distributions = {
            'iterations': [100, 200],
            'learning_rate': scipy.stats.uniform(0.01, 0.3),
            'max_depth': scipy.stats.randint(3, 10),
            'one_hot_max_size': [30],
            'l2_leaf_reg': scipy.stats.reciprocal(a=1e-2, b=1e1),
        }

        if mute:
            _verbose = 0
        else:
            _verbose = 1

        self.params.update({'use_best_model': False})
        _model = CatBoostClassifier(**self.params)

        catboost_search = build_search(_model,
                                       param_distributions=param_distributions,
                                       n_iter=n_iter,
                                       verbose=_verbose,
                                       cv=RepeatedStratifiedKFold(
                                           n_splits=n_splits,
                                           n_repeats=1,
                                           random_state=self.random_state))
        catboost_cv_results = run_search(self.X,
                                         self.y,
                                         search=catboost_search,
                                         mute=mute)
        best_estimator = catboost_search.best_estimator_
        if not mute:
            print(best_estimator.get_params())

        return catboost_cv_results, best_estimator

    def __getattr__(self, attr):
        """
        Pass all other method calls to self.model.
        """
        return getattr(self.model, attr)
Beispiel #4
0
    def get_params_json(self):
        self.manager_models = ParamsManager(param_file, key_read="Models")
        self.params = self.manager_models.get_params()["XGBoost"]

        self.manager_finetune = ParamsManager(param_file, key_read="FineTune")
        self.params_finetune = self.manager_finetune.get_params()["XGBoost"]
Beispiel #5
0
class modelXGBoost(Training, BaseEstimator, ClassifierMixin):
    """
    XGBoost is an optimized distributed gradient boosting library designed to be highly efficient,
    flexible and portable. It implements machine learning algorithms under the Gradient Boosting framework.
    XGBoost provides a parallel tree boosting (also known as GBDT, GBM) that solve many data science
    problems in a fast and accurate way. The same code runs on major distributed environment
    (Hadoop, SGE, MPI) and can solve problems beyond billions of examples.

    Parameters
    ----------
        "min_child_weight": [ Minimum sum of instance weight (hessian) needed in a child.
        "objective": learning task.
        "eval_metric": Evaluation metrics for validation data.
        "max_depth": Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit
        "max_delta_step": /Maximum delta step we allow each leaf output to be. If the value is set to 0, it means there is no constraint.
        "sampling_method": The method to use to sample the training instances.
        "subsample": Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. and this will prevent overfitting.
        "eta": tep size shrinkage used in update to prevents overfitting.
        "gamma": Minimum loss reduction required to make a further partition on a leaf node of the tree.
        "lambda": L2 regularization term on weights. Increasing this value will make model more conservative.
        "alpha": L1 regularization term on weights. Increasing this value will make model more conservative.
        "tree_method":  he tree construction algorithm used in XGBoost.
        "predictor": The type of predictor algorithm to use.
        "num_parallel_tree": umber of parallel trees constructed during each iteration.
        ...

    Documentation
    -------------
        https://xgboost.readthedocs.io/en/latest/
        https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
    """
    def __init__(self,
                 name="XGB",
                 random_state=99,
                 train_dir="",
                 params=None,
                 *args,
                 **kwargs):

        self.name = name
        self.train_dir = train_dir + "/" + "model_" + str(self.name) + "/"
        self.random_state = random_state

        if params is None:
            self.get_params_json()
            self.params.update({
                'model_dir': self.train_dir,
                "seed": self.random_state
            })
        else:
            # if isinstance(params)
            self.params = params

        self.model = XGBClassifier(**self.params)
        super().__init__(self.model, random_state=self.random_state)

    def get_params_json(self):
        self.manager_models = ParamsManager(param_file, key_read="Models")
        self.params = self.manager_models.get_params()["XGBoost"]

        self.manager_finetune = ParamsManager(param_file, key_read="FineTune")
        self.params_finetune = self.manager_finetune.get_params()["XGBoost"]

    def dataset(self,
                X,
                y,
                categorical_columns_indices=None,
                test_size=0.2,
                *args,
                **kwarg):

        self.categorical_columns_indices = categorical_columns_indices

        self.X = X
        self.columns = list(X)

        self.y, self.cat_replace = self.replace_multiclass(y)

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X,
            self.y,
            test_size=test_size,
            random_state=self.random_state)

        self.dtrain = xgb.DMatrix(self.X_train, label=self.y_train)
        self.dvalid = xgb.DMatrix(self.X_test, label=self.y_test)
        self.all_train_data = xgb.DMatrix(self.X, label=self.y)

    def set_dataset_nosplit(self,
                            X_train,
                            X_test,
                            y_train,
                            y_test,
                            categorical_columns_indices=None,
                            *args,
                            **kwarg):

        self.categorical_columns_indices = categorical_columns_indices

        self.columns = list(X_train)

        _ytrain, _ = self.replace_multiclass(y_train)
        _ytest, _ = self.replace_multiclass(y_test)

        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test

        self.X = pd.concat([X_train, X_test], axis=0)
        self.y = pd.concat([y_train, y_test], axis=0)

        self.dtrain = xgb.DMatrix(self.X_train, label=self.y_train)
        self.dvalid = xgb.DMatrix(self.X_test, label=self.y_test)
        self.all_train_data = xgb.DMatrix(self.X, label=self.y)

    def replace_multiclass(self, targets):

        _unic = targets.unique().tolist()
        _remp = np.arange(0, len(_unic)).tolist()
        return targets.replace(_unic, _remp), _unic

    def fit(self,
            X=None,
            y=None,
            X_train=None,
            X_test=None,
            y_train=None,
            y_test=None,
            mute=False,
            use_best_model=True,
            verbose=0,
            num_boost_round=100,
            nosplit=False,
            **kwargs):

        if not nosplit:
            self.dataset(X, y)
        else:
            self.set_dataset_nosplit(X_train, X_test, y_train, y_test)

        self.params.update({'verbosity': verbose})

        self.model = xgb.train(
            self.params,
            self.dtrain,
            num_boost_round=num_boost_round,
            # verbosity=verbose,
            **kwargs)

        _preds = self.model.predict(self.dvalid)
        preds_test = np.where(_preds > 0.5, 1, 0)
        score_test = accuracy_score(self.y_test, preds_test)

        _preds = self.model.predict(self.dtrain)
        preds_train = np.where(_preds > 0.5, 1, 0)
        score_train = accuracy_score(self.y_train, preds_train)

        if not mute:
            print("Accurancy para el conjunto de entrenamiento ---> {:.2f}%".
                  format(score_train * 100))
            print("Accurancy para el conjunto de validacion ------> {:.2f}%".
                  format(score_test * 100))

    def fit_cv(self,
               X=None,
               y=None,
               X_train=None,
               X_test=None,
               y_train=None,
               y_test=None,
               num_boost_round=75,
               nfold=5,
               use_best_model=True,
               verbose=2,
               nosplit=False,
               early_stopping_rounds=75,
               **kwargs):
        """
        https://xgboost.readthedocs.io/en/latest/parameter.html
        """
        if not nosplit:
            self.dataset(X, y)
        else:
            self.set_dataset_nosplit(X_train, X_test, y_train, y_test)

        self.params.update({'verbose_eval': verbose})
        self.xgb_cv = xgb.cv(self.params,
                             self.all_train_data,
                             num_boost_round,
                             nfold,
                             early_stopping_rounds=early_stopping_rounds,
                             stratified=True,
                             seed=self.random_state)

        loss = "test-" + self.params["metrics"][0]
        optimal_rounds = np.argmin(self.xgb_cv[str(loss) + '-mean'])
        best_cv_score = max(self.xgb_cv[str(loss) + '-mean'])

        if not verbose == 0:
            print("\nOptimal Round: {}\nOptimal Score: {:.3f} + std:{:.3f}".
                  format(optimal_rounds, best_cv_score,
                         self.xgb_cv[str(loss) + '-std'][optimal_rounds]))

        results = {
            "Rounds": optimal_rounds,
            "Score": best_cv_score,
            "STDV": self.xgb_cv[str(loss) + '-std'][optimal_rounds],
            "LB": None,
            "Parameters": self.params
        }

        score = self.xgb_cv[str(loss) + '-mean'].mean()
        return score, results

    def func_acc(self, prob_pred, y_target):

        _y_pred = np.zeros(len(prob_pred))

        for i in range(0, len(prob_pred)):
            _y_pred[i] = int(np.argmax(prob_pred[i]))
        accuracy = accuracy_score(_y_pred, y_target)

        return accuracy

    def predict(self, X, *args, **kwargs):
        _X_copy = X.loc[:, self.columns].copy()
        return self.model.predict(xgb.DMatrix(_X_copy), *args, **kwargs)

    def pred_binary(self, X, *args, **kwargs):
        _X_copy = X.loc[:, self.columns].copy()
        preds = self.model.predict(xgb.DMatrix(_X_copy), *args, **kwargs)
        return np.where(preds > 0.5, 1, 0)

    def pred_multiclass(self, X, *args, **kwargs):
        _X_copy = X.loc[:, self.columns].copy()
        return [
            np.argmax(line)
            for line in self.model.predict(xgb.DMatrix(_X_copy))
        ]

    def update_model(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self.model, k, v)

    def save_model(self,
                   direct="./checkpoints",
                   name="XGB_model",
                   file_model=".txt"):

        if not os.path.isdir(direct):
            try:
                os.mkdir(direct)
                print("Directorio creado: " + direct)
            except OSError as e:
                raise NameError("Error al crear el directorio")

        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

        if file_model == ".txt":
            filename = direct + "/" + name + "_" + current_time + ".txt"
            self.model.save_model(filename)
        elif file_model == ".pkl":
            filename = direct + "/" + name + "_" + current_time + ".pkl"
            joblib.dump(self.model, filename)
        else:
            raise NameError("Type {} not permited".format(file_model))
        print("Modelo guardado en la ruta: " + filename)

    def load_model(self,
                   direct="./checkpoints/XGB_model.txt",
                   file_model=".txt"):

        if not os.path.isdir(direct):
            print("no existe el drectorio especificado")

        if file_model == ".txt":
            self.model = XGBClassifier(model_file=direct)
        elif file_model == ".pkl":
            self.model = joblib.load(direct)
        else:
            raise NameError("Type {} not permited".format(file_model))
        print("Modelo cargado de la ruta: " + direct)

    def index_features(self, features):

        _index = []
        for i in features:
            _index.append(self.X.columns.get_loc(i))

        if _index == []:
            raise NameError("No coincide ninguna de las features introducidas")

        return _index

    def get_important_features(self, display=True, max_num_features=20):

        _model = XGBClassifier()
        _model.fit(self.X, self.y)
        _data = np.array([self.X.columns, _model.feature_importances_])
        _feature_importance_df = pd.DataFrame(
            _data.T, columns=["Feature Id", "Importances"])
        _feature_importance_df = _feature_importance_df.sort_values(
            by=['Importances'], ascending=False)

        if display:
            plt.figure(figsize=(12, 6))
            sns.barplot(x="Importances",
                        y="Feature Id",
                        data=_feature_importance_df)
            plt.title('XGBoost features importance:')
        # if display:
        #     xgb.plot_importance(self.model, max_num_features=max_num_features, figsize=(6, 6), title='Feature importance (LightGBM)')
        #     plt.show()

        return _feature_importance_df

    def FineTune_SearchCV(self,
                          X=None,
                          y=None,
                          X_train=None,
                          X_test=None,
                          y_train=None,
                          y_test=None,
                          params=None,
                          params_finetune=None,
                          ROC=False,
                          randomized=True,
                          cv=10,
                          n_iter=10,
                          replace_model=True,
                          verbose=0,
                          nosplit=False,
                          finetune_dir=""):
        self.get_params_json()
        self.finetune_dir = finetune_dir + "/" + "model_finetune_" + str(
            self.name) + "/"
        self.params.update({
            'train_dir': self.finetune_dir,
            "seed": self.random_state
        })
        if params is not None:
            self.params = params
        if params_finetune is not None:
            self.params_finetune = params_finetune

        if not nosplit:
            self.dataset(X, y)
        else:
            self.set_dataset_nosplit(X_train, X_test, y_train, y_test)

        self.params.update({'verbosity': verbose})
        self.model = XGBClassifier(**self.params)

        self._best_Parameters, self.results_df = self.FineTune(
            self.model,
            self.X_train,
            self.y_train,
            self.params_finetune,
            randomized=True,
            cv=cv,
            n_iter=n_iter,
            verbose=1)
        self.params.update(**self._best_Parameters)
        self.fit(self.X_train, self.y_train, verbose=1)

        score = accuracy_score(self.y_test, self.pred_multiclass(self.X_test))
        print(
            "Resultado del conjunto de test con los parametros optimos: {:.2f}%"
            .format(score * 100))
        print("\n")
        print("Report clasificacion con el conjunto de test: ")
        self.evaluate(self.model, xgb.DMatrix(self.X_test), self.y_test)
        print("\n")
        print("Cross validation con todos los datos del dataset: ")
        print("\n")
        self.KFold_CrossValidation(XGBClassifier(**self._best_Parameters),
                                   self.X_test,
                                   self.y_test,
                                   n_splits=cv,
                                   ROC=ROC,
                                   shuffle=True,
                                   mute=False,
                                   logdir_report="",
                                   display=True,
                                   save_image=True,
                                   verbose=0)

        return self._best_Parameters, self.results_df

    def SeedDiversification_cv(self,
                               X=None,
                               y=None,
                               X_train=None,
                               X_test=None,
                               y_train=None,
                               y_test=None,
                               n_iter=10,
                               n_max=2018 - 2022,
                               cv=10,
                               nosplit=False,
                               finetuneseed_dir="",
                               display=True,
                               save_image=True,
                               verbose=0):
        allmodelstart = time.time()
        self.get_params_json()

        self.finetune_dir = finetuneseed_dir + "/" + "model_finetune_seed" + str(
            self.name) + "/"
        self.params.update({
            'train_dir': self.finetune_dir,
            'verbosity': verbose
        })
        if not nosplit:
            self.dataset(X, y)
        else:
            self.set_dataset_nosplit(X_train, X_test, y_train, y_test)

        self.params.update({'verbosity': verbose})
        self.model = XGBClassifier(**self.params)

        _rd = np.random.uniform(0, n_max, n_iter).astype(np.int32).tolist()
        params_finetuneseed = {"seed": _rd}
        del (_rd)

        self._best_Parameters, self.results_df = self.FineTune(
            self.model,
            self.X,
            self.y,
            params_finetuneseed,
            randomized=False,
            cv=cv,
            n_iter=n_iter,
            verbose=1,
            mute=True)

        print("All Model Runtime: %0.2f Minutes" %
              ((time.time() - allmodelstart) / 60))

        print(
            "Diversificacion de la semilla - mean AUC: {:.2f}% - std AUC: {:.5f}"
            .format(self.results_df['mean_test_AUC'].mean() * 100,
                    self.results_df['std_test_AUC'].mean()))

        print(
            "Diversificacion de la semilla - mean Acc: {:.2f}% - std Acc: {:.5f}"
            .format(self.results_df['mean_test_Accuracy'].mean() * 100,
                    self.results_df['std_test_Accuracy'].mean()))

        return self._best_Parameters, self.results_df

    def __getattr__(self, attr):
        """
        Pass all other method calls to self.model.
        """
        return getattr(self.model, attr)
Beispiel #6
0
 def __init__(self, random_state=99):
     """
     Inicializacion de la clase de Preprocesado de un dataframe.
     """
     self.random_state = random_state
     self.manager_models = ParamsManager(param_file, key_read="Models")
Beispiel #7
0
class PipelineClasificators(Training):
    def __init__(self, random_state=99):
        """
        Inicializacion de la clase de modelos
        """
        self.random_state = random_state
        self.manager_models = ParamsManager(param_file, key_read="Models")
        self.manager_finetune = ParamsManager(param_file, key_read="FineTune")

    def add_model(self, model):
        self.model = model

    def KNearestNeighbors(self):
        """
        """
        self.KNN = KNeighborsClassifier()
        return self.KNN

    def NaiveBayes(self):
        """
        Naive Bayes assumes the data to be normally distributed which can be
        achieved by scaling using the MaxAbsScaler.
        """
        self.NB = GaussianNB()
        return self.NB

    def RandomForestClassifier(self):
        """
        n_jobs: Parelizacion en la computacion.
        oob_score: True, muestreo aleatorio.
        n_estimadores = numero de arboles en el bosque
        max_features = numero maximo de caracteristicas consideradas para dividir un nodo
        max_depth = numero maximo de niveles en cada arbol de decision
        min_samples_split = numero minimo de puntos de datos colocados en un nodo antes de que el nodo se divida
        min_samples_leaf = numero minimo de puntos de datos permitidos en un nodo hoja
        bootstrap = metodo para muestrear puntos de datos (con o sin reemplazo)

        """
        self.modelRF = RandomForestClassifier(
            n_estimators=self.manager_models.get_params()
            ["RandomForestClassifier"]["n_estimators"],
            criterion=self.manager_models.get_params()
            ["RandomForestClassifier"]["criterion"],
            max_depth=self.manager_models.get_params()
            ["RandomForestClassifier"]["max_depth"],
            min_samples_split=self.manager_models.get_params()
            ["RandomForestClassifier"]["min_samples_split"],
            min_samples_leaf=self.manager_models.get_params()
            ["RandomForestClassifier"]["min_samples_leaf"],
            min_weight_fraction_leaf=self.manager_models.get_params()
            ["RandomForestClassifier"]["min_weight_fraction_leaf"],
            max_features=self.manager_models.get_params()
            ["RandomForestClassifier"]["max_features"],
            min_impurity_decrease=self.manager_models.get_params()
            ["RandomForestClassifier"]["min_impurity_decrease"],
            bootstrap=self.manager_models.get_params()
            ["RandomForestClassifier"]["bootstrap"],
            oob_score=self.manager_models.get_params()
            ["RandomForestClassifier"]["oob_score"],
            n_jobs=self.manager_models.get_params()["RandomForestClassifier"]
            ["n_jobs"],
            random_state=self.random_state,
            verbose=self.manager_models.get_params()["RandomForestClassifier"]
            ["verbose"],
            warm_start=self.manager_models.get_params()
            ["RandomForestClassifier"]["warm_start"],
            # ccp_alpha=self.manager_models.get_params()["RandomForestClassifier"]["ccp_alpha"]
        )
        return self.modelRF

    def AdaBoostClassifier(self, **params):
        from sklearn.ensemble import AdaBoostClassifier
        return AdaBoostClassifier(random_state=self.random_state, **params)

    def GradientBoostingClassifier(self, **params):
        from sklearn.ensemble import GradientBoostingClassifier
        return GradientBoostingClassifier(random_state=self.random_state,
                                          **params)

    def ExtraTreesClassifier(self, **params):
        from sklearn.ensemble import ExtraTreesClassifier
        return ExtraTreesClassifier(random_state=self.random_state, **params)

    def SupportVectorMachine(self, **params):
        """
        """
        self.SVM = SVC(**params)
        return self.SVM

    def XGBoost(self, name="CBT"):
        """
        "min_child_weight": [ Minimum sum of instance weight (hessian) needed in a child.
        "objective": learning task.
        "eval_metric": Evaluation metrics for validation dat.
        "max_depth": Maximum depth of a tree. Increasing this value will make the model more complex and more likely to overfit
        "max_delta_step": /Maximum delta step we allow each leaf output to be. If the value is set to 0, it means there is no constraint.
        "sampling_method": The method to use to sample the training instances.
        "subsample": Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees. and this will prevent overfitting.
        "eta": tep size shrinkage used in update to prevents overfitting.
        "gamma": Minimum loss reduction required to make a further partition on a leaf node of the tree.
        "lambda": L2 regularization term on weights. Increasing this value will make model more conservative.
        "alpha": L1 regularization term on weights. Increasing this value will make model more conservative.
        "tree_method":  he tree construction algorithm used in XGBoost.
        "predictor": The type of predictor algorithm to use.
        "num_parallel_tree": umber of parallel trees constructed during each iteration.

        https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
        """
        self.modelXGBoost = modelXGBoost(name=name,
                                         random_state=self.random_state)
        return self.modelXGBoost

    def LightBoost(self, name="LBT"):

        self.LBoost = modelLightBoost(name=name,
                                      random_state=self.random_state)
        return self.LBoost

    def CatBoost(self, name="CBT"):

        self.CBoost = modelCatBoost(name=name, random_state=self.random_state)
        return self.CBoost

    def append_summary(self, model, X_train, X_test, y_train, y_test, name):
        train_start = time.perf_counter()
        score, _, _ = self.eval_Kfold_CV(model,
                                         X_train,
                                         X_test,
                                         y_train,
                                         y_test,
                                         n_splits=self.n_splits,
                                         shuffle=True,
                                         mute=True)
        train_end = time.perf_counter()
        prediction_start = time.perf_counter()
        _ = model.predict(X_test)
        prediction_end = time.perf_counter()

        self.names.append(name)
        self.utrain.append(train_end - train_start)
        self.utimes.append(prediction_end - prediction_start)

        return score

    def Pipeline_SelectModel(self, X, y, n_splits=5, select="XGBoost"):

        # Lista de modelos a optimizar
        self.scores = []
        self.names = []
        self.utrain = []
        self.utimes = []
        self.n_splits = n_splits
        self.features = X.columns.tolist()

        # Conjunto de entrenamiento y de test
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=self.random_state)

        # KNearestNeighbors
        if self.manager_models.get_params()["select_models"]["KNN"]:
            _model = self.KNearestNeighbors()
            score = self.append_summary(_model, X_train, X_test, y_train,
                                        y_test, "KNearestNeighbors")
            self.scores.append(score)
            if select == "KNearestNeighbors":
                self.add_model(_model)
            print("Modelo: KNearestNeighbors --> Mean Accuracy: {:.3f}%\n".
                  format(score * 100))

        # NaiveBayes
        if self.manager_models.get_params()["select_models"]["NaiveBayes"]:
            from sklearn.preprocessing import MaxAbsScaler
            _model = self.NaiveBayes()
            scaler_gnb = MaxAbsScaler()
            sdss_train = scaler_gnb.fit_transform(X_train)
            sdss_test = scaler_gnb.fit_transform(X_test)
            pd_sdss_train = pd.DataFrame(columns=X_train.columns.tolist(),
                                         data=sdss_train)
            pd_sdss_test = pd.DataFrame(columns=X_test.columns.tolist(),
                                        data=sdss_test)
            score = self.append_summary(_model, pd_sdss_train, pd_sdss_test,
                                        y_train, y_test, "NaiveBayes")
            self.scores.append(score)
            if select == "NaiveBayes":
                self.add_model(_model)
            print("Modelo: NaiveBayes --> Mean Accuracy: {:.3f}%\n".format(
                score * 100))

        # SupportVectorMachine
        if self.manager_models.get_params()["select_models"]["SVM"]:
            _model = self.SupportVectorMachine()
            score = self.append_summary(_model, X_train, X_test, y_train,
                                        y_test, "SupportVectorMachine")
            self.scores.append(score)
            if select == "SupportVectorMachine":
                self.add_model(_model)

            print("Modelo: SupportVectorMachine --> Mean Accuracy: {:.3f}%\n".
                  format(score * 100))

        # RandomForestClassifier
        if self.manager_models.get_params(
        )["select_models"]["RandomForestClassifier"]:
            _model = self.RandomForestClassifier()
            score = self.append_summary(_model, X_train, X_test, y_train,
                                        y_test, "RandomForestClassifier")
            self.scores.append(score)
            if select == "RandomForestClassifier":
                self.add_model(_model)
            print(
                "Modelo: RandomForestClassifier --> Mean Accuracy: {:.3f}%\n".
                format(score * 100))

        # XGBoost
        if self.manager_models.get_params()["select_models"]["XGBoost"]:
            _model = self.XGBoost(name="XGBoost")
            _model.fit(X, y, verbose=0, mute=True)
            # score = self.append_summary(_model, X_train, X_test, y_train, y_test, "XGBoost")
            train_start = time.perf_counter()
            score, _ = _model.fit_cv(X, y, nfold=n_splits, verbose=0)
            self.scores.append(score)
            train_end = time.perf_counter()

            prediction_start = time.perf_counter()
            _ = _model.predict(X_test)
            prediction_end = time.perf_counter()

            if select == "XGBoost":
                self.add_model(_model)

            print("Modelo: XGBoost --> Mean Accuracy: {:.3f}%\n".format(score *
                                                                        100))

            self.names.append("XGBoost")
            self.utrain.append(train_end - train_start)
            self.utimes.append(prediction_end - prediction_start)

        # LightGBM
        if self.manager_models.get_params()["select_models"]["LightGBM"]:

            _model = self.LightBoost(name="LBT")
            _model.fit(X, y, verbose=0, mute=True)
            train_start = time.perf_counter()
            score, _ = _model.fit_cv(X, y, nfold=n_splits, verbose=0)
            self.scores.append(score)
            train_end = time.perf_counter()

            prediction_start = time.perf_counter()
            _ = _model.predict(X_test)
            prediction_end = time.perf_counter()

            if select == "LightGBM":
                self.add_model(_model.model)

            print("Modelo: LightGBM --> Mean Accuracy: {:.3f}%\n".format(
                score * 100))

            self.names.append("LightGBM")
            self.utrain.append(train_end - train_start)
            self.utimes.append(prediction_end - prediction_start)

        # CatBoost
        if self.manager_models.get_params()["select_models"]["CatBoost"]:
            _model = self.CatBoost(name="CBT")
            train_start = time.perf_counter()
            score = _model.fit_cv(X,
                                  y,
                                  fold_count=self.n_splits,
                                  shuffle=True,
                                  stratified=True,
                                  plot=False,
                                  verbose=0)
            self.scores.append(np.mean(score["test-Accuracy-mean"]))
            train_end = time.perf_counter()

            _model.fit(X, y, plot=False, verbose=0)
            prediction_start = time.perf_counter()
            _model.model.predict(_model.eval_data)
            prediction_end = time.perf_counter()

            if select == "CatBoost":
                self.add_model(_model.model)

            print("Modelo: CatBoost --> Mean Accuracy: {:.3f}%\n".format(
                np.mean(score["test-Accuracy-mean"]) * 100))

            self.names.append("CatBoost")
            self.utrain.append(train_end - train_start)
            self.utimes.append(prediction_end - prediction_start)

        resultados = pd.DataFrame({
            "Modelo": self.names,
            "Mean Accuracy": self.scores,
            "Tiempo Entrenamiento": self.utrain,
            "Tiempo Prediccion": self.utimes
        })

        return resultados

    def Pipeline_SelectEmsembleModel(self,
                                     X,
                                     y,
                                     n_splits=10,
                                     mute=False,
                                     scoring="accuracy",
                                     display=True,
                                     save_image=False,
                                     path="/",
                                     AB=True):

        # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=self.random_state)
        X_train, y_train = X, y
        ensembles = []
        if AB:
            ensembles.append(('AB', self.AdaBoostClassifier()))
        ensembles.append(('GBM', self.GradientBoostingClassifier()))
        ensembles.append(('ET', self.ExtraTreesClassifier()))
        ensembles.append(
            ('RF', RandomForestClassifier(random_state=self.random_state)))
        ensembles.append(
            ('XGB', XGBClassifier(random_state=self.random_state)))
        ensembles.append(
            ('LGBM', LGBMClassifier(random_state=self.random_state)))

        results = []
        names = []
        for name, model in ensembles:
            kfold = StratifiedKFold(n_splits=n_splits,
                                    random_state=self.random_state)
            cv_results = cross_val_score(model,
                                         X_train,
                                         y_train,
                                         cv=kfold,
                                         scoring=scoring)
            results.append(cv_results)
            names.append(name)
            msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
            # resultados = zip(name, results)
            if not mute:
                print(msg)

        scores = pd.DataFrame(np.asarray(results).T, columns=names)

        if display:
            figure, axs = plt.subplots(1, 2, figsize=(16, 5))
            ax = axs.flatten()

            # Compare Algorithms
            ax[0].set_title('Ensemble Algorithm Comparison')
            ax[0].boxplot(results)
            ax[0].set_xticklabels(names)
            if AB:
                axis = ["AB", "GBM", "ET", "RF", "XGB", "LGM"]
            else:
                axis = ["GBM", "ET", "RF", "XGB", "LGM"]
            scores_mean = np.mean(scores, axis=0)
            scores_std = np.std(scores, axis=0)
            ax[1].grid()
            ax[1].fill_between(axis,
                               scores_mean - scores_std,
                               scores_mean + scores_std,
                               alpha=0.1,
                               color="r")
            ax[1].plot(axis, scores_mean, 'o-', color="r", label="CV score")
            ax[1].legend(loc="best")
            ax[1].set_title('Cross-validation score')
            figure.tight_layout()
            plt.show()

            if save_image:
                plt.savefig(path)

        return scores

    def Pipeline_FeatureSelect(self,
                               X,
                               y,
                               n_splits=10,
                               mute=False,
                               scoring="accuracy",
                               n_features=20,
                               display=True,
                               save_image=False,
                               path="/"):

        # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=self.random_state)
        X_train, y_train = X, y
        models = []
        models.append(('GBM', self.GradientBoostingClassifier()))
        models.append(('ET', self.ExtraTreesClassifier()))
        models.append(
            ('RF', RandomForestClassifier(random_state=self.random_state)))
        models.append(('XGB', XGBClassifier(random_state=self.random_state)))
        models.append(('LGBM', LGBMClassifier(random_state=self.random_state)))

        results = []
        names = []
        df = pd.DataFrame()

        for name, model in models:
            if not mute:
                print("modelo: {}".format(name))
            if not mute:
                print(".... Fitting")
            model.fit(X_train, y_train)
            if not mute:
                print(".... Permutation importance")
            result = permutation_importance(model,
                                            X_train,
                                            y_train,
                                            n_repeats=10,
                                            random_state=99)
            tree_importance_sorted_idx = np.argsort(model.feature_importances_)
            _ = np.arange(0, len(model.feature_importances_)) + 0.5

            name_features = "features_" + str(name)
            imp_features = "importance" + str(name)
            df[name_features] = X.columns[tree_importance_sorted_idx]
            df[imp_features] = model.feature_importances_[
                tree_importance_sorted_idx]

            features = df[name_features].values.tolist()[-n_features:]
            _X_train = X_train[features]
            _y_train = y_train
            if not mute:
                print(".... Select Features:")
                print(features)

            if not mute:
                print(".... Cross Validation")
            kfold = StratifiedKFold(n_splits=n_splits, random_state=99)
            cv_results = cross_val_score(model,
                                         _X_train,
                                         _y_train,
                                         cv=kfold,
                                         scoring=scoring)
            results.append(cv_results)
            names.append(name)
            msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
            # resultados = zip(name, results)
            if not mute:
                print(".... Append Results:")
                print(msg)
                print("\n")

        scores = pd.DataFrame(np.asarray(results).T, columns=names)

        if display:
            figure, axs = plt.subplots(1, 2, figsize=(16, 5))
            ax = axs.flatten()

            # Compare Algorithms
            ax[0].set_title('Algorithm Comparison')
            ax[0].boxplot(results)
            ax[0].set_xticklabels(names)
            axis = ["GBM", "ET", "RF", "XGB", "LGM"]

            scores_mean = np.mean(scores, axis=0)
            scores_std = np.std(scores, axis=0)
            ax[1].grid()
            ax[1].fill_between(axis,
                               scores_mean - scores_std,
                               scores_mean + scores_std,
                               alpha=0.1,
                               color="r")
            ax[1].plot(axis, scores_mean, 'o-', color="r", label="CV score")
            ax[1].legend(loc="best")
            ax[1].set_title('Cross-validation score')
            figure.tight_layout()
            plt.show()

            if save_image:
                plt.savefig(path)

        return scores, df

    def Pipeline_StackingClassifier(self, X, y, n_splits=5):
        # Lista de modelos
        self.models = []

        # KNearestNeighbors
        if self.manager_models.get_params()["stacking_models"]["KNN"]:
            _model = self.KNearestNeighbors()
            self.models.append(("KNearestNeighbors", _model))
        # NaiveBayes
        if self.manager_models.get_params()["stacking_models"]["NaiveBayes"]:
            _model = self.NaiveBayes()
            self.models.append(("NaiveBayes", _model))
        # SupportVectorMachine
        if self.manager_models.get_params()["stacking_models"]["SVM"]:
            _model = self.SupportVectorMachine()
            self.models.append(("SupportVectorMachine", _model))
        # RandomForestClassifier
        if self.manager_models.get_params(
        )["stacking_models"]["RandomForestClassifier"]:
            _model = self.RandomForestClassifier()
            self.models.append(("RandomForestClassifier", _model))
        # XGBoost
        if self.manager_models.get_params()["stacking_models"]["XGBoost"]:
            _model = self.XGBoost(name="XGBoost")
            self.models.append(("XGBoost", _model))
        # LightGBM
        if self.manager_models.get_params()["stacking_models"]["LightGBM"]:
            _model = self.LightBoost(name="LBT")
            self.models.append(("LightGBM", _model))
        # CatBoost
        if self.manager_models.get_params()["stacking_models"]["CatBoost"]:
            _model = self.CatBoost(name="CBT")
            self.models.append(("CatBoost", _model))

    def _cv_results(self, X_train, Y_train, model, kfold, name, verbose=1):
        cv_results = cross_val_score(model,
                                     X_train,
                                     Y_train,
                                     cv=kfold,
                                     scoring='accuracy')
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        if verbose > 0:
            print(msg)
        return cv_results

    def Ablacion_relativa(self,
                          pipeline,
                          X,
                          y,
                          n_splits=10,
                          mute=False,
                          std=True,
                          scoring="accuracy",
                          display=True,
                          save_image=False,
                          path="/"):
        kfold = StratifiedKFold(n_splits=n_splits, random_state=99)

        models = []
        models.append(('AB', self.AdaBoostClassifier()))
        models.append(('GBM', self.GradientBoostingClassifier()))
        models.append(
            ('RF', RandomForestClassifier(random_state=self.random_state)))
        models.append(('ET', self.ExtraTreesClassifier()))
        models.append(('LGM', LGBMClassifier(random_state=self.random_state)))
        models.append(('XGB', XGBClassifier(random_state=self.random_state)))
        models.append(('SVM', self.SupportVectorMachine()))
        models.append(('KNN', self.KNearestNeighbors()))

        scores_mean = []
        scores_std = []
        names_models = []

        for name_model, model in models:

            names = []
            results = []
            name = "Inicial"

            if not mute:
                print("\n", name_model)

            resu = self._cv_results(X, y, model, kfold, name)
            results.append(resu)
            names.append(name)

            for name, transf in pipeline:
                X_train = transf.fit_transform(X, y)
                Y_train = y
                resu = self._cv_results(X_train, Y_train, model, kfold, name)
                results.append(resu)
                names.append(name)

            scores = pd.DataFrame(np.asarray(results).T, columns=names)
            scores_mean.append(np.mean(scores, axis=0))
            scores_std.append(np.std(scores, axis=0))
            names_models.append(name_model)

        if display:
            fig, ax = plt.subplots(figsize=(14, 6))

            for i in range(len(scores_mean)):
                valor = scores_mean[i] - scores_mean[i].iloc[0]
                if std:
                    ax.fill_between(names,
                                    valor - scores_std[i],
                                    valor + scores_std[i],
                                    alpha=0.1)
                ax.plot(names, valor, 'o-', label=names_models[i], alpha=0.9)

            ax.plot(names,
                    np.zeros(len(names)),
                    'ro--',
                    label="cero",
                    alpha=0.9)

            ax.grid()
            ax.legend(loc="best")
            ax.set_title('Mejoras relativas al modelo Inicial')
            fig.tight_layout()
            fig.show()

        if save_image:
            plt.savefig(path)

        return scores_mean, scores_std

    def features_importances(self,
                             clf,
                             X,
                             y,
                             display=True,
                             save_image=False,
                             path="/"):
        import seaborn as sns
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=99,
                                                            stratify=y)

        clf.fit(X_train, y_train)
        print("Accuracy on test data: {:.2f}".format(clf.score(X_test,
                                                               y_test)))

        result = permutation_importance(clf,
                                        X_train,
                                        y_train,
                                        n_repeats=10,
                                        random_state=99)

        tree_importance_sorted_idx = np.argsort(clf.feature_importances_)
        _ = np.arange(0, len(clf.feature_importances_)) + 0.5

        df = pd.DataFrame()
        df["feature"] = X.columns[tree_importance_sorted_idx]
        df["importance"] = clf.feature_importances_[tree_importance_sorted_idx]

        if display:
            _, _ = plt.subplots(figsize=(10, 30))
            sns.barplot(x="importance",
                        y="feature",
                        data=df.sort_values(by="importance", ascending=False))
            plt.title('Features (avg over folds)')
            plt.show()

        if save_image:
            plt.savefig(path)

        return df

    def eval_Kfold_CV(self,
                      model,
                      X,
                      X_test,
                      y,
                      y_test,
                      n_splits=3,
                      shuffle=True,
                      mute=True):

        _Model = Training.add_model(model)

        resultados, score_general_test = _Model.KFold_CrossValidation(
            model, X, y, n_splits=n_splits, shuffle=shuffle, mute=mute)
        _predictions = model.predict(X_test)
        score = accuracy_score(y_true=y_test, y_pred=_predictions)

        return score, resultados, score_general_test

    def func_acc(self, prob_pred, y_target):

        _y_pred = np.zeros(len(prob_pred))

        for i in range(0, len(prob_pred)):
            _y_pred[i] = int(np.argmax(prob_pred[i]))

        accuracy = accuracy_score(_y_pred, y_target)

        return accuracy

    def pred_binary(self, prob_pred, y_target, th=0.5):
        return accuracy_score(y_target, np.where(prob_pred > th, 1, 0))

    def replace_multiclass(self, targets):

        _unic = targets.unique().tolist()
        _remp = np.arange(0, len(_unic)).tolist()
        return targets.replace(_unic, _remp), _unic

    def Pipeline_GridSearch(self):
        pass
Beispiel #8
0
class modelLightBoost(Training, BaseEstimator, ClassifierMixin):
    """
    Ejemplo multiclass:
    https://www.kaggle.com/nicapotato/multi-class-lgbm-cv-and-seed-diversification
    """
    def __init__(self,
                 name="LGB",
                 random_state=99,
                 train_dir="",
                 params=None,
                 *args,
                 **kwargs):

        self.name = name
        self.train_dir = train_dir + "/" + "model_" + str(self.name) + "/"
        self.random_state = random_state

        if params is None:
            self.get_params_json()
            self.params.update({
                'model_dir': self.train_dir,
                "seed": self.random_state
            })
        else:
            # if isinstance(params)
            self.params = params

        self.model = LGBMClassifier(**self.params)
        super().__init__(self.model, random_state=self.random_state)

    def get_params_json(self):
        self.manager_models = ParamsManager(param_file, key_read="Models")
        self.params = self.manager_models.get_params()["LightBoost"]

        self.manager_finetune = ParamsManager(param_file, key_read="FineTune")
        self.params_finetune = self.manager_finetune.get_params()["LightBoost"]

    def dataset(self,
                X,
                y,
                categorical_columns_indices=None,
                test_size=0.2,
                *args,
                **kwarg):

        self.categorical_columns_indices = categorical_columns_indices
        self.X = X
        self.columns = list(X)

        self.y, self.cat_replace = self.replace_multiclass(y)

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X,
            self.y,
            test_size=test_size,
            random_state=self.random_state)

        self.dtrain = lgb.Dataset(self.X_train.values,
                                  label=self.y_train.values,
                                  feature_name=self.X_train.columns.tolist())
        self.dvalid = lgb.Dataset(self.X_test.values,
                                  label=self.y_test.values,
                                  feature_name=self.X_test.columns.tolist())
        self.all_train_data = lgb.Dataset(self.X.values,
                                          label=self.y.values,
                                          feature_name=self.X.columns.tolist())

    def set_dataset_nosplit(self,
                            X_train,
                            X_test,
                            y_train,
                            y_test,
                            categorical_columns_indices=None,
                            *args,
                            **kwarg):

        self.categorical_columns_indices = categorical_columns_indices
        self.columns = list(X_train)
        _ytrain, _ = self.replace_multiclass(y_train)
        _ytest, _ = self.replace_multiclass(y_test)

        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test

        self.X = pd.concat([X_train, X_test], axis=0)
        self.y = pd.concat([y_train, y_test], axis=0)

        self.dtrain = lgb.Dataset(self.X_train.values,
                                  label=self.y_train.values,
                                  feature_name=self.X_train.columns.tolist())
        self.dvalid = lgb.Dataset(self.X_test.values,
                                  label=self.y_test.values,
                                  feature_name=self.X_test.columns.tolist())
        self.all_train_data = lgb.Dataset(self.X.values,
                                          label=self.y.values,
                                          feature_name=self.X.columns.tolist())

    def replace_multiclass(self, targets):

        _unic = targets.unique().tolist()
        _remp = np.arange(0, len(_unic)).tolist()
        return targets.replace(_unic, _remp), _unic

    def fit(self,
            X=None,
            y=None,
            X_train=None,
            X_test=None,
            y_train=None,
            y_test=None,
            mute=False,
            use_best_model=True,
            verbose=0,
            num_boost_round=100,
            nosplit=False,
            **kwargs):

        if not nosplit:
            self.dataset(X, y)
        else:
            self.set_dataset_nosplit(X_train, X_test, y_train, y_test)

        self.params.update({'verbose': verbose})

        self.model = lgb.train(self.params,
                               self.dtrain,
                               num_boost_round=num_boost_round,
                               verbose_eval=verbose,
                               **kwargs)

        preds_test = [
            np.argmax(line) for line in self.model.predict(
                self.X_test, num_iteration=self.model.best_iteration)
        ]
        score_test = accuracy_score(self.y_test, preds_test)

        preds_train = [
            np.argmax(line) for line in self.model.predict(
                self.X_train, num_iteration=self.model.best_iteration)
        ]
        score_train = accuracy_score(self.y_train, preds_train)

        if not mute:
            print("Accurancy para el conjunto de entrenamiento ---> {:.2f}%".
                  format(score_train * 100))
            print("Accurancy para el conjunto de validacion ------> {:.2f}%".
                  format(score_test * 100))

    def fit_cv(self,
               X=None,
               y=None,
               X_train=None,
               X_test=None,
               y_train=None,
               y_test=None,
               nfold=5,
               use_best_model=True,
               verbose=200,
               nosplit=False,
               early_stopping_rounds=150,
               num_boost_round=2000,
               **kwargs):

        if not nosplit:
            self.dataset(X, y)
        else:
            self.set_dataset_nosplit(X_train, X_test, y_train, y_test)

        self.params.update({'verbose': verbose})
        self.lgb_cv = lgb.cv(params=self.params,
                             train_set=self.all_train_data,
                             num_boost_round=num_boost_round,
                             stratified=True,
                             nfold=nfold,
                             seed=self.random_state,
                             early_stopping_rounds=early_stopping_rounds,
                             **kwargs)
        loss = self.params["metric"]
        optimal_rounds = np.argmin(self.lgb_cv[str(loss) + '-mean'])
        best_cv_score = min(self.lgb_cv[str(loss) + '-mean'])

        if not verbose == 0:
            print("\nOptimal Round: {}\nOptimal Score: {:.3f} + stdv:{:.3f}".
                  format(optimal_rounds, best_cv_score,
                         self.lgb_cv[str(loss) + '-stdv'][optimal_rounds]))

        results = {
            "Rounds": optimal_rounds,
            "Score": best_cv_score,
            "STDV": self.lgb_cv[str(loss) + '-stdv'][optimal_rounds],
            "LB": None,
            "Parameters": self.params
        }
        score = np.mean(self.lgb_cv[str(loss) + '-mean'])

        return score, results

    def func_acc(self, prob_pred, y_target):

        _y_pred = np.zeros(len(prob_pred))

        for i in range(0, len(prob_pred)):
            _y_pred[i] = int(np.argmax(prob_pred[i]))

        accuracy = accuracy_score(_y_pred, y_target)
        return accuracy

    def pred_binary(self, X, *args, **kwargs):
        _X_copy = X.loc[:, self.columns].copy()
        preds = self.model.predict(_X_copy, *args, **kwargs)
        return np.where(preds > 0.5, 1, 0)

    def pred_multiclass(self, X, *args, **kwargs):
        _X_copy = X.loc[:, self.columns].copy()
        return [
            np.argmax(line) for line in self.model.predict(
                _X_copy, num_iteration=self.model.best_iteration)
        ]

    def update_model(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self.model, k, v)

    def save_model(self,
                   direct="./checkpoints",
                   name="LGM_model",
                   file_model=".txt"):

        if not os.path.isdir(direct):
            try:
                os.mkdir(direct)
                print("Directorio creado: " + direct)
            except OSError as e:
                raise NameError("Error al crear el directorio")

        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

        if file_model == ".txt":
            filename = direct + "/" + name + "_" + current_time + ".txt"
            self.model.save_model(filename)
        elif file_model == ".pkl":
            filename = direct + "/" + name + "_" + current_time + ".pkl"
            joblib.dump(self.model, filename)
        else:
            raise NameError("Type {} not permited".format(file_model))
        print("Modelo guardado en la ruta: " + filename)

    def load_model(self,
                   direct="./checkpoints/LGM_model.txt",
                   file_model=".txt"):

        if not os.path.isdir(direct):
            print("no existe el drectorio especificado")

        if file_model == ".txt":
            self.model = LGBMClassifier(model_file=direct)
        elif file_model == ".pkl":
            self.model = joblib.load(direct)
        else:
            raise NameError("Type {} not permited".format(file_model))
        print("Modelo cargado de la ruta: " + direct)

    def predict(self, X, *args, **kwargs):
        _X_copy = X.loc[:, self.columns].copy()
        return self.model.predict(_X_copy, *args, **kwargs)

    def predict_proba(self, X, *args, **kwargs):
        _X_copy = X.loc[:, self.columns].copy()
        return self.model.predict_proba(_X_copy, *args, **kwargs)

    def index_features(self, features):

        _index = []
        for i in features:
            _index.append(self.X.columns.get_loc(i))
        if _index == []:
            raise NameError("No coincide ninguna de las features introducidas")
        return _index

    def get_important_features(self, display=True, max_num_features=20):

        if display:
            lgb.plot_importance(self.model,
                                max_num_features=max_num_features,
                                figsize=(6, 6),
                                title='Feature importance (LightGBM)')
            plt.show()
        # return _feature_importance_df

    def FineTune_SearchCV(self,
                          X=None,
                          y=None,
                          X_train=None,
                          X_test=None,
                          y_train=None,
                          y_test=None,
                          params=None,
                          params_finetune=None,
                          ROC=False,
                          randomized=True,
                          cv=10,
                          display_ROC=True,
                          verbose=0,
                          n_iter=10,
                          replace_model=True,
                          nosplit=False,
                          finetune_dir=""):

        self.get_params_json()

        self.finetune_dir = finetune_dir + "/" + "model_finetune_" + str(
            self.name) + "/"
        self.params.update({
            'train_dir': self.finetune_dir,
            "seed": self.random_state
        })
        if params is not None:
            self.params = params
        if params_finetune is not None:
            self.params_finetune = params_finetune

        if not nosplit:
            self.dataset(X, y)
        else:
            self.set_dataset_nosplit(X_train, X_test, y_train, y_test)

        self.params.update({'verbosity': verbose})
        self.model = LGBMClassifier(**self.params)

        self._best_Parameters, self.results_df = self.FineTune(
            self.model,
            self.X_train,
            self.y_train,
            self.params_finetune,
            cv=cv,
            randomized=True,
            n_iter=n_iter,
            verbose=1)
        self.params.update(**self._best_Parameters)
        self.fit(self.X_train, self.y_train)

        print("\n")
        score = accuracy_score(self.y_test, self.pred_multiclass(self.X_test))
        print("\n")
        print(
            "Resultado del conjunto de test con los parametros optimos: {:.2f}%"
            .format(score * 100))
        print("\n")
        print("Report clasificacion con el conjunto de test: ")
        self.evaluate(self.model, self.X_test, self.y_test)
        print("\n")
        print("Validacion cruzada con todos los datos del dataset: ")
        print("\n")
        self.KFold_CrossValidation(LGBMClassifier(**self._best_Parameters),
                                   self.X,
                                   self.y,
                                   n_splits=cv,
                                   ROC=ROC,
                                   shuffle=True,
                                   mute=False,
                                   logdir_report="",
                                   display=True,
                                   save_image=True,
                                   verbose=0)
        return self._best_Parameters, self.results_df

    def SeedDiversification_cv(self,
                               X=None,
                               y=None,
                               X_train=None,
                               X_test=None,
                               y_train=None,
                               y_test=None,
                               n_iter=10,
                               n_max=2018 - 2022,
                               cv=10,
                               nosplit=False,
                               finetuneseed_dir="",
                               display=True,
                               save_image=True,
                               verbose=0):
        allmodelstart = time.time()
        self.get_params_json()

        self.finetune_dir = finetuneseed_dir + "/" + "model_finetune_seed" + str(
            self.name) + "/"
        self.params.update({
            'train_dir': self.finetune_dir,
            'verbosity': verbose
        })
        if not nosplit:
            self.dataset(X, y)
        else:
            self.set_dataset_nosplit(X_train, X_test, y_train, y_test)

        self.params.update({'verbosity': verbose})
        self.model = LGBMClassifier(**self.params)

        _rd = np.random.uniform(0, n_max, n_iter).astype(np.int32).tolist()
        params_finetuneseed = {"seed": _rd}
        del (_rd)

        self._best_Parameters, self.results_df = self.FineTune(
            self.model,
            self.X,
            self.y,
            params_finetuneseed,
            randomized=False,
            cv=cv,
            n_iter=n_iter,
            verbose=1,
            mute=True)
        print("All Model Runtime: %0.2f Minutes" %
              ((time.time() - allmodelstart) / 60))

        print(
            "Diversificacion de la semilla - mean AUC: {:.2f}% - std AUC: {:.5f}"
            .format(self.results_df['mean_test_AUC'].mean() * 100,
                    self.results_df['std_test_AUC'].mean()))

        print(
            "Diversificacion de la semilla - mean Acc: {:.2f}% - std Acc: {:.5f}"
            .format(self.results_df['mean_test_Accuracy'].mean() * 100,
                    self.results_df['std_test_Accuracy'].mean()))
        return self._best_Parameters, self.results_df

    def SeedDiversification_fs(self,
                               X,
                               y,
                               params,
                               n_iter=10,
                               mute=False,
                               logdir_report="",
                               display=True,
                               save_image=True):

        allmodelstart = time.time()
        # Run Model with different Seeds
        all_feature_importance_df = pd.DataFrame()
        _y, _ = self.replace_multiclass(y)
        all_seeds = np.random.uniform(1, 1000,
                                      n_iter).astype(np.int32).tolist()

        for seeds_x in all_seeds:
            modelstart = time.time()
            print(
                "Seed: ",
                seeds_x,
            )
            # Go Go Go
            params["seed"] = seeds_x
            model = lgb.train(params,
                              lgb.Dataset(X.values, label=_y.values),
                              verbose_eval=100)

            # Feature Importance
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = X.columns.tolist()
            fold_importance_df["importance"] = model.feature_importance()
            all_feature_importance_df = pd.concat(
                [all_feature_importance_df, fold_importance_df], axis=0)

            # Submit Model Individually
            #     seed_submit(model= lgb_final, seed= seeds_x, X_test)
            if not mute:
                print("Model Runtime: %0.2f seconds" %
                      ((time.time() - modelstart)))
                print("#" * 50)
            del model

        cols = all_feature_importance_df[[
            "feature", "importance"
        ]].groupby("feature").mean().sort_values(by="importance",
                                                 ascending=False)[:50].index
        best_features = all_feature_importance_df.loc[
            all_feature_importance_df.feature.isin(cols)]
        plt.figure(figsize=(8, 10))
        sns.barplot(x="importance",
                    y="feature",
                    data=best_features.sort_values(by="importance",
                                                   ascending=False))
        plt.title('LightGBM Features (avg over folds)')
        plt.tight_layout()
        if display:
            plt.show()
        if save_image:
            filename = logdir_report + 'lgb_importances.png'
            plt.savefig(filename)
        print("All Model Runtime: %0.2f Minutes" %
              ((time.time() - allmodelstart) / 60))

    def __getattr__(self, attr):
        """
        Pass all other method calls to self.model.
        """
        return getattr(self.model, attr)