예제 #1
0
class modelCatBoost(object):
    def __init__(self, name="CBT", random_state=99, *args, **kwargs):

        self.name = name
        self.train_dir = "model_" + str(self.name) + "/"
        self.random_state = random_state

        self.manager_models = ParamsManager(param_file, key_read="Models")
        self.params = self.manager_models.get_params()["CatBoost"]
        self.params.update({
            'train_dir': self.train_dir,
            "random_state": self.random_state
        })

        self.model = CatBoostClassifier(**self.params)

    def dataset(self,
                X,
                y,
                categorical_columns_indices=None,
                test_size=0.2,
                *args,
                **kwargs):

        self.categorical_columns_indices = categorical_columns_indices
        self.X = X
        self.columns = list(X)

        self.y, self.cat_replace = self.replace_multiclass(y)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X,
            self.y,
            test_size=test_size,
            random_state=self.random_state)

        self.train_data = catboost.Pool(
            data=self.X_train.values,
            label=self.y_train.values,
            cat_features=self.categorical_columns_indices)
        self.eval_data = catboost.Pool(
            data=self.X_test.values,
            label=self.y_test.values,
            cat_features=self.categorical_columns_indices)
        self.all_train_data = catboost.Pool(
            data=self.X.values,
            label=self.y.values,
            cat_features=self.categorical_columns_indices)

    def replace_multiclass(self, targets):

        _unic = targets.unique().tolist()
        _remp = np.arange(0, len(_unic)).tolist()
        return targets.replace(_unic, _remp), _unic

    def fit(self,
            X,
            y,
            use_best_model=True,
            plot=True,
            save_snapshot=False,
            verbose=0,
            *args,
            **kwargs):

        self.dataset(X, y)
        _params = self.model.get_params()

        if verbose:
            _verbose = 0
        else:
            _verbose = _params["verbose"]

        return self.model.fit(self.train_data,
                              verbose=_verbose,
                              eval_set=self.eval_data,
                              use_best_model=use_best_model,
                              plot=plot,
                              save_snapshot=save_snapshot,
                              **kwargs)

        _preds = self.model.predict(self.dvalid)
        preds_test = np.where(_preds > 0.5, 1, 0)
        score_test = accuracy_score(self.y_test, preds_test)

        _preds = self.model.predict(self.dtrain)
        preds_train = np.where(_preds > 0.5, 1, 0)
        score_train = accuracy_score(self.y_train, preds_train)

        if not verbose == 0:
            print("Accurancy para el conjunto de entrenamiento ---> {:.2f}%".
                  format(score_train * 100))
            print("Accurancy para el conjunto de validacion ------> {:.2f}%".
                  format(score_test * 100))

    def fit_cv(self,
               X,
               y,
               fold_count=4,
               shuffle=True,
               stratified=True,
               plot=True,
               verbose=100):

        self.dataset(X, y)

        _params = self.model.get_params()
        _params.update({'verbose': verbose})

        _scores = catboost.cv(pool=self.all_train_data,
                              params=_params,
                              fold_count=fold_count,
                              seed=self.random_state,
                              shuffle=shuffle,
                              verbose=verbose,
                              plot=plot)
        if not verbose == 0:
            print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.
                  format(
                      np.max(_scores['test-Accuracy-mean']),
                      _scores['test-Accuracy-std'][np.argmax(
                          _scores['test-Accuracy-mean'])],
                      np.argmax(_scores['test-Accuracy-mean'])))

        return _scores

    def copy(self, *args, **kwargs):
        returned_classifier = CatBoostClassifier()
        returned_classifier.catboost_classifier = self.model.copy()
        returned_classifier.columns = self.columns
        return returned_classifier

    def update_model(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self.model, k, v)

    def save_model(self, direct="./checkpoints", name="catboost_model"):

        if not os.path.isdir(direct):
            try:
                os.mkdir(direct)
                print("Directorio creado: " + direct)
            except OSError as e:
                raise NameError("Error al crear el directorio")
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        filename = direct + "/" + name + "_" + current_time + ".dump"
        self.model.save_model(filename)
        print("Modelo guardado en la ruta: " + filename)

    def load_model(self, direct="./checkpoints", name="catboost_model"):

        if not os.path.isdir(direct):
            print("no existe el drectorio especificado")
        filename = direct + "/" + name + ".dump"
        self.model.load_model(filename)
        print("Modelo cargado de la ruta: " + filename)

    def predict(self, X, *args, **kwargs):
        _X_copy = X.loc[:, self.columns].copy()
        return self.model.predict(_X_copy.values, *args, **kwargs)

    def predict_proba(self, X, *args, **kwargs):
        _X_copy = X.loc[:, self.columns].copy()
        return self.model.predict_proba(_X_copy.values, *args, **kwargs)

    def add_cat_features(self, index_features):

        self.categorical_columns_indices = index_features
        print(self.categorical_columns_indices)

        self.train_data = catboost.Pool(
            data=self.X_train,
            label=self.y_train,
            cat_features=self.categorical_columns_indices)
        self.eval_data = catboost.Pool(
            data=self.X_test,
            label=self.y_test,
            cat_features=self.categorical_columns_indices)
        self.all_train_data = catboost.Pool(
            data=self.X,
            label=self.y,
            cat_features=self.categorical_columns_indices)

    def index_features(self, features):

        _index = []
        for i in features:
            _index.append(self.X.columns.get_loc(i))
        if _index == []:
            raise NameError("No coincide ninguna de las features introducidas")
        return _index

    def get_important_features(self, display=True):

        self.model.get_feature_importance(prettified=True)
        _feature_importance_df = self.model.get_feature_importance(
            prettified=True)

        if display:
            plt.figure(figsize=(12, 6))
            sns.barplot(x="Importances",
                        y="Feature Id",
                        data=_feature_importance_df)
            plt.title('CatBoost features importance:')

        return _feature_importance_df

    def Visualizer_Models(self, directs=None, visu_model=True):

        directorios = []
        if len(directs) < 0:
            if visu_model:
                directorios.append(self.train_dir)
            else:
                raise NameError("No se ha seleccionado ningun directorio")
        else:
            if visu_model:
                directorios.append(self.train_dir)
            for i in directs:
                directorios.append(i)
        print(directorios)
        widget = MetricVisualizer(directorios)
        widget.start()

    def hyperopt_objective(self, params):

        _model = CatBoostClassifier(
            l2_leaf_reg=int(params['l2_leaf_reg']),
            learning_rate=params['learning_rate'],
            bagging_temperature=params["bagging_temperature"],
            iterations=500,
            eval_metric='AUC',
            random_seed=99,
            verbose=False,
            loss_function='Logloss')
        _cv_data = catboost.cv(self.all_train_data, _model.get_params())
        best_accuracy = np.max(_cv_data['test-AUC-mean'])

        return 1 - best_accuracy

    def FineTune_hyperopt(self, X, y, mute=False):

        self.dataset(X, y)

        params_space = {
            'l2_leaf_reg':
            hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
            'learning_rate':
            hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
            'bagging_temperature':
            hyperopt.hp.uniform("bagging_temperature", 0, 0.3)
        }
        trials = hyperopt.Trials()
        best = hyperopt.fmin(self.hyperopt_objective,
                             space=params_space,
                             algo=hyperopt.tpe.suggest,
                             max_evals=2,
                             trials=trials,
                             rstate=RandomState(self.random_state))
        if not mute:
            print("\nBest parameters:")
            print(best)
            print("\n")

        _parameters = self.params
        _parameters.update(best)

        _model = CatBoostClassifier(**_parameters)
        _cv_data = catboost.cv(self.all_train_data, _model.get_params())

        if not mute:
            print('\nPrecise validation accuracy score: {}'.format(
                np.max(_cv_data['test-Accuracy-mean'])))
        return best

    def FineTune_sklearn(self, X, y, mute=False, n_splits=10, n_iter=2):
        """
        https://www.kaggle.com/ksaaskil/pets-definitive-catboost-tuning
        """
        self.dataset(X, y)

        def build_search(modelo,
                         param_distributions,
                         cv=5,
                         n_iter=10,
                         verbose=1,
                         random_state=99):
            """
            Builder function for RandomizedSearch.
            """
            QWS = make_scorer(cohen_kappa_score, weights='quadratic')
            return RandomizedSearchCV(modelo,
                                      param_distributions=param_distributions,
                                      cv=cv,
                                      return_train_score=True,
                                      refit='cohen_kappa_quadratic',
                                      n_iter=n_iter,
                                      n_jobs=None,
                                      scoring={
                                          'accuracy':
                                          make_scorer(accuracy_score),
                                          'cohen_kappa_quadratic': QWS
                                      },
                                      verbose=verbose,
                                      random_state=random_state)

        def pretty_cv_results(cv_results,
                              sort_by='rank_test_cohen_kappa_quadratic',
                              sort_ascending=True,
                              n_rows=30):
            """
            Return pretty Pandas dataframe from the `cv_results_` attribute of finished parameter search,
            ranking by test performance and only keeping the columns of interest.
            """
            df = pd.DataFrame(cv_results)
            cols_of_interest = [
                key for key in df.keys() if key.startswith('param_')
                or key.startswith("mean_train") or key.startswith("std_train")
                or key.startswith("mean_test") or key.startswith("std_test")
                or key.startswith('mean_fit_time') or key.startswith('rank')
            ]
            return df.loc[:, cols_of_interest].sort_values(
                by=sort_by, ascending=sort_ascending).head(n_rows)

        def run_search(X_train, y_train, search, mute=False):
            search.fit(X_train, y_train)
            print('Best score is:', search.best_score_)
            return pretty_cv_results(search.cv_results_)

        param_distributions = {
            'iterations': [100, 200],
            'learning_rate': scipy.stats.uniform(0.01, 0.3),
            'max_depth': scipy.stats.randint(3, 10),
            'one_hot_max_size': [30],
            'l2_leaf_reg': scipy.stats.reciprocal(a=1e-2, b=1e1),
        }

        if mute:
            _verbose = 0
        else:
            _verbose = 1

        self.params.update({'use_best_model': False})
        _model = CatBoostClassifier(**self.params)

        catboost_search = build_search(_model,
                                       param_distributions=param_distributions,
                                       n_iter=n_iter,
                                       verbose=_verbose,
                                       cv=RepeatedStratifiedKFold(
                                           n_splits=n_splits,
                                           n_repeats=1,
                                           random_state=self.random_state))
        catboost_cv_results = run_search(self.X,
                                         self.y,
                                         search=catboost_search,
                                         mute=mute)
        best_estimator = catboost_search.best_estimator_
        if not mute:
            print(best_estimator.get_params())

        return catboost_cv_results, best_estimator

    def __getattr__(self, attr):
        """
        Pass all other method calls to self.model.
        """
        return getattr(self.model, attr)
models = []

np.random.seed(0)
for i in tqdm_notebook(range(50)):

    model = CatBoostClassifier(iterations=300,
                               depth=10,
                               thread_count=12,
                               border_count=128,
                               learning_rate=0.015,
                               random_seed=np.random.randint(10**10),
                               eval_metric='AUC',
                               verbose=False)

    model.fit(X_train, y_train)
    models.append(model.copy())

## LGBM

models2 = []

np.random.seed(0)
for i in tqdm_notebook(range(100)):

    model = lgbm.LGBMClassifier(random_state=np.random.randint(10**10),
                                n_estimators=4600,
                                max_depth=15 + np.random.randint(0, 10),
                                num_leaves=15 + np.random.randint(0, 10),
                                subsample=0.99868,
                                colsample_bytree=0.8022,
                                reg_alpha=26.4310,