Exemple #1
0
    def fit(self, X, y, indicator):
        '''
        indicator=1 means we intend to do just sampling and one-time fitting
        for evaluating a fixed set of hyper-parameters, 
        0 means run hyperopt to search in the neighborhood of the seed 
        hyper-parameters to see if model quality is improving.
        '''

        XFull = X
        yFull = y
        self.Xe_train, self.Xe_test, self.ys_train, self.ys_test = \
        train_test_split(XFull, yFull.ravel(),test_size = self.test_size, random_state=self.seed,shuffle=True)

        if indicator == 1:
            ## just fit lightgbm once to obtain the AUC w.r.t a fixed set of hyper-parameters ##
            model = LGBMClassifier(random_state=self.seed,
                                   min_data=1,
                                   min_data_in_bin=1)
            model.set_params(**self.param_space)
            model.fit(self.Xe_train, self.ys_train)
            mypreds = model.predict_proba(self.Xe_test)[:, 1]
            auc = auc_metric(self.ys_test.reshape(-1, 1),
                             mypreds.reshape(-1, 1))
            return auc
        else:
            trials = Trials()
            best = fmin(fn=self.gbc_objective,
                        space=self.param_space,
                        algo=tpe.suggest,
                        trials=trials,
                        max_evals=self.max_evaluations)
            params = space_eval(self.param_space, best)
            self.best_params = params
            return params, 1 - np.min([x['loss'] for x in trials.results])
Exemple #2
0
def do_generate_metrics_lgbm_optimazed_model(X_train, y_train, X_test, y_test,
                                             grid):
    model = LGBMClassifier(random_state=0)
    model.set_params(**grid.best_params_)
    model.fit(X_train, y_train)
    metrics = calculate_metrics(model, X_test, y_test)
    print(model.get_params(), " ", model.score)
    print(grid.best_params_, " ", grid.best_score_)

    return model, metrics
Exemple #3
0
    def gbc_objective(self, space):

        model = LGBMClassifier(random_state=self.seed,
                               min_data=1,
                               min_data_in_bin=1)
        model.set_params(**space)
        model.fit(self.Xe_train, self.ys_train)
        mypreds = model.predict_proba(self.Xe_test)[:, 1]
        auc = auc_metric(self.ys_test.reshape(-1, 1), mypreds.reshape(-1, 1))
        return {'loss': (1 - auc), 'status': STATUS_OK}
Exemple #4
0
    def fit_lgb(self, X_train, y_train, X_val, y_val, X_test, y_test, **param):
        """ using turned parameters to fit training dataset, and save the fitted model to a txt file. Also, it return f1
        score on test set.

        Args:
           X_train: Dataframe df: train set
           y_train: series: train set response
           X_val: Dataframe df: validation set
           y_val: series: validation set response
           X_test: Dataframe df: test set
           y_test: series: test set response
           **param: LightGBM parameters selected from function - turining_lgb()
        return:
             f1_score for test set

        """
        model0 = LGBMClassifier(is_unbalance=True, reg_lambda=1)
        model0.set_params(**param)
        model0.fit(X_train,
                   y_train,
                   eval_set=[(X_val, y_val)],
                   early_stopping_rounds=150)
        fold_pred = model0.predict(X_test,
                                   num_iteration=model0.best_iteration_)
        fold_pred_prob = model0.predict_proba(
            X_test, num_iteration=model0.best_iteration_)
        model_probs = fold_pred_prob[:, 1]

        joblib.dump(model0, 'lgb_model.pkl')
        f1 = sklearn.metrics.f1_score(y_test, fold_pred)
        print("lgb turning model - f1_score:{}".format(f1))
        lgb.plot_importance(model0.booster_).plot()
        plt.title(
            "Feature Importance for selected features in the final LGB model")
        plt.xlabel('Importance')
        plt.ylabel('Features')
        plt.show()

        # calculate the precision-recall auc
        precision, recall, _ = precision_recall_curve(y_test, model_probs)
        auc_score = auc(recall, precision)
        print('AUC: %.3f' % auc_score)
        # plot precision-recall curves
        self.plot_pr_curve(y_test, model_probs)

        # ROC
        sklearn.metrics.plot_roc_curve(model0, X_test, y_test)
        plt.title("ROC for NN + lightGBM")
        plt.show()

        return f1
def do_generate_metrics_lgbm_optimazed_model(X_train, y_train, X_test, y_test,
                                             grid):
    file_operations.write_logs(FILENAME, "LGBM metrics calculation\n")
    model = LGBMClassifier(random_state=0)
    model.set_params(**grid.best_params_)
    model.fit(X_train, y_train)
    metrics = calculate_metrics(model, X_test, y_test)
    file_operations.write_logs(
        FILENAME, "Generated model params and results\n params:" +
        str(model.get_params()) + "\nscore " +
        str(model.score(X_test, y_test)))
    file_operations.write_logs(
        FILENAME, "Search grid best params and results\n params:" +
        str(grid.best_params_) + "\nscore " + str(grid.best_score_))

    return model, metrics
    def get_best_hyperparameters(self, train_data, train_labels,
                                 validation_ratio, random_state):
        print('\nFile: {} Class: {} Function: {} State: {}'.format(
            'hyperparameters_tuner.py', 'HyperparametersTuner',
            'get_best_hyperparameters', 'Start'))

        self._train_data, self._validation_data, self._train_labels, self._validation_labels = train_test_split(
            train_data,
            train_labels,
            test_size=validation_ratio,
            random_state=random_state,
            shuffle=True,
            stratify=train_labels)
        classifier = LGBMClassifier()
        classifier.set_params(**self._fixed_hyperparameters)
        classifier.fit(self._train_data, self._train_labels)

        predictions = classifier.predict_proba(self._validation_data)[:, 1]
        labels = self._validation_labels
        fixed_hyperparameters_score = roc_auc_score(labels, predictions)
        print('labels.shape: {}'.format(labels.shape))
        print('predictions.shape: {}'.format(predictions.shape))

        trials = Trials()
        best = fmin(fn=self.objective,
                    space=self._search_space,
                    algo=tpe.suggest,
                    trials=trials,
                    max_evals=self._max_evaluations)
        best_trial_hyperparameters = space_eval(self._search_space, best)
        best_trial_hyperparameters_score = 1 - np.min(
            [x['loss'] for x in trials.results])

        if fixed_hyperparameters_score > best_trial_hyperparameters_score:
            print('best auc score: {}'.format(fixed_hyperparameters_score))
            return self._fixed_hyperparameters
        else:
            print(
                'best auc score: {}'.format(best_trial_hyperparameters_score))
            return best_trial_hyperparameters

        print('File: {} Class: {} Function: {} State: {} \n'.format(
            'hyperparameters_tuner.py', 'HyperparametersTuner',
            'get_best_hyperparameters', 'End'))
    def objective(self, trial_hyperparameters):
        print('\nFile: {} Class: {} Function: {} State: {}'.format(
            'hyperparameters_tuner.py', 'HyperparametersTuner', 'objective',
            'Start'))
        print('trial_hyperparameters: {}'.format(trial_hyperparameters))

        classifier = LGBMClassifier()
        classifier.set_params(**self._fixed_hyperparameters)
        classifier.fit(self._train_data, self._train_labels)

        predictions = classifier.predict_proba(self._validation_data)[:, 1]
        labels = self._validation_labels
        trial_score = roc_auc_score(labels, predictions)
        print('labels.shape: {}'.format(labels.shape))
        print('predictions.shape: {}'.format(predictions.shape))

        print('File: {} Class: {} Function: {} State: {} \n'.format(
            'hyperparameters_tuner.py', 'HyperparametersTuner', 'objective',
            'End'))
        return {'loss': (1 - trial_score), 'status': STATUS_OK}
Exemple #8
0
    def fit_predict(self, X_train, y_train, X_valid, y_valid, X_test,
                    **kwargs):
        clf = LGBMClassifier()
        if self.params is not None:
            clf.set_params(**self.params)
            # print(clf.get_params())

        eval_set = [(X_train, y_train), (X_valid, y_valid)]
        self.clf = clf.fit(
            X_train,
            y_train,
            eval_set=eval_set,
            eval_metric=None,
            eval_names=('Train', 'Valid'),
            verbose=100,
            early_stopping_rounds=100,  # fit_params
            **kwargs  # TODO: set_params
        )

        valid_predict = clf.predict_proba(X_valid)
        test_predict = clf.predict_proba(X_test)
        return valid_predict, test_predict
Exemple #9
0
 def modelLGBMClassifier(self, trial: optuna.trial.Trial):
     opt_params = dict(
         num_leaves=trial.suggest_int("num_leaves", 2, 2**8),
         learning_rate=trial.suggest_discrete_uniform(
             'learning_rate', 0.001, 1, 0.001),
         n_estimators=trial.suggest_int("n_estimators", 2, 2**10, log=True),
         min_child_samples=trial.suggest_int('min_child_samples', 2, 2**8),
         min_child_weight=trial.suggest_loguniform('min_child_weight', 1e-8,
                                                   1),
         min_split_gain=trial.suggest_loguniform('min_split_gain', 1e-8, 1),
         subsample=trial.suggest_uniform('subsample', 0.4, 1),
         subsample_freq=trial.suggest_int("subsample_freq", 0, 2**4),
         colsample_bytree=trial.suggest_uniform('colsample_bytree', 0.4, 1),
         reg_alpha=trial.suggest_loguniform('reg_alpha', 1e-8, 10),
         reg_lambda=trial.suggest_loguniform('reg_lambda', 1e-8, 10),
     )
     clf = LGBMClassifier(boosting_type='gbdt',
                          num_leaves=31,
                          max_depth=-1,
                          learning_rate=0.1,
                          n_estimators=100,
                          subsample_for_bin=200000,
                          objective=None,
                          class_weight=None,
                          min_split_gain=0.,
                          min_child_weight=1e-3,
                          min_child_samples=20,
                          subsample=1.,
                          subsample_freq=0,
                          colsample_bytree=1.,
                          reg_alpha=0.,
                          reg_lambda=0.,
                          random_state=None,
                          n_jobs=-1,
                          silent=True,
                          importance_type='split')
     clf.set_params(**{**opt_params, **self.params})
     return clf
    # {'subsample': [i/10.0 for i in range(6,10)],
    #     'colsample_bytree':[i/10.0 for i in range(6,10)]},

    # {'reg_alpha': [1e-2, 0.1, 1, 2, 5, 10],
    #     'reg_lambda': [0.01,0.1, 1, 2, 5, 10]},

    # {'learning_rate':np.linspace(0.01, 1.0, 50)}
]

for params in lt_params:
    grid = GridSearchCV(estimator=gbm, param_grid=params)
    grid.fit(X, y)

    bestParams.update(grid.best_params_)
    gbm.set_params(**bestParams)

print('最优参数:\n', bestParams)
print('score=', grid.best_score_)
mdl = grid.best_estimator_

y_pred = mdl.predict(X, num_iteration=mdl.best_iteration_)
displayClassifierMetrics(y, y_pred, grid.classes_)

y_prob = mdl.predict_proba(X, num_iteration=mdl.best_iteration_)
displayROCurve(y, y_prob, grid.classes_)

# 显示特征重要性

# lightgbm.LGBMClassifier
#       (boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
Exemple #11
0
# %%
class Counter(TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        print(X.shape[1])
        return X


# %%
sel_mod = LGBMClassifier(metric='auc',
                         n_estimators=200,
                         boosting_type=BOOSTING)
sel_mod.set_params(**pars)
model = make_pipeline(
    SelectFromModel(sel_mod), Counter(),
    LGBMClassifier(metric='auc', boosting_type=BOOSTING, n_estimators=2000))

# %%
prun = PrunedCV(N_FOLD, 0.02, minimize=False)


# %%
def objective(trial):

    joblib.dump(study, 'study_{}.pkl'.format(BOOSTING))

    params = {
        'selectfrommodel__threshold':
class EveryTime:
    NAME = 'EveryTime'

    def __init__(self, datainfo, timeinfo):
        self._info = extract(datainfo, timeinfo)
        print_data_info(self._info)
        print_time_info(self._info)

        self._validation_ratio = 0.25
        self._max_data = 400000

        self._iteration = 0
        self._random_state = 13
        self._max_evaluations = 25
        self._dataset_budget_threshold = 0.8
        self._should_correct = False
        self._correction_threshold = 0.8
        self._correction_n_splits = 8
        self._epsilon = 0.001
        self._ensemble_size = 4
        self._minority_threshold = 10000
        self._large_fraction = 8
        self._small_fraction = 4

        self._categorical_frequency_map = {}
        self._mvc_frequency_map = {}
        self._train_data = []
        self._train_labels = []

        self._best_hyperparameters = None
        self._classifier = None
        self._imbalanced_sampler = OldRandomMajorityUnderSampler(
            self._random_state, self._small_fraction)
        self._too_much_data_sampler = StratifiedRandomSampler(
            self._max_data, self._random_state)
        self._test_sampler = RandomSampler(self._random_state)
        self._profile = Profile.LGBM_ORIGINAL_NAME
        self._is_first = True

    def fit(self, F, y, datainfo, timeinfo):
        print('\nFile: {} Class: {} Function: {} State: {}'.format(
            'architectures.py', 'OriginalEnsemble', 'fit', 'Start'))

        info = extract(datainfo, timeinfo)
        self._info.update(info)
        print_time_info(self._info)

        data = get_data(F, self._info)
        y = y.ravel()
        print('data.shape: {}'.format(data.shape))
        print('y.shape: {}'.format(y.shape))

        bincount = np.bincount(y.astype(int))
        print('Number of 0 label: {}'.format(bincount[0]))
        print('Number of 1 label: {}'.format(bincount[1]))

        if min(bincount) < self._minority_threshold:
            self._imbalanced_sampler = OldRandomMajorityUnderSampler(
                self._random_state, self._large_fraction)
            size = int(min(bincount) * self._large_fraction * 2.5)
            self._too_much_data_sampler = StratifiedRandomSampler(
                size, self._random_state)

        self._categorical_frequency_map = {}
        self._mvc_frequency_map = {}
        self._transform(data, DataType.TRAIN)

        self._train_data = np.concatenate(
            (self._train_data,
             data), axis=0) if len(self._train_data) > 0 else data
        self._train_labels = np.concatenate(
            (self._train_labels,
             y), axis=0) if len(self._train_labels) > 0 else y
        self._train_data, self._train_labels = self._imbalanced_sampler.sample(
            self._train_data, self._train_labels)
        self._train_data, self._train_labels = self._too_much_data_sampler.sample(
            self._train_data, self._train_labels)
        print('self._train_data.shape: {}'.format(self._train_data.shape))
        print('self._train_labels.shape: {}'.format(self._train_labels.shape))

        self._iteration += 1
        print('File: {} Class: {} Function: {} State: {} \n'.format(
            'architectures.py', 'OriginalEnsemble', 'fit', 'End'))

    def predict(self, F, datainfo, timeinfo):
        print('\nFile: {} Class: {} Function: {} State: {}'.format(
            'architectures.py', 'OriginalEnsemble', 'predict', 'Start'))

        info = extract(datainfo, timeinfo)
        self._info.update(info)
        print_time_info(self._info)

        test_data = get_data(F, self._info)
        print('test_data.shape: {}'.format(test_data.shape))

        transformed_test_data = self._transform(test_data, DataType.TEST)
        train_data = self._transform(self._train_data, DataType.TRAIN)

        train_labels = self._train_labels
        print('transformed_test_data.shape: {}'.format(
            transformed_test_data.shape))
        print('train_data.shape: {}'.format(train_data.shape))

        size = len(train_data) if len(transformed_test_data) > len(
            train_data) else len(transformed_test_data)
        train_weights = correct_covariate_shift(
            train_data, self._test_sampler.sample(transformed_test_data, size),
            self._random_state, self._correction_threshold,
            self._correction_n_splits) if self._should_correct else None

        fixed_hyperparameters, search_space = Profile.parse_profile(
            self._profile)
        if self._best_hyperparameters is None:
            tuner = HyperparametersTuner(fixed_hyperparameters, search_space,
                                         self._max_evaluations)
            self._best_hyperparameters = tuner.get_best_hyperparameters(
                train_data, train_labels, self._validation_ratio,
                self._random_state)
            print('self._best_hyperparameters: {}'.format(
                self._best_hyperparameters))

        if has_sufficient_time(self._dataset_budget_threshold, self._info):
            t_d, validation_data, t_l, validation_labels = train_test_split(
                train_data,
                train_labels,
                test_size=self._validation_ratio,
                random_state=self._random_state,
                shuffle=True,
                stratify=train_labels)

            self._classifier = LGBMClassifier()
            self._classifier.set_params(**self._best_hyperparameters)
            self._classifier.fit(train_data,
                                 train_labels,
                                 sample_weight=train_weights)

        else:
            print('Time budget exceeded.')

        predictions = self._classifier.predict_proba(transformed_test_data)[:,
                                                                            1]
        self._iteration += 1
        print('predictions.shape: {}'.format(predictions.shape))
        print('File: {} Class: {} Function: {} State: {} \n'.format(
            'architectures.py', 'OriginalEnsemble', 'predict', 'End'))
        return predictions

    def _transform(self, data, datatype):
        transformed_data = np.array([])
        time_data, numerical_data, categorical_data, mvc_data = split_data_by_type(
            data, self._info)
        if len(time_data) > 0:
            transformed_data = subtract_min_time(time_data)
            transformed_data = np.concatenate(
                (transformed_data, difference_between_time_columns(time_data)),
                axis=1)
            transformed_data = np.concatenate(
                (transformed_data, extract_detailed_time(time_data)), axis=1)
        if len(numerical_data) > 0:
            transformed_data = numerical_data if len(transformed_data) == 0 else \
                                np.concatenate((transformed_data, numerical_data), axis=1)
        if len(categorical_data) > 0:
            if (datatype == DataType.TRAIN
                    and self._iteration % 2 == 0) or datatype == DataType.TEST:
                self._categorical_frequency_map = count_frequency(
                    self._categorical_frequency_map, categorical_data)
            encoded_categorical_data = encode_frequency(
                self._categorical_frequency_map, categorical_data)
            transformed_data = np.concatenate(
                (transformed_data, encoded_categorical_data), axis=1)
        if len(mvc_data) > 0:
            if (datatype == DataType.TRAIN
                    and self._iteration % 2 == 0) or datatype == DataType.TEST:
                self._mvc_frequency_map = count_frequency(
                    self._mvc_frequency_map, mvc_data)

            encoded_mvc_data = encode_frequency(self._mvc_frequency_map,
                                                mvc_data)
            transformed_data = np.concatenate(
                (transformed_data, encoded_mvc_data), axis=1)
        return np.nan_to_num(transformed_data)
    def predict(self, F, datainfo, timeinfo):
        print('\nFile: {} Class: {} Function: {} State: {}'.format(
            'architectures.py', 'OriginalEnsemble', 'predict', 'Start'))

        info = extract(datainfo, timeinfo)
        self._info.update(info)
        print_time_info(self._info)

        test_data = get_data(F, self._info)
        print('test_data.shape: {}'.format(test_data.shape))

        transformed_test_data = self._transform(test_data, DataType.TEST)
        train_data = self._transform(self._train_data, DataType.TRAIN)

        train_labels = self._train_labels
        print('transformed_test_data.shape: {}'.format(
            transformed_test_data.shape))
        print('train_data.shape: {}'.format(train_data.shape))

        size = len(train_data) if len(transformed_test_data) > len(
            train_data) else len(transformed_test_data)
        train_weights = correct_covariate_shift(
            train_data, self._test_sampler.sample(transformed_test_data, size),
            self._random_state, self._correction_threshold,
            self._correction_n_splits) if self._should_correct else None

        fixed_hyperparameters, search_space = Profile.parse_profile(
            self._profile)
        if self._best_hyperparameters is None:
            tuner = HyperparametersTuner(fixed_hyperparameters, search_space,
                                         self._max_evaluations)
            self._best_hyperparameters = tuner.get_best_hyperparameters(
                train_data, train_labels, self._validation_ratio,
                self._random_state)
            print('self._best_hyperparameters: {}'.format(
                self._best_hyperparameters))

        if has_sufficient_time(self._dataset_budget_threshold,
                               self._info) or len(self._classifiers) == 0:
            t_d, validation_data, t_l, validation_labels = train_test_split(
                train_data,
                train_labels,
                test_size=self._validation_ratio,
                random_state=self._random_state,
                shuffle=True,
                stratify=train_labels)
            new_classifier = LGBMClassifier()
            new_classifier.set_params(**self._best_hyperparameters)
            new_classifier.fit(train_data,
                               train_labels,
                               sample_weight=train_weights)

            new_predictions = new_classifier.predict_proba(validation_data)[:,
                                                                            1]
            new_weight = compute_weight(new_predictions, validation_labels,
                                        self._epsilon)

            self._ensemble_weights = np.array([])
            for i in range(len(self._classifiers)):
                currrent_classifier = self._classifiers[i]
                currrent_classifier_predictions = currrent_classifier.predict_proba(
                    validation_data)[:, 1]
                currrent_classifier_weight = compute_weight(
                    currrent_classifier_predictions, validation_labels,
                    self._epsilon)
                self._ensemble_weights = np.append(self._ensemble_weights,
                                                   currrent_classifier_weight)

            self._classifiers = np.append(self._classifiers, new_classifier)
            self._ensemble_weights = np.append(self._ensemble_weights,
                                               new_weight)
            print('self._ensemble_weights: {}'.format(self._ensemble_weights))

            if len(self._classifiers) > self._ensemble_size:
                i = remove_worst_classifier(self._classifiers, validation_data,
                                            validation_labels)
                print('Removed classifier: {}'.format(i))
                self._classifiers = np.delete(self._classifiers, i)
                self._ensemble_weights = np.delete(self._ensemble_weights, i)
        else:
            print('Time budget exceeded.')

        if len(self._classifiers) == 1:
            predictions = self._classifiers[0].predict_proba(
                transformed_test_data)[:, 1]
        else:
            predictions = np.zeros(len(transformed_test_data))
            for i in range(len(self._classifiers)):
                predictions = np.add(
                    predictions, self._ensemble_weights[i] *
                    self._classifiers[i].predict_proba(
                        transformed_test_data)[:, 1])
            predictions = np.divide(predictions,
                                    np.sum(self._ensemble_weights))
        self._iteration += 1
        print('predictions.shape: {}'.format(predictions.shape))
        print('File: {} Class: {} Function: {} State: {} \n'.format(
            'architectures.py', 'OriginalEnsemble', 'predict', 'End'))
        return predictions
class LGBMClassifierCV(object):
    """cross_val_predict"""
    def __init__(self, params=None, cv=5, random_state=None, n_repeats=None):
        self.clf = LGBMClassifier()
        if params:
            self.clf.set_params(**params)
        if n_repeats:
            self._kf = RepeatedStratifiedKFold(cv, True, random_state)  # 复制N次
            self._num_preds = cv * n_repeats
        else:
            self._kf = StratifiedKFold(cv, True, random_state)
            self._num_preds = cv

    def fit(self,
            X,
            y,
            X_test=None,
            feval=roc_auc_score,
            sample_weight=None,
            init_score=None,
            eval_metric='auc',
            early_stopping_rounds=100,
            verbose=100,
            feature_name='auto',
            categorical_feature='auto',
            callbacks=None):
        """输入数组"""
        if X_test is None:
            X_test = X[:1]  # 将第一行作为test集

        self.oof_train = np.zeros(len(X))
        self.oof_test = np.zeros(
            (len(X_test), self._num_preds))  # num_preds:有多少折

        for n_fold, (train_index,
                     valid_index) in enumerate(self._kf.split(X, y)):
            if verbose:
                print("\033[94mFold %s started at %s\033[0m" %
                      (n_fold + 1, time.ctime()))
            X_train, y_train = X[train_index], y[train_index]
            X_valid, y_valid = X[valid_index], y[valid_index]
            eval_set = [(X_train, y_train), (X_valid, y_valid)]  # 需要同时验证两个集合

            ########################################################################
            self.clf.fit(X_train,
                         y_train,
                         sample_weight,
                         init_score,
                         eval_set,
                         eval_names=('Train', 'Valid'),
                         eval_sample_weight=None,
                         eval_class_weight=None,
                         eval_init_score=None,
                         eval_metric=eval_metric,
                         early_stopping_rounds=early_stopping_rounds,
                         verbose=verbose,
                         feature_name=feature_name,
                         categorical_feature=categorical_feature,
                         callbacks=callbacks)

            self.oof_train[valid_index] = self.clf.predict_proba(X_valid)[:, 1]
            self.oof_test[:, n_fold] = self.clf.predict_proba(X_test)[:, 1]
            ########################################################################

        # 输出 测试集 out-of-fold
        self.oof_test_rank = (pd.DataFrame(self.oof_test).rank().mean(axis=1) /
                              len(self.oof_test)).values
        self.oof_test = self.oof_test.mean(axis=1)  # 测试集的oof score算平均

        assert len(X) == len(self.oof_train)
        assert len(X_test) == len(self.oof_test)

        # 计算 训练集 oof 得分(out_of_fold)
        if feval:
            self.oof_train_score = feval(y, self.oof_train)
            print(
                f"\n\033[94mtrain CV Score: {self.oof_train_score} ended at {time.ctime()}\033[0m"
            )
            return self.oof_train_score

    def oof_submit(self, ids, pred_ranking=False, file=None, preds=None):
        """preds分用于submit"""
        if file is None:
            file = f'submit_{self.oof_train_score}.csv'
        print(f'Save {file} ...')

        if preds is None:
            preds = self.oof_test if pred_ranking else self.oof_test_rank

        if not isinstance(ids, pd.DataFrame):
            ids = pd.DataFrame(ids)
        ids.assign(preds=preds).to_csv(file, index=False, header=False)

    @property
    def oof_train_and_test(self):
        return np.r_[self.oof_train, self.oof_test]

    def oof_save(self, file='./oof_train_and_test.csv'):
        pd.DataFrame(self.oof_train_and_test,
                     columns=['oof_train_and_test']).to_csv(file, index=False)

    def plot_feature_importances(self,
                                 feature_names=None,
                                 topk=20,
                                 figsize=(10, 6),
                                 pic_name=None):
        columns = ['Importances', 'Features']
        importances = self.clf.feature_importances_.tolist()
        if feature_names is None:
            feature_names = list(
                map(lambda x: f'F_{x}', range(len(importances))))
        _ = list(zip(importances, feature_names))
        df = pd.DataFrame(_,
                          columns=columns).sort_values('Importances', 0, False)

        plt.figure(figsize=figsize)
        sns.barplot(*columns, data=df[:topk])
        plt.title('Features Importances\n')
        plt.tight_layout()
        if pic_name is None:
            plt.savefig(f'importances_{self.oof_train_score}.png')
# Parameters of the model
# https://lightgbm.readthedocs.io/en/latest/Parameters.html
params = {
    'objective': 'binary',
    'learning_rate': 0.005,
    'num_leaves': 3,
    'min_data_in_leaf': 10,
    'colsample_bytree': 1,
    'max_bin': 10,
    'random_seed': RS
}

# We will define the model for various C's and make a search of the optimum
model_lightgbm_L1 = LGBMClassifier()
model_lightgbm_L1.set_params(**params)
iter = [500, 1000, 1500]

print('\nLightGBM Level 1 CV...')
print('########################################################')
scores = []
for nrounds in iter:
    model_lightgbm_L1.set_params(n_estimators=nrounds)
    print('\nn rounds: ', nrounds)
    s = Model_cv(model_lightgbm_L1,
                 n_folds,
                 X1_train,
                 X1_test,
                 Y_train,
                 RS,
                 makepred=False)
class Model:
    def __init__(self, data_info, time_info):

        # Print data information
        info_dict = extract(data_info, time_info)
        print_data_info(info_dict)

        # # Install hyperopt and lightgbm
        # pip_install('hyperopt')
        # pip_install('lightgbm')

        print('Using algo: {}'.format(params['algo']))

        # Settings
        if params['algo'] == Algo.ORIGINAL:
            self._dataset_budget_threshold = 0.8
            self._max_train_data = 200000
            self.batch_size = 50000
            self.delta_n_estimators = 100
            self.delta_num_leaves = 20
            self.delta_learning_rate = 0.005
            self.delta_max_depth = 1
            self.delta_feature_fraction = 0.1
            self.delta_bagging_fraction = 0.1
            self.delta_bagging_freq = 1
            self.max_evaluation = 30
            self.param_choice_fixed = {
                'n_estimators': 400,
                'learning_rate': 0.01,
                'num_leaves': 50,
                'feature_fraction': 0.6,
                'bagging_fraction': 0.6,
                'bagging_freq': 2,
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc'
            }
        elif params['algo'] == Algo.FACEBOOK_LR:
            self._dataset_budget_threshold = 0.8
            self._max_train_data = 100000
            self.batch_size = 25000
            self.delta_n_estimators = 50
            self.delta_num_leaves = 10
            self.delta_learning_rate = 0.005
            self.delta_max_depth = 1
            self.delta_feature_fraction = 0.1
            self.delta_bagging_fraction = 0.1
            self.delta_bagging_freq = 1
            self.max_evaluation = 30
            self.param_choice_fixed = {
                'n_estimators': 75,
                'learning_rate': 0.01,
                'num_leaves': 15,
                'feature_fraction': 0.6,
                'bagging_fraction': 0.6,
                'bagging_freq': 2,
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc'
            }
        elif params['algo'] == Algo.BASIC:
            self._dataset_budget_threshold = 0.8
            self._max_train_data = 100000
            self.batch_size = 25000
            self.delta_n_estimators = 50
            self.delta_num_leaves = 10
            self.delta_learning_rate = 0.005
            self.delta_max_depth = 1
            self.delta_feature_fraction = 0.1
            self.delta_bagging_fraction = 0.1
            self.delta_bagging_freq = 1
            self.max_evaluation = 30
            self.param_choice_fixed = {
                'n_estimators': 75,
                'learning_rate': 0.01,
                'num_leaves': 15,
                'feature_fraction': 0.6,
                'bagging_fraction': 0.6,
                'bagging_freq': 2,
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc'
            }

        self._train_data = np.array([])
        self._train_labels = np.array([])
        self._transformed_train_data = np.array([])
        self.best_hyperparams = {}
        self._classifier = None
        self._classifier2 = None
        self._data_processor = DataProcessor(info_dict)
        self._sampler = Sampler()

        self.mdl = StreamSaveRetrainPredictor()

    def fit(self, F, y, data_info, time_info):
        '''
        This function trains the model parameters.
        Args:
            X: Training data matrix of dim num_train_samples * num_feat.
            y: Training label matrix of dim num_train_samples * num_labels.
        Both inputs are numpy arrays.
        If fit is called multiple times on incremental data (train, test1, test2, etc.)
        you should warm-start your training from the pre-trained model. Past data will
        NOT be available for re-training.
        '''

        info_dict = extract(data_info, time_info)
        print_time_info(info_dict)

        if params['algo'] == Algo.OLD_CODE:
            return self.mdl.partial_fit(F, y, data_info, time_info)
        elif params['algo'] == Algo.ORIGINAL:
            return self._original_fit(F, y, info_dict)
        elif params['algo'] == Algo.FACEBOOK_LR:
            return self._facebook_lr_fit(F, y, info_dict)
        elif params['algo'] == Algo.BASIC:
            return self._basic_fit(F, y, info_dict)

    def predict(self, F, data_info, time_info):
        '''
        This function should provide predictions of labels on (test) data.
        Make sure that the predicted values are in the correct format for the scoring
        metric. For example, binary classification problems often expect predictions
        in the form of a discriminant value (if the area under the ROC curve it the metric)
        rather that predictions of the class labels themselves. 
        The function predict eventually returns probabilities or continuous values.
        '''

        info_dict = extract(data_info, time_info)
        print_time_info(info_dict)

        if params['algo'] == Algo.OLD_CODE:
            return self.mdl.predict(F, data_info, time_info)
        elif params['algo'] == Algo.ORIGINAL:
            return self._original_predict(F, info_dict)
        elif params['algo'] == Algo.FACEBOOK_LR:
            return self._facebook_lr_predict(F, info_dict)
        elif params['algo'] == Algo.BASIC:
            return self._basic_predict(F, info_dict)

    def save(self, path="./"):
        pickle.dump(self, open(path + '_model.pickle', "w"))

    def load(self, path="./"):
        modelfile = path + '_model.pickle'
        if isfile(modelfile):
            with open(modelfile) as f:
                self = pickle.load(f)
        return self

    def _original_fit(self, F, y, info_dict):
        data = self._convert_nan_to_num(F, info_dict)
        if self._data_processor.is_uninitialized:
            self._data_processor.preprocess(data)

        sampled_data, sampled_labels = self._sampler.majority_undersampling(
            data, y)

        if len(self._train_data) == 0 and len(self._train_labels) == 0:
            self._train_data = sampled_data
            self._train_labels = sampled_labels
        else:
            self._train_data = np.concatenate((self._train_data, sampled_data),
                                              axis=0)
            self._train_labels = np.concatenate(
                (self._train_labels, sampled_labels), axis=0)

    def _original_predict(self, F, info_dict):
        data = self._convert_nan_to_num(F, info_dict)
        if self._has_sufficient_time(info_dict) or self._classifier is None:
            self._data_processor.preprocess(data)
            self._data_processor.prepare_frequency_map()

            current_train_data = self._train_data
            current_train_labels = self._train_labels

            print('self._train_data.shape: {}'.format(self._train_data.shape))
            print('self._train_labels.shape: {}'.format(
                self._train_labels.shape))
            print('self._train_data.size: {}'.format(self._train_data.size))
            print('len(self._train_data): {}'.format(len(self._train_data)))
            print('self._max_train_data: {}'.format(self._max_train_data))

            if self._too_much_training_data():
                remove_percentage = 1.0 - (float(self._max_train_data) /
                                           len(self._train_data))
                print('remove_percentage: {}'.format(remove_percentage))
                current_train_data, current_train_labels = self._sampler.random_sample_in_order(self._train_data, \
                                                                                                self._train_labels.reshape(-1,1), \
                                                                                                remove_percentage)
                print('current_train_data.shape: {}'.format(
                    current_train_data.shape))
                print('current_train_labels: {}'.format(
                    current_train_labels.shape))
                self._train_data, self._train_labels = current_train_data, current_train_labels.reshape(
                    (-1, ))

                print('new self._train_data.shape: {}'.format(
                    self._train_data.shape))
                print('new self._train_labels.shape: {}'.format(
                    self._train_labels.shape))

            self._transformed_train_data = self._data_processor.transform_data(
                current_train_data)
            self._transformed_train_labels = current_train_labels
            if not self.best_hyperparams:
                self._find_best_hyperparameters()

            self._classifier = LGBMClassifier(random_state=20,
                                              min_data=1,
                                              min_data_in_bin=1)
            self._classifier.set_params(**self.best_hyperparams)
            self._classifier.fit(self._transformed_train_data,
                                 self._transformed_train_labels.ravel())

        if data.shape[
                0] <= self.batch_size:  ### if it is relatively small array
            probs = self._classifier.predict_proba(
                self._data_processor.transform_data(data))[:, 1]
            return probs
        else:
            print('BATCH')
            print('data.shape: {}'.format(data.shape))
            results = np.array(
                [])  ## for chunking results to handle memory limit
            for i in range(0, data.shape[0], self.batch_size):
                Xsplit = data[i:(i + self.batch_size), :]
                results = np.append(
                    results,
                    self._classifier.predict_proba(
                        self._data_processor.transform_data(Xsplit))[:, 1])
                del Xsplit

            print('results.shape: {}'.format(results.shape))
            return results
        return []

    def _facebook_lr_fit(self, F, y, info_dict):
        data = self._convert_nan_to_num(F, info_dict)
        if self._data_processor.is_uninitialized:
            self._data_processor.preprocess(data)

        sampled_data, sampled_labels = self._sampler.majority_undersampling(
            data, y)

        if len(self._train_data) == 0 and len(self._train_labels) == 0:
            self._train_data = sampled_data
            self._train_labels = sampled_labels
        else:
            self._train_data = np.concatenate((self._train_data, sampled_data),
                                              axis=0)
            self._train_labels = np.concatenate(
                (self._train_labels, sampled_labels), axis=0)

    def _facebook_lr_predict(self, F, info_dict):
        data = self._convert_nan_to_num(F, info_dict)
        if self._has_sufficient_time(
                info_dict
        ) or self._classifier is None or self._classifier2 is None:
            self._data_processor.preprocess(data)
            self._data_processor.prepare_frequency_map()

            current_train_data = self._train_data
            current_train_labels = self._train_labels

            print('self._train_data.shape: {}'.format(self._train_data.shape))
            print('self._train_labels.shape: {}'.format(
                self._train_labels.shape))
            print('self._train_data.size: {}'.format(self._train_data.size))
            print('len(self._train_data): {}'.format(len(self._train_data)))
            print('self._max_train_data: {}'.format(self._max_train_data))

            if self._too_much_training_data():
                remove_percentage = 1.0 - (float(self._max_train_data) /
                                           len(self._train_data))
                print('remove_percentage: {}'.format(remove_percentage))

                current_train_data, current_train_labels = self._sampler.random_sample_in_order(self._train_data, \
                                                                                                self._train_labels.reshape(-1,1), \
                                                                                                remove_percentage)
                print('current_train_data.shape: {}'.format(
                    current_train_data.shape))
                print('current_train_labels: {}'.format(
                    current_train_labels.shape))
                self._train_data, self._train_labels = current_train_data, current_train_labels.reshape(
                    (-1, ))

                print('new self._train_data.shape: {}'.format(
                    self._train_data.shape))
                print('new self._train_labels.shape: {}'.format(
                    self._train_labels.shape))

            self._transformed_train_data = self._data_processor.transform_data(
                current_train_data)
            self._transformed_train_labels = current_train_labels
            if not self.best_hyperparams:
                self._find_best_hyperparameters()

            self._classifier = LGBMClassifier(random_state=20,
                                              min_data=1,
                                              min_data_in_bin=1)
            self._classifier.set_params(**self.best_hyperparams)
            self._classifier.fit(self._transformed_train_data,
                                 self._transformed_train_labels.ravel())

            probs = self._classifier.predict(self._transformed_train_data,
                                             pred_leaf=True)
            new_probs = onehot_sparse(probs)
            self._classifier2 = LogisticRegression()
            self._classifier2.fit(new_probs,
                                  self._transformed_train_labels.ravel())
            del probs
            del new_probs

        if data.shape[
                0] <= self.batch_size:  ### if it is relatively small array
            probs = probs = self._classifier.predict(
                self._data_processor.transform_data(data), pred_leaf=True)
            new_probs = onehot_sparse(probs)
            actual_probs = self._classifier2.predict_proba(new_probs)[:, 1]
            return actual_probs
        else:
            print('BATCH')
            print('data.shape: {}'.format(data.shape))
            results = np.array(
                [])  ## for chunking results to handle memory limit
            for i in range(0, data.shape[0], self.batch_size):
                Xsplit = data[i:(i + self.batch_size), :]
                probs = probs = self._classifier.predict(
                    self._data_processor.transform_data(Xsplit),
                    pred_leaf=True)
                new_probs = onehot_sparse(probs)
                actual_probs = self._classifier2.predict_proba(new_probs)[:, 1]
                results = np.append(results, actual_probs)
                del Xsplit
                del probs
                del new_probs
                del actual_probs

            print('results.shape: {}'.format(results.shape))
            return results
        return []

    def _basic_fit(self, F, y, info_dict):
        data = self._convert_nan_to_num(F, info_dict)
        y = y.reshape((-1, ))
        # if self._data_processor.is_uninitialized:
        #     self._data_processor.preprocess(data)

        print('data.shape: {}'.format(data.shape))
        print('y.shape: {}'.format(y.shape))

        if self._has_sufficient_time(info_dict) or self._classifier is None:
            self._classifier = LGBMClassifier(random_state=20,
                                              min_data=1,
                                              min_data_in_bin=1)
            self._classifier.set_params(**self.param_choice_fixed)

            transformed_data = self._data_processor.simple_transform_data(data)
            self._classifier.fit(transformed_data, y)

    def _basic_predict(self, F, info_dict):
        data = self._convert_nan_to_num(F, info_dict)
        transformed_data = self._data_processor.simple_transform_data(data)
        probs = self._classifier.predict_proba(transformed_data)[:, 1]
        return probs

    def _convert_nan_to_num(self, F, info_dict):
        # Convert time and numerical nan
        data = F['numerical']
        data = np.nan_to_num(data)

        # Convert categorical nan
        if info_dict['no_of_categorical_features'] > 0:
            categorical_data = F['CAT'].fillna('nan').values
            data = np.concatenate((data, categorical_data), axis=1)
            del categorical_data

        # Convert mvc nan
        if info_dict['no_of_mvc_features'] > 0:
            mvc_data = F['MV'].fillna('nan').values
            data = np.concatenate((data, mvc_data), axis=1)
            del mvc_data
        return data

    def _has_sufficient_time(self, info_dict):
        return info_dict['dataset_time_spent'] < info_dict[
            'time_budget'] * self._dataset_budget_threshold

    def _too_much_training_data(self):
        return self._train_data.shape[0] > self._max_train_data

    def _find_best_hyperparameters(self):

        param_choice_fixed = self.param_choice_fixed

        autohyper = HyperparametersTuner(parameter_space=param_choice_fixed)
        best_score_choice1 = autohyper.fit(
            self._transformed_train_data,
            self._transformed_train_labels.ravel(), 1)

        #Get the AUC for the fixed hyperparameter+Hyperopt combination on the internal validation set
        #Step:1-Define the search space for Hyperopt to be a small delta region over the initial set of fixed hyperparameters
        n_estimators_low = param_choice_fixed[
            'n_estimators'] - self.delta_n_estimators
        n_estimators_high = param_choice_fixed[
            'n_estimators'] + self.delta_n_estimators

        learning_rate_low = np.log(0.001) if (
            param_choice_fixed['learning_rate'] - self.delta_learning_rate
        ) < 0.001 else np.log(param_choice_fixed['learning_rate'] -
                              self.delta_learning_rate)
        learning_rate_high = np.log(param_choice_fixed['learning_rate'] +
                                    self.delta_learning_rate)

        num_leaves_low = param_choice_fixed[
            'num_leaves'] - self.delta_num_leaves
        num_leaves_high = param_choice_fixed[
            'num_leaves'] + self.delta_num_leaves

        feature_fraction_low = np.log(0.05) if (
            param_choice_fixed['feature_fraction'] -
            self.delta_feature_fraction) < 0.05 else np.log(
                param_choice_fixed['feature_fraction'] -
                self.delta_feature_fraction)
        feature_fraction_high = np.log(1.0) if (
            param_choice_fixed['feature_fraction'] +
            self.delta_feature_fraction) > 1.0 else np.log(
                param_choice_fixed['feature_fraction'] +
                self.delta_feature_fraction)

        bagging_fraction_low = np.log(0.05) if (
            param_choice_fixed['bagging_fraction'] -
            self.delta_bagging_fraction) < 0.05 else np.log(
                param_choice_fixed['bagging_fraction'] -
                self.delta_bagging_fraction)
        bagging_fraction_high = np.log(1.0) if (
            param_choice_fixed['bagging_fraction'] +
            self.delta_bagging_fraction) > 1.0 else np.log(
                param_choice_fixed['bagging_fraction'] +
                self.delta_bagging_fraction)

        bagging_freq_low = 1 if (
            param_choice_fixed['bagging_freq'] - self.delta_bagging_freq
        ) < 1 else param_choice_fixed['bagging_freq'] - self.delta_bagging_freq
        bagging_freq_high = param_choice_fixed[
            'bagging_freq'] + self.delta_bagging_freq

        boosting_type = param_choice_fixed['boosting_type']
        objective = param_choice_fixed['objective']
        metric = param_choice_fixed['metric']

        #set the search space to be explored by Hyperopt
        param_space_forFixed = {
            'objective':
            "binary",
            'n_estimators':
            hp.choice(
                'n_estimators',
                np.arange(n_estimators_low, n_estimators_high, 50, dtype=int)),
            'num_leaves':
            hp.choice('num_leaves',
                      np.arange(num_leaves_low, num_leaves_high, 5,
                                dtype=int)),
            'feature_fraction':
            hp.loguniform('feature_fraction', feature_fraction_low,
                          feature_fraction_high),
            'bagging_fraction':
            hp.loguniform('bagging_fraction', bagging_fraction_low,
                          bagging_fraction_high),
            'bagging_freq':
            hp.choice(
                'bagging_freq',
                np.arange(bagging_freq_low,
                          bagging_freq_high + 1,
                          1,
                          dtype=int)),
            'learning_rate':
            hp.loguniform('learning_rate', learning_rate_low,
                          learning_rate_high),
            'boosting_type':
            boosting_type,
            'metric':
            metric,
            'verbose':
            -1
        }

        #run Hyperopt to search nearby region in the hope to obtain a better combination of hyper-parameters
        autohyper = HyperparametersTuner(max_evaluations=self.max_evaluation,
                                         parameter_space=param_space_forFixed)
        best_hyperparams_choice2, best_score_choice2 = autohyper.fit(
            self._transformed_train_data,
            self._transformed_train_labels.ravel(), 0)

        #Compare choice-1 & choice-2 and take the better one
        if best_score_choice1 >= best_score_choice2:
            self.best_hyperparams = param_choice_fixed
        else:
            self.best_hyperparams = best_hyperparams_choice2

        print('\nBest Hyperparams: {}\n'.format(self.best_hyperparams))
class LGBMClassifierCV(object):
    """cross_val_predict"""
    def __init__(self, params=None, cv=5, cv_seed=None, n_repeats=None):
        self.clf = LGBMClassifier()
        self.cv = cv
        if params:
            self.clf.set_params(**params)
        if n_repeats:
            self._kf = RepeatedStratifiedKFold(cv,
                                               shuffle=True,
                                               random_state=cv_seed)
            self._num_preds = cv * n_repeats
        else:
            self._kf = StratifiedKFold(cv, shuffle=True, random_state=cv_seed)
            self._num_preds = cv

    def fit(self,
            X,
            y,
            X_test=None,
            feval=roc_auc_score,
            fix_valid_index=None,
            sample_weight=None,
            init_score=None,
            eval_metric='auc',
            early_stopping_rounds=300,
            verbose=100,
            feature_name='auto',
            categorical_feature='auto',
            callbacks=None):
        """
        :param X: 数组
        :param y:
        :param X_test:
        :param feval:
        :param fix_valid_index: 默认折外为验证集,可添加验证集范围(指定其在X里的index)
        :return:
        """
        self.best_info = {}
        self.feature_importances = 0
        if X_test is None:
            X_test = X[:1]

        self.oof_train = np.zeros(len(X))
        self.oof_test = np.zeros((len(X_test), self._num_preds))
        for n_fold, (train_index,
                     valid_index) in enumerate(self._kf.split(X, y)):
            if verbose:
                print("\033[94mFold %s started at %s\033[0m" %
                      (n_fold + 1, time.ctime()))

            # 设置valid早停范围:原生X索引
            if fix_valid_index is not None:
                valid_index = list(set(fix_valid_index)
                                   & set(valid_index))  # 线下 + 线上验证集

            X_train, y_train = X[train_index], y[train_index]
            X_valid, y_valid = X[valid_index], y[valid_index]
            eval_set = [(X_train, y_train), (X_valid, y_valid)]

            ########################################################################
            self.clf.fit(X_train,
                         y_train,
                         sample_weight,
                         init_score,
                         eval_set,
                         eval_names=('Train', 'Valid'),
                         eval_sample_weight=None,
                         eval_class_weight=None,
                         eval_init_score=None,
                         eval_metric=eval_metric,
                         early_stopping_rounds=early_stopping_rounds,
                         verbose=verbose,
                         feature_name=feature_name,
                         categorical_feature=categorical_feature,
                         callbacks=callbacks)

            self.oof_train[valid_index] = self.clf.predict_proba(X_valid)[:, 1]
            self.oof_test[:, n_fold] = self.clf.predict_proba(X_test)[:, 1]

            # best info
            self.best_info.setdefault('best_iteration',
                                      []).append(self.clf.best_iteration_)
            # todo: 支持多分类
            self.best_info.setdefault('best_score_train', []).append(
                self.clf.best_score_['Train']['auc'])
            self.best_info.setdefault('best_score_valid', []).append(
                self.clf.best_score_['Valid']['auc'])

            # feature importances
            self.feature_importances += self.clf.feature_importances_ / self.cv

            ########################################################################

        # 输出 测试集 oof
        self.oof_test_rank = (pd.DataFrame(self.oof_test).rank().mean(1) /
                              len(self.oof_test)).values
        self.oof_test = self.oof_test.mean(1)

        assert len(X) == len(self.oof_train)
        assert len(X_test) == len(self.oof_test)

        # 计算 训练集 oof 得分
        if feval is not None and verbose > 0:
            self.oof_score = feval(y, self.oof_train)
            print("\n\033[94mScore Info:\033[0m")
            print(f"\033[94m     {self.cv:>2} CV: {self.oof_score:.6f}\033[0m")

            _ = np.array(self.best_info['best_iteration'])
            print(
                f"\033[94m      Iter: {_.mean():.0f} +/- {_.std():.0f}\033[0m")

            _ = np.array(self.best_info['best_score_valid'])
            print(
                f"\033[94m     Valid: {_.mean():.6f} +/- {_.std():.6f} \033[0m\n"
            )

        return self.oof_score

    def oof_submit(self, ids, pred_ranking=False, file=None, preds=None):
        """preds藏分用"""
        if file is None:
            file = f'submit_cv{self.cv}_{self.oof_score}.csv'
        print(f'Save {file} ...')

        if preds is None:
            preds = self.oof_test_rank if pred_ranking else self.oof_test

        if not isinstance(ids, pd.DataFrame):
            ids = pd.DataFrame(ids)
        ids.assign(preds=preds).to_csv(file, index=False, header=False)

    @property
    def oof_train_and_test(self):
        return np.r_[self.oof_train, self.oof_test]

    def oof_save(self, file='./oof_train_and_test.csv'):
        pd.DataFrame(self.oof_train_and_test,
                     columns=['oof_train_and_test']).to_csv(file, index=False)

    def plot_feature_importances(self,
                                 feature_names=None,
                                 topk=20,
                                 figsize=None,
                                 pic_name=None):
        columns = ['Importances', 'Features']
        importances = self.feature_importances.tolist()
        if feature_names is None:
            feature_names = list(
                map(lambda x: f'F_{x}', range(len(importances))))

        _ = sorted(zip(importances, feature_names), reverse=True)
        self.df_feature_importances = pd.DataFrame(_, columns=columns)

        plt.figure(figsize=(14, topk // 5) if figsize is None else figsize)
        sns.barplot(*columns, data=self.df_feature_importances[:topk])
        plt.title('Features Importances\n')
        plt.tight_layout()
        if pic_name is None:
            plt.savefig(f'importances_{self.oof_score}.png')

    @classmethod
    def opt_cv(cls,
               X,
               y,
               X_test=None,
               cv_list=range(3, 16),
               params=None,
               cv_seed=777,
               topk=5):

        oofs = []
        for cv in tqdm(cv_list, desc='opt cv'):  # range(3, 16):
            oof = cls(params, cv, cv_seed=cv_seed)
            oof.fit(X, y, X_test, verbose=0)
            oofs.append((oof.oof_score, cv, oof))

        return sorted(oofs)[::-1][:topk]
Exemple #18
0
def lgbm(df, predictions_path, test_path, model_path, acct_id, summary,
         dock_path, rp_dir, root_dir):
    X_train, y_train = train_model(df)
    ch = st.radio("Choose From", ('Basic Parameters', 'Enter Manually'))
    if ch == 'Basic Parameters':
        n_estimators = st.number_input(
            label='Enter Number of Estimator (Integer)',
            value=100,
            min_value=2)
        random_state = st.number_input(label='Enter Random state(Integer)',
                                       value=0,
                                       min_value=0)
        max_depth = st.number_input(label='Enter Depth of Tree (Integer)',
                                    value=-1)
        learning_rate = st.text_input(label='Enter learning rate',
                                      value='0.01',
                                      max_chars=10,
                                      type='default')
        subsample = st.number_input(label='Enter value for subsample ',
                                    value=1.0,
                                    min_value=0.0)
        num_leaves = st.number_input(label='Enter min samples split (Integer)',
                                     value=31,
                                     min_value=0)
        reg_alpha = st.text_input(label='Enter value for reg alpha',
                                  value='0',
                                  max_chars=10,
                                  type='default')
        reg_lambda = st.text_input(label='Enter value for reg lambda',
                                   value='0',
                                   max_chars=10,
                                   type='default')
        class_weight = st.text_input(
            label='Enter class weights in dictionary format',
            value='balanced',
            max_chars=20,
            type='default')
        boosting_type = st.selectbox(label='Select criterion',
                                     options=['gbdt', 'dart', 'goss', 'rf'])

        if class_weight != 'balanced':
            class_weight = eval(class_weight)
        else:
            pass

        if st.checkbox(label='Train Model'):
            lgb = LGBMClassifier(max_depth=max_depth,
                                 subsample=subsample,
                                 random_state=random_state,
                                 num_leaves=num_leaves,
                                 n_estimators=n_estimators,
                                 learning_rate=learning_rate,
                                 reg_alpha=reg_alpha,
                                 reg_lambda=reg_lambda,
                                 class_weight=class_weight,
                                 boosting_type=boosting_type)
            clf = lgb.fit(X_train, y_train)
            scores = cross_val_score(clf, X_train, y_train, cv=5)
            st.write('cross-validation scores: ' + str(scores))
            st.write('accuracy score of cross validation :' +
                     str(scores.mean() * 100))
            summary['cross-val scores'] = str(scores)
            mod_spec = str(clf).split('(')[0] + ": " + str(clf.get_params())
            summary['model specs'] = str(mod_spec)
            save_summary(summary, dock_path)
            st.success('Model Training Completed!')

        if st.checkbox(label='See predictions'):
            make_predictions(predictions_path, test_path, model_path, clf,
                             summary, dock_path)

            if st.button("Generate Test Files"):
                with st.spinner("Execution in Progress"):
                    os.system('python ' + rp_dir + '/PMML_creation.py ' +
                              str(acct_id) + " " + str(root_dir))
                    os.system('python ' + rp_dir + '/generate_Test_Files.py ' +
                              str(acct_id) + " " + str(root_dir))
                    st.success("Test Files Generated")
                    st.success("Account ready for Deployment")
    else:
        params = st.text_input(label='Enter Best Parameters')
        mod_o = LGBMClassifier()
        if params == "":
            rs_params = hyper_tune(mod_o, df)

        elif params != "":
            if st.checkbox(label='Train Model'):
                st.text("Training Model with user defined Parameters")
                params = eval(params)
                st.text(params)
                lgb = LGBMClassifier()
                lgb = lgb.set_params(**params)
                clf = lgb.fit(X_train, y_train)

                scores = cross_val_score(clf, X_train, y_train, cv=5)
                st.write('Cross-validation scores: ' + str(scores))
                st.write('Accuracy score of cross validation :' +
                         str(scores.mean() * 100))
                summary['cross-val scores'] = str(scores)
                mod_spec = str(clf).split('(')[0] + ": " + str(
                    clf.get_params())
                summary['model specs'] = str(mod_spec)
                save_summary(summary, dock_path)
                st.success('Model Training Completed!')

            if st.checkbox(label='See predictions'):
                make_predictions(predictions_path, test_path, model_path, clf,
                                 summary, dock_path)

                if st.button("Generate Test Files"):
                    with st.spinner("Execution in Progress"):
                        os.system('python ' + rp_dir + '/PMML_creation.py ' +
                                  str(acct_id) + " " + str(root_dir))
                        os.system('python ' + rp_dir +
                                  '/generate_Test_Files.py ' + str(acct_id) +
                                  " " + str(root_dir))
                        st.success("Test Files Generated")
                        st.success("Account ready for Deployment")

        if st.checkbox(label='Show Help Text?'):
            expander = st.beta_expander("FAQ")
            st.write("Please Enter Parameters in the following format:")
            st.text("{'boosting_type': 'gbdt', 'class_weight':" + str({
                0: 1,
                1: 5
            }) + ", 'colsample_bytree': 1.0, 'learning_rate': 0.08, }")
            st.write(
                "If you wish to enter a range of params for hyper tuning ")
            st.text(
                "{'num_leaves': randint(6, 50),'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100]}"
            )