Exemple #1
0
class VotingModel:
    def __init__(self, X, y, x_test, model_lists):
        self.model = EnsembleVoteClassifier(clfs=model_lists,
                                            weights=[1, 1, 1],
                                            refit=False,
                                            voting='soft')
        self.X = X
        self.y = y
        self.X_test = x_test

    def train(self):
        self.model.fit(self.X, self.y)

    def predict(self):
        return self.model.predict(self.X_test)

    def predict_proba(self):
        return self.model.predict_proba(self.X_test)
Exemple #2
0
class ModelTrustRegression:
    def __init__(self, model, n_neighbors=20, weights='uniform', n_folds=5):
        self.template_model = model
        self.n_neighbors = n_neighbors
        self.weights=weights
        self.n_folds = n_folds
        self.fold_regressions=[]
        self.fold_models=[]
        self.bagger = None

    def fit(self, X, values):
        #hard prediction
        for train_index, validation_index in KFold(n_splits=self.n_folds).split(X):
            train_set = X[train_index]
            train_values = values[train_index]

            validation_set = X[validation_index]
            validation_values = values[validation_index]

            fold_model = clone(self.template_model)
            fold_model.fit(train_set, train_values) #retrains a brand new model for the fold

            fold_regressor = KNeighborsRegressor(weights=self.weights, n_neighbors=self.n_neighbors)
            fold_regressor.fit(validation_set, fold_model.predict(validation_set) == validation_values)
            self.fold_regressions.append(fold_regressor)
            self.fold_models.append(fold_model)

        self.bagger = EnsembleVoteClassifier(self.fold_models, voting="soft", refit=False)
        self.bagger.fit(X, values) #trivial fit

    def predict(self, X):
        return np.mean([fm.predict(X) for fm in self.fold_regressions], axis=0)

    def predict_proba(self, X):
        return self.bagger.predict_proba(X)

    def get_bagger(self):
        return self.bagger
Exemple #3
0
    'nthread': 4,
    'silent': 1,
    'subsample': 0.6,
    'reg_lambda': 0.89,
    'gamma': 0.1,
    'min_child_weight': 49.8,
    'colsample_bytree': 0.8,
    'n_estimators': 2790,
}
clf_2 = xgb.XGBClassifier(**clf_2_params)

clf_3_params = {
    'learning_rate': 0.0065,
    'max_depth': 5,
    'nthread': 4,
    'silent': 1,
    'subsample': 0.621,
    'reg_lambda': 0.726,
    'gamma': 0.053,
    'min_child_weight': 30.8,
    'colsample_bytree': 0.905,
    'n_estimators': 958,
}
clf_3 = xgb.XGBClassifier(**clf_3_params)

pipeline = EnsembleVoteClassifier(clfs=[clf_0, clf_1, clf_2, clf_3], weights=[1, 1, 1, 1], voting='soft')
pipeline.fit(train, Y)

y_pred = pipeline.predict_proba(test[test.columns])
pd.Series(y_pred[:, 1]).to_csv('answer.csv', index=False)
Exemple #4
0
    'reg_lambda': 0.88,
    'gamma': 0.15,
    'min_child_weight': 67,
    'colsample_bytree': 0.77,
    'n_estimators': 904,
}
clf_4 = xgb.XGBClassifier(**clf_4_params)

# (0, 1, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21)
# (0, 1, 3, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22) - RF
# (3, 4, 5, 6, 7, 8, 9, 10, 11, 19, 20, 22, 23)

# best_columns = [
#     'gender', 'height', 'ap_hi', 'ap_lo', 'smoke', 'alco', 'active', 'age_y',
#     'ch_1', 'ch_2', 'ch_3', 'gl_1', 'gl_2', 'gl_3',
#     'bmi', 'sist_formula', 'map', 'F_score', 'ap_log'
# ]

best_columns = [
    'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco',
    'active', 'age_y', 'bmi', 'sist_formula', 'map', 'F_score'
]

pipeline = EnsembleVoteClassifier(clfs=[clf_0, clf_1, clf_2, clf_3, clf_4],
                                  weights=[1, 1, 1, 1, 1],
                                  voting='soft')
pipeline.fit(train[best_columns], Y)

y_pred = pipeline.predict_proba(test[best_columns])
pd.Series(y_pred[:, 1]).to_csv('answer.csv', index=False)
Exemple #5
0
class MulticriteriaEnsemble(object):
    def __init__(self,
                 models=OrderedDict({}),
                 dataset=None,
                 pickle_path=None,
                 crit_metrics=None,
                 global_metric=None,
                 delta=None,
                 epsilon=None,
                 a=None,
                 bootstrap_models=OrderedDict({}),
                 n_splits=5,
                 voting='soft',
                 jenks=True,
                 jenks_limit=2,
                 refit=False):
        self.models = models
        self.bootstrap_models = bootstrap_models
        self.dataset = dataset
        self.crit_metrics = crit_metrics
        self.global_metric = global_metric
        self.delta = delta
        self.best_delta = None
        self.epsilon = epsilon
        self.a = a
        self.voting = voting
        self.n_splits = n_splits
        self.refit = refit
        self.pickle_path = self.dataset.path + 'base_learners/'
        self.multicriteria_table = None
        self.meta_table = None
        self.utastar_model = None
        self.wmv_model = None
        self.natural_breaks = None
        self.weights = []
        self.global_utilities = []
        self.kfold_indices = []
        self.test_kfold_indices = []
        self.global_metrics = []
        self.is_fit = {
            'wmv': False,
            'clfs': not self.refit,
            'utastar': False,
        }
        self.jenks = jenks
        self.jenks_limit = jenks_limit
        if not self.models and refit == True:
            raise Exception('Base learners are not provided.')
        elif self.models and refit == False:
            raise Exception(
                'Models parameter should not be set to anything while refit=False'
            )
        if self.dataset == None:
            raise Exception('Dataset is not provided.')
        if self.crit_metrics == None:
            raise Exception('Performance estimators are not provided.')
        if self.global_metric == None:
            raise Exception('Global Performance estimator is not provided.')
        if self.delta == None or self.a == None or self.epsilon == None:
            raise Exception(
                'One or more utastar model parameters is/are not provided.')

    def _pso_cost(self, x):
        self.delta = x[0]
        self.epsilon = x[1]
        if self.is_fit['wmv']:
            self.fit(mtable=False)
        else:
            self.fit()
        return 1 - self.score()

    def pso(self, bounds, num_particles, w, c1, c2, maxiter, threshold):
        psopt(self._pso_cost, bounds, num_particles, w, c1, c2, maxiter,
              threshold)

    def _save_model(self, model, file_name):
        print "Saving Model!"
        if os.path.isfile(self.pickle_path + file_name):
            if not os.path.exists(self.pickle_path + 'Archive/'):
                os.makedirs(self.pickle_path + 'Archive/')
            archived_file_name = self.pickle_path + 'Archive/' + file_name.replace(
                '.pkl', '_') + datetime.datetime.today().strftime(
                    "%m-%d-%Y-%H%M%S") + '.pkl'
            shutil.move(self.pickle_path + file_name, archived_file_name)
            joblib.dump(model, self.pickle_path + file_name)
            print "Model Saved!!!"
        else:
            print "Model Saved!!!"
            joblib.dump(model, self.pickle_path + file_name)

    #Reinitialize crucial variables
    def _reset(self):
        self.global_utilities = []
        self.weights = []
        self.kfold_indices = []
        if self.refit == True:
            self.bootstrap_models = OrderedDict({})
            print 'Multicriteria Table Deleted!!!'
            self.multicriteria_table = None
            self.meta_table = None

    #Split dataset to k stratified folds and save the indices
    def _skfold(self, n_splits):
        skf = StratifiedKFold(n_splits=n_splits,
                              shuffle=True,
                              random_state=12345)
        for train_index, test_index in skf.split(self.dataset.X_train,
                                                 self.dataset.y_train):
            self.kfold_indices.append(train_index.tolist())
            self.test_kfold_indices.append(test_index.tolist())

    #Fit the base learners
    def _fit_clfs(self):
        #If the path that the models will be saved does not exist create it
        if not os.path.exists(self.pickle_path):
            os.makedirs(self.pickle_path)
        #For every fold
        for k_idx, k in enumerate(self.kfold_indices):
            #Make a copy of the base learners
            temp_models = OrderedDict(
                zip(self.models.keys(), clone(self.models.values())))
            #For every model in the base learners create a separate model , train it on the current fold and save it
            for model in temp_models.keys():
                model_name = '%s_%s_FOLD%i' % (model.replace(
                    '_' + self.dataset.name, ''), self.dataset.name, k_idx)
                temp_models[model].fit(self.dataset.X_train.iloc[k],
                                       self.dataset.y_train.iloc[k])
                file_name = model_name + '.pkl'
                self._save_model(temp_models[model], file_name)
                self.bootstrap_models[model_name] = temp_models[model]

        #Rename the base learners to include the dataset name,fit the models and save them
        if not self.dataset.name in model:
            self.models = self._rename_models(self.models)
        for model in self.models.keys():
            self.models[model].fit(self.dataset.X_train, self.dataset.y_train)
            self._save_model(self.models[model], model + '.pkl')

    #Fit the utastar model
    def _fit_utastar(self):
        #Define the Utastar model
        self.utastar_model = Utastar(self.multicriteria_table, self.meta_table,
                                     self.delta, self.epsilon)
        #Fit the Utastar model
        self.utastar_model.solve()

    def _get_global_utilities(self):
        metrics = self._get_metrics(self.bootstrap_models, on='test')
        self._utastar_predict(metrics)

    #Fit the Weighted Majority Voting model
    def _fit_wmv(self):
        #Merge the base learners and the produced models(extra)
        models = self.bootstrap_models.values()
        #Define the Weighted Majority Voting model
        self.wmv_model = EnsembleVoteClassifier(clfs=models,
                                                weights=self.weights,
                                                voting=self.voting,
                                                refit=False)
        #Fit the WMV model
        self.wmv_model.fit(self.dataset.X_train, self.dataset.y_train)

    #Fit the Multicriteria Ensemble Model
    def fit(self, mtable=True):
        #Reinitilize crucial variables
        self._reset()
        #Get Stratified K-Fold indices
        self._skfold(self.n_splits)
        #if refit is needed,fit the models
        if self.refit:
            self._fit_clfs()
            self.is_fit['clfs'] = True
        else:
            #Check if
            try:
                for base_learner in os.walk(self.pickle_path).next()[2]:
                    if 'FOLD' in base_learner:
                        self.bootstrap_models[base_learner.replace(
                            '.pkl', '')] = joblib.load(self.pickle_path +
                                                       '%s' % base_learner)
                    else:
                        self.models[base_learner.replace(
                            '.pkl', '')] = joblib.load(self.pickle_path +
                                                       '%s' % base_learner)
                dummy_var = self.bootstrap_models.keys()[1]
            except:
                raise AttributeError(
                    'Refit is set to False but no models are given.')
        if mtable == False and self.multicriteria_table is None:
            raise Exception(
                'Multicriteria table not found.Please run fit(mtable=True) at least once.'
            )
        elif mtable == True:
            print 'Multicriteria table formed!!!'
            self._get_meta_table()
            self._get_multicriteria_table()
        self._fit_utastar()
        self._get_global_utilities()
        self._get_clfs_weights()
        self._fit_wmv()
        self.is_fit['wmv'] = True

    def predict(self, X):
        return self.wmv_model.predict(X)

    def predict_proba(self, X):
        return self.wmv_model.predict_proba(X)

    def _get_clfs_weights(self):
        gu = self.global_utilities
        if self.jenks == True:
            self.natural_breaks = jenkspy.jenks_breaks(gu, nb_class=5)
            gu = [
                i if i >= self.natural_breaks[-self.jenks_limit] else 0
                for i in gu
            ]
        gu_sum = sum(gu)
        for value in gu:
            self.weights.append(value / gu_sum)

    def add_clfs(self, clfs, refit=False):
        clfs = self._rename_models(clfs)
        if set(self.models.keys()).isdisjoint(clfs.keys()):
            if not refit:
                metrics = self._get_metrics(clfs)
                self.models.update(clfs)
            else:
                temp_models = {}
                for clf in clfs.keys():
                    temp_models[clf] = clone(clfs[clf])
                    temp_models[clf].fit(self.dataset.X_train,
                                         self.dataset.y_train)
                metrics = self._get_metrics(temp_models)
                self.models.update(temp_models)
            self._utastar_predict(metrics)
            self.weights = []
            self._get_clfs_weights()
            self._fit_wmv()
        else:
            raise Exception('One or more models are already in the ensemble.')

    def score(self):
        return self._get_global_metrics({'wmv': self.wmv_model}, on='test')[0]

    def _utastar_predict(self, metrics):
        for clf_metrics in metrics:
            pred_partial_util = []
            for crit in self.utastar_model.criteria:
                X = self.utastar_model.intervals[crit]
                y = self.utastar_model.marginal_post[crit]
                pred_partial_util.append(
                    np.interp(
                        clf_metrics[
                            self.utastar_model.criteria.tolist().index(crit) +
                            1], X, y))
            pred_global_util = np.array(pred_partial_util).dot(
                np.array(clf_metrics[1:]))
            self.global_utilities.append(pred_global_util)

    def _rename_models(self, models):
        for model in models.keys():
            model_name = '%s_%s' % (model, self.dataset.name)
            models[model_name] = models.pop(model)
        return models

    def plot_partial_utilities(self):
        numofcriteria = len(self.utastar_model.criteria)
        n = numofcriteria

        if n % 2 == 0:
            fig1, axs = plt.subplots(n / 2, 2, figsize=(18, 18))
        else:
            fig1, axs = plt.subplots(n / 2 + 1, 2, figsize=(18, 18))
        for i in range(n):
            y = self.utastar_model.marginal_post[
                self.utastar_model.criteria[i]]
            x = self.utastar_model.intervals[self.utastar_model.criteria[i]]
            if i % 2 == 0:
                if self.utastar_model.get_type(
                        self.utastar_model.criteria[i]) == 1:
                    axs[i / 2, 0].plot(x, y, '--ok')
                    axs[i / 2, 0].set_title(self.utastar_model.criteria[i])
                    axs[i / 2, 0].set_xticks(x)
                    axs[i / 2, 0].set_xlim(x[0], x[-1])
                    axs[i / 2,
                        0].set_ylabel(r'$u_{%d}(g_{%d})$' % ((i + 1), (i + 1)))
                    axs[i / 2, 0].yaxis.grid(False)
                    if self.utastar_model.get_monotonicity(
                            self.utastar_model.criteria[i]) == 1:
                        axs[i / 2, 0].set_xlim(x[-1], x[0])
                else:
                    axs[i / 2, 0].plot(x, y, '-ok')
                    axs[i / 2, 0].set_title(self.utastar_model.criteria[i])
                    axs[i / 2, 0].set_xticks(x)
                    axs[i / 2, 0].set_xlim(x[0], x[-1])
                    axs[i / 2,
                        0].set_ylabel(r'$u_{%d}(g_{%d})$' % ((i + 1), (i + 1)))
                    axs[i / 2, 0].yaxis.grid(False)
                    if self.utastar_model.get_monotonicity(
                            self.utastar_model.criteria[i]) == 1:
                        axs[i / 2, 0].set_xlim(x[-1], x[0])
            else:
                if self.utastar_model.get_type(
                        self.utastar_model.criteria[i]) == 1:
                    axs[i / 2, 1].plot(x, y, '--ok')
                    axs[i / 2, 1].set_title(self.utastar_model.criteria[i])
                    axs[i / 2, 1].set_xticks(x)
                    axs[i / 2, 1].set_xlim(x[0], x[-1])
                    axs[i / 2,
                        1].set_ylabel(r'$u_{%d}(g_{%d})$' % ((i + 1), (i + 1)))
                    axs[i / 2, 1].yaxis.grid(False)
                    if self.utastar_model.get_monotonicity(
                            self.utastar_model.criteria[i]) == 1:
                        axs[i / 2, 1].set_xlim(x[-1], x[0])

                else:
                    axs[i / 2, 1].plot(x, y, '-ok')
                    axs[i / 2, 1].set_title(self.utastar_model.criteria[i])
                    axs[i / 2, 1].set_xticks(x)
                    axs[i / 2, 1].set_xlim(x[0], x[-1])
                    axs[i / 2,
                        1].set_ylabel(r'$u_{%d}(g_{%d})$' % ((i + 1), (i + 1)))
                    axs[i / 2, 1].yaxis.grid(False)
                    if self.utastar_model.get_monotonicity(
                            self.utastar_model.criteria[i]) == 1:
                        axs[i / 2, 1].set_xlim(x[-1], x[0])
        if n % 2 != 0:
            for l in axs[i / 2 - 1, 1].get_xaxis().get_majorticklabels():
                l.set_visible(True)
            fig1.delaxes(axs[i / 2, 1])
        #plt.subplots_adjust(wspace = 0.3,hspace = 0.3)
        plt.tight_layout()
        plt.show()

    def plot_global_utilities(self):
        fig4 = plt.figure(4)
        ax = fig4.gca()
        ax.barh(range(len(self.utastar_model.global_utilities_post))[::-1],
                self.utastar_model.global_utilities_post.values(),
                align='center',
                color='grey',
                alpha=0.8)
        plt.yticks(
            range(len(self.utastar_model.global_utilities_post))[::-1],
            self.utastar_model.global_utilities_post.keys())
        ax.plot(self.utastar_model.global_utilities_post.values(),
                range(len(self.utastar_model.global_utilities_post))[::-1],
                linestyle='--',
                color='black',
                alpha=0.8)
        plt.xlim(0, 1)
        plt.title('Ranking')
        plt.tight_layout()
        plt.show()

    def plot_global_utilities_pred(self):
        fig4 = plt.figure(4)
        ax = fig4.gca()
        ax.barh(range(len(self.global_utilities))[::-1],
                self.global_utilities,
                align='center',
                color='grey',
                alpha=0.8)
        plt.yticks(
            range(len(self.global_utilities))[::-1],
            self.bootstrap_models.keys())
        ax.plot(self.global_utilities,
                range(len(self.global_utilities))[::-1],
                linestyle='--',
                color='black',
                alpha=0.8)
        plt.xlim(0, 1)
        plt.title('Ranking')
        plt.tight_layout()
        plt.show()

    def plot_criteria_weights(self):
        variables = self.utastar_model.model_weights_post.keys()
        data = self.utastar_model.model_weights_post.values()
        ranges = [
            (0.00001,
             0.00001 + max(self.utastar_model.model_weights_post.values()))
        ] * len(self.utastar_model.criteria)
        fig1 = plt.figure(figsize=(10, 10))
        radar = ComplexRadar(fig1, variables, ranges, 7)
        radar.plot(data)
        #dradar.fill(data, alpha=0.2, color='grey')
        plt.show()

    def plot_model_weights(self, title):
        sns.set(style="whitegrid")
        f, ax = plt.subplots(figsize=(10, 4))
        variables = dict(
            sorted(zip(self.bootstrap_models.keys(), self.weights)))
        sns.set_color_codes("pastel")
        f = sns.barplot(x=variables.keys(), y=variables.values(),
                        color="b").set_title(title)
        ax.set_xticklabels(ax.get_xticklabels(),
                           rotation=45,
                           fontdict={
                               'verticalalignment': 'baseline',
                               'horizontalalignment': 'right'
                           })
        ax.set(xlim=(-1, 30), ylabel="Weight", xlabel="Models")
        sns.despine(left=True, bottom=True)

    def _get_meta_table(self):
        columns = [
            'Cri/atributes', 'Monotonicity', 'Type', 'Worst', 'Best', 'a'
        ]
        meta_table = []
        for metric in self.crit_metrics.keys():
            monotonicity = 1
            if self.crit_metrics[metric][0]._sign == -1:
                monotonicity = 0
                self.crit_metrics[metric][0]._sign = 1
            mt_metric = [
                metric, monotonicity, 0, self.crit_metrics[metric][1],
                self.crit_metrics[metric][2], self.a
            ]
            meta_table.append(mt_metric)
        self.meta_table = pd.DataFrame(meta_table, columns=columns)

    def _get_multicriteria_table(self):
        criteria = self.crit_metrics.keys()
        columns = ['Alt/Cri ']
        columns.extend(criteria)
        #metrics_orig = self._get_metrics(self.models)
        metrics_bootstrap = self._get_metrics(self.bootstrap_models,
                                              on='validation')
        metrics = metrics_bootstrap
        multicriteria_table = pd.DataFrame(metrics, columns=columns)
        ranking = self._get_init_ranking()
        ranking = pd.DataFrame(ranking, columns=['Ranking'])
        self.multicriteria_table = multicriteria_table.join(ranking).copy(
            deep=True)

    def _get_dataset(self, model, on='test'):
        if on == 'test':
            X, y = self.dataset.X_test.copy(), self.dataset.y_test.copy()
        elif on == 'validation':
            X, y = self.dataset.X_train.copy(), self.dataset.y_train.copy()
            if 'FOLD' in model:
                fold_idx = int(re.search(r'(?<=FOLD)[0-9]', model).group(0))
                indices = self.test_kfold_indices[fold_idx]
                X, y = X.iloc[indices], y.iloc[indices]
        elif on == 'train':
            X, y = self.dataset.X_train, self.dataset.y_train
            if 'FOLD' in model:
                fold_idx = int(re.search(r'(?<=FOLD_)[0-9]', model).group(0))
                indices = self.kfold_indices[fold_idx]
                X, y = X.iloc[indices], y.iloc[indices]
        else:
            raise Exception('Unexpected input for argument on.')
        return X, y

    def _get_global_metrics(self, models, on='test'):
        global_metrics = []
        for model in models.keys():
            X, y = self._get_dataset(model, on=on)
            global_metrics.append(self.global_metric(models[model], X, y))
        return global_metrics

    def _get_init_ranking(self):
        #gm_orig = self._get_global_metrics(self.models,on='validation')
        gm_bootstrap = self._get_global_metrics(self.bootstrap_models,
                                                on='validation')
        #gm = gm_orig + gm_bootstrap
        self.global_metrics = gm_bootstrap
        if self.global_metric._sign == 1:
            ranking = len(self.global_metrics) - scipy.stats.rankdata(
                self.global_metrics, method='max')
        else:
            ranking = scipy.stats.rankdata(gm, method='max')
        return ranking

    def _get_metrics(self, models, on='test'):
        metrics = []
        for model in models.keys():
            model_metrics = [model]
            X, y = self._get_dataset(model, on=on)
            for metric in self.crit_metrics.keys():
                mes = self.crit_metrics[metric][0](models[model], X, y)
                #Takes care of the negativde values on the multicriteria table and replaces them with 0
                if mes > 0:
                    model_metrics.append(mes)
                else:
                    model_metrics.append(0)
            metrics.append(model_metrics)
        return metrics
    extra_calibrator
],
                                   weights=weights,
                                   refit=False,
                                   voting='soft',
                                   verbose=1)

print('fitting')
#eclf_hard.fit(train_xm, train_y)
eclf_soft.fit(train_xm, train_y)
print('predicting')
#eclf_hard_pred = eclf_hard.predict(val_xm)
#eclf_hard_pred_pr = eclf_hard.predict_proba(val_xm)

eclf_soft_pred = eclf_soft.predict(val_xm)
eclf_soft_pred_pr = eclf_soft.predict_proba(val_xm)

#evaluating majority voting
voter_pred = eclf_soft_pred
voter_pred_pr = eclf_soft_pred_pr

acc_voter = accuracy_score(val_y, voter_pred)
roc_voter = roc_auc_score(val_y, voter_pred_pr[:, 1])
f1_voter = f1_score(val_y, voter_pred)
precision_voter = precision_score(val_y, voter_pred)
recall_voter = recall_score(val_y, voter_pred)
log_loss_voter = log_loss(val_y, voter_pred_pr)
print('accuracy voter: ', acc_voter)
print('roc voter: ', roc_voter)
print('f1 voter: ', f1_voter)
print('precision voter: ', precision_voter)
Exemple #7
0
def model_ensemble_voting(x_train, y_train, x_test, y_test):
    xg = xgb.XGBClassifier(booster='gbtree',
                           objective='binary:logistic',
                           seed=2020)

    clf = lgb.LGBMClassifier(boosting_type='gbdt',
                             num_leaves=55,
                             reg_alpha=0.0,
                             reg_lambda=1,
                             max_depth=15,
                             n_estimators=6000,
                             objective='binary',
                             subsample=0.8,
                             colsample_bytree=0.8,
                             subsample_freq=1,
                             learning_rate=0.2,
                             min_child_weight=1,
                             random_state=20,
                             n_jobs=4)
    glf = cbt.CatBoostClassifier(depth=2, loss_function='Logloss')
    rf = RandomForestClassifier(n_jobs=4)
    lr = LogisticRegression(max_iter=1000)
    clfs = [xg, clf, glf, rf, lr]
    print("*******start train*******")
    # print("x_train shape :{} y_train shape:{}".format(x_train.shape, y_train.shape))
    # x_train = pd.DataFrame(data=x_train,columns=features.columns)
    # for clf in clfs:
    #     clf.fit(x_train, y_train)
    # print("********Test on test data*************")
    # x_test = pd.DataFrame(data=x_test, columns=features.columns)
    # y_vote_value = []
    # for clf in clfs:
    #     y_pre = clf.predict_proba(x_test)
    #     y_pre = y_pre[:, 1].reshape(-1, 1)
    #     plot_roc_curve(y_test, y_pre)
    #     y_vote_value.append(y_pre)
    x_train = pd.DataFrame(data=x_train, columns=features.columns)
    vote_class = EnsembleVoteClassifier(clfs=clfs,
                                        weights=[2, 1, 1, 1, 5],
                                        voting='soft')
    vote_class.fit(x_train, y_train)
    x_test = pd.DataFrame(data=x_test, columns=features.columns)
    y_pre = vote_class.predict_proba(x_test)
    y_pre = y_pre[:, 1].reshape(-1, 1)
    plot_roc_curve(y_test, y_pre)
    print("********Predict on really data*************")
    data = pd.read_csv('test.csv')
    data['YearsAtCompany_Less_One_or_More_20'] = [
        1 if (value <= 1 or value >= 20) else 0
        for value in data['YearsAtCompany']
    ]
    data['YearsInCurrentRole_Less_2'] = [
        1 if value <= 2 else 0 for value in data['YearsInCurrentRole']
    ]
    data['YearsSinceLastPromotion_Less_1_or_More_6'] = [
        1 if (value < 1 or value >= 6) else 0
        for value in data['YearsSinceLastPromotion']
    ]
    data['YearsWithCurrManager_Less_1_or_More_11'] = [
        1 if (value < 1 or value >= 11) else 0
        for value in data['YearsWithCurrManager']
    ]
    data['Age_Less_22'] = [1 if value < 22 else 0 for value in data['Age']]
    data['BusinessTravel_Frequently'] = [
        1 if str(value) == 'Travel_Frequently' else 0
        for value in data['BusinessTravel']
    ]
    data['DistanceFromHome_Less_12_More_27'] = [
        1 if (value >= 12 or value <= 27) else 0
        for value in data['DistanceFromHome']
    ]
    data['WorkLifeBalance_1'] = [
        1 if value == 1 else 0 for value in data['WorkLifeBalance']
    ]
    data['TotalWorkingYears_Less_2'] = [
        1 if value <= 2 else 0 for value in data['TotalWorkingYears']
    ]
    data['EducationField_HR'] = [
        1 if str(value) == 'Human Resources' else 0
        for value in data['EducationField']
    ]
    data['MaritalStatus_Single'] = [
        1 if str(value) == 'Single' else 0 for value in data['MaritalStatus']
    ]
    data['EnvironmentSatisfaction_Less_1'] = [
        1 if value < 1.5 else 0 for value in data['EnvironmentSatisfaction']
    ]
    data['JobInvolvement_Less_1'] = [
        1 if value < 1.5 else 0 for value in data['JobInvolvement']
    ]
    data['NumCompaniesWorked_More_4'] = [
        1 if value > 4.5 else 0 for value in data['NumCompaniesWorked']
    ]
    data['JobSatisfaction_Less_1'] = [
        1 if value < 1.5 else 0 for value in data['JobSatisfaction']
    ]
    data['OverTime_Yes'] = [
        1 if str(value) == 'Yes' else 0 for value in data['OverTime']
    ]
    data['StockOptionLevel_Less_0'] = [
        1 if value < 1 else 0 for value in data['StockOptionLevel']
    ]
    data['TrainingTimesLastYear_Less_0'] = [
        1 if value < 1 else 0 for value in data['TrainingTimesLastYear']
    ]
    data['TotalWorkingYears_Less_1_More_40'] = [
        1 if value < 2 or value > 39 else 0
        for value in data['TotalWorkingYears']
    ]
    print("test_data shape:{}".format(data.shape))
    pre_data = data.drop(['user_id'], axis=1)
    for column in columns:
        pre_data[column] = le.fit_transform(pre_data[column])
    pre_data = ss.transform(pre_data)
    pre_data = pd.DataFrame(data=pre_data, columns=features.columns)
    y_pre = vote_class.predict_proba(pre_data)
    y_res = []
    for item in y_pre[:, 1]:
        y_res.append(item)
    pre = pd.DataFrame({
        'user_id': data['user_id'],
        'Attrition': y_res
    },
                       index=None)
    pre.to_csv('pre_vote_more_features.csv', index=None)