def fit(self, X_train, y_train):

        matrix = []
        ACC = []
        PRE = []
        REC = []
        F1 = []
        ROC_AUC = []
        logloss = []

        test_set_X = []
        test_set_Y = []

        k = 1

        if self.verbose is True:
            print(
                "Checking Cross Validation Score with Balanced Bagging: %d splits"
                % self.cv)
        else:
            pass

        for train_index, test_index in self.flod.split(X_train, y_train):
            x_ta = X_train.values[train_index]
            x_te = X_train.values[test_index]
            y_ta = y_train.values[train_index]
            y_te = y_train.values[test_index]

            sts = StandardScaler()
            clf = xgb.XGBClassifier(n_jobs=self.n_jobs)
            usbc = BalancedBaggingClassifier(base_estimator=clf,
                                             n_jobs=self.n_jobs,
                                             n_estimators=self.n_estimators,
                                             ratio='not minority')
            pipe = make_pipeline(sts, usbc)

            pipe.fit(x_ta, y_ta)
            y_pred = pipe.predict(x_te)
            y_prob = pipe.predict_proba(x_te)

            matrix.append(confusion_matrix(y_te, y_pred))
            ACC.append(accuracy_score(y_te, y_pred))
            PRE.append(precision_score(y_te, y_pred))
            REC.append(recall_score(y_te, y_pred))
            F1.append(f1_score(y_te, y_pred))
            ROC_AUC.append(roc_auc_score(y_te, y_pred))
            logloss.append(log_loss(y_te, y_prob))

            test_set_X.append(x_ta)
            test_set_Y.append(y_ta)

            if self.verbose is True:
                print("Done: %d, Totaling: %d" % (k, self.cv))
            else:
                pass

            k += 1

        self.Matrix = matrix
        self.acc_ = np.array(ACC)
        self.pre_ = np.array(PRE)
        self.rec_ = np.array(REC)
        self.f1_ = np.array(F1)
        self.roc_auc_ = np.array(ROC_AUC)
        self.logloss_ = np.array(logloss)

        self.X_set = test_set_X
        self.Y_set = test_set_Y
Example #2
0
def main():
    mlflow.start_run(run_name=NAME)

    if "X_train.pkl" not in os.listdir():
        print("procesando los datos")
        X, y, encoder = preprocess_data("TOTAL_TRAIN.csv", process_cat=False)
        print(X.shape)

        with open(f"label_encoder_{NAME}.pkl", "wb") as f:
            pickle.dump(encoder, f)
        print(
            f"##################### The shape of X is {X.shape} #######################"
        )
        y = y.astype("int")
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.15,
                                                            random_state=15,
                                                            stratify=y)
        with open("X_train.pkl", "wb") as f:
            pickle.dump(X_train, f)
        with open("X_test.pkl", "wb") as f:
            pickle.dump(X_test, f)
        with open("y_train.pkl", "wb") as f:
            pickle.dump(y_train, f)
        with open("y_test.pkl", "wb") as f:
            pickle.dump(y_test, f)

        print(X_train.shape)

    else:
        with open("X_train.pkl", "rb") as f:
            X_train = pickle.load(f)
        with open("X_test.pkl", "rb") as f:
            X_test = pickle.load(f)
        with open("y_train.pkl", "rb") as f:
            y_train = pickle.load(f)
        with open("y_test.pkl", "rb") as f:
            y_test = pickle.load(f)
        with open(f"label_encoder_XGB1704.pkl", "rb") as f:
            encoder = pickle.load(f)
        print("######### ajustando cat encoder ############")

    cols_cat = ["ruido", "CODIGO_POSTAL", "ZONA_METROPOLITANA", "CALIDAD_AIRE"]
    cols_float = [col for col in X_train.columns if col not in cols_cat]
    X_train[cols_float] = X_train[cols_float].astype("float")
    X_test[cols_float] = X_test[cols_float].astype("float")

    labs_names = [c for c in encoder.classes_]

    model = LGBMClassifier(
        class_weight="balanced",
        objective="multiclass:softmax",
        n_jobs=-1,
        random_state=100,
        silent=True,
    )

    if MODE != "INDIVIDUAL":
        params = {
            "reg_alpha": (1e-3, 5.0, "log-uniform"),
            "reg_lambda": (1e-2, 50.0, "log-uniform"),
            "n_estimators": (600, 4500),
            "learning_rate": (5e-3, 1.0, "log-uniform"),
            "num_leaves": (20, 80),
            "boosting_type": ["gbdt", "goss"],
            "colsample_bytree": (0.1, 1.0, "uniform"),
            "subsample": (0.1, 1.0, "uniform"),
            "min_child_samples": (1, 25),
            "min_child_weight": (1e-6, 0.1, "log-uniform"),
        }

        print(params)

        cb = CatBoostEncoder(cols=cols_cat)
        X_train = cb.fit_transform(X_train, y_train)
        X_test = cb.transform(X_test)
        fit_params = {
            ### fit params ###
            "eval_set": [(X_test, y_test)],
            "eval_metric": lgb_f1_score,
            "early_stopping_rounds": 300,
        }

        pipeline = Pipeline(steps=[("clas_encoder",
                                    CatBoostEncoder(
                                        cols=cols_cat)), ("model", model)])

        best_model = BayesSearchCV(
            model,
            params,
            n_iter=N_ITER,
            n_points=1,
            cv=cv,
            scoring=f2_scorer,
            random_state=100,
            optimizer_kwargs={"n_initial_points": 10},
            fit_params=fit_params,
        )

    def on_step(optim_result):
        score = best_model.best_score_
        results = best_model.cv_results_
        try:
            results_df = pd.DataFrame(results)
            results_df.to_csv(f"results_{NAME}.csv", header=True, index=False)
            print(
                f"############ Llevamos {results_df.shape[0]} pruebas #################"
            )
            print(f"los resultados del cv de momento son {results_df}")
        except:
            print("Unable to convert cv results to pandas dataframe")
        mlflow.log_metric("best_score", score)
        with open(f"./best_{NAME}_params.pkl", "wb") as f:
            pickle.dump(best_model.best_params_, f)

        print("best score: %s" % score)
        if score >= 0.98:
            print("Interrupting!")
            return True

    print("ajustando modelo")
    if MODE != "INDIVIDUAL":
        print(X_train.dtypes)
        best_model.fit(X_train, y_train, callback=[on_step])
        with open(f"./best_{NAME}_model.pkl", "wb") as f:
            pickle.dump(best_model, f)
        preds = best_model.predict(X_test)
    else:
        if NAME not in os.listdir():
            os.mkdir(NAME)

        cat_encoder = CatBoostEncoder(cols=cols_cat)
        X_train = cat_encoder.fit_transform(X_train, y_train)
        X_test = cat_encoder.transform(X_test)
        best_model = BalancedBaggingClassifier(
            base_estimator=HistGradientBoostingClassifier(
                max_iter=3000,
                random_state=42,
                learning_rate=0.1,
                max_leaf_nodes=54,
                min_samples_leaf=2,
                scoring=f2_scorer,
                validation_fraction=0.1,
                n_iter_no_change=50,
            ),
            n_estimators=5,
            random_state=42,
            n_jobs=-1,
            max_features=0.7,
            sampling_strategy={5: int(dict(Counter(y_train))[5] * 0.11)},
        )
        best_model.fit(X_train, y_train)
        preds = best_model.predict(X_test)
        print(
            f'F1 SCORE IS {f1_score(y_test, preds, average="macro")}, precision is {precision_score(y_test, preds, average="macro")}, recall is {recall_score(y_test, preds, average="macro")}, accuracy is {accuracy_score(y_test, preds)}'
        )
        print(
            f"F2 SCORE IS {fbeta_score(y_test, preds, average='macro', beta=2)}"
        )
        print(
            f"F05 SCORE IS {fbeta_score(y_test, preds, average='macro', beta=2)}"
        )
        cm = confusion_matrix(y_test, preds)
        grafico_conf_matrix = print_confusion_matrix(cm,
                                                     class_names=labs_names)
        grafico_conf_matrix.savefig(f"{NAME}/norm_NO_PIPELINE")

        with open(f"best_model_{NAME}.pkl", "wb") as f:
            pickle.dump(best_model, f)

    print("loggeando movidas")
    mlflow.log_metrics(
        metrics={
            "f1": f1_score(y_test, preds, average="macro"),
            "precision": precision_score(y_test, preds, average="macro"),
            "recall": recall_score(y_test, preds, average="macro"),
            "accuracy": accuracy_score(y_test, preds),
            "f05": fbeta_score(y_test, preds, beta=0.5, average="macro"),
            "f2": fbeta_score(y_test, preds, beta=2, average="macro"),
        })
    if MODE != "INDIVIDUAL":
        best_params = best_model.best_params_
        for param in best_params.keys():
            mlflow.log_param(param, best_params[param])
    cm = confusion_matrix(y_test, preds)
    grafico_conf_matrix = print_confusion_matrix(cm, class_names=labs_names)
    grafico_conf_matrix.savefig(NAME)
    grafico_norm = print_confusion_matrix(cm,
                                          class_names=labs_names,
                                          normalize=False)
    grafico_norm.savefig(f"{NAME}_no_norm")
    mlflow.end_run()
def test_error():
    # Test that it gives proper exception on deficient input.
    X, y = make_imbalance(iris.data,
                          iris.target,
                          sampling_strategy={
                              0: 20,
                              1: 25,
                              2: 50
                          })
    base = DecisionTreeClassifier()

    # Test n_estimators
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, n_estimators=1.5).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, n_estimators=-1).fit, X, y)

    # Test max_samples
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_samples=-1).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_samples=0.0).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_samples=2.0).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_samples=1000).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_samples="foobar").fit, X,
                  y)

    # Test max_features
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_features=-1).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_features=0.0).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_features=2.0).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_features=5).fit, X, y)
    assert_raises(ValueError,
                  BalancedBaggingClassifier(base, max_features="foobar").fit,
                  X, y)

    # Test support of decision_function
    assert not (hasattr(
        BalancedBaggingClassifier(base).fit(X, y), 'decision_function'))
Example #4
0
                            min_samples_split=2,
                            max_depth=None,
                            max_features=None,
                            random_state=1)
SVM = SVC(kernel='rbf', probability=True, random_state=seed1)
ADA = AdaBoostClassifier(n_estimators=50, random_state=seed1)
XGB = XGBClassifier(booster='gbtree',
                    n_estimators=200,
                    depth_range=2,
                    min_child_weight=2,
                    random_state=seed1,
                    early_stopping_rounds=10)

UBDT = BalancedBaggingClassifier(base_estimator=DT,
                                 n_estimators=100,
                                 sampling_strategy=0.9,
                                 max_samples=1.0,
                                 random_state=seed1,
                                 n_jobs=-1)
UBSVM = BalancedBaggingClassifier(base_estimator=SVM,
                                  n_estimators=20,
                                  sampling_strategy=0.6,
                                  max_samples=1.0,
                                  random_state=seed1,
                                  n_jobs=-1)

EASYADA = EasyEnsembleClassifier(base_estimator=ADA,
                                 sampling_strategy=0.7,
                                 n_estimators=50)
EASYXGB = EasyEnsembleClassifier(base_estimator=XGB,
                                 sampling_strategy=0.5,
                                 n_estimators=20)
# %% [markdown]
# The performance with the
# :class:`~imblearn.ensemble.BalancedRandomForestClassifier` is better than
# applying a single random under-sampling. We will use a gradient-boosting
# classifier within a :class:`~imblearn.ensemble.BalancedBaggingClassifier`.

from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from imblearn.ensemble import BalancedBaggingClassifier

bag_clf = make_pipeline(
    preprocessor_tree,
    BalancedBaggingClassifier(
        base_estimator=HistGradientBoostingClassifier(random_state=42),
        n_estimators=10,
        random_state=42,
        n_jobs=2,
    ),
)

index += ["Balanced bag of histogram gradient boosting"]
cv_result = cross_validate(bag_clf, df_res, y_res, scoring=scoring)
scores["Accuracy"].append(cv_result["test_accuracy"].mean())
scores["Balanced accuracy"].append(cv_result["test_balanced_accuracy"].mean())

df_scores = pd.DataFrame(scores, index=index)
df_scores

# %% [markdown]
# This last approach is the most effective. The different under-sampling allows
# to bring some diversity for the different GBDT to learn and not focus on a
Example #6
0
def kFoldTest(x, y, sampler, classifier, k=10, show_info=False):
    """
    k折交叉验证(该函数会打乱数据顺序,无需手动再打乱)

    :param x:样本
    :param y:标签
    :param sampler:采样器
    :param classifier:分类器
    :param k:交叉验证折数
    """
    if show_info:
        print("-" * 60)
        print("%s-%s" % (sampler, classifier))

    # 记录评估结果
    val_history = {}
    val_history["val_acc"] = []
    val_history["val_precision"] = []
    val_history["val_recall"] = []
    val_history["val_f1"] = []
    val_history["auc_value"] = []
    val_history["val_gmean"] = []
    val_history["bAcc"] = []

    # k折交叉
    kf = KFold(n_splits=k, shuffle=True)  # 混洗数据
    cur_k = 0
    for train_index, val_index in kf.split(x, y):
        # 划分数据
        cur_k += 1  # 当前第几折次交叉验证
        x_train, y_train = x[train_index], y[train_index]
        x_val, y_val = x[val_index], y[val_index]
        if sampler != "" and show_info:
            print("采样前:%d/%d=%.2f" % (len(y_train[y_train == 1]), len(y_train[y_train == 0]),
                                      (len(y_train[y_train == 1]) / len(y_train[y_train == 0]))))

        # 采样器
        if sampler in ("DBU", "DUS"):
            x_train, y_train = DBUSampler(show_info=True).fit_resample(x_train, y_train)  # 抽样
        elif sampler == "RUS":
            # replacement=False 表示不放回抽样
            x_train, y_train = RandomUnderSampler(replacement=False).fit_resample(x_train, y_train)  # 抽样
        elif sampler == "SMOTE":
            x_train, y_train = SMOTE().fit_resample(x_train, y_train)
        if sampler != "" and show_info:
            print("采样后:%d/%d=%.2f" % (len(y_train[y_train == 1]), len(y_train[y_train == 0]),
                                      (len(y_train[y_train == 1]) / len(y_train[y_train == 0]))))
        if show_info:
            print("[k = %d]" % cur_k)
            print("训练 正样本:%d 负样本:%d IR = %.2f" % (len(y_train[y_train == 1]), len(y_train[y_train == 0]),
                                                  len(y_train[y_train == 1]) / len(y_train[y_train == 0])))

        # 分类器
        if classifier.lower() == "knn":
            clf = KNeighborsClassifier()
        elif classifier.lower() == "dt":
            clf = DecisionTreeClassifier()
        elif classifier.lower() == "svc":
            # probability=True 表示可以计算得到概率
            clf = SVC(probability=True)
        elif classifier in ("RandomForestClassifier", "RFC", "RandomForest"):
            clf = RandomForestClassifier(n_estimators=3)
        elif classifier == "BaggingClassifier":
            clf = BaggingClassifier(base_estimator=KNeighborsClassifier(), bootstrap=True)
        elif classifier in ("AdaBoostClassifier", "AdaBoost"):
            clf = AdaBoostClassifier(n_estimators=3)
        elif classifier in ("EasyEnsembleClassifier", "EasyEnsemble"):
            clf = EasyEnsembleClassifier(n_estimators=3)
        elif classifier in ("BalancedBaggingClassifier", "BalancedBagging"):
            clf = BalancedBaggingClassifier(n_estimators=5)
        elif classifier == "AdaSamplingBaggingClassifier":
            clf = AdaSamplingBaggingClassifier(15)

        # 训练
        clf.fit(x_train, y_train)

        # 测试
        if show_info:
            print("测试 正样本:%d 负样本:%d IR = %.2f" % (
                len(y_val[y_val == 1]), len(y_val[y_val == 0]), len(y_val[y_val == 1]) / len(y_val[y_val == 0])))
        y_proba = clf.predict_proba(x_val)
        y_pred = np.argmax(y_proba, axis=1)

        # 评估测试集
        val_acc = metrics.accuracy_score(y_val, y_pred)
        val_precision = metrics.precision_score(y_val, y_pred)
        val_recall = metrics.recall_score(y_val, y_pred)
        val_f1 = metrics.f1_score(y_val, y_pred)
        auc_value = metrics.roc_auc_score(y_val, y_proba[:, 1])
        val_gmean = mymetrics.gmean(y_val, y_pred)
        val_bAcc = metrics.balanced_accuracy_score(y_val, y_pred)

        # 存储评估结果
        val_history["val_acc"].append(val_acc)
        val_history["val_precision"].append(val_precision)
        val_history["val_recall"].append(val_recall)
        val_history["val_f1"].append(val_f1)
        val_history["auc_value"].append(auc_value)
        val_history["val_gmean"].append(val_gmean)
        val_history['bAcc'].append(val_bAcc)


        # 打印输出每折的评估情况
        if show_info:
            print("val_acc:%.2f val_precision:%.2f val_recall:%.2f val_f1:%.2f auc_value:%.2f val_gmean:%.2f" %
                  (val_acc, val_precision, val_recall, val_f1, auc_value, val_gmean))

    # 统计,求平均值和标准差
    header = ""
    value = ""
    for k in val_history.keys():
        header += "%-20s" % k
        value += "%-20s" % ("%.4f ±%.4f" % (np.mean(val_history[k]), np.std(val_history[k])))
    if show_info:
        print("%s-%s 平均数据" % (sampler, classifier))
        print(header)
        print(value)

    # 打印出关键输出,方便复制到 Markdown
    if sampler != "":
        model_name = "%s-%s" % (sampler, classifier)
    else:
        model_name = classifier

    all_data = "|%-20s" % model_name
    key_data = "|%-20s" % model_name
    for k in val_history.keys():
        t = "|%-20s" % ("%.4f ±%.4f" % (np.mean(val_history[k]), np.std(val_history[k])))
        all_data += t
        if k in ("val_f1", "auc_value", "val_gmean", "bAcc"):
            key_data += t

    if show_info:
        print(all_data)
        print(key_data)
        print("-" * 60)

    return all_data, key_data
fig, ax = plt.subplots()
plot_confusion_matrix(cm_tree,
                      classes=np.unique(satimage.target),
                      ax=ax,
                      title="Decision tree")

###############################################################################
# Classification using bagging classifier with and without sampling
###############################################################################
# Instead of using a single tree, we will check if an ensemble of decsion tree
# can actually alleviate the issue induced by the class imbalancing. First, we
# will use a bagging classifier and its counter part which internally uses a
# random under-sampling to balanced each boostrap sample.

bagging = BaggingClassifier(n_estimators=50, random_state=0)
balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0)

bagging.fit(X_train, y_train)
balanced_bagging.fit(X_train, y_train)

y_pred_bc = bagging.predict(X_test)
y_pred_bbc = balanced_bagging.predict(X_test)

###############################################################################
# Balancing each bootstrap sample allows to increase significantly the balanced
# accuracy and the geometric mean.

print("Bagging classifier performance:")
print(f"Balanced accuracy: {balanced_accuracy_score(y_test, y_pred_bc):.2f} - "
      f"Geometric mean {geometric_mean_score(y_test, y_pred_bc):.2f}")
cm_bagging = confusion_matrix(y_test, y_pred_bc)
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
gbm_params2 = {'learning_rate': [0.01, 0.05, 0.1, 0.5, 1],
                'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'n_estimators':[50,100,500,1000,1500], 'min_samples_leaf':[5,10,15]}
rf = GradientBoostingClassifier()
grid = GridSearchCV(rf,param_grid,refit=True,verbose=2)
grid.fit(X_res,y_res)
grid_predictions = grid.predict(X_test)
print(confusion_matrix(y_test,grid_predictions))

print(classification_report(y_test,grid_predictions))
from sklearn.metrics import accuracy_score
print( accuracy_score(y_test, grid_predictions) )
print( grid.best_params_)

# # BalancedBaggingClassifier

# In[ ]:


from imblearn.ensemble import BalancedBaggingClassifier 
bbc = BalancedBaggingClassifier(random_state=42)
bbc.fit(X_train, y_train)
predictions = bbc.predict(X_test)
print(confusion_matrix(y_test,predictions))

print(classification_report(y_test,predictions))
from sklearn.metrics import accuracy_score
print( accuracy_score(y_test, predictions) )
print(grid.best_params_)
    Y_train_res.shape))

print("After OverSampling, counts of label '1': {}".format(
    sum(Y_train_res == 1)))
print("After OverSampling, counts of label '0': {}".format(
    sum(Y_train_res == 0)))
#-----------------------------------------------------------

#---------------------------------TRAINING THE MODELS -------------------------------
seed = 7
scoring = 'accuracy'
models = []
models.append(('LR', LogisticRegression()))
models.append(('NB', GaussianNB()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('BB', BalancedBaggingClassifier()))
results = []
names = []
msg = ''
for name, model in models:

    cv_results = model_selection.cross_val_score(model,
                                                 X_train_res,
                                                 Y_train_res,
                                                 cv=10,
                                                 scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg += "%s--> %f \n " % (name, cv_results.mean() * 100)

popupmsg(msg)
def test_warm_start_with_oob_score_fails():
    # Check using oob_score and warm_start simultaneously fails
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True,
                                    oob_score=True)
    assert_raises(ValueError, clf.fit, X, y)
models = [
    DecisionTreeClassifier(random_state=r),
    KNeighborsClassifier(),
    GaussianNB(),
    MultinomialNB(),
    LogisticRegression(random_state=r),
    SVC(random_state=r, kernel='sigmoid'),
    MLPClassifier(random_state=r),
    BaggingClassifier(random_state=r),
    RandomForestClassifier(random_state=r),
    GradientBoostingClassifier(random_state=r),
    LGBMClassifier(),
    XGBClassifier(random_state=r),
    CatBoostClassifier(random_state=r, verbose=False),
    BalancedBaggingClassifier(random_state=r),
    BalancedRandomForestClassifier(random_state=r),
    RUSBoostClassifier(random_state=r)
]
names = [
    "DecisionTree", "KNeighbors", "GaussianNB", "MultinomialNB",
    "LogisticRegression", "SVC", "MLPClassifier", "Ensemble-Bagging",
    "Ensemble-RandomForest", "Ensemble-GradientBoosting",
    "LightGradientBoosting", "XGBoost", "CatBoost", "BalancedBagging",
    "BalancedRandomForest", "RUSBoost"
]

outputs = {}

for name, model in zip(names, models):
    model.fit(x_train, y_train)
Example #12
0
spaceBalance = spaceEasy

with Timer('BalancedEnsamble, Search') as t:
    # Set algoritm parameters
    bestBalance = fmin(fn=objectiveBalance,
                space=spaceBalance,
                algo=tpe.suggest,
                max_evals=5)
  
    # Print best parameters
    bestBalance_params = space_eval(spaceBalance, bestBalance)

clf = BalancedBaggingClassifier(**bestBalance_params,
                                random_state=0,
                                n_estimators=300,
                                n_jobs=-1,
                                verbose=0)

clf.fit(X_train, y_train)

# training roc
balance_y_train_pred = clf.predict_proba(X_train)[:,1]
plotROC(y_train, balance_y_train_pred, 'BalancedEnsamble-Train')
# test roc
balance_y_test_pred = clf.predict_proba(X_test)[:,1]
plotROC(y_test, balance_y_test_pred, 'BalancedEnsamble-Test')

with Timer('BalancedEnsamble, Train') as t:
    # train with all data
    clf.fit(X, y.values.ravel())
Example #13
0
    def unbalance_helper(self,
                         imbalance_method='under_sampling',
                         search_method='grid'):
        '''
        @description: handle unbalance data, then search best param
        @param {type}
        imbalance_method,  three option, under_sampling for ClusterCentroids, SMOTE for over_sampling, ensemble for BalancedBaggingClassifier
        search_method: two options. grid or bayesian optimization
        @return: None
        '''
        logger.info("get all freature")
        self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer(
        )
        model_name = None
        if imbalance_method == 'over_sampling':
            logger.info("Use SMOTE deal with unbalance data ")

            # 1. 使用over_sampling 处理样本不平衡问题
            print(self.y_train)
            self.X_train, self.y_train = SMOTE().fit_resample(
                self.X_train, self.y_train)
            print(self.y_train)
            self.X_test, self.y_test = SMOTE().fit_resample(
                self.X_train, self.y_train)
            model_name = 'lgb_over_sampling'

        elif imbalance_method == 'under_sampling':
            logger.info("Use ClusterCentroids deal with unbalance data ")

            # 1. 使用 under_sampling 处理样本不平衡问题
            print(self.X_train)
            #print(self.y_train)
            self.X_train, self.y_train = ClusterCentroids(
                random_state=0).fit_resample(self.X_train, self.y_train)
            print(self.X_train)
            #print(self.y_train)
            self.X_test, self.y_test = ClusterCentroids(
                random_state=0).fit_resample(self.X_test, self.y_test)
            model_name = 'lgb_under_sampling'

        elif imbalance_method == 'ensemble':
            self.model = BalancedBaggingClassifier(
                base_estimator=DecisionTreeClassifier(),
                sampling_strategy='auto',
                replacement=False,
                random_state=0)
            model_name = 'ensemble'
        logger.info('search best param')

        if imbalance_method != 'ensemble':
            param = self.param_search(search_method=search_method)
            param['params']['num_leaves'] = int(param['params']['num_leaves'])
            param['params']['max_depth'] = int(param['params']['max_depth'])
            self.model = self.model.set_params(**param['params'])

        logger.info('fit model ')
        self.model.fit(self.X_train, self.y_train)

        # 1. 预测测试集的label
        # 2. 预测训练机的label
        # 3. 计算percision , accuracy, recall, fi_score

        Test_predict_label = self.model.predict(self.X_test)
        Train_predict_label = self.model.predict(self.X_train)
        per, acc, recall, f1 = get_score(self.y_train, self.y_test,
                                         Train_predict_label,
                                         Test_predict_label)

        # 输出训练集的准确率
        logger.info('Train accuracy %s' % per)
        # 输出测试集的准确率
        logger.info('test accuracy %s' % acc)
        # 输出recall
        logger.info('test recall %s' % recall)
        # 输出F1-score
        logger.info('test F1_score %s' % f1)
        self.save(model_name)
def model(boosting_name, data_name, classifier_name, cv_name, mode):
    """
    模板方法
    :param boosting_name: 集成学习的方法
    :param data_name: 数据集名称
    :param classifier_name: 使用的基分类器
    :param cv_name: 交叉验证模式
    :param mode: 采样模式
    :return:
    """
    # 加载数据
    if data_name in fetch_datasets().keys():
        dataset = fetch_datasets()[data_name]
        X = dataset.data
        y = dataset.target
        print(Counter(y))
    else:
        # 加载自定义数据
        df = pd.read_csv('../imbalanced_data/%s.csv' % data_name, header=None)
        array = df.values.astype(float)
        X = array[:, 0:array.shape[1] - 1]
        y = array[:, -1]
        print(Counter(y))
    base = None
    if classifier_name == 'CART':
        base = tree.DecisionTreeClassifier(max_depth=8,
                                           random_state=42,
                                           min_samples_split=10)
    elif classifier_name == 'svm':
        base = svm.SVC()
    else:
        pass
    # 起始时间
    start_time = time.time()
    cv = None
    if cv_name == 'StratifiedKFold':
        cv = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    elif cv_name == 'RepeatedStratifiedKFold':
        cv = RepeatedStratifiedKFold(n_repeats=10,
                                     n_splits=10,
                                     random_state=42)
    else:
        pass
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)  # 插值点(保证每一折的fpr和tpr相同)
    aucs = []
    for train, test in cv.split(X, y):
        # 预处理
        scaler = preprocessing.MinMaxScaler().fit(X[train])
        X_train_minmax = scaler.transform(X[train])
        X_test_minmax = scaler.transform(X[test])
        classifier = None
        if boosting_name == 'CART':
            classifier = base
        elif boosting_name == 'Bagging':
            classifier = BaggingClassifier(base_estimator=base,
                                           n_estimators=40)
        elif boosting_name == 'BalancedBagging':
            classifier = BalancedBaggingClassifier(base_estimator=base,
                                                   ratio='auto',
                                                   replacement=True,
                                                   random_state=42)
        elif boosting_name == 'Adaboost':
            classifier = AdaBoostClassifier(base_estimator=base,
                                            n_estimators=40)
        elif boosting_name == 'Random Forest':
            classifier = RandomForestClassifier(max_depth=8,
                                                min_samples_split=10,
                                                n_estimators=40,
                                                random_state=42)
        elif boosting_name == 'EasyEnsemble':
            model_under(boosting_name, X_train_minmax, y[train], X_test_minmax,
                        y[test])
            continue
        elif boosting_name == 'BalanceCascade':
            model_under(boosting_name, X_train_minmax, y[train], X_test_minmax,
                        y[test])
            continue
        elif boosting_name == 'SMOTEBoost':
            classifier = SMOTEBoost(rate=100,
                                    n_estimators=40,
                                    weak_estimator=base,
                                    random_state=42,
                                    class_dist=False)
        elif boosting_name == 'RUSBoost':
            classifier = RUSBoost(ratio=50,
                                  n_estimators=40,
                                  weak_estimator=base,
                                  random_state=42,
                                  class_dist=False)
        else:
            pass
        classifier.fit(X_train_minmax, y[train])  # 采样
        predict = classifier.predict(X_test_minmax)
        probability = classifier.predict_proba(X_test_minmax)[:, 1]
        # 指标计算
        precision = metrics.precision_score(y[test], predict)
        recall = metrics.recall_score(y[test], predict)
        if precision == 0:
            f1 = 0
        else:
            f1 = 2 * (precision * recall) / (precision + recall)
        auc = metrics.roc_auc_score(y[test], probability)
        gmean = geometric_mean_score(y[test], predict)
        accuracy = metrics.accuracy_score(y[test], predict)
        # -------------step6.计算每一折的ROC曲线和PR曲线上的点 -------------
        fpr, tpr, thresholds = metrics.roc_curve(y[test], probability)
        # 对mean_tpr在mean_fpr处进行插值,通过scipy包调用interp()函数
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0  # 为什么?
        roc_auc = metrics.auc(fpr, tpr)
        aucs.append(roc_auc)
        # write2dic
        fill_dic('precision', boosting_name, precision)
        fill_dic('recall', boosting_name, recall)
        fill_dic('f1', boosting_name, f1)
        fill_dic('auc', boosting_name, auc)
        fill_dic('gmean', boosting_name, gmean)

    if boosting_name != 'EasyEnsemble' and boosting_name != 'BalanceCascade':
        # 将frp和tpr写入文件
        # 在mean_fpr100个点,每个点处插值插值多次取平均
        mean_tpr /= cv.get_n_splits()
        # 坐标最后一个点为(1,1)
        mean_tpr[-1] = 1.0
        # 计算平均AUC值
        mean_auc = metrics.auc(mean_fpr, mean_tpr)

        # 将平均fpr和tpr拼接起来存入文件
        filename = './ROC/{data_name}/{mode}/{base_classifier}/{sampler}.csv'. \
            format(data_name=data_name, mode=mode, base_classifier=classifier_name, sampler=boosting_name)
        # 将文件路径分割出来
        file_dir = os.path.split(filename)[0]
        # 判断文件路径是否存在,如果不存在,则创建,此处是创建多级目录
        if not os.path.isdir(file_dir):
            os.makedirs(file_dir)
        # # 然后再判断文件是否存在,如果不存在,则创建
        # if not os.path.exists(filename):
        #     os.system(r'touch %s' % filename)
        # 将结果拼合起来
        all = np.c_[mean_fpr, mean_tpr]
        np.savetxt(filename, all, delimiter=',', fmt='%f')

    print('%s building id transforming took %fs!' %
          (boosting_name, time.time() - start_time))
def get_estimator(scorer_type, save_folder=None):
    #clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)

    if scorer_type == 'voting_mlps_hard' or scorer_type == 'featMLP':
        import sys
        seed = np.random.randint(1, sys.maxsize)
        mlp1 = MultiThreadingFeedForwardMLP(n_classes=4,
                                            batch_size=188,
                                            hm_epochs=70,
                                            keep_prob_const=1.0,
                                            optimizer='adam',
                                            learning_rate=0.001,
                                            step_decay_LR=True,
                                            weight_init='sqrt_n',
                                            bias_init=0.001,
                                            hidden_layers=(362, 942, 1071, 870,
                                                           318, 912, 247),
                                            activation_function='relu',
                                            save_folder=save_folder,
                                            seed=seed)
        seed = np.random.randint(1, sys.maxsize)
        mlp2 = MultiThreadingFeedForwardMLP(n_classes=4,
                                            batch_size=188,
                                            hm_epochs=70,
                                            keep_prob_const=1.0,
                                            optimizer='adam',
                                            learning_rate=0.001,
                                            step_decay_LR=True,
                                            weight_init='sqrt_n',
                                            bias_init=0.001,
                                            hidden_layers=(362, 942, 1071, 870,
                                                           318, 912, 247),
                                            activation_function='relu',
                                            save_folder=save_folder,
                                            seed=seed)
        seed = np.random.randint(1, sys.maxsize)
        mlp3 = MultiThreadingFeedForwardMLP(n_classes=4,
                                            batch_size=188,
                                            hm_epochs=70,
                                            keep_prob_const=1.0,
                                            optimizer='adam',
                                            learning_rate=0.001,
                                            step_decay_LR=True,
                                            weight_init='sqrt_n',
                                            bias_init=0.001,
                                            hidden_layers=(362, 942, 1071, 870,
                                                           318, 912, 247),
                                            activation_function='relu',
                                            save_folder=save_folder,
                                            seed=seed)
        seed = np.random.randint(1, sys.maxsize)
        mlp4 = MultiThreadingFeedForwardMLP(n_classes=4,
                                            batch_size=188,
                                            hm_epochs=70,
                                            keep_prob_const=1.0,
                                            optimizer='adam',
                                            learning_rate=0.001,
                                            step_decay_LR=True,
                                            weight_init='sqrt_n',
                                            bias_init=0.001,
                                            hidden_layers=(362, 942, 1071, 870,
                                                           318, 912, 247),
                                            activation_function='relu',
                                            save_folder=save_folder,
                                            seed=seed)
        seed = np.random.randint(1, sys.maxsize)
        mlp5 = MultiThreadingFeedForwardMLP(n_classes=4,
                                            batch_size=188,
                                            hm_epochs=70,
                                            keep_prob_const=1.0,
                                            optimizer='adam',
                                            learning_rate=0.001,
                                            step_decay_LR=True,
                                            weight_init='sqrt_n',
                                            bias_init=0.001,
                                            hidden_layers=(362, 942, 1071, 870,
                                                           318, 912, 247),
                                            activation_function='relu',
                                            save_folder=save_folder,
                                            seed=seed)

        clf = VotingClassifier(
            estimators=[  # ('gb', gb),
                # ('mlp', mlp),
                ('mlp1', mlp1),
                ('mlp2', mlp2),
                ('mlp3', mlp3),
                ('mlp4', mlp4),
                ('mlp5', mlp5),
            ],
            n_jobs=1,
            voting='hard')

    if scorer_type == 'MLP_base':
        clf = MultiThreadingFeedForwardMLP(n_classes=4,
                                           batch_size=200,
                                           hm_epochs=30,
                                           keep_prob_const=1.0,
                                           optimizer='adam',
                                           learning_rate=0.001,
                                           step_decay_LR=True,
                                           weight_init='sqrt_n',
                                           bias_init=0.01,
                                           hidden_layers=(600, 600, 600),
                                           activation_function='relu',
                                           save_folder=save_folder,
                                           seed=12345)

    if scorer_type == 'MLP_base_1':
        clf = MultiThreadingFeedForwardMLP(n_classes=4,
                                           batch_size=188,
                                           hm_epochs=70,
                                           keep_prob_const=1.0,
                                           optimizer='adam',
                                           learning_rate=0.001,
                                           step_decay_LR=True,
                                           weight_init='sqrt_n',
                                           bias_init=0.001,
                                           hidden_layers=(362, 942, 1071, 870,
                                                          318, 912, 247),
                                           activation_function='relu',
                                           save_folder=save_folder,
                                           seed=12345)

    if scorer_type == 'MLP_base_2':
        clf = MultiThreadingFeedForwardMLP(n_classes=3,
                                           batch_size=188,
                                           hm_epochs=70,
                                           keep_prob_const=1.0,
                                           optimizer='adam',
                                           learning_rate=0.001,
                                           step_decay_LR=True,
                                           weight_init='sqrt_n',
                                           bias_init=0.001,
                                           hidden_layers=(362, 942, 1071, 870,
                                                          318, 912, 247),
                                           activation_function='relu',
                                           save_folder=save_folder,
                                           seed=12345)

    if scorer_type == 'riedel':
        clf = riedel_mlp(save_folder=save_folder)

    #taken from original implementation
    if scorer_type == 'svm2':
        clf = svm.SVC(kernel='linear',
                      C=1.0,
                      probability=True,
                      class_weight='balanced')

    if scorer_type == 'grad_boost':
        clf = GradientBoostingClassifier(n_estimators=200,
                                         random_state=14128,
                                         verbose=True)

    if scorer_type == 'svm1':  # stochastic gradient decent classifier
        clf = svm.SVC(gamma=0.001, C=100., verbose=True)

    if scorer_type == 'logistic_regression':
        clf = logistic.LogisticRegression()

    if scorer_type == 'svm3':
        clf = svm.SVC(kernel='poly',
                      C=1.0,
                      probability=True,
                      class_weight='unbalanced')

    if scorer_type == "bayes":
        clf = naive_bayes.GaussianNB()

    if scorer_type == 'voting_hard_mlps_svm_gradboost':
        import sys
        seed = np.random.randint(1, sys.maxsize)
        mlp1 = MultiThreadingFeedForwardMLP(n_classes=4,
                                            batch_size=188,
                                            hm_epochs=70,
                                            keep_prob_const=1.0,
                                            optimizer='adam',
                                            learning_rate=0.001,
                                            step_decay_LR=True,
                                            weight_init='sqrt_n',
                                            bias_init=0.001,
                                            hidden_layers=(362, 942, 1071, 870,
                                                           318, 912, 247),
                                            activation_function='relu',
                                            save_folder=save_folder,
                                            seed=seed)
        seed = np.random.randint(1, sys.maxsize)
        mlp2 = MultiThreadingFeedForwardMLP(n_classes=4,
                                            batch_size=188,
                                            hm_epochs=70,
                                            keep_prob_const=1.0,
                                            optimizer='adam',
                                            learning_rate=0.001,
                                            step_decay_LR=True,
                                            weight_init='sqrt_n',
                                            bias_init=0.001,
                                            hidden_layers=(362, 942, 1071, 870,
                                                           318, 912, 247),
                                            activation_function='relu',
                                            save_folder=save_folder,
                                            seed=seed)
        seed = np.random.randint(1, sys.maxsize)
        mlp3 = MultiThreadingFeedForwardMLP(n_classes=4,
                                            batch_size=188,
                                            hm_epochs=70,
                                            keep_prob_const=1.0,
                                            optimizer='adam',
                                            learning_rate=0.001,
                                            step_decay_LR=True,
                                            weight_init='sqrt_n',
                                            bias_init=0.001,
                                            hidden_layers=(362, 942, 1071, 870,
                                                           318, 912, 247),
                                            activation_function='relu',
                                            save_folder=save_folder,
                                            seed=seed)
        seed = np.random.randint(1, sys.maxsize)
        mlp4 = MultiThreadingFeedForwardMLP(n_classes=4,
                                            batch_size=188,
                                            hm_epochs=70,
                                            keep_prob_const=1.0,
                                            optimizer='adam',
                                            learning_rate=0.001,
                                            step_decay_LR=True,
                                            weight_init='sqrt_n',
                                            bias_init=0.001,
                                            hidden_layers=(362, 942, 1071, 870,
                                                           318, 912, 247),
                                            activation_function='relu',
                                            save_folder=save_folder,
                                            seed=seed)
        seed = np.random.randint(1, sys.maxsize)
        mlp5 = MultiThreadingFeedForwardMLP(n_classes=4,
                                            batch_size=188,
                                            hm_epochs=70,
                                            keep_prob_const=1.0,
                                            optimizer='adam',
                                            learning_rate=0.001,
                                            step_decay_LR=True,
                                            weight_init='sqrt_n',
                                            bias_init=0.001,
                                            hidden_layers=(362, 942, 1071, 870,
                                                           318, 912, 247),
                                            activation_function='relu',
                                            save_folder=save_folder,
                                            seed=seed)

        svm1 = svm.SVC(gamma=0.001, C=100., verbose=True)
        gradboost = GradientBoostingClassifier(n_estimators=200,
                                               random_state=14128,
                                               verbose=True)

        clf = VotingClassifier(
            estimators=[  # ('gb', gb),
                # ('mlp', mlp),
                ('mlp', mlp1),
                ('mlp', mlp2),
                ('mlp', mlp3),
                ('mlp', mlp4),
                ('mlp', mlp5),
                ('svm', svm1),
                ('grad_boost', gradboost)
            ],
            n_jobs=1,
            voting='hard')

    if scorer_type == 'voting_hard_svm_gradboost_logistic':
        svm2 = svm.SVC(kernel='linear',
                       C=1.0,
                       probability=True,
                       class_weight='balanced',
                       verbose=True)
        log_reg = logistic.LogisticRegression()
        gradboost = GradientBoostingClassifier(n_estimators=200,
                                               random_state=14128,
                                               verbose=True)

        clf = VotingClassifier(
            estimators=[  # ('gb', gb),
                ('svm', svm2), ('grad_boost', gradboost),
                ('logisitc_regression', log_reg)
            ],
            n_jobs=1,
            voting='hard')

    if scorer_type == 'voting_soft_svm_gradboost_logistic':
        svm2 = svm.SVC(kernel='linear',
                       C=1.0,
                       probability=True,
                       class_weight='balanced',
                       verbose=True)
        log_reg = logistic.LogisticRegression()
        gradboost = GradientBoostingClassifier(n_estimators=200,
                                               random_state=14128,
                                               verbose=True)

        clf = VotingClassifier(
            estimators=[  # ('gb', gb),
                ('svm', svm2), ('grad_boost', gradboost),
                ('logisitc_regression', log_reg)
            ],
            n_jobs=1,
            voting='soft')

    if scorer_type == 'voting_hard_mlp_riedel':
        import sys
        seed = np.random.randint(1, sys.maxsize)
        mlp1 = MultiThreadingFeedForwardMLP(n_classes=4,
                                            batch_size=188,
                                            hm_epochs=70,
                                            keep_prob_const=1.0,
                                            optimizer='adam',
                                            learning_rate=0.001,
                                            step_decay_LR=True,
                                            weight_init='sqrt_n',
                                            bias_init=0.001,
                                            hidden_layers=(362, 942, 1071, 870,
                                                           318, 912, 247),
                                            activation_function='relu',
                                            save_folder=save_folder,
                                            seed=seed)

        seed = np.random.randint(1, sys.maxsize)
        mlp2 = MultiThreadingFeedForwardMLP(n_classes=4,
                                            batch_size=188,
                                            hm_epochs=70,
                                            keep_prob_const=1.0,
                                            optimizer='adam',
                                            learning_rate=0.001,
                                            step_decay_LR=True,
                                            weight_init='sqrt_n',
                                            bias_init=0.001,
                                            hidden_layers=(362, 942, 1071, 870,
                                                           318, 912, 247),
                                            activation_function='relu',
                                            save_folder=save_folder,
                                            seed=seed)

        riedel1 = riedel_mlp(save_folder=save_folder + "1/")
        riedel2 = riedel_mlp(save_folder=save_folder + "2/")

        clf = VotingClassifier(estimators=[('mlp', mlp1), ('riedel', riedel1),
                                           ('mlp', mlp2), ('riedel', riedel2)],
                               n_jobs=1,
                               voting='hard')

    if scorer_type == 'voting_hard_riedel':
        riedel1 = riedel_mlp(save_folder=save_folder + "1/")
        riedel2 = riedel_mlp(save_folder=save_folder + "2/")
        riedel3 = riedel_mlp(save_folder=save_folder + "3/")
        riedel4 = riedel_mlp(save_folder=save_folder + "4/")
        riedel5 = riedel_mlp(save_folder=save_folder + "5/")

        clf = VotingClassifier(estimators=[
            ('riedel', riedel1),
            ('riedel', riedel2),
            ('riedel', riedel3),
            ('riedel', riedel4),
            ('riedel', riedel5),
        ],
                               n_jobs=1,
                               voting='hard')

    #Taken from Benjamins LSTM
    # I pass a random seed through the get_estimator() function => set fixed/random/anything
    # both models need around 5,2 GB GPU memory, so adjust gpu_memory_fraction accordingly
    if scorer_type == 'single_f_ext_LSTM_att_no_cw':
        import sys
        seed = np.random.randint(1, sys.maxsize)
        #if features != None and isinstance(features, list):
        #    clf = single_f_ext_LSTM_att(epochs=100, batch_size=128, param_dict=features[0], lr=0.001, optimizer="adam", seed=seed, min_epoch=150, use_class_weights=False, gpu_memory_fraction=0.3)
        clf = single_f_ext_LSTM_att(epochs=100,
                                    batch_size=128,
                                    param_dict="single_flat_LSTM_50d_100",
                                    lr=0.001,
                                    optimizer="adam",
                                    seed=seed,
                                    min_epoch=150,
                                    use_class_weights=False,
                                    gpu_memory_fraction=0.3,
                                    save_folder=save_folder)

    if scorer_type == 'single_f_ext_LSTM_no_cw' or scorer_type == 'stackLSTM':
        import sys
        seed = np.random.randint(1, sys.maxsize)
        #if features != None and isinstance(features, list):
        #    clf = single_f_ext_LSTM(epochs=100, batch_size=128, param_dict=features[0], lr=0.001, optimizer="adam", seed=seed, min_epoch=150, use_class_weights=False, gpu_memory_fraction=0.3)
        clf = single_f_ext_LSTM(epochs=100,
                                batch_size=128,
                                param_dict="single_flat_LSTM_50d_100",
                                lr=0.001,
                                optimizer="adam",
                                seed=seed,
                                min_epoch=150,
                                use_class_weights=False,
                                gpu_memory_fraction=0.3,
                                save_folder=save_folder)

    if scorer_type == 'sBalancedBagging':
        from sklearn.tree import DecisionTreeClassifier
        from imblearn.ensemble import BalancedBaggingClassifier
        clf = BalancedBaggingClassifier(
            base_estimator=DecisionTreeClassifier(),
            ratio='auto',
            replacement=False,
            random_state=0)

    return clf
testing_labels = [element[2] for element in test_set_reduced]
testing_labels = np.array(testing_labels)

#%%
# build models
# initialize basic SVM
SVC = svm.LinearSVC()
# initialize basic Decision tree
DT = DecisionTreeClassifier()
# initialize basic Random forest
RF = RandomForestClassifier()
# initialize Extreme Gradient Boosting Classifier
XGB = XGBClassifier()
# initialize Balanced Bagging Classifier
BB = BalancedBaggingClassifier(
    base_estimator=RandomForestClassifier(criterion='entropy'),
    n_estimators=5,
    bootstrap=True)
# train
SVC.fit(training_features, training_labels)
DT.fit(training_features, training_labels)
RF.fit(training_features, training_labels)
XGB.fit(training_features, training_labels)
BB.fit(training_features, training_labels)

# In[22]:'


#%%
#model evaluation
def prepare_kfold_cv_data(k, X, y):
    kf = KFold(n_splits=k, shuffle=False, random_state=42)
Example #17
0
"""
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.externals import joblib
import pandas as pd

if __name__ == "__main__":
    print("Loading data")
    data = pd.concat([
        pd.read_csv("/data/SO_data/downvoter/wv_train_processed_data.csv"),
        pd.read_csv("/data/SO_data/downvoter/wv_val_processed_data.csv")
    ])
    body_data = joblib.load("./final/vectorized_data/body_data.pkl")
    title_data = joblib.load("./final/vectorized_data/title_data.pkl")

    body_model = BalancedBaggingClassifier(n_estimators=100,
                                           n_jobs=-1,
                                           ratio="not minority")

    title_model = BalancedBaggingClassifier(n_estimators=100,
                                            n_jobs=-1,
                                            ratio="not minority")

    labels = data.score < 0
    print("Fitting body model")
    body_model.fit(body_data, labels)
    print("Fitting title model")
    title_model.fit(title_data, labels)

    joblib.dump(body_model, "./final/body_model.pkl")
    joblib.dump(title_model, "./final/title_model.pkl")
def nosampling_pipeline(data=[], verbose=False, clean=False, plot=False):

    results_table = []
    results = []
    rand_state = 42

    if clean:
        X = data.drop('Class', axis=1)
        y = data['Class']
        X_vals = X.values
        y_vals = y.values
        X_inliners, y_inliners = reject_sampler.fit_resample(X_vals, y_vals)
        X = X_inliners
        y = y_inliners
    else:
        X = data.drop('Class', axis=1)
        y = data['Class']
        X = X.values
        y = y.values
        pass

    sss = StratifiedKFold(n_splits=10, random_state=rand_state, shuffle=False)
    print("StratKFold:", sss)

    #List of models to be used
    models = [
        DecisionTreeClassifier(random_state=rand_state),
        RUSBoostClassifier(random_state=rand_state),
        LogisticRegression(random_state=rand_state),
        BalancedBaggingClassifier(random_state=rand_state),
        RandomForestClassifier(random_state=rand_state),
        EasyEnsembleClassifier(
            base_estimator=RandomForestClassifier(random_state=rand_state),
            random_state=rand_state),
        BalancedRandomForestClassifier(random_state=rand_state)
    ]

    results_table = pd.DataFrame(columns=['models', 'fpr', 'tpr', 'auc'])
    #Create training and testing data sets depending on wheather or not they have been generated previously.
    #Instantiate lists to store each of the models results
    strategy = []
    classifier = []
    strategy = []
    samp_technique = []
    accuracy = []
    f1 = []
    auc = []
    recall = []
    precision = []
    g_mean = []
    start = time.time()
    #Run thorugh each of the models to get their performance metrics

    sampling_strat = 'no_sampling'

    for train_index, test_index in sss.split(X, y):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    # X_train=X_train.values
    # X_test=X_test.values
    # y_train=y_train.values
    # y_test=y_test.values

    for model in models:
        print(
            "Using lentgh of X for training: {}; Using Length of Y for training: {}"
            .format(len(X_train), len(y_train)))
        print(
            "Using lentgh of X for testing: {}; Using Length of Y for test: {}"
            .format(len(X_test), len(y_test)))

        print("Currently training model - {} using sampling strategy - {}".
              format(model.__class__.__name__, sampling_strat))
        print("--" * 20)

        clf = model

        pipe = make_pipeline(clf)  # LOG_REG_MODEL WITH BOTHER
        pipe.fit(X_train, y_train)

        test_preds = pipe.predict(X_test)
        #yproba = pipe.predict_proba(X_test)[::,1]

        classifier.append(model.__class__.__name__)
        samp_technique.append(sampling_strat)
        strategy.append(" %s+%s " %
                        (str(model.__class__.__name__), sampling_strat))

        f1.append(f1_score(y_test, test_preds))
        accuracy.append(accuracy_score(y_test, test_preds))
        auc.append(roc_auc_score(y_test, test_preds))
        recall.append(recall_score(y_test, test_preds))
        precision.append(precision_score(y_test, test_preds))
        g_mean.append(
            geometric_mean_score(y_test, test_preds, average='binary'))

        fpr, tpr, _ = roc_curve(y_test, test_preds)
        auc_score = roc_auc_score(y_test, test_preds)

        results_table = results_table.append(
            {
                'classifiers': model.__class__.__name__,
                'fpr': fpr,
                'tpr': tpr,
                'auc_score': auc_score
            },
            ignore_index=True)

        #Print the model and its report
        if verbose:
            print('Classification Model: ', model.__class__.__name__, '\n')
            print('Sampling Strategy Model: ', sampling_strat, '\n')
            print(confusion_matrix(y_test, test_preds), '\n')
            print(classification_report_imbalanced(y_test, test_preds), '\n')

    #round the results for convenience
    f1 = [float(round(n, 4)) for n in f1]
    auc = [float(round(n, 4)) for n in auc]
    g_mean = [float(round(n, 4)) for n in g_mean]
    accuracy = [float(round(n, 4)) for n in accuracy]
    precision = [float(round(n, 4)) for n in precision]
    recall = [float(round(n, 4)) for n in recall]

    #store results in dataframe

    results = pd.DataFrame(
        [
            classifier, strategy, samp_technique, f1, auc, g_mean, accuracy,
            precision, recall
        ],
        index=[
            'classifier', 'strategy', 'samp_technique', 'f1', 'roc_auc',
            'g_mean', 'accuracy', 'precision', 'recall'
        ],
        columns=[
            'DecisionTreeClassifier', 'RUSBoostClaassifier',
            'LogisiticRegression', 'BalancedBaggingClassifier',
            'RandomForestClassifier', 'EasyEnsembleClassifier',
            'BalancedRandomForestClassifier'
        ])

    if plot:

        results_table.set_index('classifiers', inplace=True)
        fig = plt.figure(figsize=(8, 6))
        results_table.sort_values(by=['auc_score'], ascending=False)

        for i in results_table.index:

            plt.plot(results_table.loc[i]['fpr'],
                     results_table.loc[i]['tpr'],
                     label="{}, AUC={:.4f}".format(
                         i, results_table.loc[i]['auc_score']))

            plt.plot([0, 1], [0, 1], color='orange', linestyle='--')

            plt.xticks(np.arange(0.0, 1.1, step=0.1))
            plt.xlabel("Flase Positive Rate", fontsize=15)

            plt.yticks(np.arange(0.0, 1.1, step=0.1))
            plt.ylabel("True Positive Rate", fontsize=15)

            plt.title(
                'ROC Curve for classifiers using Full data split using sampling technique: {}'
                .format(sampling_strat),
                fontweight='bold',
                fontsize=15)
            plt.legend(prop={'size': 13}, loc='lower right')

    plt.show()

    #Change orientation of the dataframe

    end = time.time()
    print("Time elapsed:", start - end)

    return results.transpose()
Example #19
0
    print(msg)
    '''
scaler = RobustScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
'''param_grid = {
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":np.arange(210,250)
    }'''
param_grid = {
    "n_estimators": np.array([60, 70, 80, 90, 100, 110, 120, 130, 140])
}
model = BalancedBaggingClassifier()
kfold = KFold(n_splits=10, random_state=42)
grid = GridSearchCV(estimator=model,
                    param_grid=param_grid,
                    scoring=scorer,
                    cv=kfold)
grid_result = grid.fit(rescaledX, Y_train)
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
print("Best: %f using %s" %
      (grid_result.best_score_, grid_result.best_params_))
## F2-Score: 0.757


XGBDT = XGBClassifier(booster = 'gbtree', n_estimators = 200, depth_range = 2, min_child_weight = 2, random_state = seed_custom, early_stopping_rounds = 10)
XGBDT_base_performance = algo_CVmetrics(classifier_object = XGBDT, X_train = X_train, Y_train = Y_train)

## F2-Score: 0.815

## TEST => UNDERBAGGING BETTER W/ DT (ORIGINAL PROPOSAL) OR WITH SVM?
## TEST => EASYENSEMBLE BETTER W/ ADA (ORIGINAL PROPOSAL) OR WITH XGB?

############### HYBRID ALGORITHM 1B: UNDERBAGGING W/ DT ###############

DT = DecisionTreeClassifier(criterion = 'gini', splitter = "best", min_samples_split = 2, max_depth = None, max_features = None, random_state = seed_custom)

UBDT = BalancedBaggingClassifier(base_estimator = DT, max_samples = 1.0, random_state = seed_custom, n_jobs = -1)

classdist_grid = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
bag_grid = [10, 20, 30, 40, 50, 100]

f2_metric = make_scorer(metrics.fbeta_score, beta = 2)
ub_grid = dict(sampling_strategy = classdist_grid, n_estimators = bag_grid)

UBDTopt_params_final, UBDTopt_metrics_final = multi_RSCV(method = UBDT, grid = ub_grid, X = X_train, Y = Y_train, metric = f2_metric, n_candidates = 5, it = 50)

## needs too many bags, not as good as sampling algos..
## significant improvemnt

UBDT_opt = BalancedBaggingClassifier(base_estimator = DT, n_estimators = 50, sampling_strategy = 1.0, max_samples = 1.0, random_state = seed_custom, n_jobs = -1)
UBDT_performance = algo_CVmetrics(classifier_object = UBDT_opt, X_train = X_train, Y_train = Y_train)
Example #21
0
 def unbalance_helper(self,
                      imbalance_method='under_sampling',
                      search_method='grid'):
     '''
     @description: handle unbalance data, then search best param
     @param {type}
     imbalance_method,  three option, under_sampling for ClusterCentroids, SMOTE for over_sampling, ensemble for BalancedBaggingClassifier
     search_method: two options. grid or bayesian optimization
     @return: None
     '''
     logger.info("get all freature")
     # 生成所有feature
     self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer(
     )
     model_name = None
     # 是否使用不平衡数据处理方式,上采样, 下采样, ensemble
     ###########################################
     #          TODO: module 4 task 1.1        #
     ###########################################
     if imbalance_method == 'over_sampling':
         logger.info("Use SMOTE deal with unbalance data ")
         self.X_train, self.y_train = SMOTE().fit_resample(
             self.X_train, self.y_train)
         self.X_test, self.y_test = SMOTE().fit_resample(
             self.X_train, self.y_train)
         model_name = 'lgb_over_sampling'
     elif imbalance_method == 'under_sampling':
         logger.info("Use ClusterCentroids deal with unbalance data ")
         self.X_train, self.y_train = ClusterCentroids(
             random_state=0).fit_resample(self.X_train, self.y_train)
         self.X_test, self.y_test = ClusterCentroids(
             random_state=0).fit_resample(self.X_test, self.y_test)
         model_name = 'lgb_under_sampling'
     elif imbalance_method == 'ensemble':
         self.model = BalancedBaggingClassifier(
             base_estimator=DecisionTreeClassifier(),
             sampling_strategy='auto',
             replacement=False,
             random_state=0)
         model_name = 'ensemble'
     logger.info('search best param')
     # 使用set_params 将搜索到的最优参数设置为模型的参数
     if imbalance_method != 'ensemble':
         ###########################################
         #          TODO: module 4 task 1.2        #
         ###########################################
         # param = self.param_search(search_method=search_method)
         # param['params']['num_leaves'] = int(param['params']['num_leaves'])
         # param['params']['max_depth'] = int(param['params']['max_depth'])
         param = {}
         param['params'] = {}
         param['params']['num_leaves'] = 3
         param['params']['max_depth'] = 5
         self.model = self.model.set_params(**param['params'])
     logger.info('fit model ')
     # 训练, 并输出模型的结果
     self.model.fit(self.X_train, self.y_train)
     ###########################################
     #          TODO: module 4 task 1.3        #
     ###########################################
     Test_predict_label = self.model.predict(self.X_test)
     Train_predict_label = self.model.predict(self.X_train)
     per, acc, recall, f1 = get_score(self.y_train, self.y_test,
                                      Train_predict_label,
                                      Test_predict_label)
     # 输出训练集的精确率
     logger.info('Train accuracy %s' % per)
     # 输出测试集的准确率
     logger.info('test accuracy %s' % acc)
     # 输出recall
     logger.info('test recall %s' % recall)
     # 输出F1-score
     logger.info('test F1_score %s' % f1)
     self.save(model_name)
pred_test = classifier.predict(test[predictors])
cm_test = confusion_matrix(test[target],pred_test)
print(cm_test)
acc_test = accuracy_score(test[target],pred_test)
print(acc_test) # 65%

# Classes in target variable is imbalanced.
# Good : Risky = 4: 1 ratio

# Ensembling method
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# Create an object of the classifier
bbc = BalancedBaggingClassifier(base_estimator =DecisionTreeClassifier(criterion='entropy', random_state=0),
                                sampling_strategy='auto',
                                replacement=False,
                                random_state=0)

Y_train = train['Taxable.Income']
X_train = train.drop(['Taxable.Income'], axis=1)
X_test = test.drop(['Taxable.Income'], axis=1)
Y_test = test['Taxable.Income']

# Train the classifier
bbc.fit(X_train, Y_train)
preds = bbc.predict(X_test)

pd.Series(preds).value_counts()
# Confusion matrix
pd.crosstab(Y_test,preds)
# Accuracy
Example #23
0
# In[ ]:


# Balabnce bagging


# In[221]:


from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.ensemble import RandomForestClassifier

#Create an object of the classifier.
bbc = BalancedBaggingClassifier(base_estimator=RandomForestClassifier(),
                                 sampling_strategy='auto',
                                replacement=False,
                                random_state=0)

y_train = train['m13']
X_train = train.drop(['m13'], axis = 1)

#Train the classifier.
bbc.fit(X_train, y_train)
pred_y_1 = bbc.predict(X_train)
# print( accuracy_score(y_test, pred_y_1) )
# print(recall_score(y_test, pred_y_1))
# confusion_matrix(y_test, pred_y_1)


# In[3]:
    X_train = []
    y_train = []
    X_test = []
    y_test = []

    for train_index, test_index in kf.split(X):
        X_train.append(X[train_index])
        y_train.append(y[train_index])
        X_test.append(X[test_index])
        y_test.append(y[test_index])
    return X_train, y_train, X_test, y_test


# Balanced Bagging Classifier
bb_classifier = BalancedBaggingClassifier(
    base_estimator=RandomForestClassifier(criterion='entropy'),
    n_estimators=5,
    bootstrap=True)

# creating a dictionary of models
models_dictionary = OrderedDict()

models_dictionary['Balanced Bagging'] = bb_classifier


# perform data modeling
def perform_data_modeling(_models_, _imputers_, verbose=False, k_folds=5):

    # 7 Models
    # 4 Imputers
    # 5 datasets (for 5 years)
    # 7 metrics, averaged over all the K-Folds
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_val)
    preds_train = pipe.predict(X_train)
    print(classification_report_imbalanced(y_val, preds))

    print(
        '************** BalancedRandomForestClassifier(ensemble) ***********')
    pipe = make_pipeline_imb(vect,
                             BalancedRandomForestClassifier(max_depth=40))
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_val)
    preds_train = pipe.predict(X_train)
    print(classification_report_imbalanced(y_val, preds))

    print('************** BalancedBaggingClassifier(ensemble) ***********')
    pipe = make_pipeline_imb(vect, BalancedBaggingClassifier(random_state=42))
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_val)
    preds_train = pipe.predict(X_train)
    print(classification_report_imbalanced(y_val, preds))

#################### test data result ######################
import numpy as np


def save_pred_result(name='', preds=[]):
    result_dir = 'results/'
    filename = 'sampleSubmission_v2_%s.csv' % name
    y = [[i + 1, preds[i]] for i in range(len(preds))]
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)
Example #26
0
def model_baseline3(x_train, y_train, x_test, y_test):
    bagging = BaggingClassifier(random_state=0)
    balanced_bagging = BalancedBaggingClassifier(random_state=0)
    bagging.fit(x_train, y_train)
    balanced_bagging.fit(x_train, y_train)
    prob = bagging.predict_proba(x_test)[:, 1]
    predict_score = [float('%.2f' % x) for x in prob]
    loss_val = log_loss(y_test, predict_score)
    y_pred = [1 if x > 0.5 else 0 for x in predict_score]
    fpr, tpr, thresholds = roc_curve(y_test, predict_score)
    mean_fpr = np.linspace(0, 1, 100)
    mean_tpr = interp(mean_fpr, fpr, tpr)
    x_auc = auc(fpr, tpr)
    fig = plt.figure('Bagging')
    ax = fig.add_subplot(1, 1, 1)
    name = 'base_Bagging'
    plt.plot(mean_fpr,
             mean_tpr,
             linestyle='--',
             label='{} (area = %0.2f, logloss = %0.2f)'.format(name) %
             (x_auc, loss_val),
             lw=2)
    y_pred_bagging = bagging.predict(x_test)
    cm_bagging = confusion_matrix(y_test, y_pred_bagging)
    cm1 = plt.figure()
    plot_confusion_matrix(cm_bagging,
                          classes=[0, 1],
                          title='Confusion matrix of BaggingClassifier')
    # balanced_bagging
    prob = balanced_bagging.predict_proba(x_test)[:, 1]
    predict_score = [float('%.2f' % x) for x in prob]
    loss_val = log_loss(y_test, predict_score)
    fpr, tpr, thresholds = roc_curve(y_test, predict_score)
    mean_fpr = np.linspace(0, 1, 100)
    mean_tpr = interp(mean_fpr, fpr, tpr)
    x_auc = auc(fpr, tpr)
    plt.figure('Bagging')  # 选择图
    name = 'base_Balanced_Bagging'
    plt.plot(mean_fpr,
             mean_tpr,
             linestyle='--',
             label='{} (area = %0.2f, logloss = %0.2f)'.format(name) %
             (x_auc, loss_val),
             lw=2)
    y_pred_balanced_bagging = balanced_bagging.predict(x_test)
    cm_balanced_bagging = confusion_matrix(y_test, y_pred_balanced_bagging)
    cm2 = plt.figure()
    plot_confusion_matrix(cm_balanced_bagging,
                          classes=[0, 1],
                          title='Confusion matrix of BalancedBagging')
    plt.figure('Bagging')  # 选择图
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', label='Luck')
    # make nice plotting
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    ax.spines['left'].set_position(('outward', 10))
    ax.spines['bottom'].set_position(('outward', 10))
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()
    return cm1, cm2, fig
    def _init_classifier(self, opt):
        if "classifier_opt" in opt:
            opt = opt['classifier_opt']
        if "base_estimator" in opt:
            b_est = self._init_classifier(opt["base_estimator"])
        else:
            b_est = None

        if "n_estimators" in opt:
            n_estimators = opt["n_estimators"]
        else:
            n_estimators = 200

        if "max_iter" in opt:
            max_iter = opt["max_iter"]
        else:
            max_iter = 100000

        if "num_parallel_tree" in opt:
            num_parallel_tree = opt["num_parallel_tree"]
        else:
            num_parallel_tree = 5

        if "layer_structure" in opt:
            layer_structure = opt["layer_structure"]
        else:
            layer_structure = (100,)

        if opt["type"] in ["random_forrest", "rf"]:
            return RandomForestClassifier(n_estimators=n_estimators, class_weight="balanced", n_jobs=-1)
        elif opt["type"] == "ada_boost":
            return AdaBoostClassifier(base_estimator=b_est, n_estimators=n_estimators)
        elif opt["type"] in ["logistic_regression", "lr"]:
            return LogisticRegression(class_weight='balanced', max_iter=max_iter)
        elif opt["type"] == "sgd":
            return SGDClassifier(class_weight='balanced', max_iter=max_iter)
        elif opt["type"] in ["gaussian_bayes", "bayes", "gaussian_nb"]:
            return GaussianNB()
        elif opt["type"] in ["support_vector_machine", "svm"]:
            return SVC(kernel='rbf', class_weight='balanced', gamma="scale")
        elif opt["type"] in ["multilayer_perceptron", "mlp"]:
            return MLPClassifier(hidden_layer_sizes=layer_structure, max_iter=max_iter)
        elif opt["type"] in ["decision_tree", "dt", "tree"]:
            return DecisionTreeClassifier()
        elif opt["type"] in ["b_decision_tree", "b_dt", "b_tree"]:
            return DecisionTreeClassifier(class_weight="balanced")
        elif opt["type"] in ["neighbours", "knn"]:
            return KNeighborsClassifier(n_neighbors=opt["n_neighbours"])
        elif opt["type"] == "extra_tree":
            return ExtraTreesClassifier(n_estimators=n_estimators, class_weight="balanced", n_jobs=-1)
        elif opt["type"] == "xgboost":
            return XGBClassifier(objective='binary:logistic',
                                 n_estimators=n_estimators,
                                 num_parallel_tree=num_parallel_tree,
                                 tree_method="hist",
                                 booster="gbtree",
                                 n_jobs=-1)
        elif opt["type"] in ["b_random_forrest", "b_rf"]:
            return BalancedRandomForestClassifier(n_estimators=n_estimators, n_jobs=-1)
        elif opt["type"] == "b_bagging":
            return BalancedBaggingClassifier(base_estimator=b_est, n_estimators=n_estimators)
        elif opt["type"] == "b_boosting":
            return RUSBoostClassifier(base_estimator=b_est, n_estimators=n_estimators)
        else:
            raise ValueError("type: {} not recognised".format(opt["type"]))
 def __init__(self):
     self.reg = BalancedBaggingClassifier(n_estimators=50, random_state=42)
Example #29
0
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import utils
import numpy as geek
from datetime import datetime

# Create dictionary of models with according object
# "Balanced RF": Pipeline([('clf', BalancedRandomForestClassifier())]),
# "Balanced RF": Pipeline([('clf', BalancedRandomForestClassifier())]),
Models = {
    "Random Forest": Pipeline([('clf', RandomForestClassifier())]),
    "Bagging": Pipeline([('clf', BaggingClassifier())]),
    "Balanced RF": Pipeline([('clf', RandomForestClassifier())]),
    "Bootstrap Class Weighting RF":
    Pipeline([('clf', RandomForestClassifier())]),
    "Balanced Bagging": Pipeline([('clf', BalancedBaggingClassifier())])
}

# Create dictionary with parameters of models
# ModelParam={"Random Forest":{"n_estimators":50 ,"criterion": "gini","oob_score":True,"n_jobs": -1},
ModelParam = {
    "Random Forest": {
        "n_estimators": 100,
        "max_features": "log2",
        "oob_score": True,
        "n_jobs": -1
    },
    "Bagging": {
        "base_estimator": [
            DecisionTreeClassifier(max_features="auto",
                                   splitter="random",
Example #30
0
for i in range(8, len(Residues) - 8):
    r = (Residues[i - 8:i +
                  9]).upper()  # Converting Sequences to Patterns of size 17
    t = []
    for j in r:  # Binary Encoding of Patterns
        t = t + Encoding[j]
    Predictors.append(t)

Average_Predictions = [0 for i in range(len(Predictors))
                       ]  # Average of 5 Random Runs

for i in range(5):
    print("> Run:", i + 1)
    SVM = svm.SVC(kernel="rbf", gamma=0.1, C=2)
    BBC = BalancedBaggingClassifier(base_estimator=SVM)
    BBC.fit(Patterns, Labels)
    P = BBC.predict(Predictors)
    for i in range(len(P)):
        Average_Predictions[i] += P[i]

for i in range(len(Average_Predictions)):
    if Average_Predictions[i] < 0:
        Average_Predictions[i] = -1
    else:
        Average_Predictions[i] = 1

Result = pd.DataFrame()  # Exporting Predictions
Result["ID"] = Test["ID"]
Result["Lable"] = Average_Predictions
Result.to_csv("2018022_AVG_SVM_BBC.txt", index=False)