Exemple #1
0
def test_balanced_bagging_classifier_samplers(sampler, n_samples_bootstrap):
    # check that we can pass any kind of sampler to a bagging classifier
    X, y = make_imbalance(
        iris.data,
        iris.target,
        sampling_strategy={
            0: 20,
            1: 25,
            2: 50
        },
        random_state=0,
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = BalancedBaggingClassifier(
        base_estimator=CountDecisionTreeClassifier(),
        n_estimators=2,
        sampler=sampler,
        random_state=0,
    )
    clf.fit(X_train, y_train)
    clf.predict(X_test)

    # check that we have balanced class with the right counts of class
    # sample depending on the sampling strategy
    assert_array_equal(list(clf.estimators_[0][-1].class_counts_.values()),
                       n_samples_bootstrap)
Exemple #2
0
  def ranking_borda_BalancedBagging(self):
    a = 0
    rankings = np.zeros(len(self.X.columns),)
    std = np.zeros(len(self.X.columns),)

    for x in range(self.loops):
      seed = randint(0, 10000)
  
  #Splits the train/val set by a seed that generates randomly each loop.
      X_train, X_fr, y_train, y_fr = train_test_split(self.X, self.y, test_size=0.30, random_state= seed)
  #Initializing a random forest
      rf = BalancedBaggingClassifier(n_estimators=50, random_state=0)
  #Fits the Random forest and we calculate the matthew score. 
      rf.fit(X_train, y_train)
      mattheworiginal = matthews_corrcoef(y_fr, rf.predict(X_fr))
  #We initialize 2 lists to append values from the next loop.
      matthewscores= []
      columnsrf= []
  

      for x in self.X.columns:
    
        X_train, X_fr, y_train, y_fr = train_test_split(self.X, self.y, test_size=0.30, random_state = seed)
    #We drop a different column each loop. 
        X_train = X_train.drop([x], axis=1)
        X_fr = X_fr.drop([x], axis=1)
    #We fit our random forest again, but this time our training dataset lacks a feature.
        rf.fit(X_train, y_train)
        matthew = matthews_corrcoef(y_fr, rf.predict(X_fr))
    #We append to the list each column that we dropped.
        columnsrf.append(x)
    #And we also append, the drop (or gain), in r2 that we got when the feature was missing.
        matthewscores.append(mattheworiginal - matthew)
  
      a += 1 
      outcome = np.array(list(zip(columnsrf, matthewscores)))
      outcomepd = pd.DataFrame(data=outcome, columns=['Variables', 'r2-punish'])
      outcomepd['ranking'] = outcomepd['r2-punish'].rank(ascending = False)
     
      rankings = np.add(outcomepd['ranking'].to_numpy(), rankings)
      # We stack each value vertically to get a 2d numpy array
      std = np.vstack((outcomepd['ranking'].to_numpy(), std))
    
    std = np.delete(std, -1, axis = 0)
    std = np.std(std, axis = 0)
    std = np.dstack((columnsrf, std))
    featuresranks = np.dstack((columnsrf, rankings))
    std = pd.DataFrame(data = np.squeeze(std, axis = 0), columns =['Categories', 'STD'])
    borda = pd.DataFrame(data = np.squeeze(featuresranks, axis=0), columns=['Categories', 'Borda-Score'])
    borda = borda.merge(std, on = 'Categories',)
    borda['Borda-Score'] = pd.to_numeric(borda['Borda-Score'])
    borda['Borda-Average'] = borda['Borda-Score'] / self.loops
    borda['ranking'] = borda['Borda-Score'].rank(ascending = True)
    borda.sort_values(by='Borda-Score', inplace = True)
    
    return borda
Exemple #3
0
  def ranking_by_matthew_punishment_rf(self):

    std = np.zeros(len(self.X.columns),)
    rankings = np.zeros(len(self.X.columns),)

    for x in range(self.loops):
      seed = randint(0, 10000)
    #Splits the train/val set by a seed that generates randomly each loop.
      X_train, X_fr, y_train, y_fr = train_test_split(self.X, self.y, test_size=0.30, random_state= seed)
    #Initializing a random forest
      rf = BalancedBaggingClassifier(n_estimators=50, random_state=0)
  #Fits the Random forest and we calculate a R2. 
      rf.fit(X_train, y_train)
      r2original = matthews_corrcoef(y_fr, rf.predict(X_fr))
  #We initialize 2 lists to append values from the next loop.
      r2fr= []
      columnsrf= []
  

      for x in self.X.columns:

        X_train, X_fr, y_train, y_fr = train_test_split(self.X, self.y, test_size=0.30, random_state = seed)
    #We drop a different column each loop.
        X_train = X_train.drop([x], axis=1)
        X_fr = X_fr.drop([x], axis=1)
    #We fit our random forest again, but this time our training dataset lacks a feature.
        rf.fit(X_train, y_train)
        r2 = matthews_corrcoef(y_fr, rf.predict(X_fr))
    #We append to the list each column that we dropped.
        columnsrf.append(x)
    #And we also append, the drop (or gain), in r2 that we got when the feature was missing.
        r2fr.append(r2original - r2)

      outcome = np.array(r2fr)
      rankings = np.add(outcome, rankings)
      std = np.vstack((outcome, std))
    
    rankings = np.true_divide(rankings, self.loops)
    std = np.delete(std, -1, axis = 0)
    std = np.std(std, axis = 0)
    std = np.dstack((columnsrf, std))
    std = pd.DataFrame(data = np.squeeze(std, axis = 0), columns =['Categories', 'SD_of_matt_punishment'])
    featuresranks = np.dstack((columnsrf, rankings))
    borda = pd.DataFrame(data = np.squeeze(featuresranks, axis=0), columns=['Categories', 'average-mtt-punishment'])
    borda['ranking'] = borda['average-mtt-punishment'].rank(ascending = False)
    borda = borda.merge(std, on = 'Categories',)
    borda.sort_values(by='average-mtt-punishment', inplace = True, ascending = False)

    return borda
Exemple #4
0
def test_single_estimator():
    # Check singleton ensembles.
    X, y = make_imbalance(
        iris.data,
        iris.target,
        sampling_strategy={
            0: 20,
            1: 25,
            2: 50
        },
        random_state=0,
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    clf1 = BalancedBaggingClassifier(
        base_estimator=KNeighborsClassifier(),
        n_estimators=1,
        bootstrap=False,
        bootstrap_features=False,
        random_state=0,
    ).fit(X_train, y_train)

    clf2 = make_pipeline(
        RandomUnderSampler(
            random_state=clf1.estimators_[0].steps[0][1].random_state),
        KNeighborsClassifier(),
    ).fit(X_train, y_train)

    assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
def balanced_bragging(X_train, y_train, X_test, y_test, X_train_res, y_train_res):
    bagging = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1)
    bagging.fit(X_train, y_train.values.ravel())
    y_train_bc = bagging.predict(X_test)
    cnf_matrix_tra = confusion_matrix(y_test, y_train_bc)
    without=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])
    print("Niezbalansowane (bragging): {}%".format(without))
    print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1])

    bagging_oversampling = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1)
    bagging_oversampling.fit(X_train_res, y_train_res.ravel())
    y_train_bc = bagging_oversampling.predict(X_test)
    cnf_matrix_tra = confusion_matrix(y_test, y_train_bc)
    with_oversampling=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])
    print("z oversamplingiem (bragging): {}%".format(with_oversampling))
    print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1])

    balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1)
    balanced_bagging.fit(X_train, y_train.values.ravel())
    y_train_bbc = balanced_bagging.predict(X_test)
    cnf_matrix_tra = confusion_matrix(y_test, y_train_bbc)
    within=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])
    print("Zbalansowane (bragging): {}%".format(within))
    print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1])

    objects = ('Bragging','Bragging z oversamplingiem SMOTE', 'Bragging z losowym undersamplingiem')
    y_pos = np.arange(len(objects))
    performance = [without,with_oversampling, within]
    plt.bar(y_pos, performance, align='center', alpha=0.5)
    plt.xticks(y_pos, objects)
    plt.ylabel('Procent dokładności')
    plt.title('Dokładność braggingu')
    plt.show()
    return without, within
def Model_3(train, test):
    ''' Trains the model and Saves the predictions in a CSV file
        train : Training set
        test : Test set
    '''
    # Preprocessing
    X_train = [DPC(i) for i in train['Sequence']]
    X_test = [DPC(i) for i in test['Sequence']]
    Y_train = train['label']

    # Training
    clf = BalancedBaggingClassifier(base_estimator=RandomForestClassifier(
        bootstrap=False, n_estimators=450, random_state=6),
                                    n_estimators=25,
                                    n_jobs=-1,
                                    random_state=6,
                                    verbose=1)
    clf.fit(X_train, Y_train)

    # Predicting
    Y_pred = clf.predict(X_test)
    Y_prob = [x[1] for x in clf.predict_proba(X_test)]
    result = pd.DataFrame()
    result["ID"] = test["ID"]
    result["Label"] = Y_prob
    result.to_csv("Submission_3.csv", index=False)
    result["Label"] = Y_pred
    result.to_csv("Predictions_3.csv", index=False)
def test_warm_start_equal_n_estimators():
    # Test that nothing happens when fitting without increasing n_estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True,
                                    random_state=83)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    # modify X to nonsense values, this should not change anything
    X_train += 1.

    assert_warns_message(UserWarning,
                         "Warm-start fitting without increasing n_estimators"
                         " does not", clf.fit, X_train, y_train)
    assert_array_equal(y_pred, clf.predict(X_test))
Exemple #8
0
def imblearn_(classifier, X_train, y_train, X_test, y_test):
    clf = BalancedBaggingClassifier(base_estimator=classifier,
                                    ratio='auto',
                                    random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    printStats(y_test, y_pred)
    return clf, y_pred
Exemple #9
0
def impute_by_model(df, df_test, impList, classifier):

    # convert ' ?' to NAN sothat those values will be converted to -1 when tranform to numerical
    df, df_test = unknown_to_NAN(df, df_test)

    # create a new df by dropping all rows having NAN values
    # only to build model for imputation
    dropna_df = df.dropna(how='any').reset_index(drop=True)

    # before convert both df, df_test to numerical, replace below value to its column mode
    # so that, native_country will have same numerical value for each country
    df.__getitem__('native_country').replace(' Holand-Netherlands',
                                             ' United-States')

    # convert to numerical
    num_dropna_df = df2num(dropna_df, headers)
    num_df_test = df2num(df_test, headers)
    num_df = df2num(df, headers)

    # to learn model on dataset which dropped rows contains missing values
    Xtr_train = num_dropna_df[impList[0]].values
    ytr_train = num_dropna_df[impList[1]].values

    # colum missing value from training data, use to impute training set
    Xtr_test = num_df[impList[0]].values

    # colum missing value from test data, use to impute training set
    Xt_test = num_df_test[impList[0]].values

    clf = BalancedBaggingClassifier(base_estimator=classifier,
                                    ratio='auto',
                                    random_state=0)
    clf.fit(Xtr_train, ytr_train)

    # impute training data
    ytr_pred = clf.predict(Xtr_test)
    lst = df.loc[num_df[impList[1]] == -1, impList[1]].index.tolist()
    num_df.loc[lst, impList[1]] = ytr_pred[lst]

    # impute test data
    yt_pred = clf.predict(Xt_test)
    lstt = df_test.loc[num_df_test[impList[1]] == -1,
                       impList[1]].index.tolist()
    num_df_test.loc[lstt, impList[1]] = yt_pred[lstt]
    # return df, df_test
    return df, df_test
def test_warm_start_equal_n_estimators():
    # Test that nothing happens when fitting without increasing n_estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True,
                                    random_state=83)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    # modify X to nonsense values, this should not change anything
    X_train += 1.

    assert_warns_message(UserWarning,
                         "Warm-start fitting without increasing n_estimators"
                         " does not", clf.fit, X_train, y_train)
    assert_array_equal(y_pred, clf.predict(X_test))
class Classifier(BaseEstimator):
    def __init__(self):
        self.reg = BalancedBaggingClassifier(n_estimators=50, random_state=42)

    def fit(self, X, y):
        self.reg.fit(X, y)

    def predict(self, X):
        return self.reg.predict(X)
def test_warm_start_equivalence():
    # warm started classifier with 5+5 estimators should be equivalent to
    # one classifier with 10 estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf_ws = BalancedBaggingClassifier(n_estimators=5, warm_start=True,
                                       random_state=3141)
    clf_ws.fit(X_train, y_train)
    clf_ws.set_params(n_estimators=10)
    clf_ws.fit(X_train, y_train)
    y1 = clf_ws.predict(X_test)

    clf = BalancedBaggingClassifier(n_estimators=10, warm_start=False,
                                    random_state=3141)
    clf.fit(X_train, y_train)
    y2 = clf.predict(X_test)

    assert_array_almost_equal(y1, y2)
def test_warm_start_equivalence():
    # warm started classifier with 5+5 estimators should be equivalent to
    # one classifier with 10 estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf_ws = BalancedBaggingClassifier(n_estimators=5, warm_start=True,
                                       random_state=3141)
    clf_ws.fit(X_train, y_train)
    clf_ws.set_params(n_estimators=10)
    clf_ws.fit(X_train, y_train)
    y1 = clf_ws.predict(X_test)

    clf = BalancedBaggingClassifier(n_estimators=10, warm_start=False,
                                    random_state=3141)
    clf.fit(X_train, y_train)
    y2 = clf.predict(X_test)

    assert_array_almost_equal(y1, y2)
Exemple #14
0
def buildModel(X, y):
    # X = np.reshape(X,(X.shape[0],X.shape[1] * X.shape[2]))
    print X.shape, y.shape
    scaler = StandardScaler()
    print(scaler.fit(X))
    scaled_train_x = scaler.transform(X)
    X_train, X_test, y_train, y_test = train_test_split(scaled_train_x,
                                                        y,
                                                        random_state=19,
                                                        test_size=0.3)

    bag = BalancedBaggingClassifier(n_estimators=200, random_state=19)
    svm = SVC(class_weight='balanced',
              random_state=19,
              decision_function_shape='ovo')
    neural = MLPClassifier(max_iter=500,
                           random_state=19,
                           solver='lbfgs',
                           alpha=1e-5,
                           hidden_layer_sizes=(49, 8, 4))
    ada = AdaBoostClassifier(n_estimators=100, random_state=19)
    logistic = LogisticRegression(solver='lbfgs', max_iter=500)

    bag.fit(X_train, y_train)
    svm.fit(X_train, y_train)
    neural.fit(X_train, y_train)
    ada.fit(X_train, y_train)
    logistic.fit(X_train, y_train)
    # joblib.dump(bag,'bag.pkl')
    # joblib.dump(scaler,'scaler.pkl')

    y_pred = bag.predict(X_test)
    y_pred2 = svm.predict(X_test)
    y_pred3 = neural.predict(X_test)
    y_pred4 = ada.predict(X_test)
    y_pred5 = logistic.predict(X_test)

    print matthews_corrcoef(y_test, y_pred)
    print matthews_corrcoef(y_test, y_pred2)
    print matthews_corrcoef(y_test, y_pred3)
    print matthews_corrcoef(y_test, y_pred4)
    print matthews_corrcoef(y_test, y_pred5)

    print confusion_matrix(y_test, y_pred)
    print confusion_matrix(y_test, y_pred2)
    print confusion_matrix(y_test, y_pred3)
    print confusion_matrix(y_test, y_pred4)
    print confusion_matrix(y_test, y_pred5)

    print(classification_report_imbalanced(y_test, y_pred))
    print(classification_report_imbalanced(y_test, y_pred2))
    print(classification_report_imbalanced(y_test, y_pred3))
    print(classification_report_imbalanced(y_test, y_pred4))
    print(classification_report_imbalanced(y_test, y_pred5))
def clf_wrapper(classifier, X_train, y_train, X_test, y_test):
    clf = BalancedBaggingClassifier(base_estimator=classifier,
                                    ratio='auto', 
                                    replacement=False, 
                                    random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    cfm = confusion_matrix(y_test, y_pred)
    
    # Predictive Value
    PPV = cfm[0,0]/(cfm[0,0]+cfm[0,1])
    NPV = cfm[1,1]/(cfm[1,0]+cfm[1,1])
    ACR = (cfm[0,0]+cfm[1,1])/(cfm[0,0]+cfm[1,1]+cfm[1,0]+cfm[0,1])
    return (PPV+NPV+ACR)/3
Exemple #16
0
def classifier_imblearn_SVM_training(_X, _Y, _weight):
    X_train, X_test, Y_train, Y_test, w_train, w_test = train_test_split(
        _X, _Y, _weight, test_size=0.2, random_state=0xdeadbeef)
    bbc = BalancedBaggingClassifier(base_estimator=SVC(kernel="rbf",
                                                       gamma="auto"),
                                    n_estimators=10,
                                    sampling_strategy="auto",
                                    max_samples=80,
                                    replacement=False,
                                    random_state=0xdeadbeef)
    bbc.fit(X_train, Y_train)
    y_pred = bbc.predict(X_test)
    print("Result from bagging labeled SVM:")
    print("tn, fp, fn, tp =", confusion_matrix(Y_test, y_pred).ravel())
Exemple #17
0
def Model_Building():

    X = pd.read_csv(r'C:\Users\Dell\Desktop\Tookitaki\Train.csv',
                    engine='python')
    Y_train = X['Bad_label'].values
    X.drop(['customer_no', 'Bad_label'], axis=1, inplace=True)
    X_train = X.values

    X = pd.read_csv(r'C:\Users\Dell\Desktop\Tookitaki\Test.csv',
                    engine='python')
    Y_test = X['Bad_label'].values
    X.drop(['customer_no', 'Bad_label'], axis=1, inplace=True)
    X_test = X.values

    imp1.fit(X_train)
    X_train = imp1.transform(X_train).astype(float)
    # print(X_train)
    imp2.fit(X_test)
    X_test = imp2.transform(X_test).astype(float)

    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)

    print(X_train.shape)

    bbc = BalancedBaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=100)\
        ,ratio='auto',replacement=False,random_state=0, bootstrap_features=False)

    clf = SelectKBest(mutual_info_classif, k=49)
    X_train = clf.fit_transform(X_train, Y_train)
    X_test = clf.transform(X_test)
    bbc.fit(X_train, Y_train)
    y_pred = bbc.predict(X_test)

    print(confusion_matrix(Y_test, y_pred))
    print(classification_report(Y_test, y_pred))

    fpr, tpr, thresholds = metrics.roc_curve(Y_test, y_pred, pos_label=1)
    auc_score = metrics.auc(fpr, tpr)
    print('auc score =', auc_score)
    print('gini score =', 2 * auc_score - 1)
def test_single_estimator():
    # Check singleton ensembles.
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=0)

    clf1 = BalancedBaggingClassifier(
        base_estimator=KNeighborsClassifier(),
        n_estimators=1,
        bootstrap=False,
        bootstrap_features=False,
        random_state=0).fit(X_train, y_train)

    clf2 = make_pipeline(RandomUnderSampler(
        random_state=clf1.estimators_[0].steps[0][1].random_state),
                         KNeighborsClassifier()).fit(X_train, y_train)

    assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
    bootstrap=True)
classifier_5.fit(X_train, Y_train)

# Fitting Decision Tree to the training data: Model 6
from sklearn.tree import DecisionTreeClassifier
classifier_6 = DecisionTreeClassifier()
classifier_6.fit(X_train, Y_train)

# In[ ]:

# Predicting the results
y_pred_1 = classifier_1.predict(X_test)
y_pred_2 = classifier_2.predict(X_test)
y_pred_3 = classifier_3.predict(X_test)
y_pred_4 = classifier_4.predict(X_test)
y_pred_5 = classifier_5.predict(X_test)
y_pred_6 = classifier_6.predict(X_test)

# Creating the confusion matrix
from sklearn.metrics import confusion_matrix
cm_1 = confusion_matrix(Y_test, y_pred_1)
accuracy_1 = (cm_1[0, 0] + cm_1[1, 1]) / len(Y_test)

cm_2 = confusion_matrix(Y_test, y_pred_2)
accuracy_2 = (cm_2[0, 0] + cm_2[1, 1]) / len(Y_test)

cm_3 = confusion_matrix(Y_test, y_pred_3)
accuracy_3 = (cm_3[0, 0] + cm_3[1, 1]) / len(Y_test)

cm_4 = confusion_matrix(Y_test, y_pred_4)
accuracy_4 = (cm_4[0, 0] + cm_4[1, 1]) / len(Y_test)
Exemple #20
0
###############################################################################
# Instead of using a single tree, we will check if an ensemble of decsion tree
# can actually alleviate the issue induced by the class imbalancing. First, we
# will use a bagging classifier and its counter part which internally uses a
# random under-sampling to balanced each boostrap sample.

bagging = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1)
balanced_bagging = BalancedBaggingClassifier(n_estimators=50,
                                             random_state=0,
                                             n_jobs=-1)

bagging.fit(X_train, y_train)
balanced_bagging.fit(X_train, y_train)

y_pred_bc = bagging.predict(X_test)
y_pred_bbc = balanced_bagging.predict(X_test)

###############################################################################
# Balancing each bootstrap sample allows to increase significantly the balanced
# accuracy and the geometric mean.

print('Bagging classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format(
    balanced_accuracy_score(y_test, y_pred_bc),
    geometric_mean_score(y_test, y_pred_bc)))
cm_bagging = confusion_matrix(y_test, y_pred_bc)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_bagging,
                      classes=np.unique(satimage.target),
                      ax=ax[0],
                      title='Bagging')
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
gbm_params2 = {'learning_rate': [0.01, 0.05, 0.1, 0.5, 1],
                'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'n_estimators':[50,100,500,1000,1500], 'min_samples_leaf':[5,10,15]}
rf = GradientBoostingClassifier()
grid = GridSearchCV(rf,param_grid,refit=True,verbose=2)
grid.fit(X_res,y_res)
grid_predictions = grid.predict(X_test)
print(confusion_matrix(y_test,grid_predictions))

print(classification_report(y_test,grid_predictions))
from sklearn.metrics import accuracy_score
print( accuracy_score(y_test, grid_predictions) )
print( grid.best_params_)

# # BalancedBaggingClassifier

# In[ ]:


from imblearn.ensemble import BalancedBaggingClassifier 
bbc = BalancedBaggingClassifier(random_state=42)
bbc.fit(X_train, y_train)
predictions = bbc.predict(X_test)
print(confusion_matrix(y_test,predictions))

print(classification_report(y_test,predictions))
from sklearn.metrics import accuracy_score
print( accuracy_score(y_test, predictions) )
print(grid.best_params_)
print(clf_rf.score(x_val, y_val))
print(recall_score(y_val, clf_rf.predict(x_val)))
print(precision_score(y_val, clf_rf.predict(x_val)))

print('\nTest Results')
print(clf_rf.score(data_features_test, data_labels_test))
print(recall_score(data_labels_test, clf_rf.predict(data_features_test)))
print(precision_score(data_labels_test, clf_rf.predict(data_features_test)))

print("END")
bbc = BalancedBaggingClassifier(random_state=12)
bbc.fit(x_train, np.array(y_train.iloc[:, 0]))

print('Validation Results')
print(bbc.score(x_val, y_val))
print(recall_score(y_val, bbc.predict(x_val)))
print(precision_score(y_val, bbc.predict(x_val)))
print('\nTest Results')
print(bbc.score(data_features_test, data_labels_test))
print(recall_score(data_labels_test, bbc.predict(data_features_test)))
print(precision_score(data_labels_test, bbc.predict(data_features_test)))

clf_xg = GradientBoostingClassifier(learning_rate=0.15,
                                    n_estimators=70,
                                    min_samples_split=0.5,
                                    min_samples_leaf=45,
                                    max_depth=8,
                                    max_features='sqrt',
                                    subsample=0.8)
clf_xg.fit(x_train_res, y_train_res)
Exemple #23
0
for i in range(8, len(Residues) - 8):
    r = (Residues[i - 8:i +
                  9]).upper()  # Converting Sequences to Patterns of size 17
    t = []
    for j in r:  # Binary Encoding of Patterns
        t = t + Encoding[j]
    Predictors.append(t)

Average_Predictions = [0 for i in range(len(Predictors))
                       ]  # Average of 5 Random Runs

for i in range(5):
    print("> Run:", i + 1)
    SVM = svm.SVC(kernel="rbf", gamma=0.1, C=2)
    BBC = BalancedBaggingClassifier(base_estimator=SVM)
    BBC.fit(Patterns, Labels)
    P = BBC.predict(Predictors)
    for i in range(len(P)):
        Average_Predictions[i] += P[i]

for i in range(len(Average_Predictions)):
    if Average_Predictions[i] < 0:
        Average_Predictions[i] = -1
    else:
        Average_Predictions[i] = 1

Result = pd.DataFrame()  # Exporting Predictions
Result["ID"] = Test["ID"]
Result["Lable"] = Average_Predictions
Result.to_csv("2018022_AVG_SVM_BBC.txt", index=False)
print(Result)
Exemple #24
0
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.ensemble import RandomForestClassifier

#Create an object of the classifier.
bbc = BalancedBaggingClassifier(base_estimator=RandomForestClassifier(),
                                 sampling_strategy='auto',
                                replacement=False,
                                random_state=0)

y_train = train['m13']
X_train = train.drop(['m13'], axis = 1)

#Train the classifier.
bbc.fit(X_train, y_train)
pred_y_1 = bbc.predict(X_train)
# print( accuracy_score(y_test, pred_y_1) )
# print(recall_score(y_test, pred_y_1))
# confusion_matrix(y_test, pred_y_1)


# In[3]:


from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier

#Create an object of the classifier.
bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                sampling_strategy='auto',
                                replacement=False,
bagging.fit(X_train, y_train)
balanced_bagging.fit(X_train, y_train)

print('Class distribution of the test set: {}'.format(Counter(y_test)))

print('Classification results using a bagging classifier on imbalanced data')
y_pred_bagging = bagging.predict(X_test)
print(classification_report_imbalanced(y_test, y_pred_bagging))
cm_bagging = confusion_matrix(y_test, y_pred_bagging)
plt.figure()
plot_confusion_matrix(cm_bagging, classes=np.unique(ozone.target),
                      title='Confusion matrix using BaggingClassifier')

print('Classification results using a bagging classifier on balanced data')
y_pred_balanced_bagging = balanced_bagging.predict(X_test)
print(classification_report_imbalanced(y_test, y_pred_balanced_bagging))
cm_balanced_bagging = confusion_matrix(y_test, y_pred_balanced_bagging)
plt.figure()
plot_confusion_matrix(cm_balanced_bagging, classes=np.unique(ozone.target),
                      title='Confusion matrix using BalancedBaggingClassifier')

###############################################################################
# Turning the balanced bagging classifier into a balanced random forest
###############################################################################
# It is possible to turn the ``BalancedBaggingClassifier`` into a balanced
# random forest by using a ``DecisionTreeClassifier`` with
# ``max_features='auto'``. We illustrate such changes below.

balanced_random_forest = BalancedBaggingClassifier(
    base_estimator=DecisionTreeClassifier(max_features='auto'),
Exemple #26
0
y_pred = model.predict(X_test)
mostrar_resultados(y_test, y_pred, 'Oversampling')

# Estrategia: Combinamos resampling con Smote-Tomek
# Ahora probaremos una técnica muy usada que consiste en aplicar
# en simultáneo un algoritmo de undersampling y otro de oversampling
# a la vez al dataset. En este caso usaremos SMOTE para oversampling:
# busca puntos vecinos cercanos y agrega puntos “en linea recta” entre ellos.
# Y usaremos Tomek para undersampling que quita los de distinta clase que sean
# nearest neighbor y deja ver mejor el decisión boundary
# (la zona limítrofe de nuestras clases).
os_us = SMOTETomek(sampling_strategy=0.5)
X_train_res, y_train_res = os_us.fit_resample(X_train, y_train)
print(f'Distribution before resampling {Counter(y_train)}')
print(f'Distribution after resampling {Counter(y_train_res)}')
model = run_model(X_train_res, X_test, y_train_res, y_test)
y_pred = model.predict(X_test)
mostrar_resultados(y_test, y_pred, 'Smote-Tomek')

# Estrategia: Ensamble de Modelos con Balanceo
# Para esta estrategia usaremos un Clasificador de Ensamble
# que usa Bagging y el modelo será un DecisionTree. Veamos como se comporta:
bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                sampling_strategy='auto',
                                replacement=False,
                                random_state=0)
# Train the classifier
bbc.fit(X_train, y_train)
y_pred = bbc.predict(X_test)
mostrar_resultados(y_test, y_pred, 'Ensamble BBC')
Exemple #27
0
                                                        Y,
                                                        test_size=0.2,
                                                        shuffle=Y)

    cl = BalancedBaggingClassifier(
        base_estimator=QuadraticDiscriminantAnalysis(reg_param=0.11),
        n_estimators=50,
        max_samples=0.6,
        max_features=0.7,
        n_jobs=-1,
        bootstrap_features=True,
        oob_score=False)

    cl.fit(X_train, Y_train)

    predictions = cl.predict(X_train)
    # print(X_train.shape,Y_train.shape,predictions.shape)
    # print(list(zip(Y_train,predictions)))
    print('\n\nModel Train: f1 = {0} '.format(
        f1_score(Y_train, predictions, average='micro')))

    predictions = cl.predict(X_test)
    print('\nModel Test: f1 = {0} '.format(
        f1_score(Y_test, predictions, average='micro')))

    # exit()

    cl = BalancedBaggingClassifier(
        base_estimator=QuadraticDiscriminantAnalysis(reg_param=0.11),
        n_estimators=10,
        max_samples=0.8,
#-------------------------------------------------------------

#-------------------------------------algo comparision chart--------------------------------
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
#-----------------------------------------------------

#---------------------------------TESTING OUR MODEL------------------------------------
balbag = BalancedBaggingClassifier()
balbag.fit(X_train_res, Y_train_res)

predictions = balbag.predict(X_test)
accur = "Accuracy of test data:" + str(
    accuracy_score(Y_test, predictions) * 100)
popupmsg(accur)
print(confusion_matrix(Y_test, predictions))

balbag = BalancedBaggingClassifier(RandomForestClassifier())
predictions2 = balbag.predict(X2)
popupmsg("Accuracy of unseen data:" +
         str(accuracy_score(Y2, predictions2) * 100))
predictions2 = predictions2.astype(int)

output = ''
c = 1
for i in predictions2:
Exemple #29
0
class Models(object):
    def __init__(self, feature_engineer=False):
        '''
        @description: initlize Class, EX: model
        @param {type} :
        feature_engineer: whether using feature engineering, if `False`, then compare common ML models
        res_model: res network model
        resnext_model: resnext network model
        wide_model: wide res network model
        bert: bert model
        ml_data: new mldata class
        @return: No return
        '''
        # 1. 使用torchvision 初始化resnet152模型
        # 2. 使用torchvision 初始化 resnext101_32x8d 模型
        # 3. 使用torchvision 初始化  wide_resnet101_2 模型
        # 4. 加载bert 模型
        print("load")
        self.res_model = torchvision.models.resnet152(pretrained=False)
        self.res_model.load_state_dict(
            torch.load(config.root_path +
                       '/model/resnet150/resnet152-b121ed2d.pth'))
        self.res_model = self.res_model.to(config.device)
        self.resnext_model = torchvision.models.resnext101_32x8d(
            pretrained=True)
        self.resnext_model = self.resnext_model.to(config.device)
        self.wide_model = torchvision.models.wide_resnet101_2(pretrained=True)
        self.wide_model = self.wide_model.to(config.device)

        self.bert_tonkenizer = BertTokenizer.from_pretrained(config.root_path +
                                                             '/model/bert')
        self.bert = BertModel.from_pretrained(config.root_path + '/model/bert')
        self.bert = self.bert.to(config.device)
        self.ml_data = MLData(debug_mode=True)
        if feature_engineer:
            self.model = lgb.LGBMClassifier(objective='multiclass',
                                            device='gpu',
                                            n_jobs=10,
                                            num_class=33,
                                            num_leaves=30,
                                            reg_alpha=10,
                                            reg_lambda=200,
                                            max_depth=3,
                                            learning_rate=0.05,
                                            n_estimators=2000,
                                            bagging_freq=1,
                                            bagging_fraction=0.9,
                                            feature_fraction=0.8,
                                            seed=1440)
        else:
            self.models = [
                RandomForestClassifier(n_estimators=500,
                                       max_depth=5,
                                       random_state=0),
                LogisticRegression(solver='liblinear', random_state=0),
                MultinomialNB(),
                SVC(),
                lgb.LGBMClassifier(objective='multiclass',
                                   n_jobs=10,
                                   num_class=33,
                                   num_leaves=30,
                                   reg_alpha=10,
                                   reg_lambda=200,
                                   max_depth=3,
                                   learning_rate=0.05,
                                   n_estimators=2000,
                                   bagging_freq=1,
                                   bagging_fraction=0.8,
                                   feature_fraction=0.8),
            ]

    def feature_engineer(self):
        '''
        @description: This function is building all kings of features
        @param {type} None
        @return:
        X_train, feature of train set
        X_test, feature of test set
        y_train, label of train set
        y_test, label of test set
        '''
        logger.info("generate embedding feature ")
        train_tfidf, test_tfidf, train, test = get_embedding_feature(
            self.ml_data)

        logger.info("generate basic feature ")

        # 1. 获取 基本的 NLP feature
        train = get_basic_feature(train)
        test = get_basic_feature(test)
        print(test.loc[0])

        logger.info("generate modal feature ")
        cover = os.listdir(config.root_path + '/data/book_cover/')
        train['cover'] = train.title.progress_apply(
            lambda x: config.root_path + '/data/book_cover/' + x + '.jpg'
            if x + '.jpg' in cover else '')
        test['cover'] = test.title.progress_apply(
            lambda x: config.root_path + '/data/book_cover/' + x + '.jpg'
            if x + '.jpg' in cover else '')

        # 1. 获取 三大CV模型的 modal embedding
        train['res_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.res_model))
        test['res_embedding'] = test['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.res_model))
        print(len(test.loc[0, 'res_embedding']))

        #train['resnext_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.resnext_model))
        #test['resnext_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.resnext_model))

        #train['wide_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.wide_model))
        #test['wide_embedding'] = test['cover'].progress_apply(lambda x: get_img_embedding(x,self.wide_model))

        logger.info("generate bert feature ")

        # 1. 获取bert embedding
        train['bert_embedding'] = train['text'].progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))
        test['bert_embedding'] = test['text'].progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))

        print(test.loc[0])

        logger.info("generate lda feature ")

        # 1. 获取 lda feature

        train['bow'] = train['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.em.lda.id2word.doc2bow(x))
        test['bow'] = test['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.em.lda.id2word.doc2bow(x))
        print(test['queryCutRMStopWord'])
        print(test['bow'])
        # 在bag of word 基础上得到lda的embedding
        train['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.em.lda, doc),
                train['bow']))
        test['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.em.lda, doc),
                test['bow']))
        print(test['lda'])
        print(test.loc[0])

        logger.info("formate data")
        print(test)
        print(test_tfidf)
        train, test = formate_data(train, test, train_tfidf, test_tfidf)
        print(test)
        print(test.loc[0])

        cols = [x for x in train.columns if str(x) not in ['labelIndex']]
        print(cols)
        X_train = train[cols]
        X_test = test[cols]
        print(X_test)
        train["labelIndex"] = train["labelIndex"].astype(int)
        test["labelIndex"] = test["labelIndex"].astype(int)
        y_train = train["labelIndex"]
        y_test = test["labelIndex"]
        print(y_test)
        return X_train, X_test, y_train, y_test

    def param_search(self, search_method='grid'):
        '''
        @description: use param search tech to find best param
        @param {type}
        search_method: two options. grid or bayesian optimization
        @return: None
        '''
        if search_method == 'grid':
            logger.info("use grid search")
            self.model = Grid_Train_model(self.model, self.X_train,
                                          self.X_test, self.y_train,
                                          self.y_test)
        elif search_method == 'bayesian':
            logger.info("use bayesian optimization")
            trn_data = lgb.Dataset(data=self.X_train,
                                   label=self.y_train,
                                   free_raw_data=False)
            param = bayes_parameter_opt_lgb(trn_data)
            logger.info("best param", param)
            return param

    def unbalance_helper(self,
                         imbalance_method='under_sampling',
                         search_method='grid'):
        '''
        @description: handle unbalance data, then search best param
        @param {type}
        imbalance_method,  three option, under_sampling for ClusterCentroids, SMOTE for over_sampling, ensemble for BalancedBaggingClassifier
        search_method: two options. grid or bayesian optimization
        @return: None
        '''
        logger.info("get all freature")
        self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer(
        )
        model_name = None
        if imbalance_method == 'over_sampling':
            logger.info("Use SMOTE deal with unbalance data ")

            # 1. 使用over_sampling 处理样本不平衡问题
            print(self.y_train)
            self.X_train, self.y_train = SMOTE().fit_resample(
                self.X_train, self.y_train)
            print(self.y_train)
            self.X_test, self.y_test = SMOTE().fit_resample(
                self.X_train, self.y_train)
            model_name = 'lgb_over_sampling'

        elif imbalance_method == 'under_sampling':
            logger.info("Use ClusterCentroids deal with unbalance data ")

            # 1. 使用 under_sampling 处理样本不平衡问题
            print(self.X_train)
            #print(self.y_train)
            self.X_train, self.y_train = ClusterCentroids(
                random_state=0).fit_resample(self.X_train, self.y_train)
            print(self.X_train)
            #print(self.y_train)
            self.X_test, self.y_test = ClusterCentroids(
                random_state=0).fit_resample(self.X_test, self.y_test)
            model_name = 'lgb_under_sampling'

        elif imbalance_method == 'ensemble':
            self.model = BalancedBaggingClassifier(
                base_estimator=DecisionTreeClassifier(),
                sampling_strategy='auto',
                replacement=False,
                random_state=0)
            model_name = 'ensemble'
        logger.info('search best param')

        if imbalance_method != 'ensemble':
            param = self.param_search(search_method=search_method)
            param['params']['num_leaves'] = int(param['params']['num_leaves'])
            param['params']['max_depth'] = int(param['params']['max_depth'])
            self.model = self.model.set_params(**param['params'])

        logger.info('fit model ')
        self.model.fit(self.X_train, self.y_train)

        # 1. 预测测试集的label
        # 2. 预测训练机的label
        # 3. 计算percision , accuracy, recall, fi_score

        Test_predict_label = self.model.predict(self.X_test)
        Train_predict_label = self.model.predict(self.X_train)
        per, acc, recall, f1 = get_score(self.y_train, self.y_test,
                                         Train_predict_label,
                                         Test_predict_label)

        # 输出训练集的准确率
        logger.info('Train accuracy %s' % per)
        # 输出测试集的准确率
        logger.info('test accuracy %s' % acc)
        # 输出recall
        logger.info('test recall %s' % recall)
        # 输出F1-score
        logger.info('test F1_score %s' % f1)
        self.save(model_name)

    def model_select(self,
                     X_train,
                     X_test,
                     y_train,
                     y_test,
                     feature_method='tf-idf'):
        '''
        @description: using different embedding feature to train common ML models
        @param {type}
        X_train, feature of train 
        X_test, feature of test set
        y_train, label of train set
        y_test, label of test set
        feature_method, three options , tfidf, word2vec and fasttext
        @return: None
        '''
        for model in self.models:
            model_name = model.__class__.__name__
            print(model_name)
            clf = model.fit(X_train, y_train)
            Test_predict_label = clf.predict(X_test)
            Train_predict_label = clf.predict(X_train)
            per, acc, recall, f1 = get_score(y_train, y_test,
                                             Train_predict_label,
                                             Test_predict_label)
            # 输出训练集的准确率
            logger.info(model_name + '_' + 'Train accuracy %s' % per)

            # 输出测试集的准确率
            logger.info(model_name + '_' + ' test accuracy %s' % acc)

            # 输出recall
            logger.info(model_name + '_' + 'test recall %s' % recall)

            # 输出F1-score
            logger.info(model_name + '_' + 'test F1_score %s' % f1)

    def predict(self, title, desc):

        inputs = self.process(title, desc)
        label = self.ix2label[self.model.predict(inputs)[0]]
        proba = np.max(self.model.predict_proba(inputs))
        return label, proba

    def save(self, model_name):

        joblib.dump(self.model, root_path + '/model/ml_model/' + model_name)

    def load(self, path):

        self.model = joblib.load(path)
Exemple #30
0
def model_baseline3(x_train, y_train, x_test, y_test):
    bagging = BaggingClassifier(random_state=0)
    balanced_bagging = BalancedBaggingClassifier(random_state=0)
    bagging.fit(x_train, y_train)
    balanced_bagging.fit(x_train, y_train)
    prob = bagging.predict_proba(x_test)[:, 1]
    predict_score = [float('%.2f' % x) for x in prob]
    loss_val = log_loss(y_test, predict_score)
    y_pred = [1 if x > 0.5 else 0 for x in predict_score]
    fpr, tpr, thresholds = roc_curve(y_test, predict_score)
    mean_fpr = np.linspace(0, 1, 100)
    mean_tpr = interp(mean_fpr, fpr, tpr)
    x_auc = auc(fpr, tpr)
    fig = plt.figure('Bagging')
    ax = fig.add_subplot(1, 1, 1)
    name = 'base_Bagging'
    plt.plot(mean_fpr,
             mean_tpr,
             linestyle='--',
             label='{} (area = %0.2f, logloss = %0.2f)'.format(name) %
             (x_auc, loss_val),
             lw=2)
    y_pred_bagging = bagging.predict(x_test)
    cm_bagging = confusion_matrix(y_test, y_pred_bagging)
    cm1 = plt.figure()
    plot_confusion_matrix(cm_bagging,
                          classes=[0, 1],
                          title='Confusion matrix of BaggingClassifier')
    # balanced_bagging
    prob = balanced_bagging.predict_proba(x_test)[:, 1]
    predict_score = [float('%.2f' % x) for x in prob]
    loss_val = log_loss(y_test, predict_score)
    fpr, tpr, thresholds = roc_curve(y_test, predict_score)
    mean_fpr = np.linspace(0, 1, 100)
    mean_tpr = interp(mean_fpr, fpr, tpr)
    x_auc = auc(fpr, tpr)
    plt.figure('Bagging')  # 选择图
    name = 'base_Balanced_Bagging'
    plt.plot(mean_fpr,
             mean_tpr,
             linestyle='--',
             label='{} (area = %0.2f, logloss = %0.2f)'.format(name) %
             (x_auc, loss_val),
             lw=2)
    y_pred_balanced_bagging = balanced_bagging.predict(x_test)
    cm_balanced_bagging = confusion_matrix(y_test, y_pred_balanced_bagging)
    cm2 = plt.figure()
    plot_confusion_matrix(cm_balanced_bagging,
                          classes=[0, 1],
                          title='Confusion matrix of BalancedBagging')
    plt.figure('Bagging')  # 选择图
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='k', label='Luck')
    # make nice plotting
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    ax.spines['left'].set_position(('outward', 10))
    ax.spines['bottom'].set_position(('outward', 10))
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()
    return cm1, cm2, fig
from sklearn.tree import DecisionTreeClassifier

# Create an object of the classifier
bbc = BalancedBaggingClassifier(base_estimator =DecisionTreeClassifier(criterion='entropy', random_state=0),
                                sampling_strategy='auto',
                                replacement=False,
                                random_state=0)

Y_train = train['Taxable.Income']
X_train = train.drop(['Taxable.Income'], axis=1)
X_test = test.drop(['Taxable.Income'], axis=1)
Y_test = test['Taxable.Income']

# Train the classifier
bbc.fit(X_train, Y_train)
preds = bbc.predict(X_test)

pd.Series(preds).value_counts()
# Confusion matrix
pd.crosstab(Y_test,preds)
# Accuracy
np.mean(preds==Y_test) # 58% [# gini 57%]

# Cross validation K-fold
X = final_data.iloc[:,1:].values
Y = final_data.iloc[:,0].values

from sklearn.model_selection import KFold
kf = KFold(n_splits=2)
kf.get_n_splits(X)
print(kf)
# ### Fit the model
# Fit the best model based on tuned parameters
GBM_clf = ensemble.GradientBoostingClassifier(learning_rate=0.05,
                                              max_depth=3,
                                              n_estimators=100)
best_clf = BalancedBaggingClassifier(base_estimator=GBM_clf,
                                     ratio='auto',
                                     replacement=False,
                                     random_state=0)

# Fit the model and check ConfusionMatrix
best_clf.fit(X_train, y_train)

# Check R-Style confusionMatrix

y_pred = best_clf.predict(X_test).tolist(
)  ## change type: object to list, cannot create Confusion Matrix if not change
confusionMatrix(y_pred, y_test).show()  ## Show the Confusion Matrix
# Classification Report
print('Classification Report:\n',
      classification_report(y_test, y_pred, target_names=["AS", "PsA", "RA"]))

### prepare input for ROC
n_classes = len(
    y_train.unique()
)  # number of indications, if 2 then n_class=1, if >2 then the number of indications
y_score = best_clf.fit(X_train, y_train).decision_function(X_test)
y_test2 = pd.get_dummies(y_test)

ROC(n_classes, y_score, y_test2)
PRC(n_classes, y_test2, y_score)
AUC_model3(best_clf, X_train, y_train, X_test, y_test, n_classes)
Exemple #33
0
class Models(object):
    def __init__(self,
                 model_path=None,
                 feature_engineer=False,
                 train_mode=True):
        '''
        @description: initlize Class, EX: model
        @param {type} :
        feature_engineer: whether using feature engineering, if `False`, then compare common ML models
        res_model: res network model
        resnext_model: resnext network model
        wide_model: wide res network model
        bert: bert model
        ml_data: new mldata class
        @return: No return
        '''
        # 加载图像处理模型, resnet, resnext, wide resnet, 如果支持cuda, 则将模型加载到cuda中
        ###########################################
        #          TODO: module 2 task 2.1        #
        ###########################################
        self.res_model = torchvision.models.resnet152(
            pretrained=True)  # res model for modal feature [1* 1000]
        self.res_model = self.res_model.to(config.device)
        self.resnext_model = torchvision.models.resnext101_32x8d(
            pretrained=True)
        self.resnext_model = self.resnext_model.to(config.device)
        self.wide_model = torchvision.models.wide_resnet101_2(pretrained=True)
        self.wide_model = self.wide_model.to(config.device)
        # 加载 bert 模型, 如果支持cuda, 则将模型加载到cuda中
        self.bert_tonkenizer = BertTokenizer.from_pretrained(config.root_path +
                                                             '/model/bert')
        self.bert = BertModel.from_pretrained(config.root_path + '/model/bert')
        self.bert = self.bert.to(config.device)

        # 初始化 MLdataset 类, debug_mode为true 则使用部分数据, train_mode表示是否训练
        self.ml_data = MLData(debug_mode=True, train_mode=train_mode)
        # 如果不训练, 则加载训练好的模型,进行预测
        if train_mode:
            self.model = lgb.LGBMClassifier(objective='multiclass',
                                            n_jobs=10,
                                            num_class=33,
                                            num_leaves=30,
                                            reg_alpha=10,
                                            reg_lambda=200,
                                            max_depth=3,
                                            learning_rate=0.05,
                                            n_estimators=2000,
                                            bagging_freq=1,
                                            bagging_fraction=0.9,
                                            feature_fraction=0.8,
                                            seed=1440)

        else:
            self.load(model_path)
            labelNameToIndex = json.load(
                open(config.root_path + '/data/label2id.json',
                     encoding='utf-8'))
            self.ix2label = {v: k for k, v in labelNameToIndex.items()}

    def feature_engineer(self):
        '''
        @description: This function is building all kings of features
        @param {type} None
        @return:
        X_train, feature of train set
        X_test, feature of test set
        y_train, label of train set
        y_test, label of test set
        '''

        logger.info("generate embedding feature ")
        # 获取tfidf 特征, word2vec 特征, word2vec不进行任何聚合
        ###########################################
        #          TODO: module 3 task 1.1        #
        ###########################################
        train_tfidf, train = get_embedding_feature(self.ml_data.train,
                                                   self.ml_data.em.tfidf,
                                                   self.ml_data.em.w2v)
        test_tfidf, test = get_embedding_feature(self.ml_data.dev,
                                                 self.ml_data.em.tfidf,
                                                 self.ml_data.em.w2v)

        logger.info("generate autoencoder feature ")
        # 获取到autoencoder 的embedding, 根据encoder 获取而不是decoder
        train_ae = get_autoencoder_feature(
            train,
            self.ml_data.em.ae.max_features,
            self.ml_data.em.ae.max_len,
            self.ml_data.em.ae.encoder,
            tokenizer=self.ml_data.em.ae.tokenizer)
        test_ae = get_autoencoder_feature(
            test,
            self.ml_data.em.ae.max_features,
            self.ml_data.em.ae.max_len,
            self.ml_data.em.ae.encoder,
            tokenizer=self.ml_data.em.ae.tokenizer)

        logger.info("generate basic feature ")
        # 获取nlp 基本特征
        train = get_basic_feature(train)
        test = get_basic_feature(test)

        logger.info("generate modal feature ")
        # 加载图书封面的文件
        cover = os.listdir(config.root_path + '/data/book_cover/')
        # 根据title 匹配图书封面
        train['cover'] = train['title'].progress_apply(
            lambda x: config.root_path + '/data/book_cover/' + x + '.jpg'
            if x + '.jpg' in cover else '')
        test['cover'] = test['title'].progress_apply(
            lambda x: config.root_path + '/data/book_cover/' + x + '.jpg'
            if x + '.jpg' in cover else '')

        # 根据封面获取封面的embedding
        ###########################################
        #          TODO: module 3 task 1.2        #
        ###########################################
        train['res_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.res_model))
        test['res_embedding'] = test['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.res_model))

        train['resnext_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.resnext_model))
        test['resnext_embedding'] = test['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.resnext_model))

        train['wide_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.wide_model))
        test['wide_embedding'] = test['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.wide_model))

        logger.info("generate bert feature ")
        ###########################################
        #          TODO: module 3 task 1.3        #
        ###########################################
        train['bert_embedding'] = train['text'].progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))
        test['bert_embedding'] = test['text'].progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))

        logger.info("generate lda feature ")
        ###########################################
        #          TODO: module 3 task 1.4        #
        ###########################################
        # 生成bag of word格式数据
        train['bow'] = train['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.em.lda.id2word.doc2bow(x))
        test['bow'] = test['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.em.lda.id2word.doc2bow(x))
        # 在bag of word 基础上得到lda的embedding
        train['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.em.lda, doc),
                train['bow']))
        test['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.em.lda, doc),
                test['bow']))

        logger.info("formate data")
        #  将所有的特征拼接到一起
        train = formate_data(train, train_tfidf, train_ae)
        test = formate_data(test, test_tfidf, test_ae)
        #  生成训练,测试的数据
        cols = [x for x in train.columns if str(x) not in ['labelIndex']]
        X_train = train[cols]
        X_test = test[cols]
        train["labelIndex"] = train["labelIndex"].astype(int)
        test["labelIndex"] = test["labelIndex"].astype(int)
        y_train = train["labelIndex"]
        y_test = test["labelIndex"]
        return X_train, X_test, y_train, y_test

    def param_search(self, search_method='grid'):
        '''
        @description: use param search tech to find best param
        @param {type}
        search_method: two options. grid or bayesian optimization
        @return: None
        '''
        # 使用网格搜索 或者贝叶斯优化 寻找最优参数
        if search_method == 'grid':
            logger.info("use grid search")
            self.model = Grid_Train_model(self.model, self.X_train,
                                          self.X_test, self.y_train,
                                          self.y_test)
        elif search_method == 'bayesian':
            logger.info("use bayesian optimization")
            trn_data = lgb.Dataset(data=self.X_train,
                                   label=self.y_train,
                                   free_raw_data=False)
            param = bayes_parameter_opt_lgb(trn_data)
            logger.info("best param", param)
            return param

    def unbalance_helper(self,
                         imbalance_method='under_sampling',
                         search_method='grid'):
        '''
        @description: handle unbalance data, then search best param
        @param {type}
        imbalance_method,  three option, under_sampling for ClusterCentroids, SMOTE for over_sampling, ensemble for BalancedBaggingClassifier
        search_method: two options. grid or bayesian optimization
        @return: None
        '''
        logger.info("get all freature")
        # 生成所有feature
        self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer(
        )
        model_name = None
        # 是否使用不平衡数据处理方式,上采样, 下采样, ensemble
        ###########################################
        #          TODO: module 4 task 1.1        #
        ###########################################
        if imbalance_method == 'over_sampling':
            logger.info("Use SMOTE deal with unbalance data ")
            self.X_train, self.y_train = SMOTE().fit_resample(
                self.X_train, self.y_train)
            self.X_test, self.y_test = SMOTE().fit_resample(
                self.X_train, self.y_train)
            model_name = 'lgb_over_sampling'
        elif imbalance_method == 'under_sampling':
            logger.info("Use ClusterCentroids deal with unbalance data ")
            self.X_train, self.y_train = ClusterCentroids(
                random_state=0).fit_resample(self.X_train, self.y_train)
            self.X_test, self.y_test = ClusterCentroids(
                random_state=0).fit_resample(self.X_test, self.y_test)
            model_name = 'lgb_under_sampling'
        elif imbalance_method == 'ensemble':
            self.model = BalancedBaggingClassifier(
                base_estimator=DecisionTreeClassifier(),
                sampling_strategy='auto',
                replacement=False,
                random_state=0)
            model_name = 'ensemble'
        logger.info('search best param')
        # 使用set_params 将搜索到的最优参数设置为模型的参数
        if imbalance_method != 'ensemble':
            ###########################################
            #          TODO: module 4 task 1.2        #
            ###########################################
            # param = self.param_search(search_method=search_method)
            # param['params']['num_leaves'] = int(param['params']['num_leaves'])
            # param['params']['max_depth'] = int(param['params']['max_depth'])
            param = {}
            param['params'] = {}
            param['params']['num_leaves'] = 3
            param['params']['max_depth'] = 5
            self.model = self.model.set_params(**param['params'])
        logger.info('fit model ')
        # 训练, 并输出模型的结果
        self.model.fit(self.X_train, self.y_train)
        ###########################################
        #          TODO: module 4 task 1.3        #
        ###########################################
        Test_predict_label = self.model.predict(self.X_test)
        Train_predict_label = self.model.predict(self.X_train)
        per, acc, recall, f1 = get_score(self.y_train, self.y_test,
                                         Train_predict_label,
                                         Test_predict_label)
        # 输出训练集的精确率
        logger.info('Train accuracy %s' % per)
        # 输出测试集的准确率
        logger.info('test accuracy %s' % acc)
        # 输出recall
        logger.info('test recall %s' % recall)
        # 输出F1-score
        logger.info('test F1_score %s' % f1)
        self.save(model_name)

    def process(self, title, desc):
        ###########################################
        #          TODO: module 5 task 1.1        #
        ###########################################
        # 处理数据, 生成模型预测所需要的特征
        df = pd.DataFrame([[title, desc]], columns=['title', 'desc'])
        df['text'] = df['title'] + df['desc']
        df["queryCut"] = df["text"].apply(query_cut)
        df["queryCutRMStopWord"] = df["queryCut"].apply(
            lambda x:
            [word for word in x if word not in self.ml_data.em.stopWords])

        df_tfidf, df = get_embedding_feature(df, self.ml_data.em.tfidf,
                                             self.ml_data.em.w2v)

        print("generate basic feature ")
        df = get_basic_feature(df)

        print("generate modal feature ")
        df['cover'] = ''
        df['res_embedding'] = df.cover.progress_apply(
            lambda x: get_img_embedding(x, self.res_model))

        df['resnext_embedding'] = df.cover.progress_apply(
            lambda x: get_img_embedding(x, self.resnext_model))

        df['wide_embedding'] = df.cover.progress_apply(
            lambda x: get_img_embedding(x, self.wide_model))

        print("generate bert feature ")
        df['bert_embedding'] = df.text.progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))

        print("generate lda feature ")
        df['bow'] = df['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.em.lda.id2word.doc2bow(x))
        df['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.em.lda, doc),
                df.bow))

        print("generate autoencoder feature ")
        df_ae = get_autoencoder_feature(df,
                                        self.ml_data.em.ae.max_features,
                                        self.ml_data.em.ae.max_len,
                                        self.ml_data.em.ae.encoder,
                                        tokenizer=self.ml_data.em.ae.tokenizer)

        print("formate data")
        df['labelIndex'] = 1
        df = formate_data(df, df_tfidf, df_ae)
        cols = [x for x in df.columns if str(x) not in ['labelIndex']]
        X_train = df[cols]
        return X_train

    def predict(self, title, desc):
        '''
        @description: 根据输入的title, desc 预测图书的类别
        @param {type}
        title, input
        desc: input
        @return: label
        '''
        ###########################################
        #          TODO: module 5 task 1.1        #
        ###########################################
        inputs = self.process(title, desc)
        label = self.ix2label[self.model.predict(inputs)[0]]
        proba = np.max(self.model.predict_proba(inputs))
        return label, proba

    def save(self, model_name):
        '''
        @description:save model
        @param {type}
        model_name, file name for saving
        @return: None
        '''
        ###########################################
        #          TODO: module 4 task 1.4        #
        ###########################################
        joblib.dump(self.model, root_path + '/model/ml_model/' + model_name)

    def load(self, path):
        '''
        @description: load model
        @param {type}
        path: model path
        @return:None
        '''
        ###########################################
        #          TODO: module 4 task 1.4        #
        ###########################################
        self.model = joblib.load(path)
Exemple #34
0
class Models(object):
    """
    获取基于机器学习的文本算法
    """
    def __init__(self,
                 model_path=None,
                 feature_engineer=False,
                 train_mode=True):
        # 加载图像处理模型, resnet, resnext, wide resnet, 如果支持 cuda, 则将模型加载到 cuda 中
        self.res_model = torchvision.models.resnet152(pretrained=True).to(
            config.device)
        self.resnext_model = torchvision.models.resnext101_32x8d(
            pretrained=True).to(config.device)
        self.wide_model = torchvision.models.wide_resnet101_2(
            pretrained=True).to(config.device)

        # 加载 bert 模型, 如果支持 cuda, 则将模型加载到 cuda 中
        self.bert_tonkenizer = BertTokenizer.from_pretrained(config.root_path +
                                                             '/model/bert')
        self.bert = BertModel.from_pretrained(config.root_path +
                                              '/model/bert').to(config.device)

        # 初始化 MLdataset 类, debug_mode为true 则使用部分数据, train_mode表示是否训练
        self.ml_data = MLData(debug_mode=True, train_mode=train_mode)

        # 如果不训练, 则加载训练好的模型,进行预测
        if not train_mode:
            self.load(model_path)
            labelNameToIndex = json.load(
                open(config.root_path + '/data/label2id.json',
                     encoding='utf-8'))
            self.ix2label = {v: k for k, v in labelNameToIndex.items()}
        else:
            # 如果 feature_engineer,  则使用lightgbm 进行训练, 反之对比经典机器学习模型
            if feature_engineer:
                self.model = lgb.LGBMClassifier(objective='multiclass',
                                                n_jobs=10,
                                                num_class=33,
                                                num_leaves=30,
                                                reg_alpha=10,
                                                reg_lambda=200,
                                                max_depth=3,
                                                learning_rate=0.05,
                                                n_estimators=2000,
                                                bagging_freq=1,
                                                bagging_fraction=0.9,
                                                feature_fraction=0.8,
                                                seed=1440)
            else:
                self.models = [
                    RandomForestClassifier(n_estimators=500,
                                           max_depth=5,
                                           random_state=0),
                    LogisticRegression(solver='liblinear', random_state=0),
                    MultinomialNB(),
                    SVC(),
                    lgb.LGBMClassifier(objective='multiclass',
                                       n_jobs=10,
                                       num_class=33,
                                       num_leaves=30,
                                       reg_alpha=10,
                                       reg_lambda=200,
                                       max_depth=3,
                                       learning_rate=0.05,
                                       n_estimators=2000,
                                       bagging_freq=1,
                                       bagging_fraction=0.8,
                                       feature_fraction=0.8),
                ]

    def feature_engineer(self):

        print(" generate embedding feature ")

        # 获取 tfidf 特征, word2vec 特征, word2vec 不进行任何聚合
        train_tfidf, train = get_embedding_feature(self.ml_data.train,
                                                   self.ml_data.tfidf,
                                                   self.ml_data.w2v)

        # train 是通过 pandas 创建的一个对象,get_embedding_feature 后得到的列为:
        # w2v: 一条句子中的词换成 w2v 模型编码的 vector。该列的每一行为:[seq, 300]
        # w2v_label_mean:获取句子 embedding ([seq, 300]) 与标签之间的关系特征。该列的每一行为:[300]
        # w2v_label_max:获取句子 embedding ([seq, 300]) 与标签之间的关系特征。该列的每一行为:[300]
        # w2v_mean:[seq, 300] -> [300]
        # w2v_max:[seq, 300] -> [300]
        # w2v_win_2_mean:窗口滑动思想提取特征,该列的每一行为:[300]
        # w2v_win_3_mean
        # w2v_win_4_mean
        # w2v_win_2_max
        # w2v_win_3_max
        # w2v_win_4_max

        test_tfidf, test = get_embedding_feature(self.ml_data.dev,
                                                 self.ml_data.tfidf,
                                                 self.ml_data.w2v)

        print("generate basic feature ")
        # 获取nlp 基本特征
        train = get_basic_feature(train)
        test = get_basic_feature(test)

        print("generate lda feature ")

        # 生成 bag of word 格式数据
        train['bow'] = train['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.lda.id2word.doc2bow(x))
        test['bow'] = test['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.lda.id2word.doc2bow(x))
        # test['bow'] 一行:[(10, 1), (78, 1), (162, 3), (177, 1), (192, 1)...]

        # 在bag of word 基础上得到lda的embedding
        train['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.lda, doc),
                train['bow']))
        test['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.lda, doc),
                test['bow']))
        # test['lda'] 一行:[0.002929521957412362, 0.0024772200267761946, .... ] 有 30 个主题,一行是 30 个主题的概率分布

        print("generate modal feature ")
        # 加载图书封面的文件
        cover = os.listdir(config.book_cover_path)
        # 根据title 匹配图书封面
        train['cover'] = train['title'].progress_apply(
            lambda x: config.book_cover_path + x + '.jpg'
            if x + '.jpg' in cover else '')
        test['cover'] = test.title.progress_apply(
            lambda x: config.book_cover_path + x + '.jpg'
            if x + '.jpg' in cover else '')

        # 根据封面获取封面的embedding
        train['res_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.res_model))
        test['res_embedding'] = test.cover.progress_apply(
            lambda x: get_img_embedding(x, self.res_model))

        train['resnext_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.resnext_model))
        test['resnext_embedding'] = test.cover.progress_apply(
            lambda x: get_img_embedding(x, self.resnext_model))

        train['wide_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.wide_model))
        test['wide_embedding'] = test.cover.progress_apply(
            lambda x: get_img_embedding(x, self.wide_model))

        print("generate bert feature ")
        train['bert_embedding'] = train['text'].progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))
        test['bert_embedding'] = test['text'].progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))

        # print("generate autoencoder feature ")
        # 获取到 autoencoder 的embedding, 根据encoder 获取而不是decoder
        # TODO
        # train_ae = get_autoencoder_feature(
        #     train,
        #     self.ml_data.ae.max_features,
        #     self.ml_data.ae.max_len,
        #     self.ml_data.ae.encoder,
        #     tokenizer=self.ml_data.ae.tokenizer)
        # test_ae = get_autoencoder_feature(
        #     test,
        #     self.ml_data.ae.max_fe atures,
        #     self.ml_data.ae.max_len,
        #     self.ml_data.ae.encoder,
        #     tokenizer=self.ml_data.ae.tokenizer)

        print("formate data")

        #  将所有的特征拼接到一起
        train = formate_data(
            train,
            train_tfidf)  # train = formate_data(train, train_tfidf, train_ae)
        test = formate_data(
            test, test_tfidf)  # test = formate_data(test, test_tfidf, test_ae)

        #  生成训练,测试的数据
        cols = [x for x in train.columns if str(x) not in ['labelIndex']]

        X_train = train[cols]
        X_test = test[cols]

        print(X_test)

        train["labelIndex"] = train["labelIndex"].astype(int)
        test["labelIndex"] = test["labelIndex"].astype(int)

        y_train = train["labelIndex"]
        y_test = test["labelIndex"]

        return X_train, X_test, y_train, y_test

    def param_search(self, search_method='grid'):
        # 使用网格搜索 或者贝叶斯优化 寻找最优参数
        if search_method == 'grid':
            print("use grid search")
            self.model = Grid_Train_model(self.model, self.X_train,
                                          self.X_test, self.y_train,
                                          self.y_test)
        elif search_method == 'bayesian':
            print("use bayesian optimization")
            trn_data = lgb.Dataset(data=self.X_train,
                                   label=self.y_train,
                                   free_raw_data=False)
            param = bayes_parameter_opt_lgb(trn_data)
            print("best param", param)
            return param

    def unbalance_helper(self,
                         imbalance_method='under_sampling',
                         search_method='grid'):

        print("get all feature")

        # 生成所有 feature

        self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer(
        )
        model_name = None

        # 是否使用不平衡数据处理方式,上采样, 下采样, ensemble

        if imbalance_method == 'over_sampling':
            print("Use SMOTE deal with unbalance data ")
            # https://www.zhihu.com/question/269698662
            # https://www.cnblogs.com/kamekin/p/9824294.html
            self.X_train, self.y_train = SMOTE().fit_resample(
                self.X_train, self.y_train)
            self.X_test, self.y_test = SMOTE().fit_resample(
                self.X_train, self.y_train)
            model_name = 'lgb_over_sampling'
        elif imbalance_method == 'under_sampling':
            print("Use ClusterCentroids deal with unbalance data")
            self.X_train, self.y_train = ClusterCentroids(
                random_state=0).fit_resample(self.X_train, self.y_train)
            self.X_test, self.y_test = ClusterCentroids(
                random_state=0).fit_resample(self.X_test, self.y_test)
            model_name = 'lgb_under_sampling'
        elif imbalance_method == 'ensemble':
            self.model = BalancedBaggingClassifier(
                base_estimator=DecisionTreeClassifier(),
                sampling_strategy='auto',
                replacement=False,
                random_state=0)
            model_name = 'ensemble'
        print('search best param')

        # 使用 set_params 将搜索到的最优参数设置为模型的参数

        if imbalance_method != 'ensemble':
            param = self.param_search(search_method=search_method)
            param['params']['num_leaves'] = int(param['params']['num_leaves'])
            param['params']['max_depth'] = int(param['params']['max_depth'])
            self.model = self.model.set_params(**param['params'])
        print('fit model ')

        # 训练, 并输出模型的结果

        self.model.fit(self.X_train, self.y_train)
        Test_predict_label = self.model.predict(self.X_test)
        Train_predict_label = self.model.predict(self.X_train)
        per, acc, recall, f1 = get_score(self.y_train, self.y_test,
                                         Train_predict_label,
                                         Test_predict_label)

        # 输出训练集的精确率
        print('Train accuracy %s' % per)
        # 输出测试集的准确率
        print('test accuracy %s' % acc)
        # 输出recall
        print('test recall %s' % recall)
        # 输出F1-score
        print('test F1_score %s' % f1)
        self.save(model_name)

    def model_select(self,
                     X_train,
                     X_test,
                     y_train,
                     y_test,
                     feature_method='tf-idf'):
        # 对比tfidf word2vec fasttext 等词向量以及常见机器学习模型的效果
        for model in self.models:
            model_name = model.__class__.__name__
            print(model_name)
            clf = model.fit(X_train, y_train)
            Test_predict_label = clf.predict(X_test)
            Train_predict_label = clf.predict(X_train)
            per, acc, recall, f1 = get_score(y_train, y_test,
                                             Train_predict_label,
                                             Test_predict_label)
            # 输出训练集的准确率
            print(model_name + '_' + 'Train accuracy %s' % per)

            # 输出测试集的准确率
            print(model_name + '_' + ' test accuracy %s' % acc)

            # 输出recall
            print(model_name + '_' + 'test recall %s' % recall)

            # 输出F1-score
            print(model_name + '_' + 'test F1_score %s' % f1)

    def process(self, title, desc):

        # 处理数据, 生成模型预测所需要的特征
        df = pd.DataFrame([[title, desc]], columns=['title', 'desc'])
        df['text'] = df['title'] + df['desc']
        df["queryCut"] = df["text"].apply(query_cut)
        df["queryCutRMStopWord"] = df["queryCut"].apply(
            lambda x: [word for word in x if word not in get_stop_word_list()])

        df_tfidf, df = get_embedding_feature(df, self.ml_data.tfidf,
                                             self.ml_data.w2v)

        print("generate basic feature ")
        df = get_basic_feature(df)

        print("generate modal feature ")
        df['cover'] = ''

        df['res_embedding'] = df.cover.progress_apply(
            lambda x: get_img_embedding(x, self.res_model))

        df['resnext_embedding'] = df.cover.progress_apply(
            lambda x: get_img_embedding(x, self.resnext_model))

        df['wide_embedding'] = df.cover.progress_apply(
            lambda x: get_img_embedding(x, self.wide_model))

        print("generate bert feature ")
        df['bert_embedding'] = df.text.progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))

        print("generate lda feature ")
        df['bow'] = df['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.lda.id2word.doc2bow(x))
        df['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.lda, doc), df.bow))

        print("generate autoencoder feature ")
        # df_ae = get_autoencoder_feature(df,
        #                                 self.ml_data.ae.max_features,
        #                                 self.ml_data.ae.max_len,
        #                                 self.ml_data.ae.encoder,
        #                                 tokenizer=self.ml_data.ae.tokenizer)

        print("formate data")
        df['labelIndex'] = 1
        df = formate_data(df, df_tfidf)  #, df_ae)
        cols = [x for x in df.columns if str(x) not in ['labelIndex']]
        X_train = df[cols]
        return X_train

    def predict(self, title, desc):
        '''
        @description: 根据输入的title, desc 预测图书的类别
        @param {type}
        title, input
        desc: input
        @return: label
        '''
        inputs = self.process(title, desc)
        label = self.ix2label[self.model.predict(inputs)[0]]
        proba = np.max(self.model.predict_proba(inputs))
        return label, proba

    def save(self, model_name):
        '''
        @description:save model
        @param {type}
        model_name, file name for saving
        @return: None
        '''
        joblib.dump(self.model, root_path + '/model/ml_model/' + model_name)

    def load(self, path):
        '''
        @description: load model
        @param {type}
        path: model path
        @return:None
        '''
        self.model = joblib.load(path)
# Classification using bagging classifier with and without sampling
###############################################################################
# Instead of using a single tree, we will check if an ensemble of decsion tree
# can actually alleviate the issue induced by the class imbalancing. First, we
# will use a bagging classifier and its counter part which internally uses a
# random under-sampling to balanced each boostrap sample.

bagging = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1)
balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0,
                                             n_jobs=-1)

bagging.fit(X_train, y_train)
balanced_bagging.fit(X_train, y_train)

y_pred_bc = bagging.predict(X_test)
y_pred_bbc = balanced_bagging.predict(X_test)

###############################################################################
# Balancing each bootstrap sample allows to increase significantly the balanced
# accuracy and the geometric mean.

print('Bagging classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_bc),
              geometric_mean_score(y_test, y_pred_bc)))
cm_bagging = confusion_matrix(y_test, y_pred_bc)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_bagging, classes=np.unique(satimage.target), ax=ax[0],
                      title='Bagging')

print('Balanced Bagging classifier performance:')