def test_warm_start_smaller_n_estimators():
    # Test if warm start'ed second fit with smaller n_estimators raises error.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True)
    clf.fit(X, y)
    clf.set_params(n_estimators=4)
    assert_raises(ValueError, clf.fit, X, y)
def test_bagging_with_pipeline():
    X, y = make_imbalance(iris.data, iris.target, ratio={0: 20, 1: 25, 2: 50},
                          random_state=0)
    estimator = BalancedBaggingClassifier(
        make_pipeline(SelectKBest(k=1),
                      DecisionTreeClassifier()),
        max_features=2)
    estimator.fit(X, y).predict(X)
def test_oob_score_consistency():
    # Make sure OOB scores are identical when random_state, estimator, and
    # training data are fixed and fitting is done twice
    X, y = make_hastie_10_2(n_samples=200, random_state=1)
    bagging = BalancedBaggingClassifier(KNeighborsClassifier(),
                                        max_samples=0.5,
                                        max_features=0.5, oob_score=True,
                                        random_state=1)
    assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_
def test_oob_score_removed_on_warm_start():
    X, y = make_hastie_10_2(n_samples=2000, random_state=1)

    clf = BalancedBaggingClassifier(n_estimators=50, oob_score=True)
    clf.fit(X, y)

    clf.set_params(warm_start=True, oob_score=False, n_estimators=100)
    clf.fit(X, y)

    assert_raises(AttributeError, getattr, clf, "oob_score_")
def test_max_samples_consistency():
    # Make sure validated max_samples and original max_samples are identical
    # when valid integer max_samples supplied by user
    max_samples = 100
    X, y = make_hastie_10_2(n_samples=2*max_samples, random_state=1)
    bagging = BalancedBaggingClassifier(KNeighborsClassifier(),
                                        max_samples=max_samples,
                                        max_features=0.5, random_state=1)
    bagging.fit(X, y)
    assert bagging._max_samples == max_samples
Exemple #6
0
def buildModel(X, y):
    # X = np.reshape(X,(X.shape[0],X.shape[1] * X.shape[2]))
    print X.shape, y.shape
    scaler = StandardScaler()
    print(scaler.fit(X))
    scaled_train_x = scaler.transform(X)
    X_train, X_test, y_train, y_test = train_test_split(scaled_train_x,
                                                        y,
                                                        random_state=19,
                                                        test_size=0.3)

    bag = BalancedBaggingClassifier(n_estimators=200, random_state=19)
    svm = SVC(class_weight='balanced',
              random_state=19,
              decision_function_shape='ovo')
    neural = MLPClassifier(max_iter=500,
                           random_state=19,
                           solver='lbfgs',
                           alpha=1e-5,
                           hidden_layer_sizes=(49, 8, 4))
    ada = AdaBoostClassifier(n_estimators=100, random_state=19)
    logistic = LogisticRegression(solver='lbfgs', max_iter=500)

    bag.fit(X_train, y_train)
    svm.fit(X_train, y_train)
    neural.fit(X_train, y_train)
    ada.fit(X_train, y_train)
    logistic.fit(X_train, y_train)
    # joblib.dump(bag,'bag.pkl')
    # joblib.dump(scaler,'scaler.pkl')

    y_pred = bag.predict(X_test)
    y_pred2 = svm.predict(X_test)
    y_pred3 = neural.predict(X_test)
    y_pred4 = ada.predict(X_test)
    y_pred5 = logistic.predict(X_test)

    print matthews_corrcoef(y_test, y_pred)
    print matthews_corrcoef(y_test, y_pred2)
    print matthews_corrcoef(y_test, y_pred3)
    print matthews_corrcoef(y_test, y_pred4)
    print matthews_corrcoef(y_test, y_pred5)

    print confusion_matrix(y_test, y_pred)
    print confusion_matrix(y_test, y_pred2)
    print confusion_matrix(y_test, y_pred3)
    print confusion_matrix(y_test, y_pred4)
    print confusion_matrix(y_test, y_pred5)

    print(classification_report_imbalanced(y_test, y_pred))
    print(classification_report_imbalanced(y_test, y_pred2))
    print(classification_report_imbalanced(y_test, y_pred3))
    print(classification_report_imbalanced(y_test, y_pred4))
    print(classification_report_imbalanced(y_test, y_pred5))
Exemple #7
0
def test_oob_score_consistency():
    # Make sure OOB scores are identical when random_state, estimator, and
    # training data are fixed and fitting is done twice
    X, y = make_hastie_10_2(n_samples=200, random_state=1)
    bagging = BalancedBaggingClassifier(
        KNeighborsClassifier(),
        max_samples=0.5,
        max_features=0.5,
        oob_score=True,
        random_state=1,
    )
    assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_
Exemple #8
0
def test_bagging_with_pipeline():
    X, y = make_imbalance(
        iris.data,
        iris.target,
        sampling_strategy={0: 20, 1: 25, 2: 50},
        random_state=0,
    )
    estimator = BalancedBaggingClassifier(
        make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()),
        max_features=2,
    )
    estimator.fit(X, y).predict(X)
Exemple #9
0
def cross_validation(x):
    with open('../data/conv_pred/train_data_' + x + '.pickle', 'rb') as f:
        data = pickle.load(f)
    print(data)
    v = DictVectorizer()
    X = v.fit_transform(data['X'])
    y = np.array(data['y'])

    zero = 0
    one = 0
    for i in y:
        if i == 0:
            zero += 1
        else:
            one += 1
    print(zero)
    print(one)

    cv = 5
    kf = KFold(n_splits=cv)
    fscore = 0
    ftscore = 0
    all_f_value = 0
    all_prec = 0
    for train_index, test_index in tqdm(kf.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        #model = RandomForestRe(n_estimators=100, n_jobs=8)
        model = BalancedBaggingClassifier(n_estimators=100, n_jobs=8)
        #model = xgb.XGBClassifier(n_estimators=500,max_delta_step=1,scale_pos_weight=zero/one)
        model.fit(X_train, y_train)
        predict = model.predict_proba(X_test)
        precision, recall, f_value, all_pre = eval(y_test, predict)
        all_prec += all_pre
        fscore += precision
        ftscore += recall
        all_f_value += f_value
    pprint(
        sorted(zip(
            np.mean([
                est.steps[1][1].feature_importances_
                for est in model.estimators_
            ],
                    axis=0), v.feature_names_),
               key=lambda x: x[0],
               reverse=True))
    print('\n')
    print('final precision : ', str(fscore / cv))
    print('final recall : ', str(ftscore / cv))
    print('final f-value : ', str(all_f_value / cv))
    print('final all_precision : ', str(all_prec / cv))
class Classifier(BaseEstimator):
    def __init__(self):
        # mimicking balanced random forest with the BalancedBaggingClassifier
        # and DecisionTreeClassifier combination
        self.bbc = BalancedBaggingClassifier(
            base_estimator=DecisionTreeClassifier(max_features='auto'),
            ratio=determine_ratio, random_state=0, n_estimators=50, n_jobs=1)

    def fit(self, X, y):
        self.bbc.fit(X, y)

    def predict_proba(self, X):
        return self.bbc.predict_proba(X)
Exemple #11
0
def test_max_samples_consistency():
    # Make sure validated max_samples and original max_samples are identical
    # when valid integer max_samples supplied by user
    max_samples = 100
    X, y = make_hastie_10_2(n_samples=2 * max_samples, random_state=1)
    bagging = BalancedBaggingClassifier(
        KNeighborsClassifier(),
        max_samples=max_samples,
        max_features=0.5,
        random_state=1,
    )
    bagging.fit(X, y)
    assert bagging._max_samples == max_samples
Exemple #12
0
def classifier_imblearn_SVM_training(_X, _Y, _weight):
    X_train, X_test, Y_train, Y_test, w_train, w_test = train_test_split(
        _X, _Y, _weight, test_size=0.2, random_state=0xdeadbeef)
    bbc = BalancedBaggingClassifier(base_estimator=SVC(kernel="rbf",
                                                       gamma="auto"),
                                    n_estimators=10,
                                    sampling_strategy="auto",
                                    max_samples=80,
                                    replacement=False,
                                    random_state=0xdeadbeef)
    bbc.fit(X_train, Y_train)
    y_pred = bbc.predict(X_test)
    print("Result from bagging labeled SVM:")
    print("tn, fp, fn, tp =", confusion_matrix(Y_test, y_pred).ravel())
def clf_wrapper(classifier, X_train, y_train, X_test, y_test):
    clf = BalancedBaggingClassifier(base_estimator=classifier,
                                    ratio='auto', 
                                    replacement=False, 
                                    random_state=0)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    cfm = confusion_matrix(y_test, y_pred)
    
    # Predictive Value
    PPV = cfm[0,0]/(cfm[0,0]+cfm[0,1])
    NPV = cfm[1,1]/(cfm[1,0]+cfm[1,1])
    ACR = (cfm[0,0]+cfm[1,1])/(cfm[0,0]+cfm[1,1]+cfm[1,0]+cfm[0,1])
    return (PPV+NPV+ACR)/3
Exemple #14
0
def test_balanced_bagging_classifier_with_function_sampler(replace):
    # check that we can provide a FunctionSampler in BalancedBaggingClassifier
    X, y = make_classification(
        n_samples=1_000,
        n_features=10,
        n_classes=2,
        weights=[0.3, 0.7],
        random_state=0,
    )

    def roughly_balanced_bagging(X, y, replace=False):
        """Implementation of Roughly Balanced Bagging for binary problem."""
        # find the minority and majority classes
        class_counts = Counter(y)
        majority_class = max(class_counts, key=class_counts.get)
        minority_class = min(class_counts, key=class_counts.get)

        # compute the number of sample to draw from the majority class using
        # a negative binomial distribution
        n_minority_class = class_counts[minority_class]
        n_majority_resampled = np.random.negative_binomial(n=n_minority_class,
                                                           p=0.5)

        # draw randomly with or without replacement
        majority_indices = np.random.choice(
            np.flatnonzero(y == majority_class),
            size=n_majority_resampled,
            replace=replace,
        )
        minority_indices = np.random.choice(
            np.flatnonzero(y == minority_class),
            size=n_minority_class,
            replace=replace,
        )
        indices = np.hstack([majority_indices, minority_indices])

        return X[indices], y[indices]

    # Roughly Balanced Bagging
    rbb = BalancedBaggingClassifier(
        base_estimator=CountDecisionTreeClassifier(),
        n_estimators=2,
        sampler=FunctionSampler(func=roughly_balanced_bagging,
                                kw_args={"replace": replace}),
    )
    rbb.fit(X, y)

    for estimator in rbb.estimators_:
        class_counts = estimator[-1].class_counts_
        assert (class_counts[0] / class_counts[1]) > 0.8
Exemple #15
0
  def ranking_by_matthew_punishment_rf(self):

    std = np.zeros(len(self.X.columns),)
    rankings = np.zeros(len(self.X.columns),)

    for x in range(self.loops):
      seed = randint(0, 10000)
    #Splits the train/val set by a seed that generates randomly each loop.
      X_train, X_fr, y_train, y_fr = train_test_split(self.X, self.y, test_size=0.30, random_state= seed)
    #Initializing a random forest
      rf = BalancedBaggingClassifier(n_estimators=50, random_state=0)
  #Fits the Random forest and we calculate a R2. 
      rf.fit(X_train, y_train)
      r2original = matthews_corrcoef(y_fr, rf.predict(X_fr))
  #We initialize 2 lists to append values from the next loop.
      r2fr= []
      columnsrf= []
  

      for x in self.X.columns:

        X_train, X_fr, y_train, y_fr = train_test_split(self.X, self.y, test_size=0.30, random_state = seed)
    #We drop a different column each loop.
        X_train = X_train.drop([x], axis=1)
        X_fr = X_fr.drop([x], axis=1)
    #We fit our random forest again, but this time our training dataset lacks a feature.
        rf.fit(X_train, y_train)
        r2 = matthews_corrcoef(y_fr, rf.predict(X_fr))
    #We append to the list each column that we dropped.
        columnsrf.append(x)
    #And we also append, the drop (or gain), in r2 that we got when the feature was missing.
        r2fr.append(r2original - r2)

      outcome = np.array(r2fr)
      rankings = np.add(outcome, rankings)
      std = np.vstack((outcome, std))
    
    rankings = np.true_divide(rankings, self.loops)
    std = np.delete(std, -1, axis = 0)
    std = np.std(std, axis = 0)
    std = np.dstack((columnsrf, std))
    std = pd.DataFrame(data = np.squeeze(std, axis = 0), columns =['Categories', 'SD_of_matt_punishment'])
    featuresranks = np.dstack((columnsrf, rankings))
    borda = pd.DataFrame(data = np.squeeze(featuresranks, axis=0), columns=['Categories', 'average-mtt-punishment'])
    borda['ranking'] = borda['average-mtt-punishment'].rank(ascending = False)
    borda = borda.merge(std, on = 'Categories',)
    borda.sort_values(by='average-mtt-punishment', inplace = True, ascending = False)

    return borda
Exemple #16
0
def test_probability():
    # Predict probabilities.
    X, y = make_imbalance(
        iris.data,
        iris.target,
        sampling_strategy={
            0: 20,
            1: 25,
            2: 50
        },
        random_state=0,
    )
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    with np.errstate(divide="ignore", invalid="ignore"):
        # Normal case
        ensemble = BalancedBaggingClassifier(
            base_estimator=DecisionTreeClassifier(),
            random_state=0).fit(X_train, y_train)

        assert_array_almost_equal(
            np.sum(ensemble.predict_proba(X_test), axis=1),
            np.ones(len(X_test)),
        )

        assert_array_almost_equal(
            ensemble.predict_proba(X_test),
            np.exp(ensemble.predict_log_proba(X_test)),
        )

        # Degenerate case, where some classes are missing
        ensemble = BalancedBaggingClassifier(
            base_estimator=LogisticRegression(solver="lbfgs",
                                              multi_class="auto"),
            random_state=0,
            max_samples=5,
        )
        ensemble.fit(X_train, y_train)

        assert_array_almost_equal(
            np.sum(ensemble.predict_proba(X_test), axis=1),
            np.ones(len(X_test)),
        )

        assert_array_almost_equal(
            ensemble.predict_proba(X_test),
            np.exp(ensemble.predict_log_proba(X_test)),
        )
Exemple #17
0
def test_balanced_bagging_classifier_error(params):
    # Test that it gives proper exception on deficient input.
    X, y = make_imbalance(iris.data,
                          iris.target,
                          sampling_strategy={
                              0: 20,
                              1: 25,
                              2: 50
                          })
    base = DecisionTreeClassifier()
    clf = BalancedBaggingClassifier(base_estimator=base, **params)
    with pytest.raises(ValueError):
        clf.fit(X, y)

    # Test support of decision_function
    assert not (hasattr(
        BalancedBaggingClassifier(base).fit(X, y), "decision_function"))
def test_warm_start_equal_n_estimators():
    # Test that nothing happens when fitting without increasing n_estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True,
                                    random_state=83)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    # modify X to nonsense values, this should not change anything
    X_train += 1.

    assert_warns_message(UserWarning,
                         "Warm-start fitting without increasing n_estimators"
                         " does not", clf.fit, X_train, y_train)
    assert_array_equal(y_pred, clf.predict(X_test))
def test_warm_start_equal_n_estimators():
    # Test that nothing happens when fitting without increasing n_estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf = BalancedBaggingClassifier(n_estimators=5, warm_start=True,
                                    random_state=83)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    # modify X to nonsense values, this should not change anything
    X_train += 1.

    assert_warns_message(UserWarning,
                         "Warm-start fitting without increasing n_estimators"
                         " does not", clf.fit, X_train, y_train)
    assert_array_equal(y_pred, clf.predict(X_test))
Exemple #20
0
def impute_by_model(df, df_test, impList, classifier):

    # convert ' ?' to NAN sothat those values will be converted to -1 when tranform to numerical
    df, df_test = unknown_to_NAN(df, df_test)

    # create a new df by dropping all rows having NAN values
    # only to build model for imputation
    dropna_df = df.dropna(how='any').reset_index(drop=True)

    # before convert both df, df_test to numerical, replace below value to its column mode
    # so that, native_country will have same numerical value for each country
    df.__getitem__('native_country').replace(' Holand-Netherlands',
                                             ' United-States')

    # convert to numerical
    num_dropna_df = df2num(dropna_df, headers)
    num_df_test = df2num(df_test, headers)
    num_df = df2num(df, headers)

    # to learn model on dataset which dropped rows contains missing values
    Xtr_train = num_dropna_df[impList[0]].values
    ytr_train = num_dropna_df[impList[1]].values

    # colum missing value from training data, use to impute training set
    Xtr_test = num_df[impList[0]].values

    # colum missing value from test data, use to impute training set
    Xt_test = num_df_test[impList[0]].values

    clf = BalancedBaggingClassifier(base_estimator=classifier,
                                    ratio='auto',
                                    random_state=0)
    clf.fit(Xtr_train, ytr_train)

    # impute training data
    ytr_pred = clf.predict(Xtr_test)
    lst = df.loc[num_df[impList[1]] == -1, impList[1]].index.tolist()
    num_df.loc[lst, impList[1]] = ytr_pred[lst]

    # impute test data
    yt_pred = clf.predict(Xt_test)
    lstt = df_test.loc[num_df_test[impList[1]] == -1,
                       impList[1]].index.tolist()
    num_df_test.loc[lstt, impList[1]] = yt_pred[lstt]
    # return df, df_test
    return df, df_test
Exemple #21
0
    def fit(self, X, Y, sample_weight=None):
        import sklearn.tree
        if self.estimator is None:
            self.max_depth = int(self.max_depth)
            self.estimator = sklearn.tree.DecisionTreeClassifier(max_depth=self.max_depth)
        from imblearn.ensemble import BalancedBaggingClassifier
        estimator = BalancedBaggingClassifier(base_estimator=self.estimator,
                                              n_estimators=self.n_estimators,
                                              max_features=self.max_features,
                                              bootstrap=self.bootstrap,
                                              bootstrap_features=self.bootstrap_features,
                                              sampling_strategy=self.sampling_strategy,
                                              replacement=self.replacement,
                                              n_jobs=self.n_jobs,
                                              random_state=self.random_state)
        estimator.fit(X, Y)

        self.estimator = estimator
        return self
Exemple #22
0
def Model_Building():

    X = pd.read_csv(r'C:\Users\Dell\Desktop\Tookitaki\Train.csv',
                    engine='python')
    Y_train = X['Bad_label'].values
    X.drop(['customer_no', 'Bad_label'], axis=1, inplace=True)
    X_train = X.values

    X = pd.read_csv(r'C:\Users\Dell\Desktop\Tookitaki\Test.csv',
                    engine='python')
    Y_test = X['Bad_label'].values
    X.drop(['customer_no', 'Bad_label'], axis=1, inplace=True)
    X_test = X.values

    imp1.fit(X_train)
    X_train = imp1.transform(X_train).astype(float)
    # print(X_train)
    imp2.fit(X_test)
    X_test = imp2.transform(X_test).astype(float)

    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)

    print(X_train.shape)

    bbc = BalancedBaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=100)\
        ,ratio='auto',replacement=False,random_state=0, bootstrap_features=False)

    clf = SelectKBest(mutual_info_classif, k=49)
    X_train = clf.fit_transform(X_train, Y_train)
    X_test = clf.transform(X_test)
    bbc.fit(X_train, Y_train)
    y_pred = bbc.predict(X_test)

    print(confusion_matrix(Y_test, y_pred))
    print(classification_report(Y_test, y_pred))

    fpr, tpr, thresholds = metrics.roc_curve(Y_test, y_pred, pos_label=1)
    auc_score = metrics.auc(fpr, tpr)
    print('auc score =', auc_score)
    print('gini score =', 2 * auc_score - 1)
Exemple #23
0
def test_estimators_samples():
    # Check that format of estimators_samples_ is correct and that results
    # generated at fit time can be identically reproduced at a later time
    # using data saved in object attributes.
    X, y = make_hastie_10_2(n_samples=200, random_state=1)

    # remap the y outside of the BalancedBaggingclassifier
    # _, y = np.unique(y, return_inverse=True)
    bagging = BalancedBaggingClassifier(
        LogisticRegression(solver="lbfgs", multi_class="auto"),
        max_samples=0.5,
        max_features=0.5,
        random_state=1,
        bootstrap=False,
    )
    bagging.fit(X, y)

    # Get relevant attributes
    estimators_samples = bagging.estimators_samples_
    estimators_features = bagging.estimators_features_
    estimators = bagging.estimators_

    # Test for correct formatting
    assert len(estimators_samples) == len(estimators)
    assert len(estimators_samples[0]) == len(X) // 2
    assert estimators_samples[0].dtype.kind == "i"

    # Re-fit single estimator to test for consistent sampling
    estimator_index = 0
    estimator_samples = estimators_samples[estimator_index]
    estimator_features = estimators_features[estimator_index]
    estimator = estimators[estimator_index]

    X_train = (X[estimator_samples])[:, estimator_features]
    y_train = y[estimator_samples]

    orig_coefs = estimator.steps[-1][1].coef_
    estimator.fit(X_train, y_train)
    new_coefs = estimator.steps[-1][1].coef_

    assert_allclose(orig_coefs, new_coefs)
def test_probability():
    # Predict probabilities.
    X, y = make_imbalance(
        iris.data,
        iris.target,
        sampling_strategy={0: 20,
                           1: 25,
                           2: 50},
        random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    with np.errstate(divide="ignore", invalid="ignore"):
        # Normal case
        ensemble = BalancedBaggingClassifier(
            base_estimator=DecisionTreeClassifier(), random_state=0).fit(
                X_train, y_train)

        assert_array_almost_equal(
            np.sum(ensemble.predict_proba(X_test), axis=1),
            np.ones(len(X_test)))

        assert_array_almost_equal(
            ensemble.predict_proba(X_test),
            np.exp(ensemble.predict_log_proba(X_test)))

        # Degenerate case, where some classes are missing
        ensemble = BalancedBaggingClassifier(
            base_estimator=LogisticRegression(solver='lbfgs',
                                              multi_class='auto'),
            random_state=0, max_samples=5)
        ensemble.fit(X_train, y_train)

        assert_array_almost_equal(
            np.sum(ensemble.predict_proba(X_test), axis=1),
            np.ones(len(X_test)))

        assert_array_almost_equal(
            ensemble.predict_proba(X_test),
            np.exp(ensemble.predict_log_proba(X_test)))
def test_estimators_samples():
    # Check that format of estimators_samples_ is correct and that results
    # generated at fit time can be identically reproduced at a later time
    # using data saved in object attributes.
    X, y = make_hastie_10_2(n_samples=200, random_state=1)

    # remap the y outside of the BalancedBaggingclassifier
    # _, y = np.unique(y, return_inverse=True)
    bagging = BalancedBaggingClassifier(LogisticRegression(solver='lbfgs',
                                                           multi_class='auto'),
                                        max_samples=0.5,
                                        max_features=0.5, random_state=1,
                                        bootstrap=False)
    bagging.fit(X, y)

    # Get relevant attributes
    estimators_samples = bagging.estimators_samples_
    estimators_features = bagging.estimators_features_
    estimators = bagging.estimators_

    # Test for correct formatting
    assert len(estimators_samples) == len(estimators)
    assert len(estimators_samples[0]) == len(X) // 2
    assert estimators_samples[0].dtype.kind == 'i'

    # Re-fit single estimator to test for consistent sampling
    estimator_index = 0
    estimator_samples = estimators_samples[estimator_index]
    estimator_features = estimators_features[estimator_index]
    estimator = estimators[estimator_index]

    X_train = (X[estimator_samples])[:, estimator_features]
    y_train = y[estimator_samples]

    orig_coefs = estimator.steps[-1][1].coef_
    estimator.fit(X_train, y_train)
    new_coefs = estimator.steps[-1][1].coef_

    assert_allclose(orig_coefs, new_coefs)
Exemple #26
0
def cross_validation_another(x):
    with open('../data/conv_pred/super_train_data_day_' + 'A' + '.pickle',
              'rb') as f:
        data = pickle.load(f)
    with open('../data/conv_pred/super_test_data_day_' + 'A' + '.pickle',
              'rb') as f:
        test = pickle.load(f)
    v = DictVectorizer()
    X_train = v.fit_transform(data['X'])
    y_train = np.array(data['y'])
    X_test = v.transform(test['X'])
    y_test = np.array(test['y'])
    zero = 0
    one = 0
    for i in y_train:
        if i == 0:
            zero += 1
        else:
            one += 1
    print(zero)
    print(one)

    model = BalancedBaggingClassifier(n_estimators=100,
                                      n_jobs=8,
                                      max_samples=0.6)
    #model = xgb.XGBClassifier(n_estimators=500, max_delta_step=1, scale_pos_weight=zero / one)
    model.fit(X_train, y_train)
    predict = model.predict_proba(X_test)
    precision, recall, f_value, all_pre = eval(y_test, predict)
    all_prec = all_pre
    fscore = precision
    ftscore = recall
    all_f_value = f_value
    print('\n')
    print('final precision : ', str(fscore))
    print('final recall : ', str(ftscore))
    print('final f-value : ', str(all_f_value))
    print('final all_precision : ', str(all_prec))
Exemple #27
0
def train_model(data):

    dataset = pd.get_dummies(
        data,
        columns=['Employment.Type', 'Driving_flag', 'Bureau_bin'],
        drop_first=True)
    #dataset = pd.get_dummies(data,columns=['Employment.Type','Driving_flag'],drop_first=True)
    X = dataset.drop('loan_default', axis=1)
    y = dataset['loan_default']

    #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,train_size=.8, stratify=y)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0,
                                                        stratify=y)

    rfc = RandomForestClassifier(class_weight='balanced', n_estimators=100)
    rfc.fit(X_train, y_train)
    lr = LogisticRegression(class_weight='balanced')
    lr.fit(X_train, y_train)
    xgb = XGBClassifier(scale_pos_weight=3.4)
    xgb.fit(X_train, y_train)

    brfc = BalancedRandomForestClassifier(max_depth=4, random_state=0)
    brfc.fit(X_train, y_train)
    bbc = BalancedBaggingClassifier(n_estimators=100, random_state=42)
    bbc.fit(X_train, y_train)
    models = [rfc, lr, xgb, brfc, bbc]
    model_names = [
        'RandomForestClassifier', 'LogisticRegression', 'XGBClassifier',
        'BalancedRandomForestClassifier', 'BalancedBaggingClassifier'
    ]
    for m, n in zip(models, model_names):
        print('Classifier: ' + n)
        predict_evaluate_classifier(X_test, y_test, m)

    return rfc, lr, xgb, brfc, bbc
def test_warm_start(random_state=42):
    # Test if fitting incrementally with warm start gives a forest of the
    # right size and the same results as a normal fit.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)

    clf_ws = None
    for n_estimators in [5, 10]:
        if clf_ws is None:
            clf_ws = BalancedBaggingClassifier(n_estimators=n_estimators,
                                               random_state=random_state,
                                               warm_start=True)
        else:
            clf_ws.set_params(n_estimators=n_estimators)
        clf_ws.fit(X, y)
        assert len(clf_ws) == n_estimators

    clf_no_ws = BalancedBaggingClassifier(n_estimators=10,
                                          random_state=random_state,
                                          warm_start=False)
    clf_no_ws.fit(X, y)

    assert (set([pipe.steps[-1][1].random_state for pipe in clf_ws]) ==
            set([pipe.steps[-1][1].random_state for pipe in clf_no_ws]))
def test_warm_start(random_state=42):
    # Test if fitting incrementally with warm start gives a forest of the
    # right size and the same results as a normal fit.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)

    clf_ws = None
    for n_estimators in [5, 10]:
        if clf_ws is None:
            clf_ws = BalancedBaggingClassifier(n_estimators=n_estimators,
                                               random_state=random_state,
                                               warm_start=True)
        else:
            clf_ws.set_params(n_estimators=n_estimators)
        clf_ws.fit(X, y)
        assert len(clf_ws) == n_estimators

    clf_no_ws = BalancedBaggingClassifier(n_estimators=10,
                                          random_state=random_state,
                                          warm_start=False)
    clf_no_ws.fit(X, y)

    assert (set([pipe.steps[-1][1].random_state for pipe in clf_ws]) ==
            set([pipe.steps[-1][1].random_state for pipe in clf_no_ws]))
def test_warm_start_equivalence():
    # warm started classifier with 5+5 estimators should be equivalent to
    # one classifier with 10 estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf_ws = BalancedBaggingClassifier(n_estimators=5, warm_start=True,
                                       random_state=3141)
    clf_ws.fit(X_train, y_train)
    clf_ws.set_params(n_estimators=10)
    clf_ws.fit(X_train, y_train)
    y1 = clf_ws.predict(X_test)

    clf = BalancedBaggingClassifier(n_estimators=10, warm_start=False,
                                    random_state=3141)
    clf.fit(X_train, y_train)
    y2 = clf.predict(X_test)

    assert_array_almost_equal(y1, y2)
def test_warm_start_equivalence():
    # warm started classifier with 5+5 estimators should be equivalent to
    # one classifier with 10 estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf_ws = BalancedBaggingClassifier(n_estimators=5, warm_start=True,
                                       random_state=3141)
    clf_ws.fit(X_train, y_train)
    clf_ws.set_params(n_estimators=10)
    clf_ws.fit(X_train, y_train)
    y1 = clf_ws.predict(X_test)

    clf = BalancedBaggingClassifier(n_estimators=10, warm_start=False,
                                    random_state=3141)
    clf.fit(X_train, y_train)
    y2 = clf.predict(X_test)

    assert_array_almost_equal(y1, y2)
Exemple #32
0
class Models(object):
    """
    获取基于机器学习的文本算法
    """
    def __init__(self,
                 model_path=None,
                 feature_engineer=False,
                 train_mode=True):
        # 加载图像处理模型, resnet, resnext, wide resnet, 如果支持 cuda, 则将模型加载到 cuda 中
        self.res_model = torchvision.models.resnet152(pretrained=True).to(
            config.device)
        self.resnext_model = torchvision.models.resnext101_32x8d(
            pretrained=True).to(config.device)
        self.wide_model = torchvision.models.wide_resnet101_2(
            pretrained=True).to(config.device)

        # 加载 bert 模型, 如果支持 cuda, 则将模型加载到 cuda 中
        self.bert_tonkenizer = BertTokenizer.from_pretrained(config.root_path +
                                                             '/model/bert')
        self.bert = BertModel.from_pretrained(config.root_path +
                                              '/model/bert').to(config.device)

        # 初始化 MLdataset 类, debug_mode为true 则使用部分数据, train_mode表示是否训练
        self.ml_data = MLData(debug_mode=True, train_mode=train_mode)

        # 如果不训练, 则加载训练好的模型,进行预测
        if not train_mode:
            self.load(model_path)
            labelNameToIndex = json.load(
                open(config.root_path + '/data/label2id.json',
                     encoding='utf-8'))
            self.ix2label = {v: k for k, v in labelNameToIndex.items()}
        else:
            # 如果 feature_engineer,  则使用lightgbm 进行训练, 反之对比经典机器学习模型
            if feature_engineer:
                self.model = lgb.LGBMClassifier(objective='multiclass',
                                                n_jobs=10,
                                                num_class=33,
                                                num_leaves=30,
                                                reg_alpha=10,
                                                reg_lambda=200,
                                                max_depth=3,
                                                learning_rate=0.05,
                                                n_estimators=2000,
                                                bagging_freq=1,
                                                bagging_fraction=0.9,
                                                feature_fraction=0.8,
                                                seed=1440)
            else:
                self.models = [
                    RandomForestClassifier(n_estimators=500,
                                           max_depth=5,
                                           random_state=0),
                    LogisticRegression(solver='liblinear', random_state=0),
                    MultinomialNB(),
                    SVC(),
                    lgb.LGBMClassifier(objective='multiclass',
                                       n_jobs=10,
                                       num_class=33,
                                       num_leaves=30,
                                       reg_alpha=10,
                                       reg_lambda=200,
                                       max_depth=3,
                                       learning_rate=0.05,
                                       n_estimators=2000,
                                       bagging_freq=1,
                                       bagging_fraction=0.8,
                                       feature_fraction=0.8),
                ]

    def feature_engineer(self):

        print(" generate embedding feature ")

        # 获取 tfidf 特征, word2vec 特征, word2vec 不进行任何聚合
        train_tfidf, train = get_embedding_feature(self.ml_data.train,
                                                   self.ml_data.tfidf,
                                                   self.ml_data.w2v)

        # train 是通过 pandas 创建的一个对象,get_embedding_feature 后得到的列为:
        # w2v: 一条句子中的词换成 w2v 模型编码的 vector。该列的每一行为:[seq, 300]
        # w2v_label_mean:获取句子 embedding ([seq, 300]) 与标签之间的关系特征。该列的每一行为:[300]
        # w2v_label_max:获取句子 embedding ([seq, 300]) 与标签之间的关系特征。该列的每一行为:[300]
        # w2v_mean:[seq, 300] -> [300]
        # w2v_max:[seq, 300] -> [300]
        # w2v_win_2_mean:窗口滑动思想提取特征,该列的每一行为:[300]
        # w2v_win_3_mean
        # w2v_win_4_mean
        # w2v_win_2_max
        # w2v_win_3_max
        # w2v_win_4_max

        test_tfidf, test = get_embedding_feature(self.ml_data.dev,
                                                 self.ml_data.tfidf,
                                                 self.ml_data.w2v)

        print("generate basic feature ")
        # 获取nlp 基本特征
        train = get_basic_feature(train)
        test = get_basic_feature(test)

        print("generate lda feature ")

        # 生成 bag of word 格式数据
        train['bow'] = train['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.lda.id2word.doc2bow(x))
        test['bow'] = test['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.lda.id2word.doc2bow(x))
        # test['bow'] 一行:[(10, 1), (78, 1), (162, 3), (177, 1), (192, 1)...]

        # 在bag of word 基础上得到lda的embedding
        train['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.lda, doc),
                train['bow']))
        test['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.lda, doc),
                test['bow']))
        # test['lda'] 一行:[0.002929521957412362, 0.0024772200267761946, .... ] 有 30 个主题,一行是 30 个主题的概率分布

        print("generate modal feature ")
        # 加载图书封面的文件
        cover = os.listdir(config.book_cover_path)
        # 根据title 匹配图书封面
        train['cover'] = train['title'].progress_apply(
            lambda x: config.book_cover_path + x + '.jpg'
            if x + '.jpg' in cover else '')
        test['cover'] = test.title.progress_apply(
            lambda x: config.book_cover_path + x + '.jpg'
            if x + '.jpg' in cover else '')

        # 根据封面获取封面的embedding
        train['res_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.res_model))
        test['res_embedding'] = test.cover.progress_apply(
            lambda x: get_img_embedding(x, self.res_model))

        train['resnext_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.resnext_model))
        test['resnext_embedding'] = test.cover.progress_apply(
            lambda x: get_img_embedding(x, self.resnext_model))

        train['wide_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.wide_model))
        test['wide_embedding'] = test.cover.progress_apply(
            lambda x: get_img_embedding(x, self.wide_model))

        print("generate bert feature ")
        train['bert_embedding'] = train['text'].progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))
        test['bert_embedding'] = test['text'].progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))

        # print("generate autoencoder feature ")
        # 获取到 autoencoder 的embedding, 根据encoder 获取而不是decoder
        # TODO
        # train_ae = get_autoencoder_feature(
        #     train,
        #     self.ml_data.ae.max_features,
        #     self.ml_data.ae.max_len,
        #     self.ml_data.ae.encoder,
        #     tokenizer=self.ml_data.ae.tokenizer)
        # test_ae = get_autoencoder_feature(
        #     test,
        #     self.ml_data.ae.max_fe atures,
        #     self.ml_data.ae.max_len,
        #     self.ml_data.ae.encoder,
        #     tokenizer=self.ml_data.ae.tokenizer)

        print("formate data")

        #  将所有的特征拼接到一起
        train = formate_data(
            train,
            train_tfidf)  # train = formate_data(train, train_tfidf, train_ae)
        test = formate_data(
            test, test_tfidf)  # test = formate_data(test, test_tfidf, test_ae)

        #  生成训练,测试的数据
        cols = [x for x in train.columns if str(x) not in ['labelIndex']]

        X_train = train[cols]
        X_test = test[cols]

        print(X_test)

        train["labelIndex"] = train["labelIndex"].astype(int)
        test["labelIndex"] = test["labelIndex"].astype(int)

        y_train = train["labelIndex"]
        y_test = test["labelIndex"]

        return X_train, X_test, y_train, y_test

    def param_search(self, search_method='grid'):
        # 使用网格搜索 或者贝叶斯优化 寻找最优参数
        if search_method == 'grid':
            print("use grid search")
            self.model = Grid_Train_model(self.model, self.X_train,
                                          self.X_test, self.y_train,
                                          self.y_test)
        elif search_method == 'bayesian':
            print("use bayesian optimization")
            trn_data = lgb.Dataset(data=self.X_train,
                                   label=self.y_train,
                                   free_raw_data=False)
            param = bayes_parameter_opt_lgb(trn_data)
            print("best param", param)
            return param

    def unbalance_helper(self,
                         imbalance_method='under_sampling',
                         search_method='grid'):

        print("get all feature")

        # 生成所有 feature

        self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer(
        )
        model_name = None

        # 是否使用不平衡数据处理方式,上采样, 下采样, ensemble

        if imbalance_method == 'over_sampling':
            print("Use SMOTE deal with unbalance data ")
            # https://www.zhihu.com/question/269698662
            # https://www.cnblogs.com/kamekin/p/9824294.html
            self.X_train, self.y_train = SMOTE().fit_resample(
                self.X_train, self.y_train)
            self.X_test, self.y_test = SMOTE().fit_resample(
                self.X_train, self.y_train)
            model_name = 'lgb_over_sampling'
        elif imbalance_method == 'under_sampling':
            print("Use ClusterCentroids deal with unbalance data")
            self.X_train, self.y_train = ClusterCentroids(
                random_state=0).fit_resample(self.X_train, self.y_train)
            self.X_test, self.y_test = ClusterCentroids(
                random_state=0).fit_resample(self.X_test, self.y_test)
            model_name = 'lgb_under_sampling'
        elif imbalance_method == 'ensemble':
            self.model = BalancedBaggingClassifier(
                base_estimator=DecisionTreeClassifier(),
                sampling_strategy='auto',
                replacement=False,
                random_state=0)
            model_name = 'ensemble'
        print('search best param')

        # 使用 set_params 将搜索到的最优参数设置为模型的参数

        if imbalance_method != 'ensemble':
            param = self.param_search(search_method=search_method)
            param['params']['num_leaves'] = int(param['params']['num_leaves'])
            param['params']['max_depth'] = int(param['params']['max_depth'])
            self.model = self.model.set_params(**param['params'])
        print('fit model ')

        # 训练, 并输出模型的结果

        self.model.fit(self.X_train, self.y_train)
        Test_predict_label = self.model.predict(self.X_test)
        Train_predict_label = self.model.predict(self.X_train)
        per, acc, recall, f1 = get_score(self.y_train, self.y_test,
                                         Train_predict_label,
                                         Test_predict_label)

        # 输出训练集的精确率
        print('Train accuracy %s' % per)
        # 输出测试集的准确率
        print('test accuracy %s' % acc)
        # 输出recall
        print('test recall %s' % recall)
        # 输出F1-score
        print('test F1_score %s' % f1)
        self.save(model_name)

    def model_select(self,
                     X_train,
                     X_test,
                     y_train,
                     y_test,
                     feature_method='tf-idf'):
        # 对比tfidf word2vec fasttext 等词向量以及常见机器学习模型的效果
        for model in self.models:
            model_name = model.__class__.__name__
            print(model_name)
            clf = model.fit(X_train, y_train)
            Test_predict_label = clf.predict(X_test)
            Train_predict_label = clf.predict(X_train)
            per, acc, recall, f1 = get_score(y_train, y_test,
                                             Train_predict_label,
                                             Test_predict_label)
            # 输出训练集的准确率
            print(model_name + '_' + 'Train accuracy %s' % per)

            # 输出测试集的准确率
            print(model_name + '_' + ' test accuracy %s' % acc)

            # 输出recall
            print(model_name + '_' + 'test recall %s' % recall)

            # 输出F1-score
            print(model_name + '_' + 'test F1_score %s' % f1)

    def process(self, title, desc):

        # 处理数据, 生成模型预测所需要的特征
        df = pd.DataFrame([[title, desc]], columns=['title', 'desc'])
        df['text'] = df['title'] + df['desc']
        df["queryCut"] = df["text"].apply(query_cut)
        df["queryCutRMStopWord"] = df["queryCut"].apply(
            lambda x: [word for word in x if word not in get_stop_word_list()])

        df_tfidf, df = get_embedding_feature(df, self.ml_data.tfidf,
                                             self.ml_data.w2v)

        print("generate basic feature ")
        df = get_basic_feature(df)

        print("generate modal feature ")
        df['cover'] = ''

        df['res_embedding'] = df.cover.progress_apply(
            lambda x: get_img_embedding(x, self.res_model))

        df['resnext_embedding'] = df.cover.progress_apply(
            lambda x: get_img_embedding(x, self.resnext_model))

        df['wide_embedding'] = df.cover.progress_apply(
            lambda x: get_img_embedding(x, self.wide_model))

        print("generate bert feature ")
        df['bert_embedding'] = df.text.progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))

        print("generate lda feature ")
        df['bow'] = df['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.lda.id2word.doc2bow(x))
        df['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.lda, doc), df.bow))

        print("generate autoencoder feature ")
        # df_ae = get_autoencoder_feature(df,
        #                                 self.ml_data.ae.max_features,
        #                                 self.ml_data.ae.max_len,
        #                                 self.ml_data.ae.encoder,
        #                                 tokenizer=self.ml_data.ae.tokenizer)

        print("formate data")
        df['labelIndex'] = 1
        df = formate_data(df, df_tfidf)  #, df_ae)
        cols = [x for x in df.columns if str(x) not in ['labelIndex']]
        X_train = df[cols]
        return X_train

    def predict(self, title, desc):
        '''
        @description: 根据输入的title, desc 预测图书的类别
        @param {type}
        title, input
        desc: input
        @return: label
        '''
        inputs = self.process(title, desc)
        label = self.ix2label[self.model.predict(inputs)[0]]
        proba = np.max(self.model.predict_proba(inputs))
        return label, proba

    def save(self, model_name):
        '''
        @description:save model
        @param {type}
        model_name, file name for saving
        @return: None
        '''
        joblib.dump(self.model, root_path + '/model/ml_model/' + model_name)

    def load(self, path):
        '''
        @description: load model
        @param {type}
        path: model path
        @return:None
        '''
        self.model = joblib.load(path)
Exemple #33
0
y_pred = model.predict(X_test)
mostrar_resultados(y_test, y_pred, 'Oversampling')

# Estrategia: Combinamos resampling con Smote-Tomek
# Ahora probaremos una técnica muy usada que consiste en aplicar
# en simultáneo un algoritmo de undersampling y otro de oversampling
# a la vez al dataset. En este caso usaremos SMOTE para oversampling:
# busca puntos vecinos cercanos y agrega puntos “en linea recta” entre ellos.
# Y usaremos Tomek para undersampling que quita los de distinta clase que sean
# nearest neighbor y deja ver mejor el decisión boundary
# (la zona limítrofe de nuestras clases).
os_us = SMOTETomek(sampling_strategy=0.5)
X_train_res, y_train_res = os_us.fit_resample(X_train, y_train)
print(f'Distribution before resampling {Counter(y_train)}')
print(f'Distribution after resampling {Counter(y_train_res)}')
model = run_model(X_train_res, X_test, y_train_res, y_test)
y_pred = model.predict(X_test)
mostrar_resultados(y_test, y_pred, 'Smote-Tomek')

# Estrategia: Ensamble de Modelos con Balanceo
# Para esta estrategia usaremos un Clasificador de Ensamble
# que usa Bagging y el modelo será un DecisionTree. Veamos como se comporta:
bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                sampling_strategy='auto',
                                replacement=False,
                                random_state=0)
# Train the classifier
bbc.fit(X_train, y_train)
y_pred = bbc.predict(X_test)
mostrar_resultados(y_test, y_pred, 'Ensamble BBC')
Exemple #34
0
bc = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                       random_state=0)
bc.fit(X_train, y_train) 
y_pred = bc.predict(X_test)
print(confusion_matrix(y_test, y_pred))

'''
BalancedBaggingClassifier 允许在训练每个基学习器之前对每个子集进行重抽样. 
简而言之,该方法结合了EasyEnsemble采样器与分类器(如BaggingClassifier)的结果.
'''
from imblearn.ensemble import BalancedBaggingClassifier
bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                ratio='auto',
                                replacement=False,
                                random_state=0)
bbc.fit(X, y) 

y_pred = bbc.predict(X_test)
print(confusion_matrix(y_test, y_pred))

'''
imblearn.datasets包与sklearn.datasets包形成了很好的互补.
该包主要有以下两个功能:(i)提供一系列的不平衡数据集来实现测试;(ii) 提供一种工具将原始的平衡数据转换为不平衡数据.
'''






                                                    random_state=1)
y_train = y_train.squeeze()
y_test = y_test.squeeze()

# ### Fit the model
# Fit the best model based on tuned parameters
GBM_clf = ensemble.GradientBoostingClassifier(learning_rate=0.05,
                                              max_depth=3,
                                              n_estimators=100)
best_clf = BalancedBaggingClassifier(base_estimator=GBM_clf,
                                     ratio='auto',
                                     replacement=False,
                                     random_state=0)

# Fit the model and check ConfusionMatrix
best_clf.fit(X_train, y_train)

# Check R-Style confusionMatrix

y_pred = best_clf.predict(X_test).tolist(
)  ## change type: object to list, cannot create Confusion Matrix if not change
confusionMatrix(y_pred, y_test).show()  ## Show the Confusion Matrix
# Classification Report
print('Classification Report:\n',
      classification_report(y_test, y_pred, target_names=["AS", "PsA", "RA"]))

### prepare input for ROC
n_classes = len(
    y_train.unique()
)  # number of indications, if 2 then n_class=1, if >2 then the number of indications
y_score = best_clf.fit(X_train, y_train).decision_function(X_test)
Exemple #36
0
    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        shuffle=Y)

    cl = BalancedBaggingClassifier(
        base_estimator=QuadraticDiscriminantAnalysis(reg_param=0.11),
        n_estimators=50,
        max_samples=0.6,
        max_features=0.7,
        n_jobs=-1,
        bootstrap_features=True,
        oob_score=False)

    cl.fit(X_train, Y_train)

    predictions = cl.predict(X_train)
    # print(X_train.shape,Y_train.shape,predictions.shape)
    # print(list(zip(Y_train,predictions)))
    print('\n\nModel Train: f1 = {0} '.format(
        f1_score(Y_train, predictions, average='micro')))

    predictions = cl.predict(X_test)
    print('\nModel Test: f1 = {0} '.format(
        f1_score(Y_test, predictions, average='micro')))

    # exit()

    cl = BalancedBaggingClassifier(
        base_estimator=QuadraticDiscriminantAnalysis(reg_param=0.11),
classifier_3 = RandomForestClassifier(n_estimators=5, criterion='entropy')
classifier_3.fit(X_train, Y_train)

# Fitting classifier to the training data: Model 4
from sklearn.linear_model import LogisticRegression
classifier_4 = LogisticRegression(penalty='l1', random_state=0)
classifier_4.fit(X_train, Y_train)

# Fitting Balanced Bagging Classifier to the training data: Model 5
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.ensemble import RandomForestClassifier
classifier_5 = BalancedBaggingClassifier(
    base_estimator=RandomForestClassifier(criterion='entropy'),
    n_estimators=5,
    bootstrap=True)
classifier_5.fit(X_train, Y_train)

# Fitting Decision Tree to the training data: Model 6
from sklearn.tree import DecisionTreeClassifier
classifier_6 = DecisionTreeClassifier()
classifier_6.fit(X_train, Y_train)

# In[ ]:

# Predicting the results
y_pred_1 = classifier_1.predict(X_test)
y_pred_2 = classifier_2.predict(X_test)
y_pred_3 = classifier_3.predict(X_test)
y_pred_4 = classifier_4.predict(X_test)
y_pred_5 = classifier_5.predict(X_test)
y_pred_6 = classifier_6.predict(X_test)
Exemple #38
0
    # use original features
    X_train_o = X_train[:, 0:original_len]
    X_test_o = X_test[:, 0:original_len]

    X_train_n = X_train[:, original_len:]
    X_test_n = X_test[:, original_len:]

    for clf, clf_name in zip(clf_list, clf_name_list):
        print('processing', clf_name, 'round', i + 1)
        if clf_name != 'xgb':
            clf = BalancedBaggingClassifier(base_estimator=clf,
                                            ratio='auto',
                                            replacement=False)

        # fully supervised
        clf.fit(X_train_o, y_train.ravel())
        y_pred = clf.predict_proba(X_test_o)

        roc_score = roc_auc_score(y_test, y_pred[:, 1])
        prec_n = get_precn(y_test, y_pred[:, 1])

        result_dict[clf_name + 'ROC' + 'o'].append(roc_score)
        result_dict[clf_name + 'PRC@n' + 'o'].append(prec_n)

        # unsupervised
        clf.fit(X_train_n, y_train.ravel())
        y_pred = clf.predict_proba(X_test_n)

        roc_score = roc_auc_score(y_test, y_pred[:, 1])
        prec_n = get_precn(y_test, y_pred[:, 1])
Exemple #39
0
###############################################################################
# Classification using bagging classifier with and without sampling
###############################################################################
# Instead of using a single tree, we will check if an ensemble of decsion tree
# can actually alleviate the issue induced by the class imbalancing. First, we
# will use a bagging classifier and its counter part which internally uses a
# random under-sampling to balanced each boostrap sample.

bagging = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1)
balanced_bagging = BalancedBaggingClassifier(n_estimators=50,
                                             random_state=0,
                                             n_jobs=-1)

bagging.fit(X_train, y_train)
balanced_bagging.fit(X_train, y_train)

y_pred_bc = bagging.predict(X_test)
y_pred_bbc = balanced_bagging.predict(X_test)

###############################################################################
# Balancing each bootstrap sample allows to increase significantly the balanced
# accuracy and the geometric mean.

print('Bagging classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format(
    balanced_accuracy_score(y_test, y_pred_bc),
    geometric_mean_score(y_test, y_pred_bc)))
cm_bagging = confusion_matrix(y_test, y_pred_bc)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_bagging,
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


ozone = fetch_datasets()['ozone_level']
X, y = ozone.data, ozone.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

bagging = BaggingClassifier(random_state=0)
balanced_bagging = BalancedBaggingClassifier(random_state=0)

print('Class distribution of the training set: {}'.format(Counter(y_train)))

bagging.fit(X_train, y_train)
balanced_bagging.fit(X_train, y_train)

print('Class distribution of the test set: {}'.format(Counter(y_test)))

print('Classification results using a bagging classifier on imbalanced data')
y_pred_bagging = bagging.predict(X_test)
print(classification_report_imbalanced(y_test, y_pred_bagging))
cm_bagging = confusion_matrix(y_test, y_pred_bagging)
plt.figure()
plot_confusion_matrix(cm_bagging, classes=np.unique(ozone.target),
                      title='Confusion matrix using BaggingClassifier')

print('Classification results using a bagging classifier on balanced data')
y_pred_balanced_bagging = balanced_bagging.predict(X_test)
print(classification_report_imbalanced(y_test, y_pred_balanced_bagging))
cm_balanced_bagging = confusion_matrix(y_test, y_pred_balanced_bagging)
                      title='Decision tree')

###############################################################################
# Classification using bagging classifier with and without sampling
###############################################################################
# Instead of using a single tree, we will check if an ensemble of decsion tree
# can actually alleviate the issue induced by the class imbalancing. First, we
# will use a bagging classifier and its counter part which internally uses a
# random under-sampling to balanced each boostrap sample.

bagging = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1)
balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0,
                                             n_jobs=-1)

bagging.fit(X_train, y_train)
balanced_bagging.fit(X_train, y_train)

y_pred_bc = bagging.predict(X_test)
y_pred_bbc = balanced_bagging.predict(X_test)

###############################################################################
# Balancing each bootstrap sample allows to increase significantly the balanced
# accuracy and the geometric mean.

print('Bagging classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_bc),
              geometric_mean_score(y_test, y_pred_bc)))
cm_bagging = confusion_matrix(y_test, y_pred_bc)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_bagging, classes=np.unique(satimage.target), ax=ax[0],
Exemple #42
0
class Models(object):
    def __init__(self,
                 model_path=None,
                 feature_engineer=False,
                 train_mode=True):
        '''
        @description: initlize Class, EX: model
        @param {type} :
        feature_engineer: whether using feature engineering, if `False`, then compare common ML models
        res_model: res network model
        resnext_model: resnext network model
        wide_model: wide res network model
        bert: bert model
        ml_data: new mldata class
        @return: No return
        '''
        # 加载图像处理模型, resnet, resnext, wide resnet, 如果支持cuda, 则将模型加载到cuda中
        ###########################################
        #          TODO: module 2 task 2.1        #
        ###########################################
        self.res_model = torchvision.models.resnet152(
            pretrained=True)  # res model for modal feature [1* 1000]
        self.res_model = self.res_model.to(config.device)
        self.resnext_model = torchvision.models.resnext101_32x8d(
            pretrained=True)
        self.resnext_model = self.resnext_model.to(config.device)
        self.wide_model = torchvision.models.wide_resnet101_2(pretrained=True)
        self.wide_model = self.wide_model.to(config.device)
        # 加载 bert 模型, 如果支持cuda, 则将模型加载到cuda中
        self.bert_tonkenizer = BertTokenizer.from_pretrained(config.root_path +
                                                             '/model/bert')
        self.bert = BertModel.from_pretrained(config.root_path + '/model/bert')
        self.bert = self.bert.to(config.device)

        # 初始化 MLdataset 类, debug_mode为true 则使用部分数据, train_mode表示是否训练
        self.ml_data = MLData(debug_mode=True, train_mode=train_mode)
        # 如果不训练, 则加载训练好的模型,进行预测
        if train_mode:
            self.model = lgb.LGBMClassifier(objective='multiclass',
                                            n_jobs=10,
                                            num_class=33,
                                            num_leaves=30,
                                            reg_alpha=10,
                                            reg_lambda=200,
                                            max_depth=3,
                                            learning_rate=0.05,
                                            n_estimators=2000,
                                            bagging_freq=1,
                                            bagging_fraction=0.9,
                                            feature_fraction=0.8,
                                            seed=1440)

        else:
            self.load(model_path)
            labelNameToIndex = json.load(
                open(config.root_path + '/data/label2id.json',
                     encoding='utf-8'))
            self.ix2label = {v: k for k, v in labelNameToIndex.items()}

    def feature_engineer(self):
        '''
        @description: This function is building all kings of features
        @param {type} None
        @return:
        X_train, feature of train set
        X_test, feature of test set
        y_train, label of train set
        y_test, label of test set
        '''

        logger.info("generate embedding feature ")
        # 获取tfidf 特征, word2vec 特征, word2vec不进行任何聚合
        ###########################################
        #          TODO: module 3 task 1.1        #
        ###########################################
        train_tfidf, train = get_embedding_feature(self.ml_data.train,
                                                   self.ml_data.em.tfidf,
                                                   self.ml_data.em.w2v)
        test_tfidf, test = get_embedding_feature(self.ml_data.dev,
                                                 self.ml_data.em.tfidf,
                                                 self.ml_data.em.w2v)

        logger.info("generate autoencoder feature ")
        # 获取到autoencoder 的embedding, 根据encoder 获取而不是decoder
        train_ae = get_autoencoder_feature(
            train,
            self.ml_data.em.ae.max_features,
            self.ml_data.em.ae.max_len,
            self.ml_data.em.ae.encoder,
            tokenizer=self.ml_data.em.ae.tokenizer)
        test_ae = get_autoencoder_feature(
            test,
            self.ml_data.em.ae.max_features,
            self.ml_data.em.ae.max_len,
            self.ml_data.em.ae.encoder,
            tokenizer=self.ml_data.em.ae.tokenizer)

        logger.info("generate basic feature ")
        # 获取nlp 基本特征
        train = get_basic_feature(train)
        test = get_basic_feature(test)

        logger.info("generate modal feature ")
        # 加载图书封面的文件
        cover = os.listdir(config.root_path + '/data/book_cover/')
        # 根据title 匹配图书封面
        train['cover'] = train['title'].progress_apply(
            lambda x: config.root_path + '/data/book_cover/' + x + '.jpg'
            if x + '.jpg' in cover else '')
        test['cover'] = test['title'].progress_apply(
            lambda x: config.root_path + '/data/book_cover/' + x + '.jpg'
            if x + '.jpg' in cover else '')

        # 根据封面获取封面的embedding
        ###########################################
        #          TODO: module 3 task 1.2        #
        ###########################################
        train['res_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.res_model))
        test['res_embedding'] = test['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.res_model))

        train['resnext_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.resnext_model))
        test['resnext_embedding'] = test['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.resnext_model))

        train['wide_embedding'] = train['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.wide_model))
        test['wide_embedding'] = test['cover'].progress_apply(
            lambda x: get_img_embedding(x, self.wide_model))

        logger.info("generate bert feature ")
        ###########################################
        #          TODO: module 3 task 1.3        #
        ###########################################
        train['bert_embedding'] = train['text'].progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))
        test['bert_embedding'] = test['text'].progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))

        logger.info("generate lda feature ")
        ###########################################
        #          TODO: module 3 task 1.4        #
        ###########################################
        # 生成bag of word格式数据
        train['bow'] = train['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.em.lda.id2word.doc2bow(x))
        test['bow'] = test['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.em.lda.id2word.doc2bow(x))
        # 在bag of word 基础上得到lda的embedding
        train['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.em.lda, doc),
                train['bow']))
        test['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.em.lda, doc),
                test['bow']))

        logger.info("formate data")
        #  将所有的特征拼接到一起
        train = formate_data(train, train_tfidf, train_ae)
        test = formate_data(test, test_tfidf, test_ae)
        #  生成训练,测试的数据
        cols = [x for x in train.columns if str(x) not in ['labelIndex']]
        X_train = train[cols]
        X_test = test[cols]
        train["labelIndex"] = train["labelIndex"].astype(int)
        test["labelIndex"] = test["labelIndex"].astype(int)
        y_train = train["labelIndex"]
        y_test = test["labelIndex"]
        return X_train, X_test, y_train, y_test

    def param_search(self, search_method='grid'):
        '''
        @description: use param search tech to find best param
        @param {type}
        search_method: two options. grid or bayesian optimization
        @return: None
        '''
        # 使用网格搜索 或者贝叶斯优化 寻找最优参数
        if search_method == 'grid':
            logger.info("use grid search")
            self.model = Grid_Train_model(self.model, self.X_train,
                                          self.X_test, self.y_train,
                                          self.y_test)
        elif search_method == 'bayesian':
            logger.info("use bayesian optimization")
            trn_data = lgb.Dataset(data=self.X_train,
                                   label=self.y_train,
                                   free_raw_data=False)
            param = bayes_parameter_opt_lgb(trn_data)
            logger.info("best param", param)
            return param

    def unbalance_helper(self,
                         imbalance_method='under_sampling',
                         search_method='grid'):
        '''
        @description: handle unbalance data, then search best param
        @param {type}
        imbalance_method,  three option, under_sampling for ClusterCentroids, SMOTE for over_sampling, ensemble for BalancedBaggingClassifier
        search_method: two options. grid or bayesian optimization
        @return: None
        '''
        logger.info("get all freature")
        # 生成所有feature
        self.X_train, self.X_test, self.y_train, self.y_test = self.feature_engineer(
        )
        model_name = None
        # 是否使用不平衡数据处理方式,上采样, 下采样, ensemble
        ###########################################
        #          TODO: module 4 task 1.1        #
        ###########################################
        if imbalance_method == 'over_sampling':
            logger.info("Use SMOTE deal with unbalance data ")
            self.X_train, self.y_train = SMOTE().fit_resample(
                self.X_train, self.y_train)
            self.X_test, self.y_test = SMOTE().fit_resample(
                self.X_train, self.y_train)
            model_name = 'lgb_over_sampling'
        elif imbalance_method == 'under_sampling':
            logger.info("Use ClusterCentroids deal with unbalance data ")
            self.X_train, self.y_train = ClusterCentroids(
                random_state=0).fit_resample(self.X_train, self.y_train)
            self.X_test, self.y_test = ClusterCentroids(
                random_state=0).fit_resample(self.X_test, self.y_test)
            model_name = 'lgb_under_sampling'
        elif imbalance_method == 'ensemble':
            self.model = BalancedBaggingClassifier(
                base_estimator=DecisionTreeClassifier(),
                sampling_strategy='auto',
                replacement=False,
                random_state=0)
            model_name = 'ensemble'
        logger.info('search best param')
        # 使用set_params 将搜索到的最优参数设置为模型的参数
        if imbalance_method != 'ensemble':
            ###########################################
            #          TODO: module 4 task 1.2        #
            ###########################################
            # param = self.param_search(search_method=search_method)
            # param['params']['num_leaves'] = int(param['params']['num_leaves'])
            # param['params']['max_depth'] = int(param['params']['max_depth'])
            param = {}
            param['params'] = {}
            param['params']['num_leaves'] = 3
            param['params']['max_depth'] = 5
            self.model = self.model.set_params(**param['params'])
        logger.info('fit model ')
        # 训练, 并输出模型的结果
        self.model.fit(self.X_train, self.y_train)
        ###########################################
        #          TODO: module 4 task 1.3        #
        ###########################################
        Test_predict_label = self.model.predict(self.X_test)
        Train_predict_label = self.model.predict(self.X_train)
        per, acc, recall, f1 = get_score(self.y_train, self.y_test,
                                         Train_predict_label,
                                         Test_predict_label)
        # 输出训练集的精确率
        logger.info('Train accuracy %s' % per)
        # 输出测试集的准确率
        logger.info('test accuracy %s' % acc)
        # 输出recall
        logger.info('test recall %s' % recall)
        # 输出F1-score
        logger.info('test F1_score %s' % f1)
        self.save(model_name)

    def process(self, title, desc):
        ###########################################
        #          TODO: module 5 task 1.1        #
        ###########################################
        # 处理数据, 生成模型预测所需要的特征
        df = pd.DataFrame([[title, desc]], columns=['title', 'desc'])
        df['text'] = df['title'] + df['desc']
        df["queryCut"] = df["text"].apply(query_cut)
        df["queryCutRMStopWord"] = df["queryCut"].apply(
            lambda x:
            [word for word in x if word not in self.ml_data.em.stopWords])

        df_tfidf, df = get_embedding_feature(df, self.ml_data.em.tfidf,
                                             self.ml_data.em.w2v)

        print("generate basic feature ")
        df = get_basic_feature(df)

        print("generate modal feature ")
        df['cover'] = ''
        df['res_embedding'] = df.cover.progress_apply(
            lambda x: get_img_embedding(x, self.res_model))

        df['resnext_embedding'] = df.cover.progress_apply(
            lambda x: get_img_embedding(x, self.resnext_model))

        df['wide_embedding'] = df.cover.progress_apply(
            lambda x: get_img_embedding(x, self.wide_model))

        print("generate bert feature ")
        df['bert_embedding'] = df.text.progress_apply(
            lambda x: get_pretrain_embedding(x, self.bert_tonkenizer, self.bert
                                             ))

        print("generate lda feature ")
        df['bow'] = df['queryCutRMStopWord'].apply(
            lambda x: self.ml_data.em.lda.id2word.doc2bow(x))
        df['lda'] = list(
            map(lambda doc: get_lda_features(self.ml_data.em.lda, doc),
                df.bow))

        print("generate autoencoder feature ")
        df_ae = get_autoencoder_feature(df,
                                        self.ml_data.em.ae.max_features,
                                        self.ml_data.em.ae.max_len,
                                        self.ml_data.em.ae.encoder,
                                        tokenizer=self.ml_data.em.ae.tokenizer)

        print("formate data")
        df['labelIndex'] = 1
        df = formate_data(df, df_tfidf, df_ae)
        cols = [x for x in df.columns if str(x) not in ['labelIndex']]
        X_train = df[cols]
        return X_train

    def predict(self, title, desc):
        '''
        @description: 根据输入的title, desc 预测图书的类别
        @param {type}
        title, input
        desc: input
        @return: label
        '''
        ###########################################
        #          TODO: module 5 task 1.1        #
        ###########################################
        inputs = self.process(title, desc)
        label = self.ix2label[self.model.predict(inputs)[0]]
        proba = np.max(self.model.predict_proba(inputs))
        return label, proba

    def save(self, model_name):
        '''
        @description:save model
        @param {type}
        model_name, file name for saving
        @return: None
        '''
        ###########################################
        #          TODO: module 4 task 1.4        #
        ###########################################
        joblib.dump(self.model, root_path + '/model/ml_model/' + model_name)

    def load(self, path):
        '''
        @description: load model
        @param {type}
        path: model path
        @return:None
        '''
        ###########################################
        #          TODO: module 4 task 1.4        #
        ###########################################
        self.model = joblib.load(path)