def test_base_estimator():
    # Check base_estimator and its default values.
    X, y = make_imbalance(iris.data,
                          iris.target,
                          sampling_strategy={
                              0: 20,
                              1: 25,
                              2: 50
                          },
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    ensemble = EasyEnsembleClassifier(2, None, n_jobs=-1,
                                      random_state=0).fit(X_train, y_train)

    assert isinstance(ensemble.base_estimator_.steps[-1][1],
                      AdaBoostClassifier)

    ensemble = EasyEnsembleClassifier(2,
                                      AdaBoostClassifier(),
                                      n_jobs=-1,
                                      random_state=0).fit(X_train, y_train)

    assert isinstance(ensemble.base_estimator_.steps[-1][1],
                      AdaBoostClassifier)
Beispiel #2
0
def objectiveEasy(params):
    time1 = time.time()
    params = {
        'sampling_strategy': params['sampling_strategy'],
    }

    print("\n############## New Run ################")
    print(f"params = {params}")
    FOLDS = 5
    count = 1
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
    score_mean = 0
    for tr_idx, val_idx in skf.split(X_train, y_train.values.ravel()):
        clf = EasyEnsembleClassifier(**params,
                                    random_state=0,
                                    n_estimators=300,
                                    n_jobs=-1,
                                    verbose=0)

        X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
        y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        
        clf.fit(X_tr, y_tr.values.ravel())
        score = make_scorer(roc_auc_score, needs_proba=True)(clf, X_vl, y_vl)
        score_mean += score
        print(f'{count} CV - score: {round(score, 4)}')
        count += 1
    time2 = time.time() - time1
    print(f"Total Time Run: {round(time2 / 60,2)}")
    gc.collect()
    print(f'Mean ROC_AUC: {score_mean / FOLDS}')
    del X_tr, X_vl, y_tr, y_vl, clf, score
    return -(score_mean / FOLDS)
    def fit(self, X, Y, sample_weight=None):
        import sklearn.tree
        if self.estimator is None:
            self.ab_max_depth = int(self.ab_max_depth)
            base_estimator = sklearn.tree.DecisionTreeClassifier(
                max_depth=self.ab_max_depth)
            self.estimator = sklearn.ensemble.AdaBoostClassifier(
                base_estimator=base_estimator,
                n_estimators=self.ab_n_estimators,
                learning_rate=self.ab_learning_rate,
                algorithm=self.ab_algorithm,
                random_state=self.random_state)
        from imblearn.ensemble import EasyEnsembleClassifier
        estimator = EasyEnsembleClassifier(
            base_estimator=self.estimator,
            n_estimators=self.n_estimators,
            sampling_strategy=self.sampling_strategy,
            replacement=self.replacement,
            n_jobs=self.n_jobs,
            random_state=self.random_state)

        estimator.fit(X, Y)

        self.estimator = estimator
        return self
Beispiel #4
0
def test_warm_start(random_state=42):
    # Test if fitting incrementally with warm start gives a forest of the
    # right size and the same results as a normal fit.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)

    clf_ws = None
    for n_estimators in [5, 10]:
        if clf_ws is None:
            clf_ws = EasyEnsembleClassifier(
                n_estimators=n_estimators,
                random_state=random_state,
                warm_start=True,
            )
        else:
            clf_ws.set_params(n_estimators=n_estimators)
        clf_ws.fit(X, y)
        assert len(clf_ws) == n_estimators

    clf_no_ws = EasyEnsembleClassifier(
        n_estimators=10, random_state=random_state, warm_start=False
    )
    clf_no_ws.fit(X, y)

    assert {pipe.steps[-1][1].random_state for pipe in clf_ws} == {
        pipe.steps[-1][1].random_state for pipe in clf_no_ws
    }
def test_easy_ensemble_classifier_error(n_estimators, msg_error):
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    with pytest.raises(ValueError, match=msg_error):
        eec = EasyEnsembleClassifier(n_estimators=n_estimators)
        eec.fit(X, y)
def test_easy_ensemble_classifier_error(n_estimators, msg_error):
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    with pytest.raises(ValueError, match=msg_error):
        eec = EasyEnsembleClassifier(n_estimators=n_estimators)
        eec.fit(X, y)
def balancedClassifier(df):
    # Create an object of the classifier.
    seed = 7
    num_trees = 30
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    base_estimator = AdaBoostClassifier(n_estimators=num_trees,
                                        random_state=seed)
    ee_classifier = EasyEnsembleClassifier(n_estimators=10,
                                           base_estimator=base_estimator)

    X = df.take([1, 5, 6, 9, 10, 12, 18, 21], axis=1)  # predictors
    X = X.apply(pd.to_numeric)
    X = X.iloc[1:]

    Y = df['Class']  # predicted_class
    Y = Y.iloc[1:]

    classes = np.unique(df['Class'].values)
    print("We have {} unique classes: {}".format(len(classes), classes))

    # Train the classifier.
    ee_classifier.fit(X, Y)
    predictions = model_selection.cross_val_predict(ee_classifier,
                                                    X,
                                                    Y.values.ravel(),
                                                    cv=kfold)
    classification_report = metrics.classification_report(Y.values.ravel(),
                                                          predictions,
                                                          target_names=classes)
    print("classification_report ", classification_report)
    balanced_accuracy = metrics.balanced_accuracy_score(
        Y.values.ravel(), predictions)
    print(" Balanced accuracy = ", balanced_accuracy)
    return predictions, Y
def test_bagging_with_pipeline():
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    estimator = EasyEnsembleClassifier(
        n_estimators=2,
        base_estimator=make_pipeline(SelectKBest(k=1), AdaBoostClassifier()))
    estimator.fit(X, y).predict(X)
def test_bagging_with_pipeline():
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    estimator = EasyEnsembleClassifier(
        n_estimators=2,
        base_estimator=make_pipeline(SelectKBest(k=1), AdaBoostClassifier()))
    estimator.fit(X, y).predict(X)
Beispiel #10
0
def ensemble_model_initialise(base_estimator=AdaBoostClassifier(), param={}):
    config = EasyEnsembleClassifier().get_params()

    config['base_estimator'] = base_estimator
    config['n_estimators'] = 50
    config['n_jobs'] = -1
    config['random_state'] = 42
    config['verbose'] = 0

    config.update(param)

    return EasyEnsembleClassifier(**config)
Beispiel #11
0
def model():
    scores = []
    acc_score = []
    fat_weights = [0.3 for i in range(train["Fatal"].shape[0])]
    sev_weights = [0.5 for i in range(train["Severe"].shape[0])]
    sli_weights = [1 for i in range(train["Slight"].shape[0])]
    class_weights = {
        "Fatal": fat_weights,
        "Severe": sev_weights,
        "Slight": sli_weights
    }
    submission = pd.DataFrame.from_dict(
        {'Accident_Index': test['Accident_Index']})
    for class_name in class_names:
        train_target = train[class_name]
        classifier = EasyEnsembleClassifier(n_estimators=12,
                                            base_estimator=XGBClassifier(
                                                max_depth=4,
                                                learning_rate=0.2,
                                                n_estimators=600,
                                                silent=True,
                                                subsample=0.8,
                                                gamma=0.5,
                                                min_child_weight=10,
                                                objective='binary:logistic',
                                                colsample_bytree=0.6,
                                                max_delta_step=1,
                                                nthreads=1,
                                                n_jobs=1))

        cv_score = np.mean(
            cross_val_score(classifier,
                            train_features,
                            train_target,
                            cv=3,
                            scoring='roc_auc'))
        scores.append(cv_score)
        #         print('CV score for class {} is {}'.format(class_name, cv_score))

        classifier.fit(train_features,
                       train_target,
                       sample_weight=class_weights[class_name])
        submission[class_name] = classifier.predict_proba(test_features)[:, 1]
        acc = roc_auc_score(test[class_name], submission[class_name])
        acc_score.append(acc)
        #         print('Mean accuracy for class {} is {}'.format(class_name,acc))

        #Pickling the model
        model_pkl = open('Accident_Severity_Prediction_Model_Pkl.pkl', 'ab')
        pickle.dump(classifier, model_pkl)
        model_pkl.close()

    return (scores, acc_score)
def test_easy_ensemble_classifier_single_estimator():
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    clf1 = EasyEnsembleClassifier(n_estimators=1, random_state=0).fit(
        X_train, y_train)
    clf2 = make_pipeline(RandomUnderSampler(random_state=0),
                         AdaBoostClassifier(random_state=0)).fit(
                             X_train, y_train)

    assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
def test_easy_ensemble_classifier_single_estimator():
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    clf1 = EasyEnsembleClassifier(n_estimators=1, random_state=0).fit(
        X_train, y_train)
    clf2 = make_pipeline(RandomUnderSampler(random_state=0),
                         AdaBoostClassifier(random_state=0)).fit(
                             X_train, y_train)

    assert_array_equal(clf1.predict(X_test), clf2.predict(X_test))
Beispiel #14
0
def test_warm_start_smaller_n_estimators():
    # Test if warm start'ed second fit with smaller n_estimators raises error.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    clf = EasyEnsembleClassifier(n_estimators=5, warm_start=True)
    clf.fit(X, y)
    clf.set_params(n_estimators=4)
    with pytest.raises(ValueError):
        clf.fit(X, y)
 def easy_ensemble_classifier(self, model):
     clf = EasyEnsembleClassifier(n_estimators=45,
                                  base_estimator=model,
                                  random_state=42,
                                  n_jobs=-1,
                                  sampling_strategy='majority')
     return clf
def ada_boost(x_train,
              y_train,
              cv,
              n_estimators=100,
              sampling_strategy="not_majority",
              name="AdaBoost",
              only_model=False,
              **kwargs):
    """Bags AdaBoost learners which are trained on balanced bootstrap samples.
    Parameters:
        x_train: Input data for training
        y_train: Target data for training
        cv (list of tuples): cross validation indices
        n_estimators (int): number of boosted trees to consider
        sampling_strategy (str): "all", "not_majority", "minority" and more. See docu of classifer for more details.
        name (str): Name/Description for the model
        only_model (bool): if True returns only the model
    Returns:
        dict: contains results of models
    """
    eec = EasyEnsembleClassifier(n_estimators=n_estimators,
                                 sampling_strategy=sampling_strategy,
                                 random_state=42)
    if only_model:
        return eec

    return calculate_metrics_cv(model=eec,
                                X=x_train,
                                y_true=y_train,
                                cv=cv,
                                name=name)
def test_warm_start_smaller_n_estimators():
    # Test if warm start'ed second fit with smaller n_estimators raises error.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    clf = EasyEnsembleClassifier(n_estimators=5, warm_start=True)
    clf.fit(X, y)
    clf.set_params(n_estimators=4)
    with pytest.raises(ValueError):
        clf.fit(X, y)
    def easy_ensemble_classifier(df, drop, target):

        # split the table into features and outcomes
        x_cols = [i for i in df.columns if i not in drop]
        X = df[x_cols]
        y = df[target]

        # split features and outcomes into train and test data
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=1)
        eec = EasyEnsembleClassifier(n_estimators=100, random_state=0)
        eec.fit(X_train, y_train)
        y_predictions = eec.predict(X_test)

        # Calculating the accuracy score.
        acc_score = balanced_accuracy_score(y_test, y_predictions)

        return acc_score * 100
def EasyEnsembleClassfier(data,test_data):

    train_text = data['comment_text'].values.astype(str)
    test_text = test_data['comment_text'].values.astype(str)
    all_text = np.concatenate([train_text, test_text])
    train_features,test_features = get_features(train_text,test_text,all_text)

    submission = pd.DataFrame.from_dict({'Id': test_data['id']})
    # classifier1 = LogisticRegression(solver='sag', max_iter=180)
    # classifier2 = SGDClassifier(alpha=.00027, max_iter=180, penalty="l2", loss='modified_huber')
    # classifier4 = ComplementNB(alpha=0.00027, class_prior=None, fit_prior=False)
    # eclassifier = VotingClassifier(estimators=[ ('lr', classifier1), ('sgd', classifier2), ('ComplementNB', classifier4)], voting='soft', weights=[1,0.8,0.6])
    """For using a stacking classifier, do refer to mlextend.ensemble's StackingClassifier"""
    for class_name in class_names:
        train_target = data[class_name]
        y = train_target.values
        r = np.log(pr(1,y,train_features) / pr(0,y,train_features))
        x_nb = train_features.multiply(r)
        print(1)
        l = EasyEnsembleClassifier(base_estimator=LogisticRegression(C=2, solver='sag', max_iter=500))
        print(2)
        n = EasyEnsembleClassifier(base_estimator=SGDClassifier(alpha=.0002, max_iter=180, penalty="l2", loss='modified_huber'))
        print(3)
        o = LogisticRegression(C=2, dual=True, max_iter=500)
        print(4)
        p = RandomForestClassifier(criterion='gini',
                max_depth=100, max_features=1000, max_leaf_nodes=None, min_samples_split=10,
                min_weight_fraction_leaf=0.0, n_estimators=80)  
        print(5)
        m = VotingClassifier(estimators=[ ('lr', l), ('sgd', n),('lr1',o),('rdf',p)], voting='soft', weights=[0.9,1.35,0.65,0.8])
        print(6)
        m.fit(x_nb, y)
        """For cross validation scores please uncomment the following lines of code"""

    #     cv_score = np.mean(cross_val_score(
    #         m, x_nb, train_target, cv=5, scoring='roc_auc'))
    #     scores.append(cv_score)
    #     print('CV score for class {} is {}'.format(class_name, cv_score))
    # print('Total CV score is {}'.format(np.mean(scores)))
        submission[class_name] = m.predict_proba(test_features.multiply(r))[:, 1]

        submission.to_csv('EnsembleClassfierSubmission_2.csv', index=False)
def test_easy_ensemble_classifier_grid_search():
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)

    parameters = {'n_estimators': [1, 2],
                  'base_estimator__n_estimators': [3, 4]}
    grid_search = GridSearchCV(
        EasyEnsembleClassifier(base_estimator=AdaBoostClassifier()),
        parameters)
    grid_search.fit(X, y)
def adaboost(X_train, y_train, X_test, y_test):
    base_estimator = AdaBoostClassifier(n_estimators=10)
    eec = EasyEnsembleClassifier(n_estimators=10, base_estimator=base_estimator, n_jobs=-1)
    eec.fit(X_train, y_train.values.ravel())
    y_train_eec = eec.predict(X_test)
    cnf_matrix_tra = confusion_matrix(y_test, y_train_eec)
    without=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])
    print("Adaboost (boosting): {}%".format(without))
    print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1])

    objects = ('Boosting', '-')
    y_pos = np.arange(len(objects))
    performance = [without, 0]
    plt.bar(y_pos, performance, align='center', alpha=0.5)
    plt.xticks(y_pos, objects)
    plt.ylabel('Procent dokładności')
    plt.title('Dokładność Adaboost z losowym undersamplingiem')
    plt.show()

    return without
Beispiel #22
0
    def find_clf_parameters(self, train_x, train_y, clf_type):
        max_depth = [2, 4, 6]
        min_samples_leaf = np.arange(1, 4)
        min_samples_split = np.arange(2, 5)
        n_estimators = [100, 300]
        criterion = ['gini', 'entropy']
        sampling_strategy = ['auto', 'majority', 'not majority']

        models1 = {
            'BalancedRandomForestClassifier':
            BalancedRandomForestClassifier(random_state=42),
            'EasyEnsembleClassifier':
            EasyEnsembleClassifier(random_state=42)
        }

        params1 = {
            'BalancedRandomForestClassifier': [{
                'criterion':
                criterion,
                'n_estimators':
                n_estimators,
                'max_depth':
                max_depth,
                'min_samples_leaf':
                min_samples_leaf,
                'min_samples_split':
                min_samples_split,
                'sampling_strategy':
                sampling_strategy,
            }],
            'EasyEnsembleClassifier': [{
                'n_estimators': n_estimators,
                'sampling_strategy': sampling_strategy
            }],
        }

        helper1 = EstimatorSelectionHelper(models1, params1)
        if clf_type == 'binary':
            helper1.fit(train_x,
                        train_y,
                        cv=5,
                        scoring='balanced_accuracy',
                        n_jobs=-1)
        if clf_type == 'multi-class':
            helper1.fit(train_x,
                        train_y,
                        cv=5,
                        scoring='balanced_accuracy',
                        n_jobs=-1)
        df = helper1.score_summary()
        best_estimator = df['estimator'].iloc[0]
        best_estimator_params = df['params'].iloc[0]

        return best_estimator, best_estimator_params
def get_classifiers():
    classifiers = [
        DummyClassifier(), LogisticRegression(), PassiveAggressiveClassifier(), RidgeClassifier(), SGDClassifier(), \
        KNeighborsClassifier(), MLPClassifier(), LinearSVC(), \
        NuSVC(), SVC(), DecisionTreeClassifier(), ExtraTreeClassifier(), AdaBoostClassifier(), \
        BaggingClassifier(), ExtraTreesClassifier(), GradientBoostingClassifier(), \
        RandomForestClassifier(), GaussianProcessClassifier(), \
        EasyEnsembleClassifier(), BalancedBaggingClassifier(), BalancedRandomForestClassifier(),
        XGBClassifier()
    ]
    return classifiers
    
    def run_best_estimator(self, train_x, train_y, test_x, test_y, estimator,
                           params, clf_type, question):
        estimator_scores = {}

        if estimator == 'BalancedRandomForestClassifier':
            clf = BalancedRandomForestClassifier(
                n_estimators=params['n_estimators'],
                sampling_strategy=params['sampling_strategy'],
                random_state=42)
        elif estimator == 'BalancedBaggingClassifier':
            clf = BalancedBaggingClassifier(
                n_estimators=params['n_estimators'],
                bootstrap=params['bootstrap'],
                max_samples=params['max_samples'],
                sampling_strategy=params['sampling_strategy'],
                random_state=42)
        elif estimator == 'EasyEnsembleClassifier':
            clf = EasyEnsembleClassifier(
                n_estimators=params['n_estimators'],
                sampling_strategy=params['sampling_strategy'],
                random_state=42)

        clf.fit(train_x, train_y)
        cross_val_scores = self.calc_cross_val_scores(clf, train_x, train_y,
                                                      clf_type, question)

        predicted_labels = clf.predict(test_x)

        tn, fp, fn, tp = confusion_matrix(test_y, predicted_labels).ravel()
        specificity = round((tn / (tn + fp)) * 100, 2)

        predicted_prob = clf.predict_proba(test_x)
        predicted_prob_true = [p[1] for p in predicted_prob]

        estimator_scores['Question'] = question
        estimator_scores['Accuracy'] = round(
            accuracy_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Balanced Accuracy'] = round(
            balanced_accuracy_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Precision'] = round(
            precision_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Recall'] = round(
            recall_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Specificity'] = specificity
        estimator_scores['F1'] = round(f1_score(test_y, predicted_labels), 2)
        estimator_scores['ROC AUC'] = round(
            roc_auc_score(test_y, predicted_prob_true), 2)

        # print('Perfect Confusion Matrix for Q-%s is: ' % (str(question).zfill(2)))
        # perfect_labels = train_y
        # print(confusion_matrix(train_y, perfect_labels))

        return cross_val_scores, estimator_scores
    def get_naive_binary_estimators(self):
        balanced_rf_clf = BalancedRandomForestClassifier(
            sampling_strategy='not majority', random_state=42)
        balanced_bagging_clf = BalancedBaggingClassifier(
            sampling_strategy='not majority', random_state=42)
        balanced_ensemble_clf = EasyEnsembleClassifier(
            sampling_strategy='not majority', random_state=42)

        binary_estimators = [
            balanced_rf_clf, balanced_bagging_clf, balanced_ensemble_clf
        ]

        return binary_estimators
Beispiel #26
0
def test_warm_start_equal_n_estimators():
    # Test that nothing happens when fitting without increasing n_estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf = EasyEnsembleClassifier(n_estimators=5, warm_start=True, random_state=83)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    # modify X to nonsense values, this should not change anything
    X_train += 1.0

    warn_msg = "Warm-start fitting without increasing n_estimators"
    with pytest.warns(UserWarning, match=warn_msg):
        clf.fit(X_train, y_train)
    assert_array_equal(y_pred, clf.predict(X_test))
def sixth_test(X_train, y_train, X_test, y_test):
    print("Diminuer le nombre de feature numéroté inactive, ensuite SMOTE/ADASYN etc")
    data = pd.DataFrame(X_train)
    data['Target'] = y_train
    inactive_index = data[data['Target'] == 0].index
    length = len(inactive_index)
    drop_indices = np.random.choice(inactive_index, round(0.66*length), replace=False)
    data = data.drop(drop_indices)
    y_train = data.Target.values
    data = data.drop("Target", axis=1)
    X_train = data.values
    """
    print("BalancedRandomForestClassifier")
    scores = cross_validate(BalancedRandomForestClassifier(max_depth=None, n_estimators=300, random_state=0, n_jobs=2, max_features='log2'), X_train, y_train, cv=10, scoring=('roc_auc', 'average_precision'), return_estimator=True)
    print(scores['test_roc_auc'].mean(),
          scores['test_average_precision'].mean())
    log_model = scores['estimator'][np.argmax(scores['test_roc_auc'])]
    y_log_pred = log_model.predict(X_test)
    conf_mat = confusion_matrix(y_true=y_test, y_pred=y_log_pred)
    print("confusion_matrix:\n", conf_mat)

    print()

    print("BalancedBaggingClassifier")
    tree = DecisionTreeClassifier(max_features='auto')
    resample_bagging = BalancedBaggingClassifier(
        base_estimator=tree, n_estimators=100, random_state=0, n_jobs=2)
    scores = cross_validate(resample_bagging, X_train, y_train, cv=10, scoring=(
        'roc_auc', 'average_precision'), return_estimator=True)
    print(scores['test_roc_auc'].mean(),
          scores['test_average_precision'].mean())
    rf_model = scores['estimator'][np.argmax(scores['test_roc_auc'])]
    y_rf_pred = rf_model.predict(X_test)
    conf_mat = confusion_matrix(y_true=y_test, y_pred=y_rf_pred)
    print("confusion_matrix:\n", conf_mat)
    """
    print("EasyEnsembleClassifier")
    tree = DecisionTreeClassifier(max_features='auto')
    ada_tree = AdaBoostClassifier(base_estimator=LogisticRegression())
    resample_easy = EasyEnsembleClassifier(
        base_estimator=ada_tree, n_estimators=10, random_state=0, n_jobs=2, sampling_strategy='auto')
    scores = cross_validate(resample_easy, X_train, y_train, cv=10, scoring=(
        'roc_auc', 'average_precision'), return_estimator=True)
    print(scores['test_roc_auc'].mean(),
          scores['test_average_precision'].mean())
    rf_model = scores['estimator'][np.argmax(scores['test_roc_auc'])]
    y_rf_pred = rf_model.predict(X_test)
    conf_mat = confusion_matrix(y_true=y_test, y_pred=y_rf_pred)
    print("confusion_matrix:\n", conf_mat)
Beispiel #28
0
 def __init__(self):
     self.file_object = open("../logs/modeltune/log.txt", 'a+')
     self.saved_best_model_path = '../saved_model/best_model.sav'
     self.logger = App_Logger()
     self.transformed_data = dataTransform()
     self.df = self.transformed_data.trainingData()
     self.data = self.df.iloc[:, :-1]
     self.label = self.df.iloc[:, -1]
     self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
         self.data,
         self.label,
         test_size=0.2,
         random_state=0,
         stratify=self.label)
     self.BRF = BalancedRandomForestClassifier(n_jobs=-1)
     self.EEC = EasyEnsembleClassifier(n_jobs=-1)
Beispiel #29
0
 def stacking(self, X_train, y_train):
     print("STACKING")
     estimators = [
         ('rf', RandomForestClassifier(max_depth=2, random_state=42,
                                       class_weight='balanced_subsample')),
         ('bag', BalancedBaggingClassifier(random_state=42)),
         ('balanced_rf', BalancedRandomForestClassifier()),
         ('easy', EasyEnsembleClassifier()),
         ('xgb', XGBClassifier(eta=0.1, objective='multi:softmax',
                               num_class=len(y_train.unique())))
     ]
     xgb = XGBClassifier(eta=0.1, objective='multi:softmax',
                         num_class=len(y_train.unique()))
     stack = StackingClassifier(
         estimators=estimators, final_estimator=xgb
     )
     return stack
def test_warm_start_equal_n_estimators():
    # Test that nothing happens when fitting without increasing n_estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf = EasyEnsembleClassifier(
        n_estimators=5, warm_start=True, random_state=83)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    # modify X to nonsense values, this should not change anything
    X_train += 1.

    warn_msg = "Warm-start fitting without increasing n_estimators"
    with pytest.warns(UserWarning, match=warn_msg):
        clf.fit(X_train, y_train)
    assert_array_equal(y_pred, clf.predict(X_test))
    def get_models(self):
        base_lr = LogisticRegression(class_weight='balanced')
        ovr_lr = OneVsRestClassifier(base_lr)

        base_eec = EasyEnsembleClassifier(n_estimators=10)
        ovr_eec = OneVsRestClassifier(base_eec)

        base_rus = RUSBoostClassifier(n_estimators=50)
        ovr_rus = OneVsRestClassifier(base_rus)

        base_bbc = BalancedBaggingClassifier(n_estimators=10)
        ovr_bbc = OneVsRestClassifier(base_bbc)

        base_brf = BalancedRandomForestClassifier(n_estimators=100)
        ovr_brf = OneVsRestClassifier(base_brf)

        estimators = [('lr', ovr_lr), ('eec', ovr_eec), ('rus', ovr_rus),
                      ('bbc', ovr_bbc), ('brf', ovr_brf)]
        return estimators
def test_warm_start(random_state=42):
    # Test if fitting incrementally with warm start gives a forest of the
    # right size and the same results as a normal fit.
    X, y = make_hastie_10_2(n_samples=20, random_state=1)

    clf_ws = None
    for n_estimators in [5, 10]:
        if clf_ws is None:
            clf_ws = EasyEnsembleClassifier(
                n_estimators=n_estimators,
                random_state=random_state,
                warm_start=True)
        else:
            clf_ws.set_params(n_estimators=n_estimators)
        clf_ws.fit(X, y)
        assert len(clf_ws) == n_estimators

    clf_no_ws = EasyEnsembleClassifier(
        n_estimators=10, random_state=random_state, warm_start=False)
    clf_no_ws.fit(X, y)

    assert (set([pipe.steps[-1][1].random_state for pipe in clf_ws]) == set(
        [pipe.steps[-1][1].random_state for pipe in clf_no_ws]))
def run(X_train, X_test, y_train, y_test):
    print("######################")
    print("Easy Ensemble")
    print("######################")
    print("\n")

    print('Original dataset shape %s' % Counter(y_train))

    # resample all classes but the majority class
    eec = EasyEnsembleClassifier(sampling_strategy='not majority',
                                 replacement=True,
                                 random_state=42,
                                 n_jobs=-1)
    eec.fit(X_train, y_train)
    y_pred = eec.predict(X_test)
    y_proba = eec.predict_proba(X_test)

    return y_test, y_pred, y_proba
    def __init__(self):
        from imblearn.over_sampling import SMOTE, ADASYN, SVMSMOTE, BorderlineSMOTE, RandomOverSampler
        from imblearn.under_sampling import ClusterCentroids, RandomUnderSampler, InstanceHardnessThreshold, NearMiss, \
            TomekLinks, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, OneSidedSelection, \
            CondensedNearestNeighbour, NeighbourhoodCleaningRule
        from imblearn.ensemble import EasyEnsemble, EasyEnsembleClassifier, BalancedBaggingClassifier, \
            BalancedRandomForestClassifier, BalanceCascade, RUSBoostClassifier

        self.oversamplers = {
            'ADASYN': ADASYN(),
            'RandomOverSampler': RandomOverSampler(),
            'SMOTE': SMOTE(),
            'BorderlineSMOTE': BorderlineSMOTE(),
            'SVMSMOTE': SVMSMOTE()
        }
        self.undersamplers = {
            'ClusterCentroids': ClusterCentroids(),
            'RandomUnderSampler': RandomUnderSampler(),
            'InstanceHardnessThreshold': InstanceHardnessThreshold(),
            'NearMiss': NearMiss(),
            'TomekLinks': TomekLinks(),
            'EditedNearestNeighbours': EditedNearestNeighbours(),
            'RepeatedEditedNearestNeighbours':
            RepeatedEditedNearestNeighbours(),
            'AllKNN': AllKNN(),
            'OneSidedSelection': OneSidedSelection(),
            'CondensedNearestNeighbour': CondensedNearestNeighbour(),
            'NeighbourhoodCleaningRule': NeighbourhoodCleaningRule()
        }
        self.ensemblesamplers = {
            'EasyEnsemble': EasyEnsemble(),
            'EasyEnsembleClassifier': EasyEnsembleClassifier(),
            'BalancedBaggingClassifier': BalancedBaggingClassifier(),
            'BalanceCascade': BalanceCascade(),
            'BalancedRandomForestClassifier': BalancedRandomForestClassifier,
            'RUSBoostClassifier': RUSBoostClassifier()
        }
def get_models():
    models, names = list(), list()
    # LR
    models.append(
        LogisticRegression(solver='liblinear',
                           class_weight='balanced',
                           penalty='l2'))
    names.append('Logistic Regression')
    # Ada Boost
    names.append('Ada Boost')
    models.append(AdaBoostClassifier())
    # Gradient Boosting
    names.append('Gradient Boosting')
    models.append(GradientBoostingClassifier())
    # RUSBoostClassifier
    names.append('RUSBoost Classifier')
    models.append(RUSBoostClassifier())
    # BalancedRandomForestClassifier
    names.append('RandomForestClassifier')
    models.append(RandomForestClassifier(class_weight='balanced'))
    # BalancedRandomForestClassifier
    names.append('EasyEnsembleClassifier')
    models.append(EasyEnsembleClassifier())
    return models, names
Beispiel #36
0
 def __init__(self, trainFile, testFile):
     self.trainFile = trainFile
     self.testFile = testFile
     self.__een = EasyEnsembleClassifier(
         base_estimator=LogisticRegression(C=6, solver='sag', max_iter=500))
     self.__sgd = SGDClassifier(alpha=.0002,
                                max_iter=180,
                                penalty="l2",
                                loss='modified_huber')
     self.__rforest = RandomForestClassifier(criterion='gini',
                                             max_depth=100,
                                             max_features=900,
                                             n_estimators=20,
                                             n_jobs=-1,
                                             min_samples_leaf=3,
                                             min_samples_split=10)
     self.__lr = LogisticRegression(C=6, solver='sag', max_iter=500)
     self.__vot = VotingClassifier(estimators=[('een', self.__een),
                                               ('sgd', self.__sgd),
                                               ('lr', self.__lr),
                                               ('rf', self.__rforest)],
                                   voting='soft',
                                   weights=[0.9, 1.3, 0.55, 0.65])
     self.train_data = None
     self.test_data = None
     self.all_data = None
     self.train_features = None
     self.test_features = None
     self.test_labels = None
     self.train_labels = None
     self.class_names = [
         'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
         'identity_hate'
     ]
     self.submissionFile = None
     self.score = {}
def test_easy_ensemble_classifier(n_estimators, base_estimator):
    # Check classification for various parameter settings.
    X, y = make_imbalance(iris.data, iris.target,
                          sampling_strategy={0: 20, 1: 25, 2: 50},
                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    eec = EasyEnsembleClassifier(n_estimators=n_estimators,
                                 base_estimator=base_estimator,
                                 n_jobs=-1,
                                 random_state=RND_SEED)
    eec.fit(X_train, y_train).score(X_test, y_test)
    assert len(eec.estimators_) == n_estimators
    for est in eec.estimators_:
        assert (len(est.named_steps['classifier']) ==
                base_estimator.n_estimators)
    # test the different prediction function
    eec.predict(X_test)
    eec.predict_proba(X_test)
    eec.predict_log_proba(X_test)
    eec.decision_function(X_test)
      .format(balanced_accuracy_score(y_test, y_pred_brf),
              geometric_mean_score(y_test, y_pred_brf)))
cm_brf = confusion_matrix(y_test, y_pred_brf)
plot_confusion_matrix(cm_brf, classes=np.unique(satimage.target), ax=ax[1],
                      title='Balanced random forest')

###############################################################################
# Boosting classifier
###############################################################################
# In the same manner, easy ensemble classifier is a bag of balanced AdaBoost
# classifier. However, it will be slower to train than random forest and will
# achieve worse performance.

base_estimator = AdaBoostClassifier(n_estimators=10)
eec = EasyEnsembleClassifier(n_estimators=10,
                             base_estimator=base_estimator,
                             n_jobs=-1)
eec.fit(X_train, y_train)
y_pred_eec = eec.predict(X_test)
print('Easy ensemble classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_eec),
              geometric_mean_score(y_test, y_pred_eec)))
cm_eec = confusion_matrix(y_test, y_pred_eec)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_eec, classes=np.unique(satimage.target), ax=ax[0],
                      title='Easy ensemble classifier')

rusboost = RUSBoostClassifier(n_estimators=10,
                              base_estimator=base_estimator)
rusboost.fit(X_train, y_train)
def test_warm_start_equivalence():
    # warm started classifier with 5+5 estimators should be equivalent to
    # one classifier with 10 estimators
    X, y = make_hastie_10_2(n_samples=20, random_state=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

    clf_ws = EasyEnsembleClassifier(
        n_estimators=5, warm_start=True, random_state=3141)
    clf_ws.fit(X_train, y_train)
    clf_ws.set_params(n_estimators=10)
    clf_ws.fit(X_train, y_train)
    y1 = clf_ws.predict(X_test)

    clf = EasyEnsembleClassifier(
        n_estimators=10, warm_start=False, random_state=3141)
    clf.fit(X_train, y_train)
    y2 = clf.predict(X_test)

    assert_allclose(y1, y2)