Beispiel #1
0
def run_ensemble(X_train, X_val, y_train, y_val, df_test):

    ### ENSEMBLE LEARNING with (naive) classification models

    from sklearn.ensemble import StackingClassifier, RandomForestClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVC
    from sklearn.metrics import accuracy_score

    import xgboost as xgb

    final_layer = StackingClassifier(
        estimators=[('knn', KNeighborsClassifier(n_neighbors=6))],
        final_estimator=xgb.XGBClassifier(objective="binary:logistic",
                                          random_state=42))
    model = StackingClassifier(estimators=[
        ('rf', RandomForestClassifier(random_state=42)),
        ('svc', SVC(C=1, gamma=1e-6, kernel='rbf')),
    ],
                               final_estimator=final_layer)

    history = model.fit(X_train, y_train)

    print(accuracy_score(y_val, model.predict(X_val)))

    rank_results = test_results(df_test, alg="ensemble", model=model)
    return rank_results
Beispiel #2
0
def rank_stacking_classifer(X, Y):
    # rf = RandomForestClassifier()
    # gbdt = GradientBoostingClassifier()
    # adaboost = AdaBoostRegressor()
    # clf = StackingClassifier(classiers=).fit(X, Y)

    estimators = [('rf', RandomForestClassifier(n_jobs=20)),
                  ('gbdt', GradientBoostingClassifier()),
                  ('AdaBoostRegressor', AdaBoostClassifier())]

    clf = StackingClassifier(estimators=estimators,
                             final_estimator=LogisticRegression())

    clf.fit(X, Y)

    return clf
    def fit_model(self):
        """ 09. Fit our stacked classifier gradient boosting model. """

        ### Define Classifiers
        mdl = StackingClassifier(estimators=self.estimators,
                                 final_estimator=LogisticRegressionCV(10))

        mdl.fit(self.X_train,
                self.y_train,
                X_val=self.X_val,
                Y_val=self.y_val,
                sample_weight=self.w_train,
                val_sample_weight=self.w_val,
                early_stopping_rounds=100)
        self.mdl = cloudpickle.dumps(mdl)
        self.next(self.compute_roc)
def test_stacking_classifier_sparse_passthrough(fmt):
    # Check passthrough behavior on a sparse X matrix
    X_train, X_test, y_train, _ = train_test_split(
        sparse.coo_matrix(scale(X_iris)).asformat(fmt),
        y_iris, random_state=42
    )
    estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
    rf = RandomForestClassifier(n_estimators=10, random_state=42)
    clf = StackingClassifier(
        estimators=estimators, final_estimator=rf, cv=5, passthrough=True
    )
    clf.fit(X_train, y_train)
    X_trans = clf.transform(X_test)
    assert_allclose_dense_sparse(X_test, X_trans[:, -4:])
    assert sparse.issparse(X_trans)
    assert X_test.format == X_trans.format
def stacking_predictor(row):
    """
    Training stacking model with our data
    Define what our base layer will be composed of and then build
    a stacking classifier base
    on these models.
    set our final estimator as "logistic regression"

    """
    our_trained_data = pd.read_csv("data/data.csv")
    our_trained_data = clean_data(our_trained_data)

    x = our_trained_data[[
        'radius_mean', 'texture_mean', 'area_mean', 'concavity_mean',
        'concave points_mean', 'symmetry_mean', 'smoothness_mean'
    ]]
    y = our_trained_data[['diagnosis']]
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    x_train = x_train.values.tolist()
    y_train = y_train.values.tolist()
    flattened_y_train = []
    for sub_list in y_train:
        for val in sub_list:
            flattened_y_train.append(val)

    X, y = x_train, flattened_y_train

    estimators = [('random_forest',
                   RandomForestClassifier(n_estimators=5, random_state=42)),
                  ('logistic_regr',
                   LogisticRegression(solver="lbfgs", max_iter=1460)),
                  ('knn', KNeighborsClassifier(n_neighbors=5)),
                  ('svm_rbf', SVC(kernel='rbf', gamma=4, C=10000))]

    Stacking_classifier = StackingClassifier(
        estimators=estimators, final_estimator=LogisticRegression(), cv=5)

    # Fit the stacking model with our own data and with selected 7 features.
    Stacking_classifier.fit(X, y)

    # Now predicting one patient
    single_predicted_result = Stacking_classifier.predict([row])

    return ('%s %d' % ("patient", single_predicted_result))
Beispiel #6
0
 def ensembler(self, method='voting'):
     """
     Utilise des méthodes d'ensemble pour tous les classificateurs en fonction de la méthode
     Renvoie l'objet correspondant à la méthode
     """
     if method == 'voting':
         vot_clf = VotingClassifier(
             estimators=[(name, self.clfs[name])
                         for name in self.clfs.keys()])
         vot_clf.fit(self.X_train, self.y_train)
         return vot_clf
     if method == 'stacking':
         stack_clf = StackingClassifier(
             estimators=[(name, self.clfs[name])
                         for name in self.clfs.keys()])
         stack_clf.fit(self.X_train, self.y_train)
         return stack_clf
    def predict(self):
        X_train,y_train = self.train_data.iloc[:,:-1], self.train_data.iloc[:,-1]

        scaler = StandardScaler().fit(X_train)
        X_train = scaler.transform(X_train)

        level_0 = list()
        level_0.append(('RF', RandomForestClassifier(n_estimators=700)))
        level_0.append(('LR',LogisticRegression(max_iter=6000)))
        
        level_1 = SVC(C=1.2)
        model = StackingClassifier(estimators=level_0, final_estimator=level_1, cv=4)

        model.fit(X_train, y_train)
        test=scaler.transform(self.test_data)
        submission = model.predict(test)
        submission = pd.DataFrame(submission)
        submission.to_csv('submission.csv',header=['quality'],index=False)
def main():
    args = parse_arguments()
    # params
    DATA_DIR = args.data_path
    num_folds = args.fold
    seed = 1234

    # setup data
    with open(DATA_DIR + '/features.txt') as f:
        features_txt = f.readlines()
    features_name = [x.strip() for x in features_txt]
    features_name = [
        "".join(c if c.isalnum() else "_" for c in str(x))
        for x in features_name
    ]
    X_train = pd.read_csv(DATA_DIR + '/X_train.csv', names=features_name)
    X_test = pd.read_csv(DATA_DIR + '/X_test.csv', names=features_name)
    y_train = pd.read_csv(DATA_DIR + '/y_train.csv', names=['activity_label'])
    subject_train = pd.read_csv(DATA_DIR + '/subject_train.csv',
                                names=['subject_id'])

    # 0始まりにする
    y_train['activity_label'] = y_train['activity_label'] - 1

    # set up models
    estimators = [('rf',
                   RandomForestClassifier(n_estimators=300,
                                          random_state=seed)),
                  ('svr', SVC(probability=True, random_state=seed)),
                  ('knn', KNeighborsClassifier())]
    final_estimator = LogisticRegression(random_state=seed)
    kf = GroupKFold(n_splits=num_folds)
    cv_idx = kf.split(X=subject_train, groups=subject_train)
    clf = StackingClassifier(estimators=estimators,
                             final_estimator=final_estimator,
                             cv=cv_idx)

    # train
    clf.fit(X_train, y_train)

    # make submission
    test_preds = clf.predict(X_test)
    submit = test_preds + 1
    np.savetxt('baseline.txt', submit)
Beispiel #9
0
class stacked_model(BaseEstimator, ClassifierMixin, TransformerMixin):
    def __init__(self, base_models = None, meta_model = None, n_folds = None):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds

    def fit(self,X,y):
        level0 = []
        for name, model in self.base_models:
            level0.append((name, model))
        level1 = self.meta_model
        self.get_stacking_ = StackingClassifier(estimators = self.base_models, final_estimator = level1, cv = self.n_folds)
        self.get_stacking_.fit(X,y)
        
        return self
    
    def predict(self, X):
        y_pred = self.get_stacking_.predict(X)
        return y_pred
Beispiel #10
0
def test_stacking_classification():
    from sklearn.model_selection import train_test_split
    from sklearn.datasets import load_iris
    from sklearn.svm import LinearSVC
    from sklearn.linear_model import LogisticRegression
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import make_pipeline
    from sklearn.ensemble import StackingClassifier

    X, y = load_iris(return_X_y=True)
    estimators = [('gbm', xgb.sklearn.XGBClassifier()),
                  ('svr',
                   make_pipeline(StandardScaler(),
                                 LinearSVC(random_state=42)))]
    clf = StackingClassifier(estimators=estimators,
                             final_estimator=LogisticRegression())

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    clf.fit(X_train, y_train).score(X_test, y_test)
Beispiel #11
0
def stack_ensemble():
    '''
    Create StackingClassifier model
    Parameters:
        N/A
    Returns:
        N/A
    Outputs:
        confusion_matrix,
        classification_report,
        scoring
    '''

    WOE_encoder = WOEEncoder()
    X_train_enc = WOE_encoder.fit_transform(X_train, y_train)
    X_test_enc = WOE_encoder.transform(X_test)

    scaler = MinMaxScaler()
    X_train_enc_scaled = pd.DataFrame(
        scaler.fit_transform(X_train_enc, y_train))
    X_test_enc_scaled = pd.DataFrame(scaler.transform(X_test_enc))

    clfs = list()
    clfs.append(('linSVC', LinearSVC()))
    clfs.append(('bayes', GaussianNB()))
    clfs.append(('knn', KNeighborsClassifier()))
    clfs.append(('rfc', RandomForestClassifier()))
    # define meta learner model
    meta_clf = LogisticRegression()
    # define the stacking ensemble
    stk_model = StackingClassifier(estimators=clfs,
                                   final_estimator=meta_clf,
                                   cv=3)

    # fit the model on training data
    stk_model.fit(X_train_enc_scaled, y_train)
    stk_pred = stk_model.predict(X_test_enc_scaled)
    print('Stack Accuracy :', accuracy_score(y_test, stk_pred))
    print('stack F1 :', f1_score(y_test, stk_pred))
    print(confusion_matrix(y_test, stk_pred))
    print(classification_report(y_test, stk_pred))
Beispiel #12
0
def ensemble_(feat, tar, split):
    scaler = MinMaxScaler()
    x_tr,x_te,y_tr,y_te = train_test_split(feat,tar,test_size = split,shuffle = True)
    scaler.fit(x_tr)
    x_tr = scaler.transform(x_tr)
    x_te = scaler.transform(x_te)
    
    knn = KNeighborsClassifier()
    params_knn = {'n_neighbors': np.arange(1, 25)}
    knn_gs = GridSearchCV(knn, params_knn, cv=5)
    knn_gs.fit(x_tr, y_tr)
    knn_best = knn_gs.best_estimator_
    print(knn_gs.best_params_)
    
    rf = RandomForestClassifier()
    params_rf = {'n_estimators': [50, 100, 200,300,400]}
    rf_gs = GridSearchCV(rf, params_rf, cv=5)
    rf_gs.fit(x_tr, y_tr)
    rf_best = rf_gs.best_estimator_
    print(rf_gs.best_params_)
    
    
    log_reg = LogisticRegression()
    log_reg.fit(x_tr, y_tr)
    
    print('knn: {}'.format(knn_best.score(x_te, y_te)))
    print('rf: {}'.format(rf_best.score(x_te, y_te)))
    print('log_reg: {}'.format(log_reg.score(x_te, y_te)))
    
    estimators=[('knn', knn_best), ('rf', rf_best), ('log_reg', log_reg)]
    ensemble = VotingClassifier(estimators, voting='hard')
    ensemble.fit(x_tr, y_tr)
    print("ensemble voting score: ",str(ensemble.score(x_te, y_te)))
    
    ensemble_bagging = BaggingClassifier(base_estimator=RandomForestClassifier(), n_estimators=10) 
    ensemble_bagging.fit(x_tr, y_tr)
    print("ensemble bagging score: ",str(ensemble_bagging.score(x_te, y_te)))
    
    ensemble_stacking = StackingClassifier(estimators,LogisticRegression())
    ensemble_stacking.fit(x_tr, y_tr)
    print("ensemble stacking score: ", str(ensemble_stacking.score(x_te, y_te)))
def Model_1(train, test):
    ''' Trains the model and Saves the predictions in a CSV file
        train : Training set
        test : Test set
    '''
    # Preprocessing
    X_train = OneHotEncoder(sparse=False).fit_transform(
        [[x for x in s] for s in train['Sequence']])
    X_test = OneHotEncoder(sparse=False).fit_transform(
        [[x for x in s] for s in test['Sequence']])
    Y_train = train['label']

    X_train, Y_train = RandomUnderSampler(random_state=100).fit_resample(
        X_train, Y_train)
    X_train, Y_train = shuffle(X_train, Y_train, random_state=100)

    # Training
    estimators = [('rf',
                   RandomForestClassifier(n_estimators=300,
                                          max_depth=45,
                                          min_samples_leaf=7,
                                          random_state=100)),
                  ('mlp', MLPClassifier(max_iter=200, random_state=100)),
                  ('knn', KNeighborsClassifier(n_neighbors=4))]

    clf = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(random_state=100),
        n_jobs=-1,
        verbose=1)
    clf.fit(X_train, Y_train)

    # Predicting
    Y_pred = clf.predict(X_test)
    Y_prob = [x[1] for x in clf.predict_proba(X_test)]
    result = pd.DataFrame()
    result["ID"] = test["ID"]
    result["Label"] = Y_prob
    result.to_csv("Submission_1.csv", index=False)
    result["Label"] = Y_pred
    result.to_csv("Predictions_1.csv", index=False)
Beispiel #14
0
def test_stacking_classifier_drop_estimator():
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, _ = train_test_split(scale(X_iris),
                                                   y_iris,
                                                   stratify=y_iris,
                                                   random_state=42)
    estimators = [('lr', 'drop'), ('svc', LinearSVC(random_state=0))]
    rf = RandomForestClassifier(n_estimators=10, random_state=42)
    clf = StackingClassifier(estimators=[('svc', LinearSVC(random_state=0))],
                             final_estimator=rf,
                             cv=5)
    clf_drop = StackingClassifier(estimators=estimators,
                                  final_estimator=rf,
                                  cv=5)

    clf.fit(X_train, y_train)
    clf_drop.fit(X_train, y_train)
    assert_allclose(clf.predict(X_test), clf_drop.predict(X_test))
    assert_allclose(clf.predict_proba(X_test), clf_drop.predict_proba(X_test))
    assert_allclose(clf.transform(X_test), clf_drop.transform(X_test))
Beispiel #15
0
def main():
    np.random.seed(0)
    train_X, train_y, test_X, test_y = load_data()

    # Stacking models:
    # Create your stacked model using StackingClassifier
    base_models = [('rfc', RandomForestClassifier()), ('svm', SVC()),
                   ('gnb', GaussianNB()), ('knc', KNeighborsClassifier()),
                   ('dtc', DecisionTreeClassifier())]

    # The default final_estimator is LogisticRegression
    sc = StackingClassifier(estimators=base_models)

    # fit the model on the training data
    sc.fit(train_X, train_y)

    # predict
    y_pred = sc.predict(test_X)

    # Get and print f1-score on test data
    print(f"f1 score = {f1_score(y_pred, test_y , average = 'weighted')}")
Beispiel #16
0
def test_stacking():
    irep = IREP(random_state=42)
    rip = RIPPER(random_state=42)

    df = DF.copy()
    numeric_cols = df.select_dtypes("number").columns
    categorical_cols = [
        col for col in df.columns
        if (col not in numeric_cols and not col == CLASS_FEAT)
    ]
    dum_df = pd.get_dummies(df[categorical_cols])
    for col in numeric_cols:
        dum_df[col] = df[col]
    dum_df[CLASS_FEAT] = df[CLASS_FEAT]
    sktrain, sktest = df_shuffled_split(dum_df, random_state=42)
    sktrain_x, sktrain_y = sktrain.drop(CLASS_FEAT, axis=1), train[CLASS_FEAT]
    sktest_x, sktest_y = sktest.drop(CLASS_FEAT, axis=1), test[CLASS_FEAT]

    lone_tree = DecisionTreeClassifier(random_state=42)
    lone_tree.fit(sktrain_x, sktrain_y)
    lone_tree_score = lone_tree.score(sktest_x, sktest_y)
    # print('lone_tree_score',lone_tree_score)

    irep_tree = SVC(random_state=42)
    irep_stack_estimators = [("irep", irep), ("tree", irep_tree)]
    irep_stack = StackingClassifier(estimators=irep_stack_estimators,
                                    final_estimator=LogisticRegression())
    irep_stack.fit(sktrain_x, sktrain_y)
    irep_stack_score = irep_stack.score(sktest_x, sktest_y)
    # print('irep_stack_score', irep_stack_score)
    assert irep_stack_score != lone_tree_score

    rip_tree = DecisionTreeClassifier(random_state=42)
    rip_stack_estimators = [("rip", rip), ("tree", rip_tree)]
    rip_stack = StackingClassifier(estimators=rip_stack_estimators,
                                   final_estimator=LogisticRegression())
    rip_stack.fit(sktrain_x, sktrain_y)
    rip_stack_score = rip_stack.score(sktest_x, sktest_y)
    # print('rip_stack_score',rip_stack_score)
    assert rip_stack_score != lone_tree_score
def train(X, y):

    sss = StratifiedShuffleSplit(n_splits=2, test_size=0.8, random_state=42)

    # model = HistGradientBoostingClassifier(**hist_params)
    # model = GradientBoostingClassifier(**grad_params)
    # model = XGBClassifier(**xgb_params)

    # """
    estimators = [
        ("RandomForest", RandomForestClassifier(**params)),
        # ("HistGradientBoosting" ,HistGradientBoostingClassifier(**hist_params)),
        ("Quadrant", QuadraticDiscriminantAnalysis()),
        ("XGB", XGBClassifier(**xgb_params))
    ]
    model = StackingClassifier(estimators=estimators, n_jobs=-1)
    # """
    print("Train & Cross validation".center(40, '-'))
    print(np.mean(cross_validate(sss, X, y, model), axis=0) * 100)
    model.fit(X, y)
    # print(model.n_iter_)
    return model
Beispiel #18
0
def test_stacking_classifier():
    '''
    Tests issue https://github.com/koaning/scikit-lego/issues/501

    No asserts are added as we only test for being exception free.
    When cloning the model in Thresholder an unfitted model is generated
    where no predict_proba exists
    '''
    estimators = [("dummy", DummyClassifier(strategy="constant", constant=0))]

    X = np.random.normal(0, 1, (100, 3))
    y = np.random.normal(0, 1, (100, )) < 0

    clf = StackingClassifier(estimators=estimators,
                             final_estimator=DummyClassifier(
                                 strategy="constant", constant=0))

    clf.fit(X, y)

    a = Thresholder(clf, threshold=0.2)
    a.fit(X, y)
    a.predict(X)
Beispiel #19
0
def test_stacking_classifier_iris(cv, final_estimator, passthrough):
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, y_test = train_test_split(scale(X_iris),
                                                        y_iris,
                                                        stratify=y_iris,
                                                        random_state=42)
    estimators = [("lr", LogisticRegression()), ("svc", LinearSVC())]
    clf = StackingClassifier(
        estimators=estimators,
        final_estimator=final_estimator,
        cv=cv,
        passthrough=passthrough,
    )
    clf.fit(X_train, y_train)
    clf.predict(X_test)
    clf.predict_proba(X_test)
    assert clf.score(X_test, y_test) > 0.8

    X_trans = clf.transform(X_test)
    expected_column_count = 10 if passthrough else 6
    assert X_trans.shape[1] == expected_column_count
    if passthrough:
        assert_allclose(X_test, X_trans[:, -4:])

    clf.set_params(lr="drop")
    clf.fit(X_train, y_train)
    clf.predict(X_test)
    clf.predict_proba(X_test)
    if final_estimator is None:
        # LogisticRegression has decision_function method
        clf.decision_function(X_test)

    X_trans = clf.transform(X_test)
    expected_column_count_drop = 7 if passthrough else 3
    assert X_trans.shape[1] == expected_column_count_drop
    if passthrough:
        assert_allclose(X_test, X_trans[:, -4:])
Beispiel #20
0
def ensemble_predictions(members, X_te, params):
    assert params["type"] in ("weighted", "stacked")
    # make predictions
    if params["type"] == "weighted":
        y_preds = np.array([model.predict_proba(X_te) for model in members])

        # mean across ensemble members
        y_ensemble_pred = np.average(y_preds, weights=params["weights"], axis=0)
    else:
        estimators = [(f'expert_{i}', members[i]) for i in range(len(members))]

        # only final estimator should be fitted here
        clf = StackingClassifier(
            estimators=estimators, final_estimator=LogisticRegression())
        X_tr = params["X_tr"]
        print(X_tr.columns.tolist())
        y_tr = params["y_tr"]

        clf.fit(X_tr, y_tr.values.ravel())

        y_ensemble_pred = clf.predict_proba(X_te)

    return y_ensemble_pred
Beispiel #21
0
def test_stacking_classifier_drop_column_binary_classification():
    # check that a column is dropped in binary classification
    X, y = load_breast_cancer(return_X_y=True)
    X_train, X_test, y_train, _ = train_test_split(
        scale(X), y, stratify=y, random_state=42
    )

    # both classifiers implement 'predict_proba' and will both drop one column
    estimators = [('lr', LogisticRegression()),
                  ('rf', RandomForestClassifier(random_state=42))]
    clf = StackingClassifier(estimators=estimators, cv=3)

    clf.fit(X_train, y_train)
    X_trans = clf.transform(X_test)
    assert X_trans.shape[1] == 2

    # LinearSVC does not implement 'predict_proba' and will not drop one column
    estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
    clf.set_params(estimators=estimators)

    clf.fit(X_train, y_train)
    X_trans = clf.transform(X_test)
    assert X_trans.shape[1] == 2
 def Stacking(self):
     estimators3 = [
         ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
         ('knn', KNeighborsClassifier(n_neighbors=5)),
         ('svm', SVC())]
     estimators2 = [
         ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
         ('svm', SVC())]
     estimators1 = [
         ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
         ('knn', KNeighborsClassifier(n_neighbors=5))]
     estimators4 = [
         ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
         ('svm', SVC())]
     try:
         if (self.svmStackingcheckBox.isChecked() and self.rfcStackingcheckBox.isChecked() and self.knnStackingcheckBox.isChecked()):
             estimators = estimators3
             clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
             stackingAccuracy = clf.fit(self.X_train, self.y_train).score(self.X_test, self.y_test)
             self.accuracyEnsembleLBL.setText(str(stackingAccuracy))
         elif (self.svmStackingcheckBox.isChecked() and self.rfcStackingcheckBox.isChecked()):
             estimators = estimators2
             clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
             stackingAccuracy = clf.fit(self.X_train, self.y_train).score(self.X_test, self.y_test)
             self.accuracyEnsembleLBL.setText(str(stackingAccuracy))
         elif(self.rfcStackingcheckBox.isChecked() and self.knnStackingcheckBox.isChecked()):
             estimators = estimators1
             clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
             stackingAccuracy = clf.fit(self.X_train, self.y_train).score(self.X_test, self.y_test)
             self.accuracyEnsembleLBL.setText(str(stackingAccuracy))
         elif(self.svmStackingcheckBox.isChecked() and self.knnStackingcheckBox.isChecked()):
             estimators = estimators4
             clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
             stackingAccuracy = clf.fit(self.X_train, self.y_train).score(self.X_test, self.y_test)
             self.accuracyEnsembleLBL.setText(str(stackingAccuracy))
     except Exception as a:
         print(a)
def stackingClassifier(Feature_train, y_train, Feature_test):
    layer_one_estimators = [('rf_1',
                             DecisionTreeClassifier(max_depth=6,
                                                    max_features=15)),
                            ('knn_1', KNeighborsClassifier(n_neighbors=35))]

    layer_two_estimators = [('dt_2',
                             DecisionTreeClassifier(max_depth=6,
                                                    max_features=15)),
                            ('rf_2', svm.SVC())]

    layer_two = StackingClassifier(estimators=layer_two_estimators,
                                   final_estimator=LogisticRegression())

    clf = StackingClassifier(estimators=layer_one_estimators,
                             final_estimator=layer_two)
    clf = clf.fit(Feature_train, y_train)
    y_pred = clf.predict(Feature_test)
    return y_pred
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split

X, y = load_iris(return_X_y=True)
estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
              ('svr',
               make_pipeline(StandardScaler(), LinearSVC(random_state=42)))]
clf = StackingClassifier(estimators=estimators,
                         final_estimator=LogisticRegression())
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)
clf.fit(X_train, y_train).score(X_test, y_test)

# %%
# Permutation-based feature importance
# ------------------------------------
#
# The :func:`inspection.permutation_importance` can be used to get an
# estimate of the importance of each feature, for any fitted estimator:

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

X, y = make_classification(random_state=0, n_features=5, n_informative=3)
Beispiel #25
0
clf = StackingClassifier(estimators=estimators,
                         final_estimator=LogisticRegression())

vectorizer = TfidfVectorizer()

print([[" ".join(i) for i in p] for p in pos_filtered_data][0])

data = vectorizer.fit_transform(
    [" ".join([" ".join(i) for i in p]) for p in pos_filtered_data])

X_train, X_test, y_train, y_test = train_test_split(data,
                                                    labels,
                                                    test_size=0.33,
                                                    random_state=42)

clf.fit(X_train, y_train)

print(classification_report(y_test, clf.predict(X_test)))

# #### doc2vec with KNN

# print(pos_filtered_data[0])

# glued_data = []
# for item in pos_filtered_data:
#     new_item = []
#     for sent in item:
#         new_item.append(" ".join(sent))
#     glued_data.append(". ".join(new_item))

# print(glued_data[0])
# ############################################################ HistGradientBoostingClassifier
clf_hgbc = HistGradientBoostingClassifier()
clf_hgbc.fit(x_train, y_train)
hgbc_pred = clf_hgbc.predict(x_test)
hgb_matrices = evaluate_preds(clf_hgbc, x_test, y_test, hgbc_pred)
# ############################################################
# ############################################################ LogisticRegression
clf_lr = LogisticRegression()
clf_lr.fit(x_train, y_train)
clf_pred = clf_lr.predict(x_test)
lr_matrices = evaluate_preds(clf_lr, x_test, y_test, clf_pred)
# ############################################################
# ############################################################ StackingClassifier
clf_sc = StackingClassifier(estimators=estimators,
                            final_estimator=LogisticRegression())
clf_sc.fit(x_train, y_train)
clf_pred = clf_sc.predict(x_test)
sc_matrices = evaluate_preds(clf_sc, x_test, y_test, clf_pred)
# ############################################################
# ############################################################   VotingClassifier
clf_vc = VotingClassifier(estimators=[("knn", clf_knn), ('adab', clf_adab),
                                      ('rfc', clf_rfc), ('gnc', clf_gbc),
                                      ("bc", clf_bc), ("etc", clf_etc),
                                      ("hgbc", clf_hgbc), ('xgb', clf_xgb),
                                      ("lr", clf_lr)],
                          voting='soft')

clf_vc.fit(x_train, y_train)
clf_pred = clf_vc.predict(x_test)
vc_matrices = evaluate_preds(clf_vc, x_test, y_test, clf_pred)
Beispiel #27
0
estimators = [
    ('gbm', grid_search_gbm.best_estimator_),
    #('xgb', grid_search_xgb.best_estimator_),
    ('lgbm', lgbm_grid.best_estimator_)
]
#('rf', grid_search_rf.best_estimator_)]

# In[257]:

clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(
        random_state=20202020),  # logreg is better than gbm
    stack_method='predict_proba')

clf.fit(X_tr, y_tr)

# In[258]:

results = clf.predict_proba(X_val)[:, 1]
act = y_val.array

roc_auc_score(act, results)

# #### 71.246 best on validation
# with lgbm and gbm as base learners

# ## CatBoost (left out of Stack model - takes forever to train)

# In[111]:
            ),
            (
                "model",
                LGBMClassifier(n_jobs=-1, boosting_type="gbdt").set_params(
                    **{
                        k.replace("final_estimator__model__", ""): v
                        for k, v in params.items()
                    }),
            ),
        ]),
        verbose=1,
        n_jobs=-1,
        cv=3,
    )

    best_model = model.fit(X_train, y_train)
    preds = best_model.predict(X_test)
    print("loggeando movidas")
    mlflow.log_metrics(
        metrics={
            "f1": f1_score(y_test, preds, average="macro"),
            "precision": precision_score(y_test, preds, average="macro"),
            "recall": recall_score(y_test, preds, average="macro"),
            "accuracy": accuracy_score(y_test, preds),
            "f05": fbeta_score(y_test, preds, beta=0.5, average="macro"),
            "f2": fbeta_score(y_test, preds, beta=2, average="macro"),
        })

    best_params = params
    for param in best_params.keys():
        mlflow.log_param(param, best_params[param])
Beispiel #29
0
def test_stacking_classifier_error(y, params, type_err, msg_err):
    with pytest.raises(type_err, match=msg_err):
        clf = StackingClassifier(**params, cv=3)
        clf.fit(scale(X_iris), y, sample_weight=np.ones(X_iris.shape[0]))
much better than if we simply transform those outputs to [0,1] according to a threshold
'''

X = pd.DataFrame({'Yamnet': y_predicted_yamnet, 'SVM': pd.Series(y_pred_svm)})

estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
              ('svr',
               make_pipeline(StandardScaler(), LinearSVC(random_state=42)))]
clf = StackingClassifier(estimators=estimators,
                         final_estimator=LogisticRegression())

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y,
                                                    random_state=42)
clf.fit(X_train, y_train).score(X_test,
                                y_test)  #  y_test == y_real.iloc[X_test.index]
y_pred_combined = clf.predict_proba(
    X_test)[:, 1]  # The probability of getting the output as 1 (cough)
Confusion_Matrix(y_test, y_pred_combined, pred_prob=True)

y_pred_combined = clf.predict_proba(X)[:, 1]
y_real, y_predicted_combined = Confusion_Matrix(y,
                                                y_pred_combined,
                                                pred_prob=True)

X_new = pd.DataFrame({'Yamnet': [0], 'SVM': [0.95]})
clf.predict_proba(X_new)[:, 1]

# Import Joblib Module from Scikit Learn

import joblib