コード例 #1
0
	    def stacking():
	    # Building and fitting 
	    clf1 = KNeighborsClassifier(n_neighbors=1)
	    clf2 = RandomForestClassifier(random_state=1)
	    clf3 = GaussianNB()
	    lr = LogisticRegression()
	    stack = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr)
	    stack.fit(X_train, y_train)
	    
	    # make class predictions for the testing set
	    y_pred_class = stack.predict(X_test)
	    
	    print('########### Stacking ###############')
	    
	    accuracy_score = evalClassModel(stack, y_test, y_pred_class, True)

	    #Data for final graph
	    methodDict['Stacking'] = accuracy_score * 100

	    stacking()
コード例 #2
0
def runExperiment(bases, experimentName):
	metric = 'precision'
	meta_classifier = GaussianNB()
	input = x
	for i in [10, 15, 20]:
		experiment = '\n*** Stacking - {} - {} base classifiers ***'.format(experimentName, i)
		base_classifiers = []
		while len(base_classifiers) < i:
			for b in bases:
				pipe = make_pipeline(ColumnSelector(cols=getRandomCols()), b)
				base_classifiers.append(pipe)
#		test_scores = []
		test_scores = {'score':[], 'diversity':[]}
		for j in range(10):
			ensemble = StackingClassifier(classifiers=base_classifiers, meta_classifier=meta_classifier)
			cv_scores = cross_validate(ensemble, input, y, scoring={'score':metric, 'diversity':diversity}, cv=KFold(n_splits=10))
#			test_scores.append(cv_scores['test_score'].mean())
			test_scores['score'].append(cv_scores['test_score'].mean())
			test_scores['diversity'].append(cv_scores['test_diversity'].mean())
		report(experiment, test_scores)
コード例 #3
0
def runExperiment(base_classifiers, experimentName):
	if len(base_classifiers) != 5:
		raise ValueError('You must provide exactly 5 base classifiers')
	metric = 'precision'
	meta_classifier = GaussianNB()
	input = x
	for i in [10, 15, 20]:
		classifiers = []
		for j in range(1, i // len(base_classifiers) + 1):
			classifiers.extend(base_classifiers)
		experiment = '\n*** Stacking - {} - {} base classifiers ***'.format(experimentName, len(classifiers))
#		test_scores = []
		test_scores = {'score':[], 'diversity':[]}
		for j in range(10):
			ensemble = StackingClassifier(classifiers=base_classifiers, meta_classifier=meta_classifier)
#			cv_scores = cross_validate(ensemble, input, y, scoring=metric, cv=KFold(n_splits=10))
			cv_scores = cross_validate(ensemble, input, y, scoring={'score':metric, 'diversity':diversity}, cv=KFold(n_splits=10))
#			test_scores.append(cv_scores['test_score'].mean())
			test_scores['score'].append(cv_scores['test_score'].mean())
			test_scores['diversity'].append(cv_scores['test_diversity'].mean())
		report(experiment, test_scores)
コード例 #4
0
    def __init__(self, models_instructions):
        '''
        models_instructions should be a list of tuples [(Model, list_of_features)]
        '''
        from mlxtend.classifier import StackingClassifier
        from mlxtend.feature_selection import ColumnSelector
        from sklearn.pipeline import Pipeline
        from sklearn.linear_model import LogisticRegression
        from sklearn.model_selection import StratifiedKFold

        self.cv = StratifiedKFold(n_splits=5, random_state=123)
        self.models_instructions = models_instructions
        models = [
            Pipeline([('ColumnSelect', ColumnSelector(v[1])),
                      ('Model', v[0].clf)])
            for i, v in enumerate(models_instructions)
        ]
        self.models = models
        self.clf_stack = Model(clf=StackingClassifier(
            classifiers=models, meta_classifier=LogisticRegression()),
                               name='Stacked ensemble')
コード例 #5
0
    def stacking(self,
                 x_train,
                 y_train,
                 meta_clf_label=None,
                 save=True,
                 name='stack_model.pkl'):
        meta_clf = self.model_util.get_meta_model(meta_clf_label)
        meta_param = self.model_util.get_meta_param_set(meta_clf_label)

        stack_clf = StackingClassifier(classifiers=self.get_basic_models(
            self.selected_models),
                                       use_probas=False,
                                       average_probas=False,
                                       meta_classifier=meta_clf)

        self.best_model = self.model_param_tune(x_train, y_train, stack_clf,
                                                meta_param)
        if save:
            self.model_util.save_model(
                self.best_model, os.path.join(self.model_save_path, name))
        return self.best_model
コード例 #6
0
def get_model(model_type):
    if model_type == "knn":
        model = KNeighborsClassifier()
    elif model_type == "naive_bayes":
        model = MultinomialNB()
    elif model_type == "logistic_regression":
        model = LogisticRegression()
    elif model_type == "svm":
        model = SVC(kernel='linear')
    elif model_type == "decision_tree":
        model = DecisionTreeClassifier()
    elif model_type == "adaboost":
        model = AdaBoostClassifier()
    elif model_type == "random_forest":
        model = RandomForestClassifier(n_estimators=300)
    elif model_type == "gbdt":
        model = GradientBoostingClassifier()
    elif model_type == "xgboost":
        model = XGBClassifier()
    elif model_type == "mlp":
        model = MLPClassifier()
    elif model_type == 'bagging':
        clf1 = LogisticRegression(random_state=0)
        clf2 = XGBClassifier(random_state=0)
        clf3 = SVC(random_state=0, kernel='linear', probability=True)
        clf4 = MLPClassifier(random_state=0)
        model = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4],
                                       weights=[1, 2, 2, 1],
                                       voting='soft',
                                       verbose=2)
    elif model_type == 'stacking':
        clf1 = XGBClassifier(random_state=0)
        clf2 = SVC(random_state=0, kernel='linear', probability=True)
        clf3 = MLPClassifier(random_state=0)
        lr = LogisticRegression()
        model = StackingClassifier(classifiers=[clf1, clf2, clf3],
                                   use_probas=True,
                                   average_probas=False,
                                   meta_classifier=lr)
    return model
コード例 #7
0
def test_mlxtend():
    data = load_iris()
    X = data.data  # (569,30)
    y = data.target  # (569)

    clfs = [
        RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=100,
                               n_jobs=-1,
                               criterion='entropy'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
        GradientBoostingClassifier(learning_rate=0.05,
                                   subsample=0.5,
                                   max_depth=6,
                                   n_estimators=100)
    ]
    lr = LogisticRegression()
    sclf = StackingClassifier(classifiers=clfs, meta_classifier=lr)
    scores = model_selection.cross_val_score(sclf,
                                             X,
                                             y,
                                             cv=3,
                                             scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" %
          (scores.mean(), scores.std(), 'stacking'))

    print('3-fold cross validation:\n')

    for clf, label in zip(
            clfs,
        ['KNN', 'Random Forest', 'Naive Bayes', 'StackingClassifier']):
        scores = model_selection.cross_val_score(clf,
                                                 X,
                                                 y,
                                                 cv=3,
                                                 scoring='accuracy')
        print("Accuracy: %0.2f (+/- %0.2f) [%s]" %
              (scores.mean(), scores.std(), label))
コード例 #8
0
def NN_Stacking(X_train, X_test, y_train, y_test):
    model_name = 'NN Stacking'
    input_dim = len(X_train.columns)

    #create_model_NN_1HiddenLayer(input_dim=38*4, nodes_l1=200, dropout_l1=0.2)
    clf1 = KerasClassifier(build_fn=create_model_NN_1HiddenLayer,
                           input_dim=38 * 4,
                           epochs=30,
                           batch_size=1000,
                           verbose=2)
    clf2 = KerasClassifier(build_fn=create_model_NN_2HiddenLayers,
                           input_dim=input_dim,
                           epochs=15,
                           batch_size=1000,
                           verbose=2)
    clf3 = KerasClassifier(build_fn=create_model_NN_3HiddenLayers,
                           input_dim=input_dim,
                           epochs=15,
                           batch_size=1000,
                           verbose=2)
    clf4 = KerasClassifier(build_fn=create_model_NN_4HiddenLayers,
                           input_dim=input_dim,
                           epochs=15,
                           batch_size=1000,
                           verbose=2)

    sclf = StackingClassifier(classifiers=[clf2, clf2, clf3, clf4],
                              use_probas=True,
                              average_probas=False,
                              meta_classifier=clf1)

    start = datetime.now()
    print('Fitting %s...' % model_name)
    model = sclf.fit(X_train, y_train)
    end = datetime.now()
    print('Finished fitting {} in {} seconds'.format(model_name,
                                                     str(end - start)))

    return sclf, model
コード例 #9
0
ファイル: voting_pre.py プロジェクト: SunJackson/card_risk
 def stacking_model2(self, X_train, X_test, y_train, bst_xgb, bst_forest,
                     bst_gradient, bst_lgb):
     '''
     组合四种算法
     :param X_train: 训练集
     :param X_test: 测试集
     :param y_train: 训练标签
     :param bst_xgb: xgb最优参数
     :param bst_forest: forest最优参数
     :param bst_gradient: gradient最优参数
     :param bst_lgb: lgb最优参数
     :return: 预测结果
     '''
     lr = linear_model.LogisticRegression(random_state=7)
     sclf = StackingClassifier(
         classifiers=[bst_xgb, bst_forest, bst_gradient, bst_lgb],
         use_probas=True,
         average_probas=False,
         meta_classifier=lr)
     sclf.fit(X_train, y_train)
     predictions = sclf.predict_proba(X_test)[:, 1]
     return predictions
コード例 #10
0
def test_get_params():
    clf1 = KNeighborsClassifier(n_neighbors=1)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    lr = LogisticRegression()
    sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                              meta_classifier=lr)

    got = sorted(list({s.split('__')[0] for s in sclf.get_params().keys()}))
    expect = ['average_probas',
              'classifiers',
              'gaussiannb',
              'kneighborsclassifier',
              'meta-logisticregression',
              'meta_classifier',
              'randomforestclassifier',
              'store_train_meta_features',
              'use_clones',
              'use_features_in_secondary',
              'use_probas',
              'verbose']
    assert got == expect, got
コード例 #11
0
def get_model(model_type):
    if model_type == "logistic_regression":
        model = LogisticRegression()  # 快,准确率一般。val mean acc:0.91
    elif model_type == "random_forest":
        model = RandomForestClassifier(
            n_estimators=300)  # 速度还行,准确率一般。val mean acc:0.93125
    elif model_type == "decision_tree":
        model = DecisionTreeClassifier()  # 速度快,准确率低。val mean acc:0.62
    elif model_type == "knn":
        model = KNeighborsClassifier()  # 速度一般,准确率低。val mean acc:0.675
    elif model_type == "bayes":
        model = MultinomialNB()  # 速度快,准确率低。val mean acc:0.62
    elif model_type == "xgboost":
        model = XGBClassifier()  # 速度慢,准确率高。val mean acc:0.95
    elif model_type == "svm":
        model = SVC(kernel='linear',
                    probability=True)  # 速度慢,准确率高,val mean acc:0.945
    elif model_type == 'mlp':
        model = MLPClassifier()  # 速度一般,准确率一般。val mean acc:0.89125
    elif model_type == 'ensemble':
        clf1 = LogisticRegression(random_state=0)
        clf2 = XGBClassifier(random_state=0)
        clf3 = SVC(random_state=0, kernel='linear', probability=True)
        clf4 = MLPClassifier(random_state=0)
        model = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3, clf4],
                                       weights=[1, 2, 2, 1],
                                       voting='soft',
                                       verbose=2)
    elif model_type == 'stack':
        clf1 = XGBClassifier(random_state=0)
        clf2 = SVC(random_state=0, kernel='linear', probability=True)
        clf3 = MLPClassifier(random_state=0)
        lr = LogisticRegression()
        model = StackingClassifier(classifiers=[clf1, clf2, clf3],
                                   use_probas=True,
                                   average_probas=False,
                                   meta_classifier=lr)

    return model
コード例 #12
0
def main():

    data_train, data_test = loadData()
    data_train = brain(data_train)
    data_test = brain(data_test)
    data_train = data_train.drop(['AgePclass'], axis=1)
    data_test = data_test.drop(['AgePclass'], axis=1)
    models = {
        'LogisticReg': LogisticRegression(max_iter=500, tol=0.0001, penalty='l2', solver='lbfgs'),
        'svc': SVC(max_iter=200, kernel='rbf', gamma=0.5, C=5),
        'KNN': KNeighborsClassifier(n_neighbors=9),
        'LinearSvc': LinearSVC(max_iter=250, penalty='l2', C=0.5),
        'decisionTree': DecisionTreeClassifier(max_depth=4),

        'randomTree': RandomForestClassifier(n_estimators=100, max_depth=3, max_features=4,
                                             min_samples_leaf=20, random_state=0),
        'gbdt': GradientBoostingClassifier(n_estimators=500, max_depth=3, learning_rate=0.1, random_state=0),
        'adaboost': AdaBoostClassifier(n_estimators=300, learning_rate=0.75, random_state=0),
        'extract': ExtraTreesClassifier(n_estimators=250, n_jobs=-1, max_depth=5, random_state=0),
        'gnb': GaussianNB(),
    }

    stackModel = StackingClassifier(classifiers=[models['decisionTree'],
                                                    models['gbdt'], models['adaboost'],
                                                    models['extract']],
                                    meta_classifier=models['randomTree'])

    # for key in models:
    #     scores, clf = train(data_train, models[key])
    #
    #     print("model: {0}   scores: {1}".format(key, scores))
    # clf = SVC(max_iter=200, kernel='rbf', gamma=0.5, C=5)

    # parameters = {"n_estimators": [200, 500, 800, 1000], 'min_samples_leaf': [2, 4, 6],
    #               "max_depth": [3, 5, 8], "random_state": [0]}
    # clf = GridSearchCV(models['randomTree'], parameters)
    clf = train(data_train, models['randomTree'])
    test(data_test, clf)
コード例 #13
0
def test_not_fitted():
    np.random.seed(123)
    meta = LogisticRegression()
    clf1 = RandomForestClassifier()
    clf2 = GaussianNB()
    sclf = StackingClassifier(classifiers=[clf1, clf2],
                              use_probas=True,
                              meta_classifier=meta)

    assert_raises(
        NotFittedError, "This StackingClassifier instance is not fitted yet."
        " Call 'fit' with appropriate arguments"
        " before using this method.", sclf.predict, iris.data)

    assert_raises(
        NotFittedError, "This StackingClassifier instance is not fitted yet."
        " Call 'fit' with appropriate arguments"
        " before using this method.", sclf.predict_proba, iris.data)

    assert_raises(
        NotFittedError, "This StackingClassifier instance is not fitted yet."
        " Call 'fit' with appropriate arguments"
        " before using this method.", sclf.predict_meta_features, iris.data)
コード例 #14
0
def stacking():
    from sklearn import datasets
    from sklearn import model_selection
    from sklearn.linear_model import LogisticRegression
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    from mlxtend.classifier import StackingClassifier
    #import xgboost as xgb
    import numpy as np

    iris = datasets.load_iris()
    X, y = iris.data, iris.target

    clf1 = KNeighborsClassifier(n_neighbors=5)
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    clf4 = GradientBoostingClassifier(n_estimators=200, max_depth=6)
    lr = LogisticRegression()
    #sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],meta_classifier=lr)
    sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                              use_probas=True,
                              meta_classifier=lr)

    print('3-fold cross validation:\n')
    for clf, label in zip([clf1, clf2, clf3, clf4, sclf], [
            'KNN', 'Random Forest', 'Naive Bayes',
            "GradientBoostingClassifier", 'StackingClassifier'
    ]):
        scores = model_selection.cross_val_score(clf,
                                                 X,
                                                 y,
                                                 cv=3,
                                                 scoring='accuracy')
        print("Accuracy: %0.2f (+/- %0.2f) [%s]" %
              (scores.mean(), scores.std(), label))
コード例 #15
0
    def model_select_param(self,model='LR'):

        print 'Model Select Parameters'
        print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        
        lr = self.model_init(model)
        clf1 = self.model_init('KNN')
        clf2 = self.model_init('RFC')
        clf3 = self.model_init('GNB')
        sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],meta_classifier=lr)

        params = {'kneighborsclassifier__n_neighbors': [1, 5],'randomforestclassifier__n_estimators': [10, 50],\
        'meta-logisticregression__C': [0.1, 10.0]}
        
        gs = GridSearchCV(sclf, param_grid=param_grid, cv=5, scoring='roc_auc', n_jobs=5)
        
        data = self.train.values.copy() 
        label = self.train_label['label'].values.copy()

        label = label.reshape(label.shape[0])       
        gs.fit(data, label)
        
        print 'Model: {0} '.format(model)
           
        print gs.grid_scores_
        print gs.best_score_
        print gs.best_params_
        
        print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 
        auc_train=None
        auc_test=None
        
        auc_train=self.model_cross_validation(model,gs.best_params_)
        auc_test=self.model_test(model,gs.best_params_)
        
        return auc_train,auc_test
コード例 #16
0
def run_glass_experiments_ensemble(data):
    glass_X = data.drop(['Id', 'Type'], axis=1)
    glass_y = data.loc[:, 'Type']

    bagging = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                n_estimators=100)
    boosting = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(),
                                  n_estimators=100)
    stacking = StackingClassifier(
        classifiers=[DecisionTreeClassifier() for _ in range(100)],
        meta_classifier=DecisionTreeClassifier())
    random_forest = RandomForestClassifier(n_estimators=100)
    xgboost = xgb.XGBClassifier()
    decision_tree = DecisionTreeClassifier()

    methods = {
        ' Bagging': bagging,
        ' Boosting': boosting,
        ' Stacking': stacking,
        ' Random Forest': random_forest,
        ' XGBoost': xgboost,
        ' Decision Tree': decision_tree
    }

    results = list()
    for method in methods:
        results_model = cross_validate(methods[method],
                                       glass_X,
                                       y=glass_y,
                                       cv=10,
                                       scoring=['accuracy'],
                                       return_train_score=True)
        results_model['method'] = method
        results_model['fold'] = np.arange(1, 11)
        results.append(pd.DataFrame(results_model))
    return pd.concat(results)
コード例 #17
0
def stack_models(features, labels):
    """ stack gb and nb
    """
    kfold = StratifiedShuffleSplit(labels, 100, random_state=42)
    clf1 = GradientBoostingClassifier(learning_rate=0.1,
                                      min_samples_leaf=1,
                                      n_estimators=400,
                                      min_weight_fraction_leaf=0,
                                      min_samples_split=4,
                                      max_depth=2,
                                      random_state=42)
    #clf2  = AdaBoostClassifier()
    clf3 = GaussianNB()
    lr = LogisticRegression()
    sclf = StackingClassifier(classifiers=[clf1, clf3], meta_classifier=lr)
    for clf, label in zip([clf1, clf3, sclf],
                          ['GB', 'Naive Bayes', 'StackingClassifier']):
        scores = model_selection.cross_val_score(clf,
                                                 features,
                                                 labels,
                                                 cv=kfold,
                                                 scoring='f1')
        print("Accuracy: %0.2f (+/- %0.2f) [%s]" %
              (scores.mean(), scores.std(), label))
コード例 #18
0
def test_get_params():
    np.random.seed(123)
    clf1 = KNeighborsClassifier(n_neighbors=1)
    clf2 = RandomForestClassifier(n_estimators=10)
    clf3 = GaussianNB()
    lr = LogisticRegression(solver='liblinear',
                            multi_class='ovr')
    sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
                              meta_classifier=lr)

    got = sorted(list({s.split('__')[0] for s in sclf.get_params().keys()}))
    expect = ['average_probas',
              'classifiers',
              'drop_last_proba',
              'gaussiannb',
              'kneighborsclassifier',
              'meta_classifier',
              'randomforestclassifier',
              'store_train_meta_features',
              'use_clones',
              'use_features_in_secondary',
              'use_probas',
              'verbose']
    assert got == expect, got
コード例 #19
0
#使用KNN
knn = BaggingClassifier(KNeighborsClassifier(),
                        n_estimators=40,
                        max_samples=0.5,
                        max_features=0.5)

knn.fit(X_train, y_train)

knn_predict = knn.predict(data_test)

#使用stack集成 随机森林,梯度提升决策树,KNN
clf1 = rfc
clf2 = gbc
clf3 = knn
lr = LogisticRegression()
slf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr)

slf.fit(X_train, y_train)

#交叉验证
scores = cross_validation.cross_val_score(gbm, X, y, cv=5)
#得分
dtc_score = dtc.score(X_test, y_test)
rfc_score = rfc.score(X_test, y_test)
#gbc_score = cross_val_score(gbc,X,y,cv=10)
gbc_score = gbc.score(X_test, y_test)
gbm_score = gbm.score(X_test, y_test)

knn_score = knn.score(X_test, y_test)
slf_score = slf.score(X_test, y_test)
#输出结果
コード例 #20
0
ファイル: simple-bagging.py プロジェクト: GiXxXx/kaggle
# lm_clf = BaggingClassifier(
#     LogisticRegression(), max_features=1.0, max_samples=0.2)

knn_clf = BaggingClassifier(
    KNeighborsClassifier(n_jobs=-1), n_estimators=3, max_features=1.0, max_samples=0.3, n_jobs=-1)

# nb_clf = BaggingClassifier(
#     GaussianNB(), n_estimators=3, max_features=1.0, max_samples=0.3, n_jobs=-1)

# svc_clf  = BaggingClassifier(
#     SVC(), n_estimators=3, max_features=1.0, max_samples=0.3, n_jobs=-1)

rf_clf = RandomForestClassifier(n_estimators=110, max_depth=5,
                                                  min_samples_split=2, min_samples_leaf=1, random_state=34, n_jobs=-1)

st_clf = StackingClassifier(classifiers=[knn_clf, rf_clf], meta_classifier=LogisticRegression(), use_probas=True, average_probas=False)


print('3-fold cross validation:\n')

for clf, label in zip([knn_clf, rf_clf, st_clf],
                      [
                       'KNN',
                       'Random Forest',
                       'StackingClassifier']):

    scores = model_selection.cross_val_score(
        clf, train_data_pca, train_data.values[:, 0], cv=3, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

# ret = bagging_clf.predict(test_data.values)
コード例 #21
0
predictionsxgb = xgb.predict(X_test)
import operator
from sklearn.neural_network import MLPClassifier
mlp=MLPClassifier(solver='adam',activation='tanh',random_state=0)
modelmlp=mlp.fit(X_train,y_train)
predictionmlp=mlp.predict(X_test)

#4. Stacked Classifier
X=features1
y=target
clf1 = adb
clf2 = dtc
clf3 = svm1
meta = LogisticRegression()

sclf = StackingClassifier(classifiers=[meta, clf1, clf3], 
                          meta_classifier=clf2)

for clf, label in zip([clf1, clf2, clf3, sclf], ['clf1','clf2','clf3','SC']):
    scores = model_selection.cross_val_score(clf, X, y, cv=6, scoring='accuracy')

#5. Data Visualization 
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
import matplotlib.gridspec as gridspec
import itertools
gs = gridspec.GridSpec(2, 2)
fig = plt.figure(figsize=(10,8))
for clf, lab, grd in zip([clf1, clf2, clf3, sclf], 
    ['AdaBoost', 'Decision Tree Classifier', 'Support Vector Machine', 'Stacking Classifier'],itertools.product([0, 1], repeat=2)):

    clf.fit(X, y)
コード例 #22
0
#---------5堆叠法(stacking)------
from sklearn.datasets import load_iris
iris=load_iris()
X,Y=iris.data[:,1:3],iris.target
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
import numpy as np
knn_clf=KNeighborsClassifier(n_neighbors=2)
RF_clf=RandomForestClassifier(random_state=1)
baye_clf=GaussianNB()
lr=LogisticRegression()
stack_clf=StackingClassifier(classifiers=[knn_clf,RF_clf,baye_clf],meta_classifier=lr)
print('3-fold cross validation:\n')
for clf,label in zip([knn_clf,RF_clf,baye_clf,lr],["KNN","Random Forest","Naive Bayes","StackingClassifier"]):
    scores=cross_val_score(clf,X,Y,cv=3,scoring='accuracy')
    print('Accuracy:%0.2f(+/-%0.2f)[%s]'%(scores.mean(),scores.std(),label))
#---------5.堆叠法(stacking)------
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.datasets.samples_generator import make_blobs #聚类数据生成器其参数设置详见:https://blog.csdn.net/kevinelstri/article/details/52622960
 
'''创建训练的数据集'''
data, target = make_blobs(n_samples=50000, centers=2, random_state=0, cluster_std=0.60)
コード例 #23
0
ファイル: stacking.py プロジェクト: LiaoFJ/2020SCcompetition
baseline3 = RandomForestClassifier(n_estimators=500,
                                   oob_score=True,
                                   n_jobs=1,
                                   random_state=1)
baseline4 = CatBoostClassifier(iterations=500,
                               depth=6,
                               learning_rate=0.033,
                               loss_function='Logloss',
                               logging_level='Verbose')
baseline5 = AdaBoostClassifier()
baseline6 = GaussianNB()
baseline7 = SVC(kernel='rbf', class_weight='balanced')
lr = XGBClassifier()

stackmodel = StackingClassifier(classifiers=[
    baseline1, baseline2, baseline3, baseline4, baseline5, baseline6, baseline7
],
                                meta_classifier=lr)

#%%
for basemodel, label in zip([
        baseline1, baseline2, baseline3, baseline4, baseline5, baseline6,
        baseline7, stackmodel
], [
        'xgboost', 'lightgbm', 'Random Forest', 'Catboost', 'AdaBoost',
        'GaussianNB', 'SVC', 'stack'
]):

    scores = model_selection.cross_val_score(basemodel,
                                             train,
                                             target,
                                             cv=5,
コード例 #24
0
print("Soft Voting Test w/bin score mean: {:.2f}". format(vote_soft_cv['test_score'].mean()*100))
print("Soft Voting Test w/bin score 3*std: +/- {:.2f}". format(vote_soft_cv['test_score'].std()*100*3))
print('-'*10)z

"""## Stacking

#### ML Extend
"""

# Commented out IPython magic to ensure Python compatibility.
from mlxtend.classifier import StackingClassifier
lgbm_cl = LGBMClassifier(random_state=seed)
rf_cl = RandomForestClassifier(10, random_state=seed)
gdb_cl = GradientBoostingClassifier(random_state=seed)
logreg = LogisticRegression()
sclf = StackingClassifier(classifiers=[lgbm_cl, rf_cl,gdb_cl],
                          meta_classifier=logreg)


scores = model_selection.cross_val_score(sclf, data1_x_bin, data[Target], 
                                              cv=3, scoring='f1')
print("Accuracy: %0.2f (+/- %0.2f)" 
#       % (scores.mean(), scores.std()))

"""#### VecStack"""

#1st level model
X_train, X_test, y_train, y_test = train_test_split(data1_x_bin, data[Target], test_size=0.2)

models = [lgbm_cl,rf_cl,gdb_cl]
S_train, S_test = stacking(models, X_train, y_train, X_test, 
    regression = False, metric = metrics.f1_score, n_folds = 4 , 
コード例 #25
0

#========================Adaboost END=========================



#========================Stacking=========================
model_log = LogisticRegression(penalty='l2',
                         C=10,
                         multi_class='multinomial',
                         class_weight='balanced',
                         solver='newton-cg',
                         )

model_sta = StackingClassifier(
        classifiers=[model_RF,model_SVM, model_Adb], 
        meta_classifier=model_log,
        )

model_sta.fit(X_train,y_train)


y_sta_pred = model_sta.predict(X_test)


accuracy_score(y_test,y_sta_pred)
print('The sta accuracy is:',accuracy_score(y_test,y_sta_pred))
print('The sta precision is:',metrics.precision_score(y_test,y_sta_pred,average='macro'))
print('The sta recall is::',metrics.recall_score(y_test,y_sta_pred,average='macro'))
print('The sta f1 score is:',metrics.f1_score(y_test,y_sta_pred,average='macro'))

コード例 #26
0
ファイル: skystacking.py プロジェクト: maki-d-wni/skynet
def main():
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import f1_score
    from skynet import DATA_PATH
    from skynet.data_handling import read_learning_data
    from skynet.data_handling.preprocessing import PreProcessor
    from skynet.data_handling import get_init_response
    from skynet.data_handling import split_time_series
    from mlxtend.classifier import StackingClassifier

    icao = "RJFK"

    train = read_learning_data(DATA_PATH +
                               "/pickle/learning/skynet/train_%s.pkl" % icao)
    test = read_learning_data(DATA_PATH +
                              "/pickle/learning/skynet/test_%s.pkl" % icao)

    data = pd.concat([train, test]).reset_index(drop=True)

    preprocess = PreProcessor(norm=False, binary=False)
    preprocess.fit(data.iloc[:, :-1], data.iloc[:, -1])

    data = pd.concat([preprocess.X_train, preprocess.y_train], axis=1)
    date = data["date"].values
    spdata = split_time_series(data, date, level="month", period=2)

    for key in spdata:
        ext = spdata[key]

        target = get_init_response()
        feats = [f for f in ext.keys() if not (f in target + ["date"])]

        X = ext[feats]
        ss = StandardScaler()
        X = pd.DataFrame(ss.fit_transform(X), columns=X.keys())
        y = ext[target]
        X, y = balanced(X, y)

        spX, spy = preprocess.split(X, y, n_folds=5)
        for k in spy:
            print(np.unique(spy[k]))

        X = pd.concat([spX[n] for n in spX if n != 0]).reset_index(drop=True)
        y = pd.concat([spy[n] for n in spy if n != 0]).reset_index(drop=True)

        X_test = spX[0].reset_index(drop=True)
        y_test = spy[0].reset_index(drop=True)

        from sklearn.ensemble import RandomForestClassifier
        clf1 = RandomForestClassifier(max_features=2)
        clf2 = SkySVM()
        meta = LogisticRegression()

        # 学習
        # (注)balancedしてない
        sta = SkyStacking((clf1, clf2), meta)
        sta.fit(X, y)
        p = sta.predict(X_test)

        clf1.fit(X.values, y.values[:, 0])
        print(np.array(X.keys())[np.argsort(clf1.feature_importances_)[::-1]])
        p_rf = clf1.predict(X_test.values)

        # mlxtendのstacking
        sc = StackingClassifier(classifiers=[clf1, clf2], meta_classifier=meta)
        sc.fit(X.values, y.values[:, 0])
        p_sc = sc.predict(X_test.values)

        y_test = np.where(y_test.values[:, 0] > 1, 0, 1)
        p = np.where(p > 1, 0, 1)
        p_rf = np.where(p_rf > 1, 0, 1)
        p_sc = np.where(p_sc > 1, 0, 1)

        f1 = f1_score(y_true=y_test, y_pred=p)
        print("stacking", f1)

        f1_rf = f1_score(y_true=y_test, y_pred=p_rf)
        print("random forest", f1_rf)

        f1_sc = f1_score(y_true=y_test, y_pred=p_sc)
        print("stacked classifier", f1_sc)

        if True:
            break
iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
import numpy as np

clf1 = KNeighborsClassifier(n_neighbors=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr)

print('3-fold cross validation:\n')

for clf, label in zip(
    [clf1, clf2, clf3, sclf],
    ['KNN', 'Random Forest', 'Naive Bayes', 'StackingClassifier']):

    scores = model_selection.cross_val_score(clf,
                                             X,
                                             y,
                                             cv=3,
                                             scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" %
          (scores.mean(), scores.std(), label))
コード例 #28
0
def get_pipeline(classifier_name='BDT'):
    """ Function to get classifier pipeline.
    """
    steps = []
    if classifier_name == 'RF':
        classifier = RandomForestClassifier(
            n_estimators=100,
            max_depth=6,
            n_jobs=20,
            # n_estimators=100, max_depth=7, min_samples_leaf=150, n_jobs=20,
            random_state=2)
    elif classifier_name == 'xgboost':
        classifier = XGBClassifier(n_estimators=125,
                                   nthread=10,
                                   silent=True,
                                   seed=2)
    elif classifier_name == 'Ada':
        classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5),
                                        n_estimators=100,
                                        learning_rate=0.1,
                                        random_state=2)
        # classifier = AdaBoostClassifier(n_estimators=50, learning_rate=0.1, random_state=2)
    # elif classifier_name in ['GBDT', 'BDT']:
    #     classifier = GradientBoostingClassifier(
    #         loss='exponential', max_depth=3, n_estimators=100, random_state=2)
    #     # classifier = GradientBoostingClassifier(loss='deviance', max_depth=3,
    #     #     n_estimators=500, random_state=2)

    elif classifier_name == 'BDT_comp_IC79.2010':
        classifier = GradientBoostingClassifier(loss='deviance',
                                                max_depth=4,
                                                n_estimators=100,
                                                random_state=2)
        steps.append(('classifier', classifier))
    elif classifier_name == 'BDT_comp_IC79.2010_2-groups':
        classifier = GradientBoostingClassifier(loss='deviance',
                                                max_depth=4,
                                                n_estimators=100,
                                                random_state=2)
        steps.append(('classifier', classifier))

    elif classifier_name == 'BDT_comp_IC86.2012_2-groups':
        classifier = GradientBoostingClassifier(loss='deviance',
                                                max_depth=4,
                                                n_estimators=100,
                                                random_state=2)
        steps.append(('classifier', classifier))
    elif classifier_name == 'BDT_comp_IC86.2012_3-groups':
        classifier = GradientBoostingClassifier(loss='deviance',
                                                max_depth=3,
                                                n_estimators=100,
                                                random_state=2)
        steps.append(('classifier', classifier))
    elif classifier_name == 'BDT_comp_IC86.2012_4-groups':
        classifier = GradientBoostingClassifier(loss='deviance',
                                                max_depth=2,
                                                n_estimators=100,
                                                random_state=2)
        steps.append(('classifier', classifier))
    elif 'CustomClassifier' in classifier_name:
        hyperparams_str = classifier_name.split('_')[1:]
        assert len(
            hyperparams_str
        ) == 3, 'Too many CustomClassifier hyperparams. Got {} but should have 3.'.format(
            len(hyperparams_str))
        p = float(hyperparams_str[0])
        neighbor_weight = float(hyperparams_str[1])
        num_groups = int(hyperparams_str[2])
        classifier = CustomClassifier(p=p,
                                      neighbor_weight=neighbor_weight,
                                      num_groups=num_groups,
                                      random_state=2)
        steps.append(('classifier', classifier))

    elif classifier_name == 'RF_comp_IC86.2012_4-groups':
        classifier = RandomForestClassifier(max_depth=10,
                                            n_estimators=500,
                                            random_state=2,
                                            n_jobs=10)
        steps.append(('classifier', classifier))

    elif classifier_name == 'SVC_comp_IC86.2012_2-groups':
        classifier = SVC(C=0.5, random_state=2)
        steps.append(('scaler', StandardScaler()))
        steps.append(('classifier', classifier))
    elif classifier_name == 'SVC_comp_IC86.2012_4-groups':
        classifier = SVC(C=0.5, random_state=2)
        steps.append(('scaler', StandardScaler()))
        steps.append(('classifier', classifier))

    elif classifier_name == 'LinearSVC_comp_IC86.2012_2-groups':
        classifier = LinearSVC(random_state=2)
        steps.append(('scaler', StandardScaler()))
        steps.append(('classifier', classifier))
    elif classifier_name == 'LinearSVC_comp_IC86.2012_4-groups':
        classifier = LinearSVC(random_state=2)
        steps.append(('scaler', StandardScaler()))
        steps.append(('classifier', classifier))

    elif classifier_name == 'NuSVC_comp_IC86.2012_4-groups':
        classifier = NuSVC(random_state=2)
        steps.append(('scaler', StandardScaler()))
        steps.append(('classifier', classifier))

    elif classifier_name == 'xgboost_comp_IC86.2012_2-groups':
        classifier = XGBClassifier(
            learning_rate=0.05,
            max_depth=7,
            n_estimators=150,
            # subsample=0.75,
            random_state=2)
        steps.append(('classifier', classifier))

    elif classifier_name == 'xgboost_comp_IC86.2012_4-groups':
        classifier = XGBClassifier(
            max_depth=2,
            n_estimators=100,
            # subsample=0.75,
            random_state=2)
        steps.append(('classifier', classifier))

    elif classifier_name == 'LogisticRegression_comp_IC86.2012_4-groups':
        classifier = LogisticRegression(random_state=2)
        steps.append(('scaler', StandardScaler()))
        steps.append(('classifier', classifier))

    elif classifier_name == 'linecut_comp_IC86.2012_4-groups':
        classifier = LineCutClassifier()
        steps.append(('classifier', classifier))

    elif classifier_name == 'stacking_comp_IC86.2012_4-groups':
        classifiers = [
            SVC(random_state=2),
            LinearSVC(random_state=2),
            GradientBoostingClassifier(loss='deviance',
                                       max_depth=2,
                                       n_estimators=100,
                                       random_state=2),
        ]
        classifier = StackingClassifier(classifiers,
                                        meta_classifier=LogisticRegression())
        steps.append(('scaler', StandardScaler()))
        steps.append(('classifier', classifier))
    elif classifier_name == 'voting_comp_IC86.2012_4-groups':
        # classifiers = [SVC(random_state=2),
        #                LinearSVC(random_state=2),
        #                GradientBoostingClassifier(loss='deviance',
        #                                           max_depth=2,
        #                                           n_estimators=100,
        #                                           random_state=2),
        #                                           ]

        estimators = [
            ('SVC', SVC(random_state=2)),
            # ('LinearSVC', LinearSVC(random_state=2)),
            ('LogisticRegression', LogisticRegression(random_state=2)),
            # ('BDT', GradientBoostingClassifier(loss='deviance',
            #                                    max_depth=2,
            #                                    n_estimators=100,
            #                                    random_state=2)),
            ('xgboost',
             XGBClassifier(max_depth=3,
                           booster='gblinear',
                           n_estimators=100,
                           random_state=2))
        ]
        classifier = VotingClassifier(estimators, voting='hard')
        steps.append(('scaler', StandardScaler()))
        steps.append(('classifier', classifier))

    elif classifier_name == 'RF_energy_IC79.2010':
        classifier = RandomForestRegressor(n_estimators=100,
                                           max_depth=8,
                                           n_jobs=10,
                                           random_state=2)
        steps.append(('classifier', classifier))
    elif classifier_name == 'RF_energy_IC86.2012':
        classifier = RandomForestRegressor(n_estimators=100,
                                           max_depth=7,
                                           n_jobs=10,
                                           random_state=2)
        steps.append(('classifier', classifier))
    elif classifier_name == 'xgboost_energy_IC86.2012':
        classifier = XGBRegressor(
            n_estimators=75,
            booster='gblinear',
            # subsample=0.75,
            random_state=2)
        steps.append(('classifier', classifier))
    elif classifier_name == 'linearregression_energy_IC86.2012':
        reg = make_pipeline(
            PolynomialFeatures(2),
            # StandardScaler(),
            LinearRegression(),
        )
        return reg
    elif classifier_name == 'SGD_comp_IC86.2012_2-groups':
        # clf = make_pipeline(StandardScaler(),
        #                     SGDClassifier(random_state=2, n_jobs=1),
        #                     )
        clf = make_pipeline(
            StandardScaler(),
            SGDClassifier(loss='hinge',
                          alpha=1e-3,
                          max_iter=50,
                          tol=1e-3,
                          shuffle=True,
                          random_state=2),
        )
        return clf
    else:
        raise ValueError(
            '{} is not a valid classifier name'.format(classifier_name))

    # pipeline = Pipeline([
    #     # ('scaler', StandardScaler()),
    #     # ('pca', PCA(n_components=4, random_state=2)),
    #     # ('lda', LinearDiscriminantAnalysis(n_discriminants=6)),
    #     ('classifier', classifier)])
    pipeline = Pipeline(steps)

    return pipeline
コード例 #29
0
    zero_one_losses_list = []

    print '---------------------------------------------------------------------------------------------------------'
    print 'Bagging Classifier:'
    clf = BaggingClassifier(base_estimator=LogisticRegression(class_weight={0:1,1:9},warm_start=True),n_estimators=3)
    clf.fit(train_features,train_labels)
    pred = clf.predict(test_features)
    print "F1 Score:"
    print (precision_recall_fscore_support(test_labels,pred,average='micro'))[2]
    fpr, tpr, thresholds = roc_curve(test_labels, pred)
    print "Area Under Receiver Operating Characteristic Curve (ROC):"
    print auc(fpr, tpr)

    print '\nStacking Classifier:'
    clf = StackingClassifier(classifiers=[LogisticRegression(class_weight={0:1,1:9},warm_start=True),LogisticRegression(class_weight={0:1,1:9},warm_start=True),LogisticRegression(class_weight={0:1,1:9},warm_start=True)],meta_classifier=LogisticRegression())
    clf.fit(train_features,train_labels)
    pred = clf.predict(test_features)
    print "F1 Score:"
    print (precision_recall_fscore_support(test_labels,pred,average='micro'))[2]
    fpr, tpr, thresholds = roc_curve(test_labels, pred, pos_label=1)
    print "Area Under Receiver Operating Characteristic Curve (ROC):"
    print auc(fpr, tpr)

    print '---------------------------------------------------------------------------------------------------------'

    print "Classifiers Used : AdaBoost, RandomForest,Gradient Booster, Logistic Regression"
    print "\nTrain Start Time : {}".format(time.time())
    clf_AdaBoost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(class_weight={0:1,1:4}),n_estimators=5)
    clf_RandomForest = RandomForestClassifier(class_weight={0:1,1:11})
    clf_LogisticReg = LogisticRegression(class_weight={0:1,1:9},warm_start=True)
    scores = cross_val_score(clf,
                             X_train,
                             Y_train,
                             cv=5,
                             scoring='accuracy',
                             pre_dispatch=4)
    print("Accuracy: %0.4f (+/- %0.4f) [%s]" %
          (scores.mean(), scores.std(), label))

# Let's try another strategy: Stacking

# In[ ]:

lgr = LogisticRegression()
sclf = StackingClassifier(
    classifiers=[RFC_best, logitR, LDA, GBC_best, votingC],
    meta_classifier=lgr)

print('10-fold cross validation:\n')

for clf, label in zip([RFC_best, logitR, LDA, GBC_best, votingC, sclf],
                      ['rfc', 'logitR', 'lda', 'gbc', 'voting', 'stacking']):

    scores = cross_val_score(clf, X_train, Y_train, cv=5, scoring='accuracy')
    print("Accuracy: %0.4f (+/- %0.4f) [%s]" %
          (scores.mean(), scores.std(), label))

# If we run an ANOVA we couldn't say one model predicts better than other, so let's take the voting Classifier as the final model...

# ### 6.3 Prediction
# #### 6.3.1 Predict and Submit results