Example #1
0
def ada_boost_classifier(X_train_res, X_test, y_train_res):
    clf = AdaBoostClassifier(
        base_estimator=dt_clf)  # instance of adaboost classifier
    clf.set_params(base_estimator__criterion='gini',
                   base_estimator__splitter='best',
                   n_estimators=100)  # tuned adaboost
    #clf.set_params(dt_clf,learning_rate = 1)
    #clf.set_params(n_estimators = 10, learning_rate = 1)
    ada_clf = clf.fit(X_train_res,
                      y_train_res)  # fitting model on sampled train data
    ada_predict = ada_clf.predict(X_test)  # predict on test data
    ada_acc = accuracy_score(y_test, ada_predict)  # accuracy score
    ada_kappa = cohen_kappa_score(
        y_test, ada_predict)  # cohen kappa score of cohen_kappa
    accuracy = cross_val_score(clf,
                               X_train_res,
                               y_train_res,
                               cv=10,
                               scoring='accuracy')  # 10-fold accuracy score
    f_score = cross_val_score(clf,
                              X_train_res,
                              y_train_res,
                              cv=10,
                              scoring='f1_micro')  # 10-fold f1-score
    ada_accuracy, ada_f_score = accuracy.mean(), f_score.mean(
    )  # f1 and accuracy mean score
    #print "accuracy and f_score are:  "
    return ada_accuracy, ada_f_score, ada_clf, ada_kappa  # return ada_accuracy, ada_f_score,ada_clf,ada_kappa
Example #2
0
def load_architecture():

    ada_params_filename = logger.config_dict['BEST_ADA_L']
    logger.log(
        "Loading params for ADA from {} ...".format(ada_params_filename))
    with open(logger.get_model_file(ada_params_filename, "large")) as fp:
        ada_best_params = json.load(fp)

    ada_model = AdaBoostClassifier(DecisionTreeClassifier())
    ada_model.set_params(**ada_best_params)

    xgb_params_filename = logger.config_dict['BEST_XGB_L']
    logger.log(
        "Loading params for XGB from {} ...".format(xgb_params_filename))
    with open(logger.get_model_file(xgb_params_filename, "large")) as fp:
        xgb_best_params = json.load(fp)

    xgb_model = XGBClassifier()
    xgb_model.set_params(**xgb_best_params)

    ensemble_weights = [0.5, 0.5]

    comb_model = VotingClassifier(estimators=[('ADA', ada_model),
                                              ('XGB', xgb_model)],
                                  voting='soft',
                                  weights=ensemble_weights,
                                  n_jobs=-1)

    logger.log("Finish loading best architecture {}".format(comb_model))

    return comb_model
Example #3
0
def heart(dataType):
    title = '{0} Ada Boost'.format(dataType)
    package = data.createData(dataType)

    xTrain = package.xTrain
    xTest = package.xTest
    yTrain = package.yTrain
    yTest = package.yTest

    param_range = list(range(1, 160, 10))
    param = 'n_estimators'

    params = {'algorithm': 'SAMME.R'}
    clf = AdaBoostClassifier()
    clf.set_params(**params)

    plotter.plotValidationCurve(clf,
                                xTrain,
                                yTrain,
                                param,
                                param_range,
                                graphTitle=title)
    plotter.plotLearningCurve(clf, title=title, xTrain=xTrain, yTrain=yTrain)
    title = 'Heart'
    clf.fit(xTrain, yTrain)
    plotter.plotConfusion(clf, title,
                          ['Diameter narrowing ', 'Diameter not narrowing'],
                          xTest, yTest)
Example #4
0
class CustomEstimator(BaseEstimator):

    def __init__(self, C = 1, penalty = 'l2'):
        self.C = C
        self.penalty = penalty
        self._model = AdaBoost(n_estimators = 50)
        self._model.set_params(C = C, penalty = penalty)
        pass

    def transform(self, X, y=None):
        return self.score(X)
    
    def predict(self, X):
        return self._model.predict(X)
    
    def score(self, X, y=None):
        global global_X, global_y, oversampled_global_X, oversampled_global_y, global_i, global_indices

        temp_X = global_X.copy()
        temp_X.drop('target', axis = 1, inplace = True)
        score = self._model.score(temp_X.ix[global_indices[global_i][1]], global_y.ix[global_indices[global_i][1]])
        
        if global_i == 4:
            global_i = 0

        return score

    def fit(self, X, y=None):
        self._model.fit(oversampled_global_X, oversampled_global_y)
        return self
def predict_classifier(name_dataset, name_train, classifier, name_test,
                       metric):
    """Run classifier"""
    if classifier == "ada_boost":
        estimator = AdaBoostClassifier(random_state=42,
                                       base_estimator=ComplementNB(alpha=0.01))
        #estimator = AdaBoostClassifier(random_state=42, base_estimator= LogisticRegression(C= 50, max_iter= 100))

    elif classifier == "extra_tree":
        estimator = ExtraTreesClassifier(random_state=SEED)

    elif classifier == "knn":
        estimator = KNeighborsClassifier()

    elif classifier == "logistic_regression":
        estimator = LogisticRegression(random_state=SEED)

    elif classifier == "naive_bayes":
        estimator = MultinomialNB()

    elif classifier == "naive_bayes_complement":
        estimator = ComplementNB()

    elif classifier == "passive_aggressive":
        estimator = PassiveAggressiveClassifier(random_state=SEED,
                                                max_iter=1000)

    elif classifier == "random_forest":
        estimator = RandomForestClassifier(random_state=SEED)

    elif classifier == "sgd":
        estimator = SGDClassifier(random_state=SEED, max_iter=1000)

    elif classifier == "svm":
        estimator = svm.LinearSVC(random_state=SEED, max_iter=1000)

    x_train, y_train, x_test, y_test = load_svmlight_files(
        [open(name_train, 'rb'), open(name_test, 'rb')])

    load_estimator = False
    if load_estimator == True:
        joblib.load("escores/grid_" + name_dataset + "_" +
                    classifier)  # load estimator
    else:
        if not (len(classifier.split(",")) > 1):
            escores = cv.load_escores(name_dataset, classifier,
                                      1)  # test score 0
            best_param_folds = cv.best_param_folds_no_frequency(
                escores, 0, metric)  # best score per fold
            estimator.set_params(**best_param_folds)
        estimator.fit(x_train, y_train)

    y_pred = estimator.predict(x_test)
    cv.save_dict_list([y_test], [y_pred],
                      'y_pred/' + name_dataset + "_" + classifier + "_" +
                      metric + "_" + cv.name_file(name_test))
Example #6
0
class MyAdaboost(MyModel):

    def __init__(self):
        super().__init__()
        self.name = "Adaboost"
        self.is_ensemble = True
        self.chinese_name = "自适应提升算法"
        self.english_name = "Adaptive Boosting"
        self.model = AdaBoostClassifier()
        # 固定每个模型的随机种子
        self.model.set_params(**{'random_state': self.random_state})
Example #7
0
def post_pruning_boosting_tree_performance():
    pruning_tree = DecisionTreeClassifier(ccp_alpha=0.015)
    num_trees_list = [i + 1 for i in range(20)]
    train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer(
    )
    acc_train_cancer_list = []
    acc_test_cancer_list = []
    boost_classifier = AdaBoostClassifier(pruning_tree, n_estimators=1)
    for num_trees in num_trees_list:
        boost_classifier.set_params(n_estimators=num_trees)
        boost_classifier.fit(train_features_cancer, train_labels_cancer)
        acc_train_cancer = boost_classifier.score(train_features_cancer,
                                                  train_labels_cancer)
        acc_train_cancer_list.append(acc_train_cancer)
        acc_test_cancer = boost_classifier.score(test_features_cancer,
                                                 test_labels_cancer)
        acc_test_cancer_list.append(acc_test_cancer)

    train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam(
    )
    acc_train_spam_list = []
    acc_test_spam_list = []
    for num_trees in num_trees_list:
        boost_classifier.set_params(base_estimator__ccp_alpha=0.005,
                                    n_estimators=num_trees)
        boost_classifier.fit(train_features_spam, train_labels_spam)
        acc_train_spam = boost_classifier.score(train_features_spam,
                                                train_labels_spam)
        acc_train_spam_list.append(acc_train_spam)
        acc_test_spam = boost_classifier.score(test_features_spam,
                                               test_labels_spam)
        acc_test_spam_list.append(acc_test_spam)
    plt.figure(figsize=(10, 6))
    plt.subplot(121)
    plt.plot(num_trees_list, acc_train_cancer_list, label='train')
    plt.plot(num_trees_list, acc_test_cancer_list, label='test')
    plt.xlabel('num of trees')
    plt.ylabel('accuracy')
    plt.title(
        'post-pruning boosting cancer classifer \nperformance vs number of boosting trees'
    )
    plt.legend(loc='upper right')
    plt.subplot(122)
    plt.plot(num_trees_list, acc_train_spam_list, label='train')
    plt.plot(num_trees_list, acc_test_spam_list, label='test')
    plt.xlabel('num of trees')
    plt.ylabel('accuracy')
    plt.title(
        'post-pruning boosting spam classifer \nperformance vs number of boosting trees'
    )
    plt.legend(loc='upper right')
    plt.show()
Example #8
0
 def test_evaluation_function(self):
     X = []
     # pre-train chosen for test data
     if self.pre_train_chosen == 'bag-of-words':
         X = self.read_data_bag_of_words_function(self.datafile_name_test)
     elif self.pre_train_chosen == 'word-embedding':
         X = self.read_data_embedding_function(self.datafile_name_test)
     # test model is SVM, if pre-train is word-embedding, it is time-consuming method
     # user should change another model or pre-train method
     if self.model_chosen == 'SVM':
         if self.pre_train_chosen == 'word-embedding':
             print(
                 "It's time consuming for svm model using word-embedding method, change another model"
             )
         else:
             optimal_svm = SVC()
             optimal_svm.set_params(**self.hyper_para)
             optimal_svm.fit(self.X, self.y)
             y_pred = optimal_svm.predict(X)
             print(y_pred)
     # test model is adaboost
     elif self.model_chosen == 'adaboost':
         DTC = tree.DecisionTreeClassifier(random_state=11,
                                           max_features="auto",
                                           max_depth=None)
         optimal_abc = AdaBoostClassifier(base_estimator=DTC)
         optimal_abc.set_params(**self.hyper_para)
         optimal_abc.fit(self.X, self.y)
         y_pred = optimal_abc.predict(X)
         print(y_pred)
     # test model is Logistic Regression
     elif self.model_chosen == 'Logistic Regression':
         optimal_logreg = LogisticRegression()
         optimal_logreg.set_params(**self.hyper_para)
         optimal_logreg.fit(self.X, self.y)
         y_pred = optimal_logreg.predict(X)
         print(y_pred)
         numpy.savetxt('predicted-labels', y_pred, fmt='%d', delimiter=',')
     # test model is Naive Bayes, if pre-train method is word-embedding
     # Naive Bayes doesn't support continuous features
     # change the model or pre-train method
     elif self.model_chosen == 'Naive Bayes':
         if self.pre_train_chosen == 'word-embedding':
             print(
                 "not suitable for Naive Bayes classifier using word-embedding, because of the continuous feature"
             )
         else:
             optimal_nb = MultinomialNB()
             optimal_nb.set_params(**self.hyper_para)
             optimal_nb.fit(self.X, self.y)
             y_pred = optimal_nb.predict(X)
             print(y_pred)
Example #9
0
def train_and_save_final_model(X, y, X_train, y_train, params,
                               save_model_file_path, test_data):
    adbc = AdaBoostClassifier(random_state=0)
    adbc.set_params(**params)

    if test_data == None:
        adbc.fit(X_train, y_train)
    else:
        adbc.fit(X, y)

    #save model
    model_file_path = save_model_file_path + 'adbc.sav'
    pickle.dump(adbc, open(model_file_path, 'wb'))
Example #10
0
def getAdaBoostBDTClassifier(options={}):
    """the standard BDT classifer based on AdaBoost"""

    dt = DecisionTreeClassifier(criterion="gini",
                                max_depth=5,
                                min_samples_leaf=0.05,
                                random_state=0)
    bdt = AdaBoostClassifier(dt,
                             n_estimators=200,
                             learning_rate=0.13,
                             algorithm='SAMME',
                             random_state=0)
    bdt.set_params(options={})
    return bdt
Example #11
0
    def classifier(self, scoring, cv, eval_using):
        
        adaclf = AdaBoostClassifier(algorithm='SAMME')
        xtr = StandardScaler().fit_transform(self.xtr)
        xte = StandardScaler().fit_transform(self.xte)
        
        # iterate over each grid score for param tuner
        for score in scoring:
            
            print('Tuning parameters of inital classifiers...')
            passive_params = param_tuner(PassiveAggressiveClassifier(), 
                                         score=score, cv=cv, xtr=xtr, 
                                         ytr=self.ytr)
            passclf = PassiveAggressiveClassifier().set_params(**passive_params)  
            sgd_params = param_tuner(SGDClassifier(), score=score, cv=cv,
                                     xtr=xtr, ytr=self.ytr)
            sgdclf = SGDClassifier().set_params(**sgd_params)
            
            # cant use resampling/bagging with passive aggressive classifier
            # will raise ValueError: The number of class labels must be > 1
            # since resampling may results in training sets with 1 class. 
            
            print('\n'+'Tuning meta-classifiers with tuned classifier/s...') 
            bagsgd_params = param_tuner(BaggingClassifier(sgdclf), 
                                         score=score, cv=cv, xtr=xtr, 
                                         ytr=self.ytr)
            bg_sgdclf = BaggingClassifier(sgdclf).set_params(**bagsgd_params)
            
            adasgd_params = param_tuner(adaclf.set_params(base_estimator=sgdclf), 
                                        score =score, cv=cv, xtr=xtr, 
                                        ytr=self.ytr)
            ada_sgdclf = adaclf.set_params(**adasgd_params)
            
            print('Voting on meta-classifiers/classifiers then predicting...')
            vote = VotingClassifier(estimators=[('BagSGD', bg_sgdclf),
                                                ('adaboostSGD', ada_sgdclf),
                                                ('Passive', passclf)],
                                    voting='hard').fit(xtr, self.ytr)

            start = time.time()
            y_true, y_pred = self.yte, vote.predict(xte)
            print('\n' + '-'*5, 'FINAL PREDICTION RESULTS','-'*5 +'\n', 
                  '{0:.4f}'.format(time.time()-start)+'--prediction time(secs)')
                  
            clf_evaluation = report(*eval_using, y_true=y_true, y_pred=y_pred)
            for reports in clf_evaluation:
                print('---',reports)
                print(clf_evaluation[reports])
Example #12
0
def accuracy_vs_num_tree():
    max_depth_tree = DecisionTreeClassifier(max_depth=3)
    num_trees_list = [i + 1 for i in range(100)]
    train_features_cancer, train_labels_cancer, test_features_cancer, test_labels_cancer = split_train_test_breast_cancer(
    )
    acc_train_cancer_list = []
    acc_test_cancer_list = []
    boost_classifier = AdaBoostClassifier(max_depth_tree, n_estimators=1)
    for num_trees in num_trees_list:
        boost_classifier.set_params(n_estimators=num_trees)
        boost_classifier.fit(train_features_cancer, train_labels_cancer)
        acc_train_cancer = boost_classifier.score(train_features_cancer,
                                                  train_labels_cancer)
        acc_train_cancer_list.append(acc_train_cancer)
        acc_test_cancer = boost_classifier.score(test_features_cancer,
                                                 test_labels_cancer)
        acc_test_cancer_list.append(acc_test_cancer)

    train_features_spam, train_labels_spam, test_features_spam, test_labels_spam = split_train_test_spam(
    )
    acc_train_spam_list = []
    acc_test_spam_list = []
    for num_trees in num_trees_list:
        boost_classifier.set_params(n_estimators=num_trees)
        boost_classifier.fit(train_features_spam, train_labels_spam)
        acc_train_spam = boost_classifier.score(train_features_spam,
                                                train_labels_spam)
        acc_train_spam_list.append(acc_train_spam)
        acc_test_spam = boost_classifier.score(test_features_spam,
                                               test_labels_spam)
        acc_test_spam_list.append(acc_test_spam)
    plt.figure(figsize=(10, 6))
    plt.subplot(121)
    plt.plot(num_trees_list, acc_train_cancer_list, label='train')
    plt.plot(num_trees_list, acc_test_cancer_list, label='test')
    plt.xlabel('num of trees')
    plt.ylabel('accuracy')
    plt.title('cancer accuracy vs number of boosting trees')
    plt.legend(loc='upper right')
    plt.subplot(122)
    plt.plot(num_trees_list, acc_train_spam_list, label='train')
    plt.plot(num_trees_list, acc_test_spam_list, label='test')
    plt.xlabel('num of trees')
    plt.ylabel('accuracy')
    plt.title('spam accuracy vs number of boosting trees')
    plt.legend(loc='upper right')
    plt.show()
Example #13
0
	def adbTuning(self, pX, change = 3):
		n = pX.shape[0]
		adb = AdaBoostClassifier()
		best_auc = 0
		best_param = None
		for i in range(change):
			params = {
				'n_estimators': 3+int(10*np.random.random()),
				'random_state':2016
			}
			adb.set_params(**params)
			auc = cross_val_score(adb, pX, self.y, scoring="roc_auc").mean()
			if auc  > best_auc:
				best_auc = auc
				best_param = params
		print 'adaboost ' + str(best_auc)
		return best_auc, AdaBoostClassifier(**best_param)
    def _get_model(self, problem_transform=ClassifierChain):

        self._load_models_hyperparams()

        adaboost_model = AdaBoostClassifier(DecisionTreeClassifier())
        adaboost_model.set_params(**self.adab_hyperparams)
        randf_model = RandomForestClassifier()
        randf_model.set_params(**self.randf_hyperparams)

        ensemble_model = problem_transform(
            VotingClassifier(estimators=[('ADA', adaboost_model),
                                         ('RANDF', randf_model)],
                             voting='soft',
                             weights=[0.45, 0.55],
                             n_jobs=-1))

        return ensemble_model
Example #15
0
def runAdaBoostClassifier(x_train, y_train, x_test, y_test, p):

    # Here we instantiate the adaboost classifier
    clf = AdaBoostClassifier()
    clf.set_params(**p)

    clf.fit(x_train, y_train)

    # now, make the predictions using our classifier
    ada_predictions = clf.predict(x_test)

    # now we have to computer the classification accuracy
    # think about what two variables we have to compare
    dt_score = accuracy_score(y_test, ada_predictions)
    print("adaboost classification accuracy on test data is " + str(dt_score),
          file=sys.stderr)

    etc_predictions = clf.predict(x_test)
    dt_score = accuracy_score(y_test, etc_predictions)
    print("accuracy score on test data: " + str(dt_score), file=sys.stderr)
    train_score = accuracy_score(y_train, clf.predict(x_train))
    print("accuracy score on training data: " + str(train_score),
          file=sys.stderr)
    return (train_score, dt_score)
print('最优参数:', grid.best_params_)
bestclfParams = grid.best_params_

############################
######## 再对基学习器超参调优
############################
# 可惜目前只能手工遍历
from sklearn.model_selection import cross_val_score

mdl = DecisionTreeClassifier()
clf = AdaBoostClassifier(base_estimator=mdl,
                         algorithm='SAMME',
                         n_estimators=200,
                         learning_rate=0.8,
                         random_state=10)
clf.set_params(**bestclfParams)

# 定义调参步骤
dtcParams = {
    'max_depth': range(3, 14, 2),  #第一次
    'min_samples_split': range(50, 201, 20),
    'min_samples_leaf': range(10, 60, 10)
}
# dtcParams_2 = {'max_features':np.linspace(0.5, 1.0,6)}    #第二次

for var_name, var_vals in dtcParams.items():
    best_scores = []
    best_Params = []
    for val in var_vals:
        dv = {var_name: val}
        best_Params.append(dv)
Example #17
0
import sys
from os import path
sys.path.append(path.dirname(path.dirname(path.abspath(__file__))))
from kaggle_io.extract_inputs import extract_training_data

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.externals import joblib
from CvModel import CvModel

Id, X, y = extract_training_data('data/kaggle_train_tf_idf.csv')

n_folds = 5
scaler = StandardScaler().fit(X)
ada = AdaBoostClassifier()

print 'Training AdaBoost with n_estimators=10'
ada.set_params(n_estimators=10)
cv_ada = CvModel(n_folds, scaler, ada)
cv_ada.fit(X, y)
joblib.dump(cv_ada, 'ada1/1.pkl')

print 'Training AdaBoost with n_estimators=50'
ada.set_params(n_estimators=50)
cv_ada = CvModel(n_folds, scaler, ada)
cv_ada.fit(X, y)
joblib.dump(cv_ada, 'ada1/2.pkl')
Example #18
0
plt.plot(np.arange(1, len(Ein)+1), Eval, label="Validation")
plt.title('AdaBoost estimators behaviour')
plt.xlabel('Number of estimators')
plt.ylabel('Score')
plt.legend()
plt.show()

estimators = np.arange(MIN_ESTIMATORS, MAX_ESTIMATORS)
clf = RandomizedSearchCV(estimator=ada, scoring='roc_auc', param_distributions=dict(n_estimators=estimators), n_jobs=-1, random_state=SEED, cv=kfold)
model = clf.fit(X_train, y_train)
best_number_estimators = model.best_estimator_.get_params()['n_estimators']"""
best_number_estimators = 130

print("Best number of estimators for AdaBoost: ", best_number_estimators)

ada.set_params(n_estimators=best_number_estimators)
ada.fit(X_train, y_train)

scoreModel(ada, X_train, y_train, 'Train')

##################################################################################################################
# Random Forest (RF)
##################################################################################################################
print("\nFinding optimal number of trees for Random Forest...")

rf = RandomForestClassifier(n_jobs=-1,
                            random_state=SEED,
                            criterion='entropy',
                            bootstrap=True,
                            max_features='auto',
                            class_weight='balanced',
Example #19
0
def CreatesFeatsPipeline(pipe_name, init_params=None):
    """ load pre-existing pipelines
    """
    pipeline = []
    if pipe_name == 'cla_ERP_TS_LR':
        # pipeline using Xdawn with MDM
        pipeline = sklearn.pipeline.Pipeline([
            ('xdawn', pyriemann.estimation.XdawnCovariances()),
            ('TS', pyriemann.tangentspace.TangentSpace()),
            ('lr', sklearn.linear_model.LogisticRegression())
        ])
    elif pipe_name == 'cla_ERP_LR':
        pipeline = sklearn.pipeline.Pipeline([
            ('preproc', Epochs2signals()),
            ('xdawn', pyriemann.estimation.XdawnCovariances()),
            ('TS', pyriemann.tangentspace.TangentSpace()),
            ('lr', sklearn.linear_model.LogisticRegression())
        ])
    elif pipe_name == 'cla_CSP_LR':
        pipeline = sklearn.pipeline.Pipeline([
            ("cov", pyriemann.estimation.Covariances(estimator='lwf')),
            ('CSP', pyriemann.spatialfilters.CSP(nfilter=12, log=False)),
            ('TS', pyriemann.tangentspace.TangentSpace()),
            ('lr', sklearn.linear_model.LogisticRegression(solver='lbfgs'))
        ])
    elif pipe_name == 'cla_CSP_MDM':
        pipeline = sklearn.pipeline.Pipeline([
            ("cov", pyriemann.estimation.Covariances(estimator='lwf')),
            ('CSP', pyriemann.spatialfilters.CSP(nfilter=8, log=False)),
            ('MDM', pyriemann.classification.MDM())
        ])
    elif pipe_name == 'cla_MDM':
        pipeline = sklearn.pipeline.Pipeline([
            ("cov", pyriemann.estimation.Covariances(estimator='lwf')),
            ('MDM', pyriemann.classification.MDM())
        ])
    elif pipe_name == 'reg_CSP':
        # pipeline using Xdawn in the tangent space (regression)
        pipeline = sklearn.pipeline.Pipeline([
            ("cov", pyriemann.estimation.Covariances(estimator='lwf')),
            ('CSP', pyriemann.spatialfilters.CSP(nfilter=12, log=False)),
            ('TS', pyriemann.tangentspace.TangentSpace()),
            ('LASSO', sklearn.linear_model.LassoCV())
        ])
    elif pipe_name == 'reg_ERP':
        # pipeline using Xdawn in the tangent space (regression)
        pipeline = sklearn.pipeline.Pipeline([
            ('xdawn',
             pyriemann.estimation.XdawnCovariances(estimator='lwf',
                                                   xdawn_estimator='lwf')),
            ('TS', pyriemann.tangentspace.TangentSpace()),
            ('LASSO', sklearn.linear_model.LassoCV())
        ])
    elif pipe_name == 'reg_ERP_svr':
        # pipeline using Xdawn in the tangent space
        pipeline = sklearn.pipeline.Pipeline([
            ('preproc', Epochs2signals()),
            ('xdawn', XdawnCovariancesRegression()),
            ('TS', pyriemann.tangentspace.TangentSpace()),
            ('LASSO',
             sklearn.model_selection.GridSearchCV(sklearn.svm.SVR(kernel='rbf',
                                                                  C=100,
                                                                  gamma=0.1,
                                                                  epsilon=.1),
                                                  cv=5,
                                                  param_grid={
                                                      "C":
                                                      [1e0, 1e1, 1e2, 1e3],
                                                      "gamma":
                                                      np.logspace(-2, 2, 5)
                                                  }))
        ])

    elif pipe_name == 'reg_FilterBank':
        f_list = range(len(init_params['preproc__filters']))
        pipFreqs = []
        for freq in f_list:
            pipFreqs.append((
                "freq" + str(freq),
                sklearn.pipeline.Pipeline([
                    ('CospSelector', CospSelector(f_list=[freq])),
                    ('Cov', pyriemann.estimation.Covariances(estimator='lwf'))
                    #                 ,('xdawn',XdawnCovariancesRegression(nfilter=8,estimator='lwf',xdawn_estimator='lwf',bins=[0,32,72,100]))
                    ,
                    ('SPOC',
                     pyriemann.spatialfilters.SPoC(nfilter=20, log=False))
                    #                 ,('TS',pyriemann.tangentspace.TangentSpace())
                    ,
                    ('cosp2Feats', Cosp2feats())
                ])))
        union = sklearn.pipeline.FeatureUnion(pipFreqs)

        pipeline = sklearn.pipeline.Pipeline([
            ('preproc', Epochs2signals()), ('union', union),
            ('LASSO', sklearn.linear_model.LassoCV())
        ])

    elif pipe_name == 'reg_SPOC':
        pipeline = sklearn.pipeline.Pipeline([
            ('Cov', pyriemann.estimation.Covariances()),
            ('SPOC', pyriemann.spatialfilters.SPoC(log=False)),
            ('TS', pyriemann.tangentspace.TangentSpace()),
            ('LASSO', sklearn.linear_model.LassoCV())
        ])

    elif pipe_name == "vot_ADA":
        pipeline = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),
                                      algorithm="SAMME",
                                      n_estimators=200)

    else:
        print('no pipeline recognized')
        assert False

    # initialize parameters of the pipeline
    if init_params is not None:
        pipeline.set_params(**init_params)  # initialize the parameters
    else:
        print('CreatesFeatsPipeline: ' + pipe_name + ' not initialized!')

    return pipeline
Example #20
0
class AdaBoost(object):
    def __init__(self, dataset_x, dataset_y):
        self.dataset_x = dataset_x
        self.dataset_y = dataset_y

        self.clf = AdaBoostClassifier()
        self.best_parameter = {}

    def startAdaBoost(self):
        print("------------------ AdaBoost Classifier -------------------")
        # self.findBestParameters()
        self.gridSearch()
        # self.randomSearch()

    def findBestParameters(self):
        """
        Try different parameters for finding the best score
        :return:
        """
        self.clf = AdaBoostClassifier()
        scores = cross_val_score(self.clf,
                                 self.dataset_x,
                                 self.dataset_y,
                                 cv=10,
                                 scoring="accuracy")
        print(scores)
        print("Accuracy: %0.2f (+/- %0.2f)" %
              (scores.mean(), scores.std() * 2))

    def test(self):
        """
        Test the model with best parameters found in randomSearch() or gridSearch()
        :return:
        """
        # self.clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, learning_rate=1.5, algorithm='SAMME.R')
        self.clf = AdaBoostClassifier()
        self.clf.set_params(**self.best_parameter)
        print("*** Test Result for AdaBoost ***")
        ModelEvaluation.evaluateModelWithCV(self.clf,
                                            self.dataset_x,
                                            self.dataset_y,
                                            cv=10)

    def randomSearch(self):
        tuned_parameters = {
            'base_estimator':
            [DecisionTreeClassifier(),
             LogisticRegression(),
             MultinomialNB()],
            'n_estimators': [50, 100, 150],
            'learning_rate': [0.5, 1.0, 1.5],
            'algorithm': ['SAMME']
        }
        self.best_parameter = SearchParameters.randomSearch(
            classifier=self.clf,
            parameters=tuned_parameters,
            cv=10,
            n_iter=30,
            train_x=self.dataset_x,
            train_y=self.dataset_y)

    def gridSearch(self):
        tuned_parameters = {
            'base_estimator': [DecisionTreeClassifier()],
            'n_estimators': [50, 100, 150],
            'learning_rate': [0.5, 1.0, 1.5],
            'algorithm': ['SAMME']
        }
        self.best_parameter = SearchParameters.gridSearch(
            classifier=self.clf,
            parameters=tuned_parameters,
            cv=10,
            train_x=self.dataset_x,
            train_y=self.dataset_y)
Example #21
0
# search = GridSearchCV(AdaBoostClassifier(DecisionTreeClassifier(random_state=42), random_state=42), 
#                       grid, make_scorer(f1_score), cv=StratifiedKFold(labels), n_jobs=-1)

# search.fit(features, labels)

# print search.best_score_
# print search.best_params_

# clf = search.best_estimator_


### To speed up the process of training the grid search is not included and the best parameters used.
### This is as recommended by the reviewer
best_params = {
	'n_estimators': 4, 
	'base_estimator__criterion': 'gini', 
	'base_estimator__max_depth': 3, 
	'base_estimator__min_samples_leaf': 11}

clf = AdaBoostClassifier(DecisionTreeClassifier(random_state=42), random_state=42)
clf.set_params(**best_params)


## Task 6: Dump your classifier, dataset, and features_list so anyone can
## check your results. You do not need to change anything below, but make sure
## that the version of poi_id.py that you submit can be run on its own and
## generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)
Example #22
0
def without_penalty(X, y):
    clf = AdaBoostClassifier()
    clf.set_params(**params)
    clf.fit(X, y)
    return clf
Example #23
0
def boosting(X, y, split_amount, plot=True, X_test=None, y_test=None):
	training_amount = 1 - split_amount
	X_train = None
	y_train = None
	if X_test is None and y_test is None:
		X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_amount, train_size=training_amount, shuffle=True)
	else:
		X_train = X
		y_train = y
	boost_classifier = AdaBoostClassifier(algorithm='SAMME.R', n_estimators=60)
	estimators_range = range(50, 150, 5)

	train_scores, test_scores = validation_curve(boost_classifier, X_train, y_train, param_name='n_estimators'
	, param_range=estimators_range, cv=5, n_jobs=1)

	train_scores_mean = np.mean(train_scores, axis=1)
	train_scores_std = np.std(train_scores, axis=1)
	test_scores_mean = np.mean(test_scores, axis=1)
	test_scores_std = np.std(test_scores, axis=1)

	best_num_estimators = estimators_range[list(test_scores_mean).index(max(test_scores_mean))]
	boost_classifier.set_params(n_estimators=best_num_estimators)
	training_sizes = np.linspace(.1, 1.0, 5)
	train_sizes, train_scores_learn, test_scores_learn = learning_curve(boost_classifier, X_train, y_train,
		train_sizes=training_sizes, cv=5)

	train_scores_learn_mean = np.mean(train_scores_learn, axis=1)
	train_scores_learn_std = np.std(train_scores_learn, axis=1)
	test_scores_learn_mean = np.mean(test_scores_learn, axis=1)
	test_scores_learn_std = np.std(test_scores_learn, axis=1)

	boost_classifier.fit(X_train, y_train)
	measure_performance(X_test, y_test, boost_classifier)

	if plot:
		lw=2
		plt.figure()
		plt.grid()
		plt.title("Boosting Validation Curve")
		plt.plot(estimators_range, train_scores_mean, label='training_score', color='darkorange')
		plt.fill_between(estimators_range, train_scores_mean - train_scores_std,
		             train_scores_mean + train_scores_std, alpha=0.2,
		             color="darkorange", lw=lw)
		plt.plot(estimators_range, test_scores_mean, label='cross_validation_score', color='navy')
		plt.fill_between(estimators_range, test_scores_mean - test_scores_std,
		             test_scores_mean + test_scores_std, alpha=0.2,
		             color="navy", lw=lw)
		plt.legend()
		plt.xlabel('Number of Estimators')
		plt.ylabel('Score')



		title = "Boosting Learning Curve (n_estimators = " + str(best_num_estimators) + " )"
		plt.figure(2)
		plt.grid()
		plt.title(title)
		plt.fill_between(train_sizes, train_scores_learn_mean - train_scores_learn_std,
		                 train_scores_learn_mean + train_scores_learn_std, alpha=0.1,
		                 color="r")
		plt.fill_between(train_sizes, test_scores_learn_mean  - test_scores_learn_std,
		                 test_scores_learn_mean + test_scores_learn_std, alpha=0.1, color="g")
		plt.plot(train_sizes, train_scores_learn_mean, 'o-', color="r",
		         label="Training score")
		plt.plot(train_sizes, test_scores_learn_mean, 'o-', color="g",
		         label="Test score")
		plt.xlabel('Training Sizes')
		plt.ylabel('Score')
		plt.legend()
		plt.show()
Example #24
0
def train_BTree(filename, X_train, X_test, y_train, y_test, full_param=False, debug=False, numFolds=10, njobs=-1,
                scalar=1, make_graphs=False, pBTree={}):
    np.random.seed(1)
    start = time.time()
    algo = 'Boosted Tree'

    if len(pBTree) == 0:
        if full_param:
            param_grid = [{'base_estimator__criterion'     : ['gini', 'entropy'],
                           'base_estimator__max_depth'     : [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 100],
                           # 'base_estimator__min_samples_split': [2, 3, 5, 6, 8, 10],
                           # 'base_estimator__min_samples_leaf' : [1, 2, 3, 5, 6, 8, 10],
                           # 'base_estimator__max_features'  : [0.9, 1.0],  # 0.1, 0.3, 0.5,
                           'base_estimator__max_leaf_nodes': [10, 100],  # 2, 4, 5, 7,
                           'base_estimator__ccp_alpha'     : [0.0, 0.005, 0.01],
                           # 0.015, 0.02, 0.025, 0.030, 0.035, 0.04],
                           "base_estimator__splitter"      : ["best"],  # "random"],
                           "n_estimators"                  : [1, 50, 100, 150, 200, 250, 300],
                           "learning_rate"                 : [0.1, 0.5, 1],
                           'random_state'                  : [1]
                           }]
        else:
            param_grid = [{'base_estimator__criterion': ['gini', 'entropy'],
                           'base_estimator__max_depth': [3, 5, 7, 10],
                           'base_estimator__ccp_alpha': [0.0, 0.005, 0.01, 0.035],
                           # 'base_estimator__min_samples_split': [3, 5, 7, 10],
                           # 'base_estimator__ccp_alpha'        : [0.0, 0.005, 0.015, 0.025, 0.35, 0.04],
                           "n_estimators"             : [1, 50, 100, 150],
                           # "learning_rate"                    : [0.1, 0.5, 1],
                           'random_state'             : [1]
                           }]

        DTC = DecisionTreeClassifier(random_state=11)
        adaTree = AdaBoostClassifier(base_estimator=DTC)

        # run grid search
        grid_search = GridSearchCV(adaTree, param_grid=param_grid, cv=numFolds,
                                   scoring='roc_auc_ovr_weighted',
                                   return_train_score=True, n_jobs=njobs, verbose=debug)

        grid_search.fit(X_train, y_train)

        cvres = grid_search.cv_results_
        best_params = grid_search.best_params_

        util.save_gridsearch_to_csv(cvres, algo, filename[:-4], scalar)

        btree_classifier = AdaBoostClassifier(base_estimator=DTC)
        btree_classifier.set_params(**best_params)
    else:
        DTC = DecisionTreeClassifier()
        btree_classifier = AdaBoostClassifier(base_estimator=DTC)
        btree_classifier.set_params(**pBTree)

    start = time.time()
    btree_classifier.fit(X_train, y_train)
    print('BTree Fit Time: ', time.time() - start)
    start = time.time()

    y_prob = btree_classifier.predict_proba(X_train)
    train_score = roc_auc_score(y_train, y_prob, multi_class="ovr", average="weighted")
    print('BTree Train Score Time: ', time.time() - start)
    start = time.time()

    y_prob = btree_classifier.predict_proba(X_test)
    test_score = roc_auc_score(y_test, y_prob, multi_class="ovr", average="weighted")
    print('BTree Test Score Time: ', time.time() - start)
    DTC = DecisionTreeClassifier()
    test_class = AdaBoostClassifier(base_estimator=DTC)
    test_class.set_params(**pBTree)

    if make_graphs:
        util.boost_lr_vs_nest(X_train, y_train, pBTree, njobs, filename[:-4], train_score)
        util.compute_vc(algo, 'n_estimators',
                        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 1000],
                        X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree,
                        log=True, njobs=njobs, debug=debug, extraText='log')

        util.plot_learning_curve(btree_classifier, algo, filename[:-4], X_train, y_train, ylim=(0.0, 1.05), cv=10,
                                 n_jobs=njobs, debug=debug)

        util.compute_vc(algo, 'base_estimator__max_depth',
                        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 30, 40, 50, 60, 70, 80,
                         90, 100], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class,
                        pBTree, log=True, njobs=njobs, debug=debug)

        util.compute_vc(algo, 'base_estimator__max_leaf_nodes',
                        [2, 3, 4, 5, 6, 7, 8, 9, 10, 25, 50, 75, 100, 200, 500, 1000, 10000], X_train, y_train, X_test,
                        y_test, btree_classifier, filename[:-4], test_class, pBTree, log=True, njobs=njobs)
        # computer Model Complexity/Validation curves
        util.compute_vc(algo, 'base_estimator__criterion', ['gini', 'entropy'], X_train, y_train, X_test, y_test,
                        btree_classifier, filename[:-4], test_class, pBTree, log=False, njobs=njobs)

        util.compute_vc(algo, 'n_estimators',
                        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 1000],
                        X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree,
                        log=False, njobs=njobs, debug=debug)
        util.compute_vc(algo, 'n_estimators',
                        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 1000],
                        X_train, y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree,
                        log=True, njobs=njobs, debug=debug, extraText='log')
        util.compute_vc(algo, 'learning_rate',
                        [0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01,
                         0.05, 0.1, 0.5, 1], X_train, y_train, X_test, y_test, btree_classifier, filename[:-4],
                        test_class, pBTree, log=True, njobs=njobs, debug=debug)

        util.compute_vc(algo, 'base_estimator__ccp_alpha',
                        [0.000001, 0.00001, 0.00002, 0.00003, 0.00004, 0.00005, 0.00006, 0.00007, 0.00008, 0.00009,
                         0.0001, 0.00011, 0.00012, 0.00013, 0.00014, 0.00015, 0.00016, 0.00017, 0.00018, 0.00019,
                         0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001, 0.01, 0.1, 1],
                        X_train,
                        y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=True,
                        njobs=njobs)
        util.compute_vc(algo, 'base_estimator__min_samples_split', [2, 3, 5, 6, 8, 10], X_train, y_train, X_test,
                        y_test, btree_classifier, filename[:-4], test_class, pBTree, log=False, njobs=njobs)
        util.compute_vc(algo, 'base_estimator__min_samples_leaf',
                        [1, 2, 3, 5, 6, 8, 10, 25, 50, 75, 100, 250, 500, 750, 1000], X_train,
                        y_train, X_test, y_test, btree_classifier, filename[:-4], test_class, pBTree, log=True,
                        njobs=njobs)
        util.compute_vc(algo, 'base_estimator__max_features',
                        [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 0.9, 0.99999, 1.0], X_train, y_train, X_test, y_test,
                        btree_classifier, filename[:-4], test_class, pBTree, log=False, njobs=njobs)

        util.compute_vc(algo, 'base_estimator__splitter', ["best", "random"], X_train, y_train, X_test, y_test,
                        btree_classifier, filename[:-4], test_class, pBTree, log=False, njobs=njobs)

    return time.time() - start, round(train_score, 4), round(test_score, 4)
Example #25
0
class AdaBoost(Classifier):
    r"""Implementation of AdaBoost classifier.
    
    Date:
        2020

    Author:
        Luka Pečnik

    License:
        MIT
    
    Reference:
        Y. Freund, R. Schapire, “A Decision-Theoretic Generalization of on-Line Learning and an Application to Boosting”, 1995.
    
    Documentation:
        https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html

    See Also:
        * :class:`niaaml.classifiers.Classifier`
    """
    Name = 'AdaBoost'

    def __init__(self, **kwargs):
        r"""Initialize AdaBoost instance.
        """
        warnings.filterwarnings(action='ignore',
                                category=ChangedBehaviorWarning)
        warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
        warnings.filterwarnings(action='ignore',
                                category=DataConversionWarning)
        warnings.filterwarnings(action='ignore',
                                category=DataDimensionalityWarning)
        warnings.filterwarnings(action='ignore', category=EfficiencyWarning)
        warnings.filterwarnings(action='ignore', category=FitFailedWarning)
        warnings.filterwarnings(action='ignore', category=NonBLASDotWarning)
        warnings.filterwarnings(action='ignore',
                                category=UndefinedMetricWarning)

        self._params = dict(n_estimators=ParameterDefinition(
            MinMax(min=10, max=111), np.uint),
                            algorithm=ParameterDefinition(['SAMME',
                                                           'SAMME.R']))
        self.__ada_boost = AdaBoostClassifier()

    def set_parameters(self, **kwargs):
        r"""Set the parameters/arguments of the algorithm.
        """
        self.__ada_boost.set_params(**kwargs)

    def fit(self, x, y, **kwargs):
        r"""Fit AdaBoost.

        Arguments:
            x (pandas.core.frame.DataFrame): n samples to classify.
            y (pandas.core.series.Series): n classes of the samples in the x array.
        """
        self.__ada_boost.fit(x, y)

    def predict(self, x, **kwargs):
        r"""Predict class for each sample (row) in x.

        Arguments:
            x (pandas.core.frame.DataFrame): n samples to classify.

        Returns:
            pandas.core.series.Series: n predicted classes.
        """
        return self.__ada_boost.predict(x)

    def to_string(self):
        r"""User friendly representation of the object.

        Returns:
            str: User friendly representation of the object.
        """
        return Classifier.to_string(self).format(
            name=self.Name,
            args=self._parameters_to_string(self.__ada_boost.get_params()))
Example #26
0
#!/usr/bin/python2.7
from numpy import average, logspace, load
import sys
from sklearn.ensemble import AdaBoostClassifier
from ML import kfold as kf, GridSearch as gs

#load data
samples = load(sys.argv[1])
lables = load("labels.npy")

folds = int(sys.argv[2]) if sys.argv[2] else 10

clf = AdaBoostClassifier()
estimators_range = range(5, 110, 20)
param_grid = dict(n_estimators=estimators_range)
#   gridsearh = gs(clf, param_grid, samples, lables)
#   best_param = gridsearh.search()
clf.set_params(n_estimators=100)  #best_param['n_estimators'])
#print best_param['n_estimators']
kfold = kf(clf, samples, lables, folds)
res = kfold.fit()
for score in kfold.results['scores']:
    print score.get_accuracy()
print average([score.get_accuracy() for score in kfold.results['scores']])
Example #27
0
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

best_params = {
    'n_estimators': 4,
    'base_estimator__criterion': 'gini',
    'base_estimator__max_depth': 3,
    'base_estimator__min_samples_leaf': 11
}

clf = AdaBoostClassifier(DecisionTreeClassifier(random_state=42),
                         random_state=42)
clf.set_params(**best_params)

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print accuracy_score(pred, labels_test)

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
Example #28
0
                                                      train_labels,
                                                      problem_name, plot_dir)

param_dist = {
    'n_estimators': [25, 50, 75, 100, 150, 200, 300],
    'learning_rate': stats.uniform(0.75, 0.25)
}

scoring_metric = 'f1'

opt_param_set_from_random_search = perf_random_search_for_best_hyper_params(
    clf,
    train_features,
    train_labels,
    scoring_metric,
    param_dist,
    n_iter_search=20,
    n_jobs=4,
    cv=5)

clf.set_params(**opt_param_set_from_random_search)

plot_learning_curves_helper(clf, train_features, train_labels, scoring_metric,
                            plot_dir, problem_name)

clf.fit(train_features, train_labels.values.ravel())

plot_opt_model_perf(clf, test_features, test_labels, [0, 1], problem_name,
                    plot_dir)

store_model(clf, model_path)
Example #29
0
grid_search = GridSearchCV(bdt_real, param_grid=param_grid, cv=10)
grid_search.fit(X_train, y_train)
print 'Best parameters of Adaboost SAMME.R:' , grid_search.best_params_
print 'Best scrore of Adaboost SAMME.R:', grid_search.best_score_


grid_search = GridSearchCV(bdt_discrete, param_grid=param_grid, cv=10)
grid_search.fit(X_train, y_train)
print 'Best parameters of Adaboost SAMME:' , grid_search.best_params_
print 'Best scrore of Adaboost SAMME:', grid_search.best_score_



num_estimators = X_train.shape[0]

bdt_real.set_params(n_estimators=num_estimators)
bdt_discrete.set_params(n_estimators=num_estimators)

bdt_real.fit(X_train, y_train)
bdt_discrete.fit(X_train, y_train)

real_test_errors = []
discrete_test_errors = []


ypred_r = bdt_real.predict(X_test)
ypred_e = bdt_discrete.predict(X_test)
print 'Accuracy of SAMME.R: {} '.format(bdt_real.score(X_test, ypred_r))
print 'Accuracy of SAMME: {}'.format(bdt_discrete.score(X_test, ypred_e))
print("--- %s seconds ---" % (time.time() - start_time))
Example #30
0
# We can now compute the performance of the model on new, held out data from the **test set**:

# In[16]:

#test_score = svc.score(X_test, y_test)
test_score = abc.score(X_test_scaled, y_test)
print 'test_score'
print test_score
print 'abc'
print abc
params = {
    'base_estimator': DC(max_depth=5)
}
print 'changing base estimator'
abc.set_params(**params)
#abc.base_estimator = DC(max_depth=5, min_samples_leaf=0.1*len(X_train))
abc.fit(X_train_scaled, y_train)
print 'new train score'
print abc.score(X_train_scaled, y_train)
# This score is clearly not as good as expected! The model cannot generalize so well to new, unseen data.
# 
# - Whenever the **test** data score is **not as good as** the **train** score the model is **overfitting**
# 
# - Whenever the **train score is not close to 100%** accuracy the model is **underfitting**
# 
# Ideally **we want to neither overfit nor underfit**: `test_score ~= train_score ~= 1.0`. 

# The previous example failed to generalized well to test data because we naively used the default parameters of the `SVC` class:

# In[17]:
        "base_estimator__max_depth": [None, 3, 5, 8, 10],
        "base_estimator__min_samples_leaf": [1, 2, 3, 5, 8],
        "learning_rate": [0.01, 0.1, 0.5, 0.8, 1.0],
    }

    searcher = GridSearchCV(adabst,
                            param_grid,
                            f2_score,
                            n_jobs=2,
                            verbose=1,
                            cv=StratifiedKFold(labels, 10))

    searcher.fit(features, labels)

    # Apply tuned parameters to the model
    adabst.set_params(**searcher.best_params_)

else:

    # Result I got when I ran the above searching
    adabst.set_params(base_estimator__max_features="sqrt",
                      base_estimator__min_samples_leaf=1,
                      base_estimator__max_depth=3,
                      learning_rate=0.01)

sys.stdout.write("Done\n")

### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
Example #32
0
def Adaboost_Classifier_Mul(X_raw_train, y_raw_train, X_test, y_test,
                            Cali_method):
    mean_cv_scores_df = pd.DataFrame(columns=[
        'accuracy', 'average_precision', 'f1', 'roc_auc', 'RMSE', 'MXE',
        'APaccuracy', 'BEP_score'
    ])
    mean_final_scores_df = pd.DataFrame(columns=[
        'accuracy', 'average_precision', 'f1', 'roc_auc', 'RMSE', 'MXE',
        'APaccuracy', 'BEP_score'
    ])
    index_name = []
    ##采用5次五折交叉验证法
    kf = KFold(n_splits=5)

    dt = DecisionTreeClassifier()
    gnb = GaussianNB()
    base_estimators = [{
        'name': 'Naive_Bayes',
        'estimator': gnb
    }, {
        'name': 'DecisionTree',
        'estimator': dt
    }]
    n_estimator = [20, 30, 40, 50, 60, 70, 80, 90, 100]
    learning_rates = [0.2, 0.4, 0.6, 0.8, 1]
    for b in base_estimators:
        for n in n_estimator:
            for r in learning_rates:
                ada = AdaBoostClassifier(algorithm='SAMME')
                ada.set_params(base_estimator=b['estimator'],
                               n_estimators=n,
                               learning_rate=r)
                name = 'base_estimator=' + b['name'] + ' n_estimators=' + str(
                    n) + ' learning_rates=' + str(r)
                index_name.append(name)
                cv_score, final_score = KFold_Mul_Experiment(
                    b['estimator'], kf, X_raw_train, y_raw_train, X_test,
                    y_test, Cali_method)
                #cv
                cv_scores = pd.DataFrame(cv_score,
                                         columns=[
                                             'accuracy', 'average_precision',
                                             'f1', 'roc_auc', 'RMSE', 'MXE',
                                             'APaccuracy', 'BEP_score'
                                         ])
                cv_scores.loc['mean'] = cv_scores.apply(lambda x: x.mean())
                mean_cv_scores = cv_scores.loc['mean']
                mean_cv_scores_df = mean_cv_scores_df.append(mean_cv_scores,
                                                             ignore_index=True)
                #final
                final_scores = pd.DataFrame(final_score,
                                            columns=[
                                                'accuracy',
                                                'average_precision', 'f1',
                                                'roc_auc', 'RMSE', 'MXE',
                                                'APaccuracy', 'BEP_score'
                                            ])
                final_scores.loc['mean'] = final_scores.apply(
                    lambda x: x.mean())
                mean_final_scores = final_scores.loc['mean']
                mean_final_scores_df = mean_final_scores_df.append(
                    mean_final_scores, ignore_index=True)

    #cv
    mean_cv_scores_df.index = index_name
    mean_cv_scores_dfmax = mean_cv_scores_df[[
        'accuracy', 'average_precision', 'f1', 'roc_auc', 'APaccuracy',
        'BEP_score'
    ]]
    mean_cv_scores_dfmax.loc['best'] = mean_cv_scores_dfmax.apply(
        lambda x: x.argmax())
    mean_cv_scores_dfmin = mean_cv_scores_df[['RMSE', 'MXE']]
    mean_cv_scores_dfmin.loc['best'] = mean_cv_scores_dfmin.apply(
        lambda x: x.argmin())
    mean_cv_scores_df = pd.merge(mean_cv_scores_dfmax,
                                 mean_cv_scores_dfmin,
                                 left_index=True,
                                 right_index=True,
                                 how='outer')
    #final
    mean_final_scores_df.index = index_name
    mean_final_scores_dfmax = mean_final_scores_df[[
        'accuracy', 'average_precision', 'f1', 'roc_auc', 'APaccuracy',
        'BEP_score'
    ]]
    mean_final_scores_dfmax.loc['OPT-SEL'] = mean_final_scores_dfmax.apply(
        lambda x: x.max())
    mean_final_scores_dfmin = mean_final_scores_df[['RMSE', 'MXE']]
    mean_final_scores_dfmin.loc['OPT-SEL'] = mean_final_scores_dfmin.apply(
        lambda x: x.min())
    mean_final_scores_df = pd.merge(mean_final_scores_dfmax,
                                    mean_final_scores_dfmin,
                                    left_index=True,
                                    right_index=True,
                                    how='outer')
    return mean_cv_scores_df, mean_final_scores_df
Example #33
0
print 'Best scrore of Adaboost SAMME.R:', grid_search.best_score_

pdb.set_trace()

grid_search = GridSearchCV(bdt_discrete, param_grid=param_grid, cv=10)
grid_search.fit(X_train, y_train)
print 'Best parameters of Adaboost SAMME:' , grid_search.best_params_
print 'Best scrore of Adaboost SAMME:', grid_search.best_score_

pdb.set_trace()
'''

# Train on the training data set
num_estimators = 600;

bdt_real.set_params(n_estimators=num_estimators)
bdt_discrete.set_params(n_estimators=num_estimators)

bdt_real.fit(X_train, y_train)
bdt_discrete.fit(X_train, y_train)

real_test_errors = []
discrete_test_errors = []

# Test on the testing data set and display the accuracies
ypred_r = bdt_real.predict(X_test)
ypred_e = bdt_discrete.predict(X_test)
print 'Accuracy of SAMME.R = ', accuracy_score(ypred_r, y_test)
print 'Accuracy of SAMME = ', accuracy_score(ypred_e, y_test)

# Plot the relationship between error rates and number of trees
Example #34
0
clf.fit(X, y)
#applying grid search to find the best model
from sklearn.model_selection import GridSearchCV
parameters = [{
    'n_estimators': [100, 200, 500, 1000],
    'learning_rate': [0.01, 0.1, 0.2, 0.5]
}]

grid_search = GridSearchCV(estimator=clf,
                           param_grid=parameters,
                           scoring='accuracy',
                           cv=2)
grid_search = grid_search.fit(X, y)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
clf.set_params(**best_parameters)
# refit
clf.fit(X, y)

origin_size = True
keep_top_k = 750

video_capture = cv2.VideoCapture(0)
process_this_frame = True
while True:
    # Grab a single frame of video
    ret, frame = video_capture.read()

    # Only process every other frame of video to save time
    if process_this_frame:
        # get the coordinate of the points
pst_classifier.score(data_test, label_test)


# ### Ada Boost Classifier

# In[145]:


from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import mean_squared_error

ada_classifier = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=4),learning_rate=0.70,n_estimators=120,random_state=49,algorithm="SAMME.R")
ada_classifier.fit(data_train, label_train)
print(ada_classifier.score(data_test, label_test))

ada_classifier.set_params(n_estimators=120)
errors = [mean_squared_error(label_test, y_pred) for y_pred in ada_classifier.staged_predict(data_test)]
bst_n_estimators = np.argmin(errors)
print('bst_n_estimators',bst_n_estimators)

ada_best = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=4),learning_rate=0.70,n_estimators=bst_n_estimators,random_state=49,algorithm="SAMME.R")
ada_best.fit(data_train,label_train)
ada_best.score(data_test,label_test)


# ### Gradient Boost Classifier

# In[146]:


from sklearn.ensemble import GradientBoostingClassifier