def classifiers_evaluation(df_res, y):
    vect = vectorizer(start, end)
    vect.fit(df_res[1])

    classifiers = [
        KNeighborsClassifier(3),
        SVC(probability=True),
        DecisionTreeClassifier(),
        ensemble.RandomForestClassifier(),
        ensemble.AdaBoostClassifier(),
        ensemble.GradientBoostingClassifier(),
        GaussianNB(),
        LinearDiscriminantAnalysis(),
        QuadraticDiscriminantAnalysis(),
        LogisticRegression(),
        MLPClassifier(),
        SGDClassifier(loss='log', max_iter=100),
        LogisticRegressionCV()
        ]

    log_cols = ["Classifier", "ROC_AUC score"]
    log = pd.DataFrame(columns=log_cols)

    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)

    acc_dict = {}

    for train_index, test_index in sss.split(df_res, y):
        X_train, X_test = df_res.iloc[train_index], df_res.iloc[test_index]

        y_train, y_test = y[train_index], y[test_index]

        X_train_ready = vect.transform(X_train[1])

        X_test_ready = vect.transform(X_test[1])

        del X_train
        del X_test

        for clf in classifiers:
            name = clf.__class__.__name__

            clf.fit(X_train_ready, y_train)
            train_predictions = clf.predict(X_test_ready)
            acc = accuracy_score(y_test, train_predictions)
#            acc = roc_auc_score(y_test, train_predictions)

            if name in acc_dict:
                acc_dict[name] += acc
            else:
                acc_dict[name] = acc

        del X_train_ready
        del X_test_ready
        del y_train
        del y_test

    for clf in acc_dict:
        acc_dict[clf] = acc_dict[clf] / 10.0
        log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols)
        log = log.append(log_entry)

    print(acc_dict)
    print(log)
X[:, 13] = enX.fit_transform(X[:, 13])

X[:, 15] = enX.fit_transform(X[:, 15])

X[:, 18] = enX.fit_transform(X[:, 18])
#0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 13, 15, 16
oneencX = OneHotEncoder(categorical_features=[6, 13, 15, 18])
X = oneencX.fit_transform(X).toarray()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=40)
print(y_train)

clf = ensemble.GradientBoostingClassifier(learning_rate=0.25, n_estimators=100)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(y_pred)

import pylab as pl
cm = metrics.confusion_matrix(y_test, y_pred)
pl.matshow(cm)
pl.title('Confusion Matrix')
pl.colorbar()
pl.show()

score = metrics.accuracy_score(y_test, y_pred)
print(score)
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn import svm, model_selection, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# In[19]:

#list of machine learning algorithms
MLA = [
    #ensemble method
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #gaussian processes
    gaussian_process.GaussianProcessClassifier(),

    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),

    #naive_bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
Example #4
0
y_train = titanic_train['Survived']

#applying feature selection algorithm to get impactful features
rf = ensemble.RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

features = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf.feature_importances_
})
features.sort_values(by=['importance'], ascending=True, inplace=True)
features.set_index('feature', inplace=True)
features.plot(kind='barh', figsize=(20, 20))

fs_model = feature_selection.SelectFromModel(rf, prefit=True)
X_train1 = fs_model.transform(X_train)
X_train1.shape
selected_features = X_train.columns[fs_model.get_support()]

#build model using selected features
gb_estimator = ensemble.GradientBoostingClassifier(random_state=100)
gb_grid = {
    'n_estimators': list(range(50, 301, 50)),
    'learning_rate': [0.01, 0.05, 0.1]
}
grid_gb_estimator = model_selection.GridSearchCV(gb_estimator, gb_grid, cv=10)
grid_gb_estimator.fit(X_train1, y_train)

print(grid_gb_estimator.best_score_)
print(grid_gb_estimator.best_params_)
Example #5
0
sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_data

# Import functions for decision tree
from functions_decision_tree import *

model_5 = decision_tree_create(train_data, list(train_data.columns[1:]), 'safe_loans', 0, max_depth = 6, min_node_size = 0, min_error_reduction=0)

for i in xrange(len(sample_validation_data)):
    print "Case: " + str(i) + " Prediction: " + str(classify(model_5, sample_validation_data.iloc[i], annotate = False))

sample_validation_data[target]

# With sklearn
gbes = ensemble.GradientBoostingClassifier(n_estimators=5,max_depth=6)
model5 = gbes.fit(train_data.drop([target],axis=1), train_data[target])

predictions = model5.predict(sample_validation_data.drop([target],axis=1)) 

predictions == sample_validation_data[target]

predictions_p = model5.predict_proba(sample_validation_data.drop([target],axis=1))[:,1]

np.column_stack((predictions,predictions_p))

from sklearn.metrics import accuracy_score
accuracy_score(validation_data[target],model5.predict(validation_data.drop([target],axis=1)))

# false positives: prediction 1, actual -1 -> 0
predictions = pd.concat([pd.Series(validation_data[target],name='true').reset_index(),pd.Series(model5.predict(validation_data.drop([target],axis=1)),name='predict')],axis=1)
Example #6
0
from sklearn import ensemble
from math import exp
import numpy
import matplotlib.pyplot as plt

data = pandas.read_csv('gbm-data.csv')
y = data['Activity'].values
X = data.iloc[:, 1:].values
X_train, X_test, y_train, y_test \
    = train_test_split(X, y, test_size=0.8, random_state=241)

x = [0.2]  #[1, 0.5, 0.3, 0.2, 0.1]
for el in x:
    f = []
    GBS = ensemble.GradientBoostingClassifier(n_estimators=250,
                                              verbose=True,
                                              random_state=241,
                                              learning_rate=el)
    GBS.fit(X_train, y_train)
    o = GBS.staged_decision_function(X_test)
    for i, y_pred in enumerate(o):
        y_pred = 1 / (1 + numpy.exp(-y_pred))
        ll = log_loss(y_test, y_pred)
        f.append(ll)

    plt.figure()
    plt.plot(f, 'r', linewidth=2)
    plt.show()

clf = RandomForestClassifier(random_state=241, n_estimators=36)
clf.fit(X_train, y_train)
ll2 = log_loss(y_test, clf.predict_proba(X_test))
Example #7
0
trainset_minmax,testset_minmax=scs.scaling('minmax')

pca=PCA(n_components=100)
pca.fit(trainset_minmax)
trainset_minmax_new=pca.transform(trainset_minmax)
testset_minmax_new=pca.transform(testset_minmax)

dataset=trainset_minmax_new
label=scs.trainlabel
count0=label.tolist().count(0)
count1=label.tolist().count(1)
m=np.shape(dataset)[0]
trainset1=[dataset[i] for i in xrange(m) if label[i]==1]
trainset0=[dataset[i] for i in xrange(m) if label[i]==0]
        
trainset=np.concatenate((trainset0[:count1],trainset1))
trainlabel=np.concatenate((np.zeros((count1,1)),np.ones((count1,1))))

samples=trainset
target=trainlabel

classifier_GB=ensemble.GradientBoostingClassifier(n_estimators=1000, max_leaf_nodes=4, max_depth= None, random_state= 2,min_samples_split= 5)

classifier_GB.fit(samples,target)

lables_test_GB=classifier_GB.predict_proba(inX)




Example #8
0
# # Cтолбцовая диаграмма, представляющая значимость первых 20 признаков
# d_first = 20
# plt.figure(figsize=(8, 8))
# plt.title("Feature importances")
# plt.bar(range(d_first), importances[indices[:d_first]], align='center')
# plt.xticks(range(d_first), np.array(feature_names)[indices[:d_first]], rotation=90)
# plt.xlim([-1, d_first])
# plt.show()
#
# best_features = indices[:8]
# best_features_names = feature_names[best_features]
# print(best_features_names)

########################
# Метод GBT:
gbt = ensemble.GradientBoostingClassifier(n_estimators=100, random_state=11)
gbt.fit(X_train, y_train)

err_train = np.mean(y_train != gbt.predict(X_train))
err_test = np.mean(y_test != gbt.predict(X_test))
print(err_train, err_test)

# Используем только значимые признаки
# gbt = ensemble.GradientBoostingClassifier(n_estimators=100, random_state=11)
# gbt.fit(X_train[best_features_names], y_train)
#
# err_train = np.mean(y_train != gbt.predict(X_train[best_features_names]))
# err_test = np.mean(y_test != gbt.predict(X_test[best_features_names]))
# print(err_train, err_test)

from sklearn import preprocessing
from sklearn.metrics import f1_score
from sklearn import ensemble
from sklearn.preprocessing import StandardScaler

X_train = np.load(open('X_train.npy', 'rb'))
Y_train = np.load(open('Y_train.npy', 'rb'))
X_test = np.load(open('X_test.npy', 'rb'))
Y_test = np.load(open('Y_test.npy', 'rb'))

scaler = StandardScaler()
X_train_scaler = scaler.fit_transform(X_train)
X_test_scaler = scaler.fit_transform(X_test)

# We create a instance of model.
Estimator = ensemble.GradientBoostingClassifier()

# Now, we are going to use a grid search cross-validation to explore combinations of parameters.
param_grid = {'n_estimators': [10,20,30],'max_features':['auto', 'log2'],\
              'min_samples_split':[5,10,15], 'max_depth': range(2,15)}

Grid_GBoost = GridSearchCV(Estimator,
                           param_grid,
                           cv=10,
                           scoring='f1',
                           verbose=2)
Grid_GBoost.fit(X_train, Y_train)

# Once it has been fitted, we get several parameters.

print("ParameterGrid: ", '\n', list(ParameterGrid(param_grid)), '\n')
Example #10
0
import sklearn.neighbors as skneib
import sklearn.ensemble as skes
import sklearn.tree as sktr

import pickle

from Prediction.ModelEstimation.getEstimationForMultipleFeatureSets import getEstimationForMultipleFeatureSets

randomSeed = 15

random.seed(randomSeed)

classifiers = [
    skes.RandomForestClassifier(),
    sktr.DecisionTreeClassifier(),
    skes.GradientBoostingClassifier(),
    sklin.RidgeClassifier(),
    skneib.KNeighborsClassifier()
]  #skes.RandomForestClassifier(n_estimators=int(random()*10+1))

tries = 0
while True:

    classifier = random.choice(classifiers)
    classifierParameter = 0
    if isinstance(classifier, skes.RandomForestClassifier) or isinstance(
            classifier, sktr.DecisionTreeClassifier) or isinstance(
                classifier, skes.GradientBoostingClassifier):
        classifier.min_samples_leaf = random.randint(10, 100)
        classifierParameter = classifier.min_samples_leaf
    if isinstance(classifier, sklin.RidgeClassifier):
    def fit(self, x, y):
        scoring = {
            "roc": make_scorer(roc_auc_score),
        }

        x = x.values
        y = y.values.reshape(len(y), )

        if self.method == 'logistic':
            clf = GridSearchCV(linear_model.LogisticRegression(),
                               param_grid=self.parameters,
                               cv=5,
                               scoring=scoring,
                               refit='roc')
            clf.fit(x, y)

            coef = pd.DataFrame(clf.best_estimator_.coef_).sort_values()
            self.select_col = coef.index[-self.TopN:]

        elif self.method == 'rf':
            clf = GridSearchCV(esb.RandomForestClassifier(),
                               param_grid=self.parameters,
                               cv=5,
                               scoring=scoring,
                               refit='roc')
            clf.fit(x, y)

            importances = clf.best_estimator_.feature_importances_
            Rank = pd.DataFrame(importances,
                                index=x.columns,
                                columns=['importances'
                                         ]).sort_values('importances')
            self.select_col = list(Rank.index[-self.Top_N:])

        elif self.method == 'adaBoost':
            clf = GridSearchCV(esb.AdaBoostClassifier(),
                               param_grid=self.parameters,
                               cv=5,
                               scoring=scoring,
                               refit='roc')
            clf.fit(x, y)

            importances = clf.best_estimator_.feature_importances_
            Rank = pd.DataFrame(importances,
                                index=x.columns,
                                columns=['importances'
                                         ]).sort_values('importances')
            self.select_col = list(Rank.index[-self.Top_N:])

        elif self.method == 'gbm':
            clf = GridSearchCV(esb.GradientBoostingClassifier(),
                               param_grid=self.parameters,
                               cv=5,
                               scoring=scoring,
                               refit='roc')
            clf.fit(x, y)

            importances = clf.best_estimator_.feature_importances_
            Rank = pd.DataFrame(importances,
                                index=x.columns,
                                columns=['importances'
                                         ]).sort_values('importances')
            self.select_col = list(Rank.index[-self.Top_N:])

        elif self.method == 'xgb':
            clf = GridSearchCV(XGBRegressor(),
                               param_grid=self.parameters,
                               cv=5,
                               scoring=scoring,
                               refit='roc')
            clf.fit(x, y)

            importances = clf.best_estimator_.feature_importances_
            Rank = pd.DataFrame(importances,
                                index=x.columns,
                                columns=['importances'
                                         ]).sort_values('importances')
            self.select_col = list(Rank.index[-self.Top_N:])
Example #12
0
def train():
  test_model = ensemble.GradientBoostingClassifier()
  person_table, condition_occurrence_table, outcome_cohort_table, measurement_table = util.load_data_set(TRAIN_DIR)
  measurement_table = util.preprocess_measurement(measurement_table)
  test_model = util.train_model(test_model,person_table, condition_occurrence_table, measurement_table, outcome_cohort_table)
  pickle.dump(test_model, open(os.path.join(VOL_DIR,'model.dat'),'wb')) # 데이터 입력
Example #13
0
# global_auc = auc(Y, predictions)


# print "  auc over folds: %0.4f (+/- %0.4f)" % (np.mean(fold_aucs), np.std(fold_aucs))
# print "  global auc: %0.4f" % global_auc

# tock()

# random forest training takes too long lol
# classifier training
print("Classifier training and evaluation through cross-validation")
n_folds = settings['n_folds']

# clf = ensemble.RandomForestClassifier(n_estimators=200, n_jobs=4, verbose=1)
clf = ensemble.GradientBoostingClassifier(n_estimators=50, min_samples_split=2, max_features=int(np.sqrt(1000)))

# Run classifier with crossvalidation and plot ROC curves
cv = cross_validation.StratifiedKFold(Y, n_folds=n_folds)

def auc(t, y):
    fpr, tpr, thresholds = metrics.roc_curve(t, y)
    return metrics.auc(fpr, tpr)

predictions = np.zeros(Y.shape)
fold_aucs = []

for i, (train, test) in enumerate(cv):
    # scores = clf.fit(features[train], Y[train]).decision_function(features[test])
    scores = clf.fit(features[train], Y[train]).predict_proba(features[test])[:, 1]
Example #14
0
def adaboost_on_fold(feature_sets,
                     train,
                     test,
                     y,
                     y_all,
                     X,
                     dim,
                     dimsum,
                     learn_options,
                     classification=False):
    '''
    AdaBoostRegressor/Classifier from scikitlearn.
    '''

    if learn_options['adaboost_version'] == 'python':
        if not learn_options['adaboost_CV']:
            if not classification:
                clf = en.GradientBoostingRegressor(
                    loss=learn_options['adaboost_loss'],
                    learning_rate=learn_options['adaboost_learning_rate'],
                    n_estimators=learn_options['adaboost_n_estimators'],
                    alpha=learn_options['adaboost_alpha'],
                    subsample=1.0,
                    min_samples_split=2,
                    min_samples_leaf=1,
                    max_depth=learn_options['adaboost_max_depth'],
                    init=None,
                    max_features=None,
                    verbose=0,
                    max_leaf_nodes=None,
                    warm_start=False,
                    random_state=learn_options['seed'])
            else:
                clf = en.GradientBoostingClassifier(
                    learning_rate=learn_options['adaboost_learning_rate'],
                    n_estimators=learn_options['adaboost_n_estimators'],
                    subsample=1.0,
                    min_samples_split=2,
                    min_samples_leaf=1,
                    max_depth=learn_options['adaboost_max_depth'],
                    init=None,
                    max_features=None,
                    verbose=0,
                    max_leaf_nodes=None,
                    warm_start=False,
                    random_state=learn_options['seed'])

            clf.fit(X[train], y[train].flatten())
            y_pred = clf.predict(X[test])[:, None]
        else:  # optimize the parameters if the adaboosted algorithm

            if learn_options["algorithm_hyperparam_search"] == "bo":
                print

                from hyperopt import hp, fmin, tpe, rand

                def adaboost_scoring_bo(params):
                    # label_encoder = sklearn.preprocessing.LabelEncoder()
                    # label_encoder.fit(y_all['Target gene'].values[train])
                    # gene_classes = label_encoder.transform(y_all['Target gene'].values[train])
                    # n_folds = len(np.unique(gene_classes))
                    cv = sklearn.cross_validation.KFold(
                        y_all['Target gene'].values[train].shape[0],
                        n_folds=20,
                        shuffle=True)
                    est = en.GradientBoostingRegressor(
                        n_estimators=1000,
                        learning_rate=params['learning_rate'],
                        max_depth=params['max_depth'],
                        min_samples_leaf=params['min_samples_leaf'],
                        max_features=params['max_features'],
                        random_state=learn_options['seed'])
                    scorer = cross_val_score(est,
                                             X[train],
                                             y[train].flatten(),
                                             cv=cv,
                                             n_jobs=20)
                    return np.median(scorer)

                space = {
                    'learning_rate': hp.uniform('learning_rate', 0.001, 0.1),
                    'max_depth': hp.quniform('max_depth', 1, 8, 1),
                    'min_samples_leaf': hp.quniform('min_samples_leaf', 3, 20,
                                                    1),
                    'max_features': hp.uniform('max_features', 0.05, 1.0)
                }

                best = fmin(adaboost_scoring_bo,
                            space,
                            algo=tpe.suggest,
                            max_evals=50,
                            verbose=1)
                print best
                clf = en.GradientBoostingRegressor(
                    n_estimators=learn_options['adaboost_n_estimators'],
                    learning_rate=best['learning_rate'],
                    max_depth=best['max_depth'],
                    min_samples_leaf=best['min_samples_leaf'],
                    max_features=best['max_features'],
                    random_state=learn_options['seed'])

                clf.fit(X[train], y[train].flatten())
            elif learn_options["algorithm_hyperparam_search"] == "grid":
                assert not classification, "need to tweak code below to do classificaton, as above"
                n_jobs = 20

                print "Adaboost with GridSearch"
                from sklearn.grid_search import GridSearchCV
                param_grid = {
                    'learning_rate': [0.1, 0.05, 0.01],
                    'max_depth': [4, 5, 6, 7],
                    'min_samples_leaf': [5, 7, 10, 12, 15],
                    'n_estimators': [100, 500, 1000, 2000]
                }
                #              'max_features': [1.0, 0.5, 0.3, 0.1]}
                # param_grid = {'n_estimators': [100, ]
                #               'learning_rate': [0.1, 0.05, 0.001],
                #               'max_depth': [4, 7],
                #               'min_samples_leaf': [5, 15],
                #               'max_features': [1.0, 0.1]}

                # label_encoder = sklearn.preprocessing.LabelEncoder()
                # label_encoder.fit(y_all['Target gene'].values[train])
                # gene_classes = label_encoder.transform(y_all['Target gene'].values[train])
                n_folds = 10  # len(np.unique(gene_classes))
                # cv = sklearn.cross_validation.StratifiedKFold(gene_classes, n_folds=n_folds, shuffle=True)
                cv = sklearn.cross_validation.KFold(X[train].shape[0],
                                                    n_folds=n_folds,
                                                    shuffle=True)

                est = en.GradientBoostingRegressor(
                    loss=learn_options['adaboost_loss'],
                    random_state=learn_options['seed']
                )  #, n_estimators=learn_options['adaboost_n_estimators'])
                clf = GridSearchCV(est,
                                   param_grid,
                                   n_jobs=n_jobs,
                                   verbose=1,
                                   cv=cv,
                                   scoring=spearman_scoring,
                                   iid=False)
                clf.fit(X[train], y[train].flatten())
                print clf.best_params_
            else:
                raise Exception(
                    "if using adaboost_CV then need to specify grid (grid search) or bo (bayesian optimization)"
                )

            y_pred = clf.predict(X[test])[:, None]
    else:
        raise NotImplementedError

    return y_pred, clf
Example #15
0
def DeepBoosting4(X_train,
                  y_train,
                  TrainMethod="GrowDeep",
                  n_estimators=5000000,
                  GrowDeep_max_iterPerDepthNUM=500,
                  GrowDeep_max_depthNUM=50,
                  GrowDeep_max_no_improvement=3,
                  GrowDeep_tol_no_improvement=0.00001,
                  GrowDeep_init_depth=1,
                  AllowGrowDeepRetrain=1,
                  validation_fraction=0.2,
                  n_iter_no_change=5,
                  tol=0.01,
                  tolAdjust=1,
                  LossEarlyStop="logloss",
                  random_state=0,
                  FixedDepth_max_depth=50,
                  learning_rate=0.01,
                  verbose=0,
                  CrossVali_random_state=1,
                  CrossVali_n_splits=2,
                  CrossVali_max_depth_list=[1],
                  CrossVali_n_estimators_list=[100, 250, 500, 750, 1000],
                  CrossVali_verbose=2):

    if TrainMethod == "CrossValidateDepth":
        gbes_shallow = ensemble.GradientBoostingClassifier(
            n_estimators=n_estimators,
            validation_fraction=validation_fraction,
            n_iter_no_change=n_iter_no_change,
            tol=tol,
            random_state=random_state,
            learning_rate=learning_rate)

        param_grid = {
            'max_depth': CrossVali_max_depth_list
        }  #[1,2,3,4,5,6,7,8,9,10]
        scorers = {'accuracy_score': make_scorer(accuracy_score)}
        refit_score = 'accuracy_score'
        skf = StratifiedKFold(n_splits=CrossVali_n_splits,
                              random_state=CrossVali_random_state)
        grid_searchshallow = GridSearchCV(gbes_shallow,
                                          param_grid,
                                          scoring=scorers,
                                          refit=refit_score,
                                          cv=skf,
                                          return_train_score=True,
                                          n_jobs=n_jobs,
                                          verbose=CrossVali_verbose)
        grid_searchshallow.fit(X_train, y_train)
        if verbose == 1:
            print("print(grid_searchshallow.best_params_)=",
                  grid_searchshallow.best_params_)
            print("grid_searchv.score(X_train, y_train)=",
                  grid_searchshallow.best_estimator_.score(X_train, y_train))
            print("n_estimators =",
                  grid_searchshallow.best_estimator_.n_estimators_)
        return grid_searchshallow

    if TrainMethod == "FixedDepth":
        gbes_deeptree = ensemble.GradientBoostingClassifier(
            n_estimators=n_estimators,
            max_depth=FixedDepth_max_depth,
            validation_fraction=validation_fraction,
            n_iter_no_change=n_iter_no_change,
            tol=tol,
            random_state=random_state,
            learning_rate=learning_rate)
        gbes_deeptree.fit(X_train, y_train)
        if verbose == 1:
            print("gbes_deeptree.score(X_train, y_train)=",
                  gbes_deeptree.score(X_train, y_train))
            print("n_estimators =", gbes_deeptree.n_estimators_)
        return gbes_deeptree

    if TrainMethod == "CrossValidateDepthAndNumIterations":
        gbes_shallowNumIters = ensemble.GradientBoostingClassifier(
            n_estimators=n_estimators,
            random_state=random_state,
            learning_rate=learning_rate)

        param_grid = {
            'max_depth': CrossVali_max_depth_list,
            'n_estimators': CrossVali_n_estimators_list
        }  #[1,2,3,4,5,6,7,8,9,10]
        scorers = {'accuracy_score': make_scorer(accuracy_score)}
        refit_score = 'accuracy_score'
        skf = StratifiedKFold(n_splits=CrossVali_n_splits,
                              random_state=CrossVali_random_state)
        gbes_searchshallowNumIters = GridSearchCV(gbes_shallowNumIters,
                                                  param_grid,
                                                  scoring=scorers,
                                                  refit=refit_score,
                                                  cv=skf,
                                                  return_train_score=True,
                                                  n_jobs=n_jobs,
                                                  verbose=CrossVali_verbose)
        gbes_searchshallowNumIters.fit(X_train, y_train)
        if verbose == 1:
            print("print(gbes_searchshallowNumIters.best_params_)=",
                  gbes_searchshallowNumIters.best_params_)
            print(
                "grid_searchvNumIters.score(X_train, y_train)=",
                gbes_searchshallowNumIters.best_estimator_.score(
                    X_train, y_train))
            print("n_estimators =",
                  gbes_searchshallowNumIters.best_estimator_.n_estimators_)
        return gbes_searchshallowNumIters

    if TrainMethod == "GrowDeep":
        TotalNumWeakLearners = 0
        gbes_grow = ensemble.GradientBoostingClassifier(
            n_estimators=1,
            max_depth=GrowDeep_init_depth,
            random_state=random_state,
            warm_start=True,
            learning_rate=learning_rate)
        gbes_grow.fit(X_train, y_train)
        if LossEarlyStop == "accuracy":
            NEWSCORE = gbes_grow.score(X_train, y_train)
        if LossEarlyStop == "logloss":
            y_pred = gbes_grow.predict_proba(X_train)
            NEWSCORE = 1 - log_loss(y_train, y_pred)
        TotalNumWeakLearners += 1
        if TotalNumWeakLearners >= n_estimators or NEWSCORE == 1.0:
            print("NEWSCORE=100%")
            return gbes_grow
        ##fit with early stop##
        no_improvement_counter_EarlyStop = 0
        DIFFSCORERUNSUM = 0
        for iterNUM in range(GrowDeep_max_iterPerDepthNUM):
            #_ = gbes_grow.set_params(n_estimators=1,  warm_start=True)  # set warm_start and new params of trees
            gbes_grow.n_estimators += 1
            _ = gbes_grow.fit(X_train, y_train)  # fit additional  trees to est
            TotalNumWeakLearners += 1
            if TotalNumWeakLearners >= n_estimators or NEWSCORE == 1.0:
                print("NEWSCORE=100%")
                return gbes_grow
            OLDSCORE = NEWSCORE
            if LossEarlyStop == "accuracy":
                NEWSCORE = gbes_grow.score(X_train, y_train)
            if LossEarlyStop == "logloss":
                y_pred = gbes_grow.predict_proba(X_train)
                NEWSCORE = 1 - log_loss(y_train, y_pred)
            DIFFSCORE = NEWSCORE - OLDSCORE
            DIFFSCORERUNSUM += DIFFSCORE
            if verbose >= 2:
                print("NEWSCORE at each early stop=", NEWSCORE)
                print("DIFFSCORE at each early stop=", DIFFSCORE)
                print("DIFFSCORERUNSUM at each early stop=", DIFFSCORERUNSUM)
            if (DIFFSCORERUNSUM) > tol:
                no_improvement_counter_EarlyStop = 0  # reset this counter if there is improvement.
                DIFFSCORERUNSUM = 0
            if (DIFFSCORERUNSUM) < tol:
                no_improvement_counter_EarlyStop += 1
            if no_improvement_counter_EarlyStop == n_iter_no_change:
                break
        if verbose >= 1:
            print("n_estimators for depth" + str(1) + "=",
                  gbes_grow.n_estimators_)
            print("TotalNumWeakLearners", TotalNumWeakLearners)
            print("NEWSCORE", NEWSCORE)
        ##fit with early stop##
        #if LossDepth=="accuracy":
        #    NEWSCOREdepth=gbes_grow.score(X_train, y_train)
        #if LossDepth=="logloss":
        #    y_pred=gbes_grow.predict_proba(X_train)
        #    NEWSCOREdepth=1-log_loss(y_train, y_pred)
        NEWSCOREdepth = gbes_grow.score(X_train, y_train)  #NEWSCORE
        if tolAdjust == 1 and (1 - NEWSCOREdepth) < tol:
            tol = tol / 2
        if verbose >= 1:
            print("NEWSCOREdepth", NEWSCOREdepth)
        if NEWSCOREdepth == 1.0:
            return gbes_grow
        no_improvement_counter = 0
        RetrainFLAG = 0
        for depthNUM in range(GrowDeep_max_depthNUM):
            _ = gbes_grow.set_params(
                max_depth=depthNUM + 2,
                warm_start=True)  # set warm_start and new params of trees
            ##fit with early stop##
            no_improvement_counter_EarlyStop = 0
            DIFFSCORERUNSUM = 0
            for iterNUM in range(GrowDeep_max_iterPerDepthNUM):
                #_ = gbes_grow.set_params(n_estimators=1,  warm_start=True)  # set warm_start and new params of trees
                gbes_grow.n_estimators += 1
                _ = gbes_grow.fit(X_train,
                                  y_train)  # fit additional  trees to est
                TotalNumWeakLearners += 1
                if TotalNumWeakLearners >= n_estimators or NEWSCORE == 1.0:
                    print("NEWSCORE=100%")
                    return gbes_grow
                OLDSCORE = NEWSCORE
                if LossEarlyStop == "accuracy":
                    NEWSCORE = gbes_grow.score(X_train, y_train)
                if LossEarlyStop == "logloss":
                    y_pred = gbes_grow.predict_proba(X_train)
                    NEWSCORE = 1 - log_loss(y_train, y_pred)
                DIFFSCORE = NEWSCORE - OLDSCORE
                DIFFSCORERUNSUM += DIFFSCORE
                if (DIFFSCORERUNSUM) > tol:
                    no_improvement_counter_EarlyStop = 0  # reset this counter if there is improvement.
                    DIFFSCORERUNSUM = 0
                if (DIFFSCORERUNSUM) < tol:
                    no_improvement_counter_EarlyStop += 1
                if no_improvement_counter_EarlyStop == n_iter_no_change:
                    break
            if verbose >= 1:
                print("n_estimators for depth" + str(depthNUM + 2) + "=",
                      gbes_grow.n_estimators_)
                print("TotalNumWeakLearners", TotalNumWeakLearners)
                print("NEWSCORE", NEWSCORE)
            ##fit with early stop##
            OLDSCOREdepth = NEWSCOREdepth
            #if LossDepth=="accuracy":
            #    NEWSCOREdepth=gbes_grow.score(X_train, y_train)
            #if LossDepth=="logloss":
            #    y_pred=gbes_grow.predict_proba(X_train)
            #    NEWSCOREdepth=1-log_loss(y_train, y_pred)
            NEWSCOREdepth = gbes_grow.score(X_train, y_train)  #NEWSCORE
            if tolAdjust == 1 and (1 - NEWSCOREdepth) < tol:
                tol = tol / 2
            if verbose >= 1:
                print("NEWSCOREdepth", NEWSCOREdepth)
            if NEWSCOREdepth == 1.0:
                break
            DIFFSCOREdepth = NEWSCOREdepth - OLDSCOREdepth
            if DIFFSCOREdepth >= 0 and (
                    DIFFSCOREdepth) > GrowDeep_tol_no_improvement:
                no_improvement_counter = 0  # reset this counter if there is improvement.
            if DIFFSCOREdepth >= 0 and (
                    DIFFSCOREdepth) < GrowDeep_tol_no_improvement:
                no_improvement_counter += 1
                if no_improvement_counter == GrowDeep_max_no_improvement:
                    break
            if DIFFSCOREdepth < 0:
                Retrain_iter = depthNUM + 1
                RetrainFLAG = 1
                break

        return gbes_grow
    print('准确率:', test_ada_score)

    # 交叉验证
    scores = model_selection.cross_val_score(ada, X_train, y_train, cv=10)
    # 平均准确率: 0.7965240641711229
    print('平均准确率:', scores.mean())

    # 随机森林
    rfc = ensemble.RandomForestClassifier(n_estimators=100, random_state=66)
    rfc = rfc.fit(X_train, y_train)
    rfc_score = rfc.score(X_test, y_test)
    # 随机森林的准确率: 0.8026905829596412
    print('随机森林的准确率:', rfc_score)

    # 梯度提升
    gbc = ensemble.GradientBoostingClassifier(random_state=30).fit(
        X_train, y_train)
    score = gbc.score(X_test, y_test)
    # 梯度提升的准确率: 0.8475336322869955
    print('梯度提升的准确率:', score)

    # adaBoost与随机森林结合
    rfc = ensemble.RandomForestClassifier(n_estimators=100,
                                          random_state=88,
                                          n_jobs=-1)
    ada_rfc = ensemble.AdaBoostClassifier(rfc, n_estimators=100).fit(
        X_train, y_train)
    score = ada_rfc.score(X_test, y_test)
    # adaBoost与随机森林结合准确率: 0.7757847533632287
    print('adaBoost与随机森林结合准确率:', score)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# In[877]:

# run a simple model
params = {
    'n_estimators': 20,
    'max_leaf_nodes': 6,
    'learning_rate': 0.1,
    'random_state': 1,
    'max_features': 21
}
classifier = ensemble.GradientBoostingClassifier(**params)
classifier.fit(X_train, y_train)

# In[881]:

# calculate AUC
from sklearn.metrics import roc_auc_score
from sklearn import ensemble
from sklearn.externals import joblib
roc_auc_score(Y, classifier.predict(X))

# In[ ]:

# StandardScaler() : Scaling is used to give same weights to each variables so
# that in our optimization problem will give us the best value instead of giving different values each time
yTest1 = test_region1[['Maintenance_flag']]
test_region1 = test_region1.drop(test_region1[['Maintenance_flag']], axis=1)
test_region1 = test_region1[[
    'Vibration', 'Engine_RPM', 'Speed_OBD', 'Ambient_air_temp', 'Speed_GPS',
    'Vehicle_speed_sensor', 'Throttle_Pos_Manifold', 'Mass_Air_Flow_Rate'
]]

train1 = yTrain1.loc[yTrain1['Maintenance_flag'] == 1]
claimTrain1 = len(train1.Maintenance_flag) / len(yTrain1.Maintenance_flag)

Y_target_train1 = yTrain1.Maintenance_flag
Y_target_test1 = yTest1.Maintenance_flag

gbm1 = ensemble.GradientBoostingClassifier(loss='deviance',
                                           criterion='mse',
                                           n_estimators=1500,
                                           max_leaf_nodes=5,
                                           verbose=1)
fit_gbm1 = gbm1.fit(train_region1, yTrain1)
Y_predict1 = fit_gbm1.predict(test_region1)
Y_predProb1 = fit_gbm1.predict_proba(test_region1)

Y_probab1 = pd.DataFrame({
    0: Y_predProb1[:, 0],
    1: Y_predProb1[:, 1]
},
                         index=Y_target_test1.index.copy())

# roc values
fpr2, tpr2, threshold2 = metrics.roc_curve(Y_target_test1, Y_probab1[1])
Example #19
0
features /= (fmaxs - fmins).reshape(1, -1)

tock()

print("Do some memory cleanup")
del X
del X_downsampled
del X_specgram
del patches

# classifier training
print("Classifier training")

# clf = svm.LinearSVC(C=10e-3)
# clf = ensemble.RandomForestClassifier(n_estimators=200, n_jobs=4, verbose=1)
clf = ensemble.GradientBoostingClassifier(n_estimators=100, verbose=1)
clf.fit(features, Y)

tock()

print("Further cleanup: no longer need training data")
del features
del Y

# load test data
print("Load test data")
X_test = np.load(TEST_DATA_PATH).astype('float32')

tock()

# downsample
Example #20
0
train_data_norm, test_data_norm = normalize(train_data, test_data)

#take out the data size

#add one extra column 1s at the beginning of the data
train_data = train_data_norm
test_data = test_data_norm

iteration_list = [10, 30, 100, 300]
train_err_list = []
test_err_list = []

for i in iteration_list:
    #training the logistic boosting
    logboost = ensemble.GradientBoostingClassifier(loss='deviance',
                                                   learning_rate=0.1,
                                                   n_estimators=i)
    logboost.fit(train_data, train_labels.values.ravel())

    #predicting
    train_pred = logboost.predict(train_data)
    test_pred = logboost.predict(test_data)

    #evaluate the error
    train_error = 1 - accuracy_score(train_labels, train_pred)
    test_error = 1 - accuracy_score(test_labels, test_pred)
    if (i == 300):
        plt.plot(np.arange(i) + 1, logboost.train_score_)
        plt.xlabel('Iterations')
        plt.ylabel('Train loss')
        plt.show()
Example #21
0
from sklearn.datasets import fetch_openml
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)
X = X / 255.

from sklearn import ensemble
from sklearn.kernel_approximation import Nystroem
Estimators = [50, 70]
Learning_rates = [0.1, 0.15, 0.20]
Max_depths = [1, 3]
#Leafs = [1]

#feature_map_nystroem = Nystroem()
#data_transformed = feature_map_nystroem.fit_transform(X)

X_train, X_test = X[:60000], X[60000:]
y_train, y_test = y[:60000], y[60000:]

for estimate in Estimators:
    for lrates in Learning_rates:
        for max_depth in Max_depths:
            clf = ensemble.GradientBoostingClassifier(n_estimators=estimate,
                                                      learning_rate=lrates,
                                                      max_depth=max_depth)
            clf.fit(X_train, y_train)
            print("ERROR RATE FOR No_Estimators: " + str(estimate) +
                  ", Learning rate: " + str(lrates) + ", Maximum depth: " +
                  str(max_depth) + " is")
            y_pred = clf.predict(X_test)
            from sklearn import metrics
            print((1 - metrics.accuracy_score(y_test, y_pred)) * 100)
            if score == float(1):
                sigtest.append(entry)
                sigtestscore.append(1.)

            elif score == float(0):
                bkgtest.append(entry)
                bkgtestscore.append(0.)

print time.asctime(time.localtime()), "Datasets produced!"

print time.asctime(time.localtime()), "Training BDT"

#Train the BDT (Gradient Boosting Classifier)  and save

clf = ensemble.GradientBoostingClassifier(max_depth=8,
                                          n_estimators=100,
                                          learning_rate=0.008)
clf.fit(full, fullscore)

joblib.dump(clf, '/nfs/astrop/d6/rstein/BDTpickle/DCpixelclassifier.pkl')

print time.asctime(time.localtime()), "BDT Trained"

print "Score on whole training sample is", clf.score(full, fullscore)
print "Score on whole test sample is", clf.score(fulltest, fulltestscore)
print "Score on training signal is ", clf.score(sig, sigscore)
print "Score on test signal is ", clf.score(sigtest, sigtestscore)
print "Score on training background is ", clf.score(bkg, bkgscore)
print "Score on test background is ", clf.score(bkgtest, bkgtestscore)

importances = clf.feature_importances_
Example #23
0
                                                data_Y,
                                                test_size=0.33,
                                                random_state=42)
########################################################################
########################################################################
########################################################################
params = {
    'n_estimators': 10,
    'max_depth': 3,
    'subsample': 0.5,
    'learning_rate': 0.89,
    'min_samples_leaf': 1,
    'random_state': 5
}

clf1 = ensemble.GradientBoostingClassifier(**params)
clf2 = BernoulliNB()
clf3 = DecisionTreeClassifier(random_state=0)
clf4 = svm.SVC(kernel='rbf', probability=True)
clf5 = SGDClassifier(loss="modified_huber", penalty='l1')
clf6 = RandomForestClassifier(n_estimators=9)
clf7 = ensemble.AdaBoostClassifier()
clf8 = svm.SVC(kernel='linear', probability=True)
clf9 = MLPClassifier(solver='lbfgs',
                     alpha=1e-5,
                     hidden_layer_sizes=(150, 50, 15, 5, 3),
                     random_state=1)
clf10 = neighbors.KNeighborsClassifier(n_neighbors=5)
clf11 = GaussianNB()
clf12 = LinearDiscriminantAnalysis()
clf13 = QuadraticDiscriminantAnalysis()
Example #24
0
def gbdt(x, y):
    clf = ensemble.GradientBoostingClassifier(n_estimators=100,
                                              learning_rate=0.1,
                                              max_depth=5,
                                              random_state=0).fit(x, y)
    return clf
Example #25
0
    feature_to_pick = 250
    feature_top_n = get_top_n_features(titanic_train_data_X, titanic_train_data_Y, feature_to_pick)
    print('Total Feature:' + str(combined_train_test.shape))
    print('Picked Feature' + str(feature_top_n.shape))

    titanic_train_data_X = titanic_train_data_X[feature_top_n]
    del titanic_train_data_X['Ticket_Number']
    titanic_test_data_X = titanic_test_data_X[feature_top_n]
    del titanic_test_data_X['Ticket_Number']

    # 14.建立模型
    rf_est = ensemble.RandomForestClassifier(n_estimators=750, criterion='gini', max_features='sqrt',
                                             max_depth=3, min_samples_split=4, min_samples_leaf=2,
                                             n_jobs=50, random_state=42, verbose=1)
    gbm_est = ensemble.GradientBoostingClassifier(n_estimators=900, learning_rate=0.0008, loss='exponential',
                                                  min_samples_split=3, min_samples_leaf=2, max_features='sqrt',
                                                  max_depth=3, random_state=42, verbose=1)
    et_est = ensemble.ExtraTreesClassifier(n_estimators=750, max_features='sqrt', max_depth=35, n_jobs=50,
                                           criterion='entropy', random_state=42, verbose=1)

    voting_est = ensemble.VotingClassifier(estimators=[('rf', rf_est), ('gbm', gbm_est), ('et', et_est)],
                                           voting='soft', weights=[3, 5, 2],
                                           n_jobs=50)
    voting_est.fit(titanic_train_data_X, titanic_train_data_Y)
    print('VotingClassifier Score:' + str(voting_est.score(titanic_train_data_X, titanic_train_data_Y)))
    print('VotingClassifier Estimators:' + str(voting_est.estimators_))

    # 预测
    titanic_test_data_X['Survived'] = voting_est.predict(titanic_test_data_X)

    submission = pd.DataFrame({'PassengerId': test_data_org.loc[:, 'PassengerId'],
featureSet = pd.DataFrame(columns=('url','no of dots','presence of hyphen','len of url','presence of at',\
'presence of double slash','no of subdir','no of subdomain','len of domain','no of queries','is IP','presence of Suspicious_TLD',\
'presence of suspicious domain','label'))

for i in range(len(df)):
    features = getFeatures(df["URL"].loc[i], df["Lable"].loc[i])
    featureSet.loc[i] = features

featureSet.groupby(featureSet['label']).size()
X = featureSet.drop(['url', 'label'], axis=1).values
y = featureSet['label'].values
model = {
    "DecisionTree": tree.DecisionTreeClassifier(max_depth=10),
    "RandomForest": ek.RandomForestClassifier(n_estimators=50),
    "Adaboost": ek.AdaBoostClassifier(n_estimators=50),
    "GradientBoosting": ek.GradientBoostingClassifier(n_estimators=50),
    "GNB": GaussianNB(),
    "LogisticRegression": LogisticRegression()
}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

results = {}
for algo in model:
    clf = model[algo]
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    results[algo] = score

winner = max(results, key=results.get)

clf = model[winner]
        pred_instance_name_arr.append([row[0].strip()])

pred_data_instance_names = vstack(pred_instance_name_arr)
pred_data_features = vstack(pred_instance_feature_arr)
# print(pred_data_instance_names)
# print(pred_data_features)
# ---------------

train_data = vstack(data_arr)
instance_name = vstack(instance_name_arr)
instance_class = vstack(instance_class_arr).ravel()
# print(instance_name)
# print(instance_class)

# Create a gradient boost classifier object
gboostT = ensemble.GradientBoostingClassifier()

# Evaluate model with cross-validation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3)
n_scores = cross_val_score(gboostT, train_data, instance_class, scoring='accuracy', cv=cv, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# Create a gradient boost classifier object
gboostT = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=12)
# Train and measure model performance
score = gboostT.fit(train_data, instance_class).score(train_data, instance_class)
print("Score: ", score)
"""

score = gboostT.fit(train_data, instance_class)
# predict hte response for data2
Example #28
0
def ecologyGBM(ip, port):

    #Log.info("Importing ecology_model.csv data...\n")
    ecology_train = h2o.import_file(
        path=h2o.locate("smalldata/gbm_test/ecology_model.csv"))
    #Log.info("Summary of the ecology data from h2o: \n")
    #ecology.summary()

    # Log.info("==============================")
    # Log.info("H2O GBM Params: ")
    # Log.info("x = ecology_train[2:14]")
    # Log.info("y = ecology_train["Angaus"]")
    # Log.info("ntrees = 100")
    # Log.info("max_depth = 5")
    # Log.info("min_rows = 10")
    # Log.info("learn_rate = 0.1")
    # Log.info("==============================")
    # Log.info("==============================")
    # Log.info("scikit GBM Params: ")
    # Log.info("learning_rate=0.1")
    # Log.info("n_estimators=100")
    # Log.info("max_depth=5")
    # Log.info("min_samples_leaf = 10")
    # Log.info("n.minobsinnode = 10")
    # Log.info("max_features=None")
    # Log.info("==============================")

    ntrees = 100
    max_depth = 5
    min_rows = 10
    learn_rate = 0.1

    # Prepare data for scikit use
    trainData = np.genfromtxt(
        h2o.locate("smalldata/gbm_test/ecology_model.csv"),
        delimiter=',',
        dtype=None,
        names=("Site", "Angaus", "SegSumT", "SegTSeas", "SegLowFlow", "DSDist",
               "DSMaxSlope", "USAvgT", "USRainDays", "USSlope", "USNative",
               "DSDam", "Method", "LocSed"),
        skip_header=1,
        missing_values=('NA'),
        filling_values=(np.nan))
    trainDataResponse = trainData["Angaus"]
    trainDataFeatures = trainData[[
        "SegSumT", "SegTSeas", "SegLowFlow", "DSDist", "DSMaxSlope", "USAvgT",
        "USRainDays", "USSlope", "USNative", "DSDam", "Method", "LocSed"
    ]]

    ecology_train["Angaus"] = ecology_train["Angaus"].asfactor()
    # Train H2O GBM Model:
    gbm_h2o = h2o.gbm(x=ecology_train[2:],
                      y=ecology_train["Angaus"],
                      ntrees=ntrees,
                      learn_rate=learn_rate,
                      max_depth=max_depth,
                      min_rows=min_rows,
                      distribution="bernoulli")

    # Train scikit GBM Model:
    gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learn_rate,
                                                  n_estimators=ntrees,
                                                  max_depth=max_depth,
                                                  min_samples_leaf=min_rows,
                                                  max_features=None)
    gbm_sci.fit(trainDataFeatures[:, np.newaxis], trainDataResponse)

    # Evaluate the trained models on test data
    # Load the test data (h2o)
    ecology_test = h2o.import_file(
        path=h2o.locate("smalldata/gbm_test/ecology_eval.csv"))

    # Load the test data (scikit)
    testData = np.genfromtxt(h2o.locate("smalldata/gbm_test/ecology_eval.csv"),
                             delimiter=',',
                             dtype=None,
                             names=("Angaus", "SegSumT", "SegTSeas",
                                    "SegLowFlow", "DSDist", "DSMaxSlope",
                                    "USAvgT", "USRainDays", "USSlope",
                                    "USNative", "DSDam", "Method", "LocSed"),
                             skip_header=1,
                             missing_values=('NA'),
                             filling_values=(np.nan))
    testDataResponse = testData["Angaus"]
    testDataFeatures = testData[[
        "SegSumT", "SegTSeas", "SegLowFlow", "DSDist", "DSMaxSlope", "USAvgT",
        "USRainDays", "USSlope", "USNative", "DSDam", "Method", "LocSed"
    ]]

    # Score on the test data and compare results

    # scikit
    auc_sci = roc_auc_score(
        testDataResponse,
        gbm_sci.predict_proba(testDataFeatures[:, np.newaxis])[:, 1])

    # h2o
    gbm_perf = gbm_h2o.model_performance(ecology_test)
    auc_h2o = gbm_perf.auc()

    #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o))
    assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"
Example #29
0
all_models = {
    "1Dummy:Majority":
    dummy.DummyClassifier(strategy='most_frequent'),
    "1Dummy:Stratified":
    dummy.DummyClassifier(random_state=0),
    "DT:default":
    tree.DecisionTreeClassifier(
        random_state=0
    ),  # TODO Let's start like that. We will configure later.

    ##"DT-": tree.DecisionTreeClassifier(), # TODO Let's start like that. We will configure later.
    "RF:default":
    ensemble.RandomForestClassifier(n_estimators=10, random_state=0),
    "GBT:default":
    ensemble.GradientBoostingClassifier(random_state=0),
    "LR:default":
    linear_model.LogisticRegression(solver='liblinear',
                                    multi_class='ovr',
                                    random_state=0),
}

# ==================== GRID SEARCH ====================================
# ======== Logistic regression
regularization_pars = (1e-6, 3e-6, 6e-6, 1e-5, 3e-5, 6e-5, 1e-4, 3e-4, 6e-4,
                       1e-3, 3e-3, 6e-3, 1e-2, 3e-2, 6e-2, 1e-1, 3e-1, 6e-1, 1,
                       3, 6, 10, 30, 60, 100, 300, 600)
for penalty in ('l2', 'l1'):
    for reg in regularization_pars:
        all_models["LR:"+penalty+"-"+str(reg)] = \
            linear_model.LogisticRegression(solver='liblinear', multi_class='ovr', random_state=0, C=reg,
    pred = [most_common(x) for x in zip(*res)]
    f = open('predictions.csv', 'w')
    f.write("ID,Category\n")

    for i, res in enumerate(pred):
        f.write("%d,%d\n" % (i + 1, res))

    f.close()


train = np.load('train.npy')
# Remove the labels
test = np.load('test_distribute.npy')[:, 1:]

data = train[:, 1:]
target = train[:, 0]

clfs = []

# Through cv testing, I found the optimal number of estimators to be 15
clfs.append(ensemble.ExtraTreesClassifier(n_estimators=100))
clfs.append(ensemble.GradientBoostingClassifier(n_estimators=125))
clfs.append(ensemble.AdaBoostClassifier(n_estimators=100))

predictificate(data, target, test, clfs)

# I use the following code to find good hyperparameter values
#scores = cross_validation.cross_val_score(
#clf, data, target, cv=5)
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))