Exemple #1
0
0 / 0
feature_top_n = get_top_n_features(titanic_train_data_X, titanic_train_data_Y,
                                   feature_to_pick)

titanic_train_data_X = titanic_train_data_X[feature_top_n]
titanic_test_data_X = titanic_test_data_X[feature_top_n]

#oversampling
#titanic_train_data_X, titanic_train_data_Y = RandomOverSampler().fit_sample(titanic_train_data_X, titanic_train_data_Y)
titanic_train_data_X, titanic_train_data_Y = SMOTE().fit_sample(
    titanic_train_data_X, titanic_train_data_Y)
#voting
#xgb_est = xgb.XGBClassifier(learning_rate=0.03, random_state=3, n_estimators=900, subsample=0.8, n_jobs = 50,colsample_bytree = 0.8, max_depth = 10, verbose=1)
#svm_est = svm.SVC(kernel='rbf', gamma = 1e-3, C =100)
ada_est = ensemble.AdaBoostClassifier(n_estimators=1000,
                                      random_state=3,
                                      learning_rate=0.1)
rf_est = ensemble.RandomForestClassifier(n_estimators=1000,
                                         criterion='gini',
                                         max_features='sqrt',
                                         max_depth=10,
                                         min_samples_split=4,
                                         min_samples_leaf=20,
                                         n_jobs=50,
                                         random_state=42,
                                         verbose=1)
gbm_est = ensemble.GradientBoostingClassifier(n_estimators=1000,
                                              learning_rate=0.003,
                                              loss='exponential',
                                              min_samples_split=3,
                                              min_samples_leaf=20,
Exemple #2
0
testing_data = count_vect.transform(X_test)
print(testing_data)
#training_data = training_data.astype(float)

#y = y.as_matrix().astype(np.float)
X_train_counts.shape
X_train.shape
y_train.shape
y_train = np.nan_to_num(y_train)
np.isnan(y_train).any()
training_data.data = np.nan_to_num(training_data.data)
##################################

#ada boost
ada_estimator = ensemble.AdaBoostClassifier(
    base_estimator=tree.DecisionTreeClassifier(), random_state=100)
ada_grid = {
    'n_estimators': list(range(50, 101, 50)),
    'learning_rate': [0.1, 0.2, 1.0],
    'base_estimator__max_depth': [1, 3, 5],
    'base_estimator__criterion': ['entropy', 'gini']
}
ada_grid_estimator = model_selection.GridSearchCV(ada_estimator,
                                                  ada_grid,
                                                  scoring='accuracy',
                                                  cv=10,
                                                  return_train_score=True)
ada_grid_estimator.fit(training_data, y_train)

print(ada_grid_estimator.best_score_)
print(ada_grid_estimator.best_params_)
Exemple #3
0
from sklearn import ensemble

import utils

# this is pure shit.

training_points, training_labels = utils.get_training_data('../train_2008.csv')
test_points = utils.get_test_points('../test_2008.csv')

clf = ensemble.AdaBoostClassifier()  # uses DT by default
clf.fit(training_points, training_labels)

utils.prepare_submission_sklearn(clf.predict, test_points)
 np.random.seed(10)
 #fetch dataframes
 df_train = pd.read_csv(loc_train)
 df_test = pd.read_csv(loc_test)
 #shuffle train df to prevent malordered samples
 df_train = df_train.reindex(np.random.permutation(df_train.index))
 #get the feature columns
 feature_cols = [col for col in df_train.columns if col not in ['class']]
 #create a train and test set
 X_train = df_train[feature_cols]
 X_test = df_test[feature_cols]
 #fetch the labels into 'y'
 y = df_train['class']
 #classifier config and fitting
 clf_base = ensemble.RandomForestClassifier(n_estimators=1050,
                                            criterion="entropy",
                                            max_features=None,
                                            random_state=777,
                                            n_jobs=-1)
 clf = ensemble.AdaBoostClassifier(clf_base,
                                   n_estimators=4,
                                   random_state=93)
 print "\nFitting:\n", clf, "\non train set shaped:\n", X_train.shape
 clf.fit(X_train, y)
 #predicting and storing results
 with open(loc_submission, "wb") as outfile:
     print "\nPredicting on test set shaped:\n", X_test.shape, "\nWriting to:", outfile
     outfile.write("Id,Class\n")
     for e, val in enumerate(clf.predict_proba(X_test)):
         outfile.write("%s,%f\n" % (float(e + 1), float(val[1])))
 print "\nScript running time:", datetime.now() - start
def EntireDataset():
    a = input('Click and drag FEATURE SELECTED ENTIRE DATASET file here: ')
    a = a.strip('\' ')
    data = pd.read_csv(a, encoding='utf-8').set_index('PATIENT')

    b = input('Click and drag LABELS file here: ')
    b = b.strip('\' ')
    labels_df = pd.read_csv(b, encoding='utf-8').set_index('PATIENT')
    labels = np.array(labels_df[labels_df.columns[0]])

    nfeatsmax = len(data.columns)
    nfeatsneural = round((nfeatsmax * 2 / 3))

    rf = ensemble.RandomForestClassifier(max_features=nfeatsmax,
                                         max_depth=5,
                                         bootstrap=False)
    et = ensemble.ExtraTreesClassifier(max_features=nfeatsmax,
                                       max_depth=5,
                                       bootstrap=False)
    kn = neighbors.KNeighborsClassifier(n_neighbors=nfeatsmax, p=1)
    nb = naive_bayes.GaussianNB()
    dt = tree.DecisionTreeClassifier(max_features=nfeatsmax,
                                     max_depth=5,
                                     criterion='entropy')
    ls = svm.LinearSVC(penalty='l1', dual=False)
    gb = ensemble.GradientBoostingClassifier(loss='exponential', max_depth=2)
    nn = neural_network.MLPClassifier(hidden_layer_sizes=(
        nfeatsneural,
        nfeatsneural,
        nfeatsneural,
    ),
                                      learning_rate_init=0.0001,
                                      max_iter=500)
    ab = ensemble.AdaBoostClassifier()
    bc = ensemble.BaggingClassifier(base_estimator=rf)
    vc = ensemble.VotingClassifier(estimators=[('gb', gb), ('ab', ab),
                                               ('bc', bc)],
                                   voting='soft')

    estimators = {  #'randomforest': rf,
        #'extratrees': et,
        #'kneighbors': kn,
        'naivebayes': nb,
        #'decisiontree': dt,
        #'linearsvc': ls,
        #'gboost': gb,
        #'neuralnet': nn,
        #'adaboost': ab,
        #'bagging': bc,
        #'voting': vc,
    }

    results = {
        'estimator': [],
        'subjects': [],
        'labels': [],
        'predictions': [],
        'scores': [],
        'attempts': []
    }

    for j, k in zip(estimators.keys(), estimators.values()):
        k.fit(data, labels)
        predict_train = k.predict(data)
        train_scores = [
            1 if x == y else 0 for x, y in zip(labels, predict_train)
        ]
        results['estimator'].extend([j] * len(data))
        results['subjects'].extend(data.index)
        results['labels'].extend(labels)
        results['predictions'].extend(predict_train)
        results['scores'].extend(train_scores)
        results['attempts'].extend([1] * len(data))

    results_df = pd.DataFrame.from_dict(results).set_index('subjects')
    results_df.to_csv(
        path_or_buf=
        '/media/james/ext4data/current/projects/pfizer/combined-study/entire_dataset_results.csv'
    )

    with open(
            '/media/james/ext4data/current/projects/pfizer/combined-study/trainedclassifier.pickle',
            'wb') as f:
        pickle.dump(k, f, pickle.HIGHEST_PROTOCOL)

    print('ENTIRE DATASET ACCURACY')
    trd = results_df.groupby('estimator').sum()
    trsum = (trd['scores'] / trd['attempts']) * 100
    print(trsum)

    return
Exemple #6
0
    res = []
    for clf in clfs:
        clf.fit(data, target)
        res.append(clf.predict(test))

    pred = [most_common(x) for x in zip(*res)]
    f = open('final-predictions.csv', 'w')
    f.write("ID,Category\n")

    for i, res in enumerate(pred):
        f.write("%d,%d\n" % (i + 1, res))

    f.close()


clfs = []

# Through cv testing, I found the optimal number of estimators to be 15
clfs.append(ensemble.RandomForestClassifier(n_estimators=150))
clfs.append(ensemble.GradientBoostingClassifier(n_estimators=200))
clfs.append(ensemble.AdaBoostClassifier(n_estimators=135))
#clfs.append(neighbors.KNeighborsClassifier(n_neighbors=10))
#clfs.append(svm.SVC())

predictificate(data, target, test, clfs)

# I use the following code to find good hyperparameter values
#scores = cross_validation.cross_val_score(
#clf, data, target, cv=5)
#print("Accuracy: %0.2f (+/- %0.2f) %f" % (scores.mean(), scores.std() * 2, x))
Exemple #7
0
    axis=1,
    inplace=False)
titanic2.shape

X_train = titanic2[0:titanic_train.shape[0]]
X_train.shape
X_train.info()
y_train = titanic_train['Survived']

#create estimators for voting classifier
#M1
dt_estimator = tree.DecisionTreeClassifier(random_state=100)
#M2
rf_estimator = ensemble.RandomForestClassifier(random_state=100)
#M3
ada_estimator = ensemble.AdaBoostClassifier(random_state=100)

#voting classifier
voting_estimator = ensemble.VotingClassifier(
    estimators=[('dt', dt_estimator), ('rf',
                                       rf_estimator), ('ada', ada_estimator)])
voting_grid = {
    'dt__max_depth': [3, 5, 7],
    'rf__n_estimators': [20],
    'rf__max_features': [5, 7, 9],
    'rf__max_depth': [2, 4, 6],
    'ada__n_estimators': [20]
}
grid_voting_estimator = model_selection.GridSearchCV(voting_estimator,
                                                     voting_grid,
                                                     verbose=1,
##fourth classifier strong for precision and recall (based on final features list)- use findings from 2nd classifier
from sklearn import ensemble, tree
#from sklearn.grid_search import GridSearchCV
#parameter = {'algorithm':['SAMME', 'SAMME.R'],
#                'n_estimators':[2,5,10,25,50]}
#clf = GridSearchCV(ensemble.AdaBoostClassifier(base_estimator=tree.DecisionTreeClassifier(criterion='entropy'),
# n_estimators=50),parameter)
#clf = clf.fit(features,labels)
#print clf.best_estimator_

##fifth classifier strong for precision and recall (based on final features list)- use findings from classifiers 2&4
from sklearn import ensemble, tree
tree = tree.DecisionTreeClassifier(criterion='entropy')
clf = ensemble.AdaBoostClassifier(base_estimator=tree,
                                  algorithm='SAMME',
                                  n_estimators=50)
clf = clf.fit(features, labels)
print clf.feature_importances_
print features_list[1:]

### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script.
### Because of the small size of the dataset, the script uses stratified
### shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

##creating new test classifier using kfold cross validation
PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
Exemple #9
0
titanic2 = titanic1.drop([
    'PassengerId', 'Name', 'Age', 'Ticket', 'Cabin', 'Survived', 'SibSp',
    'Parch', 'Fare'
],
                         axis=1,
                         inplace=False)
titanic2.info()
X_train = titanic2[0:titanic_train.shape[0]]
X_train.shape
X_train.info()
y_train = titanic_train['Survived']

#Base Model..Building
dt_estimator = tree.DecisionTreeClassifier(random_state=2017)
#Model Building
ada_estimator = ensemble.AdaBoostClassifier(random_state=2017,
                                            base_estimator=dt_estimator)
ada_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.123, 0.5344, 0.789],
    'base_estimator__max_depth': [3, 4]
}
grid_ada_estimator = model_selection.GridSearchCV(ada_estimator,
                                                  ada_grid,
                                                  cv=10,
                                                  n_jobs=1)
grid_ada_estimator.fit(X_train, y_train)
print(grid_ada_estimator.grid_scores_)
print(grid_ada_estimator.best_score_)  #83
print(grid_ada_estimator.best_params_)
grid_ada_estimator.best_estimator_  #83
print(grid_ada_estimator.score(X_train, y_train))  #83
train1_x, test1_x, train1_y, test1_y = train_test_split(train[train_x_calc], train[target], random_state = 0)
train1_x_bin, test1_x_bin, train1_y_bin, test1_y_bin = train_test_split(train[train_x_bin], train[target] , random_state = 0)
train1_x_dummy, test1_x_dummy, train1_y_dummy, test1_y_dummy = train_test_split(train_dummy[train_x_dummy], train[target], random_state = 0)


#Discrete Variable Correlation by Survival using group by aka pivot table
for x in train_x:
    if train[x].dtype != 'float64' :
        print('Survival Correlation by:', x)
        print(train[[x, target[0]]].groupby(x, as_index=False).mean())
        print('-'*10, '\n')

# Machine Learning Algorithm (MLA) Selection and Initialization
MLA = [
    # Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    # Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),

    # GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
clf = DecisionTreeClassifier(max_depth=3)
clf.fit(X_train, y_train)
scores = model_selection.cross_val_score(clf, X_test, y_test, cv=10)
print(scores)

#%%
#Testing Gausian Naiive bayes

gnb = naive_bayes.GaussianNB()
gnb.fit(X_train, y_train)
scores = model_selection.cross_val_score(gnb, X_test, y_test, cv=10)
print(scores)

#%%
d_tree = DecisionTreeClassifier(max_depth=3)
aba = ensemble.AdaBoostClassifier(base_estimator=d_tree, n_estimators=50)

params = {
    "base_estimator__criterion": ["gini", "entropy"],
    "base_estimator__splitter": ["best", "random"]
}

cv_n = model_selection.KFold(n_splits=20, shuffle=True)
cv_search = model_selection.GridSearchCV(aba,
                                         param_grid=params,
                                         scoring="average_precision",
                                         cv=cv_n,
                                         refit=True,
                                         n_jobs=2)
cv_search.fit(X_train, y_train)
print(cv_search.score(X_test, y_test))
Exemple #12
0
        # convert into  a large array
        training_X = convert_image_list( training_images )  
        training_Y = np.ravel( np.concatenate( tuple(j for j in training_output ) ) )

        if options.debug: print("Fitting...")

        if options.method=="SVM":
            clf = svm.SVC()
        elif options.method=="nuSVM":
            clf = svm.NuSVC()
        elif options.method=='NN':
            clf = neighbors.KNeighborsClassifier(options.n)
        elif options.method=='RanForest':
            clf = ensemble.RandomForestClassifier(n_estimators=options.n,random_state=options.random)
        elif options.method=='AdaBoost':
            clf = ensemble.AdaBoostClassifier(n_estimators=options.n,random_state=options.random)
        elif options.method=='tree':
            clf = tree.DecisionTreeClassifier(random_state=options.random)
        else:
            clf = svm.LinearSVC()
        
        #scores = cross_validation.cross_val_score(clf, training_X, training_Y)
        #print scores
        
        clf.fit( training_X, training_Y )
        
        #print(clf.score(training_X,training_Y))
        
        if options.debug: print( clf )
        
        with open(options.save,'wb') as f:
Exemple #13
0
titanic_train.info()

titanic_train1 = pd.get_dummies(titanic_train,
                                columns=['Pclass', 'Sex', 'Embarked'])
titanic_train1.shape
titanic_train1.info()
titanic_train1.head(6)

X_train = titanic_train1.drop(
    ['PassengerId', 'Age', 'Cabin', 'Ticket', 'Name', 'Survived'], 1)
y_train = titanic_train['Survived']

#Note that we take entire data into consideration in boosting. That's why we have to cut the tree depth to control the overfitting.
#In this case we are giving max_depth=3
dt_estimator = tree.DecisionTreeClassifier(max_depth=3)
ada_tree_estimator1 = ensemble.AdaBoostClassifier(dt_estimator, 5)

#Parameter tuning
#n_estimators(no. of trees to grow), learning_rate(Learning rate shrinks the contribution of each classifier by learning_rate.)
#There is a trade-off between learning_rate and n_estimators.
#Pass the learning_rate less than default which is 1.
ada_grid = {'n_estimators': [5, 8, 10, 12], 'learning_rate': [0.1, 0.5, 0.9]}
ada_grid_estimator = model_selection.GridSearchCV(ada_tree_estimator1,
                                                  ada_grid,
                                                  cv=10,
                                                  n_jobs=1)
ada_grid_estimator.fit(X_train, y_train)
ada_grid_estimator.cv_results_
ada_grid_estimator.best_score_
ada_grid_estimator.best_params_
Exemple #14
0
def get_top_n_features(titanic_train_data_X, titanic_train_data_Y,
                       top_n_features):
    # 随机森林
    rf_est = RandomForestClassifier(random_state=3)
    rf_param_grid = {
        'n_estimators': [500],
        'max_features': [5, 6, 10],
        'min_samples_split': [2, 3],
        'max_depth': [20]
    }
    rf_grid = model_selection.GridSearchCV(rf_est,
                                           rf_param_grid,
                                           n_jobs=25,
                                           cv=10,
                                           verbose=1)
    rf_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    # 将feature按Importance排序
    feature_imp_sorted_rf = pd.DataFrame({
        'feature':
        list(titanic_train_data_X),
        'importance':
        rf_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    features_top_n_rf = feature_imp_sorted_rf.head(top_n_features)['feature']
    print('Sample 25 Features from RF Classifier')
    print(str(features_top_n_rf[:25]))

    # AdaBoost
    ada_est = ensemble.AdaBoostClassifier(random_state=42)
    ada_param_grid = {'n_estimators': [500], 'learning_rate': [0.5, 0.6]}
    ada_grid = model_selection.GridSearchCV(ada_est,
                                            ada_param_grid,
                                            n_jobs=25,
                                            cv=10,
                                            verbose=1)
    ada_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    # 排序
    feature_imp_sorted_ada = pd.DataFrame({
        'feature':
        list(titanic_train_data_X),
        'importance':
        ada_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    features_top_n_ada = feature_imp_sorted_ada.head(top_n_features)['feature']
    print('Sample 25 Features from ADA Classifier:')
    print(str(features_top_n_ada[:25]))

    # ExtraTree
    et_est = ensemble.ExtraTreesClassifier(random_state=42)
    et_param_grid = {
        'n_estimators': [500],
        'min_samples_split': [3, 4],
        'max_depth': [15]
    }
    et_grid = model_selection.GridSearchCV(et_est,
                                           et_param_grid,
                                           n_jobs=25,
                                           cv=10,
                                           verbose=1)
    et_grid.fit(titanic_train_data_X, titanic_train_data_Y)
    # 排序
    feature_imp_sorted_et = pd.DataFrame({
        'feature':
        list(titanic_train_data_X),
        'importance':
        et_grid.best_estimator_.feature_importances_
    }).sort_values('importance', ascending=False)
    features_top_n_et = feature_imp_sorted_et.head(top_n_features)['feature']
    print('Sample 25 Features from ET Classifier:')
    print(str(features_top_n_et[:25]))
    # 将三个模型挑选出来的前features_top_n_et合并
    features_top_n = pd.concat(
        [features_top_n_rf, features_top_n_ada, features_top_n_et],
        ignore_index=True).drop_duplicates()

    return features_top_n
skf = list(CV.StratifiedKFold(trainingY, 5))
rf_param = [2000, 12]  #n_est, depth
gb_param = [400, 4, 'auto']  #n_est, depth
ada_param = [1000]  #n_est

clfs = [
    ensemble.RandomForestClassifier(n_estimators=rf_param[0],
                                    n_jobs=16,
                                    max_depth=rf_param[1],
                                    max_features=0.5,
                                    random_state=1126),
    ensemble.GradientBoostingClassifier(n_estimators=gb_param[0],
                                        max_depth=gb_param[1],
                                        max_features=gb_param[2],
                                        random_state=1126),
    ensemble.AdaBoostClassifier(n_estimators=ada_param[0], random_state=1126)
]

dataset_blend_train = np.zeros((trainingX.shape[0], len(clfs)))
dataset_blend_test = np.zeros((testingX.shape[0], len(clfs)))

# Cross-validation
for j, clf in enumerate(clfs):
    print j, clf
    print >> log_f, clf
    dataset_blend_test_j = np.zeros((testingX.shape[0], len(skf)))
    for i, (train, test) in enumerate(skf):
        print "Fold", i
        X_fold_train = trainingX[train]
        y_fold_train = trainingY[train]
        X_fold_test = trainingX[test]
Exemple #16
0
def main():
    train_x ,train_y = helpers.load_data()

    alg = ensemble.AdaBoostClassifier()
    analysis.param_search(alg,train_x,train_y)
def plotting(X, Y, Xt, Yt, labelx, labely, outputfile):
    h = .02  # step size in the mesh
    classifiers = dict(knn=neighbors.KNeighborsClassifier(4),
                       logistic=linear_model.LogisticRegression(C=1e5),
                       svm=svm.SVC(C=1e5),
                       adaboost=ensemble.AdaBoostClassifier(),
                       naivebay=naive_bayes.GaussianNB())

    cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
    cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

    fignum = 1
    # we create an instance of Neighbours Classifier and fit the data.
    for name, clf in classifiers.iteritems():
        clf.fit(X, Y)
        score = clf.score(Xt, Yt)

        if score > 0.85:
            print '....... plotting for ' + name
            pl.cla()
            pl.clf()
            # Plot the decision boundary. For that, we will assign a color to each
            # point in the mesh [x_min, m_max]x[y_min, y_max].
            x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
            y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
            xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                                 np.arange(y_min, y_max, h))
            Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

            # Put the result into a color plot
            Z = Z.reshape(xx.shape)
            pl.figure(fignum, figsize=(4, 3))
            pl.pcolormesh(xx, yy, Z, cmap=cmap_light)

            # Plot also the training points
            pl.scatter(X[:, 0],
                       X[:, 1],
                       s=30,
                       c=Y,
                       edgecolors='k',
                       cmap=cmap_bold)
            pl.xlim(xx.min(), xx.max())
            pl.ylim(yy.min(), yy.max())
            pl.xticks(())
            pl.yticks(())
            fignum += 1
            pl.ylabel(labely)
            pl.xlabel(labelx)
            pl.text(xx.min(),
                    yy.min(),
                    name + " - Accuracy " + str(round(score, 2)),
                    ha='left',
                    fontsize=14,
                    style='italic')
            if score > 0.95:
                pl.savefig(outputfile + '_' + name + '_SUPERGOOD.png',
                           orientation='landscape')
            else:
                pl.savefig(outputfile + '_' + name + '.png',
                           orientation='landscape')

    return score, name
def multi_classifier_voting_predication(data1, data1_x_bin, cv_split, Target):
    # why choose one model, when you can pick them all with voting classifier
    # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html
    # removed models w/o attribute 'predict_proba' required for vote classifier and models with a 1.0 correlation to another model
    vote_est = [
        # Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html
        ('ada', ensemble.AdaBoostClassifier()),
        ('bc', ensemble.BaggingClassifier()),
        ('etc', ensemble.ExtraTreesClassifier()),
        ('gbc', ensemble.GradientBoostingClassifier()),
        ('rfc', ensemble.RandomForestClassifier()),
        # Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc
        ('gpc', gaussian_process.GaussianProcessClassifier()),

        # GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
        ('lr', linear_model.LogisticRegressionCV()),

        # Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html
        ('bnb', naive_bayes.BernoulliNB()),
        ('gnb', naive_bayes.GaussianNB()),

        # Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html
        ('knn', neighbors.KNeighborsClassifier()),

        # SVM: http://scikit-learn.org/stable/modules/svm.html
        ('svc', svm.SVC(probability=True)),

        # xgboost: http://xgboost.readthedocs.io/en/latest/model.html
        ('xgb', XGBClassifier())
    ]
    # Hard Vote or majority rules
    vote_hard = ensemble.VotingClassifier(estimators=vote_est, voting='hard')
    vote_hard_cv = model_selection.cross_validate(vote_hard,
                                                  data1[data1_x_bin],
                                                  data1[Target],
                                                  cv=cv_split,
                                                  return_train_score=True)
    vote_hard.fit(data1[data1_x_bin], data1[Target])

    print("Hard Voting Training w/bin score mean: {:.2f}".format(
        vote_hard_cv['train_score'].mean() * 100))
    print("Hard Voting Test w/bin score mean: {:.2f}".format(
        vote_hard_cv['test_score'].mean() * 100))
    print("Hard Voting Test w/bin score 3*std: +/- {:.2f}".format(
        vote_hard_cv['test_score'].std() * 100 * 3))
    print('-' * 10)

    # Soft Vote or weighted probabilities
    vote_soft = ensemble.VotingClassifier(estimators=vote_est, voting='soft')
    vote_soft_cv = model_selection.cross_validate(vote_soft,
                                                  data1[data1_x_bin],
                                                  data1[Target],
                                                  cv=cv_split,
                                                  return_train_score=True)
    vote_soft.fit(data1[data1_x_bin], data1[Target])

    print("Soft Voting Training w/bin score mean: {:.2f}".format(
        vote_soft_cv['train_score'].mean() * 100))
    print("Soft Voting Test w/bin score mean: {:.2f}".format(
        vote_soft_cv['test_score'].mean() * 100))
    print("Soft Voting Test w/bin score 3*std: +/- {:.2f}".format(
        vote_soft_cv['test_score'].std() * 100 * 3))
    print('-' * 10)
    return vote_hard, vote_soft
             'Edited Nearest Neighbours', 'Repeated Edited Nearest Neighbours',
             'All KNN', 'Instance Hardness Threshold',
             'Neighbour hood Cleaning Rule' , 'OneSidedSelection', 'Random Under Sampler',
             'TomekLinks(random_state=42)'
            ]

params = {'n_estimators': 10, 'max_depth': 3, 'subsample': 0.5,
                  'learning_rate': 0.89, 'min_samples_leaf': 1, 'random_state': 5}
clfs = [
        ensemble.GradientBoostingClassifier(**params),
        BernoulliNB(),
        DecisionTreeClassifier(random_state=0),
        svm.SVC(kernel='rbf', probability=True),
        SGDClassifier(loss="modified_huber",penalty='l1'),
        RandomForestClassifier(n_estimators=9),
        ensemble.AdaBoostClassifier(),
        svm.SVC(kernel='linear', probability=True),
        MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(150,50,15,5,3), random_state=1),
        neighbors.KNeighborsClassifier(n_neighbors=5),
        NearestCentroid(metric='euclidean', shrink_threshold=None),
        GaussianNB(),
        LinearDiscriminantAnalysis(),
        QuadraticDiscriminantAnalysis()
        
        
       ]
clfs_name = ['GradientBoostingClassifier', 'Bernoulli Naive Bayes',
             'DecisionTreeClassifier', 'SVM (rbf)', 'Stochastic Gradient Descent',
             'Radom Forest Classifier', 'Ada Boost Classifier',
             'SVM (linear)' , 'Multi Layer Perceptron', 'K Nearest Neighbors',
             'Nearest Centroid Classifier', 'Guassian Naive Bayes',
def adaBoostClassifierAlgorithm():
    from sklearn import ensemble 
    algorithmFitPredAndShow(ensemble.AdaBoostClassifier() , "AdaBoostClassifier")
    batch_size=548,
    epochs=1000000,
    shuffle=True,
    validation_data=(np.array(x_mlpval), krsutil.to_categorical(y_mlpval)),
    callbacks=callbacks_list,
    verbose=2)

#AdaBoost algorithm with a decision tree as the base classifier for the first task, including several rounds of grid search.
#ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(random_state=42), random_state=42) #F1 0.728 at first.
#ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=2, random_state=42), random_state=42) #Validation F1 0.484 at first.
#ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=3, random_state=42), random_state=42) #Validation F1 0.745 at first.
#ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=4, random_state=42), random_state=42) #Validation F1 0.775 at first.
#ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=5, random_state=42), random_state=42) #Validation F1 0.779 at first.
#ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=6, random_state=42), random_state=42) #Validation F1 0.780 at first.
ensembletree_model = sklensemble.AdaBoostClassifier(
    skltree.DecisionTreeClassifier(max_depth=7, random_state=42),
    random_state=42)  #Validation F1 0.787 at first.
#ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=8, random_state=42), random_state=42) #Validation F1 0.764 at first.
#ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=9, random_state=42), random_state=42) #Validation F1 0.774 at first.
#ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=10, random_state=42), random_state=42) #Validation F1 0.775 at first.
#ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=11, random_state=42), random_state=42) #Validation F1 0.770 at first.
#ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=12, random_state=42), random_state=42) #Validation F1 0.759 at first.
#ensembletree_model=sklensemble.AdaBoostClassifier(skltree.DecisionTreeClassifier(max_depth=13, random_state=42), random_state=42) #Validation F1 0.751 at first.
#ensembletree_hyper={'n_estimators':[2,3,4,5,6,7,8,9,10,50,100,200,400,800,1000]}
#ensembletree_hyper={'n_estimators':[50,100,150]}
#ensembletree_hyper={'n_estimators':[90,100,110]}
#ensembletree_hyper={'n_estimators':[95,100,105]}
ensembletree_hyper = {
    'n_estimators': [95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105]
}
ensembletree_scorer = sklmet.make_scorer(sklmet.f1_score, average='macro')
Exemple #22
0
predict_y = model.fit(train_X, train_failed_y).predict(test_X)
#只看挂科这一类分类效果好 坏
predict_failed_y = [1 - x for x in predict_y]
print("------ExtraTree---------")
f1 = f1_score(test_failed_y, predict_failed_y)
print("f1:%f" % f1)
macro_auc = roc_auc_score(test_failed_y, predict_failed_y, average="macro")
print("macro auc:%f" % macro_auc)
accuracy = accuracy_score(test_failed_y, predict_failed_y)
print("accuracy:%f" % accuracy)
precision = precision_score(test_failed_y, predict_failed_y)
print("precision:%f" % precision)
recall = recall_score(test_failed_y, predict_failed_y)
print("recall:%f" % recall)

model = ensemble.AdaBoostClassifier(n_estimators=20, random_state=random_state)
predict_y = model.fit(train_X, train_failed_y).predict(test_X)
#只看挂科这一类分类效果好 坏
predict_failed_y = [1 - x for x in predict_y]
print("------Adaboost---------")
f1 = f1_score(test_failed_y, predict_failed_y)
print("f1:%f" % f1)
macro_auc = roc_auc_score(test_failed_y, predict_failed_y, average="macro")
print("macro auc:%f" % macro_auc)
accuracy = accuracy_score(test_failed_y, predict_failed_y)
print("accuracy:%f" % accuracy)
precision = precision_score(test_failed_y, predict_failed_y)
print("precision:%f" % precision)
recall = recall_score(test_failed_y, predict_failed_y)
print("recall:%f" % recall)
Exemple #23
0
def InnerHoldout():
    a = input('Click and drag FEATURE SELECTED SINGLE FOLD DATA file here: ')
    a = a.strip('\' ')
    data = pd.read_csv(a, encoding='utf-8').set_index('PATIENT')

    b = input('Click and drag LABELS file here: ')
    b = b.strip('\' ')
    labels = pd.read_csv(b, encoding='utf-8').set_index('PATIENT')

    c = input('Click and drag OUTER CV file here: ')
    c = c.strip('\' ')
    with open(c, 'rb') as f:
        inner_cv = pickle.load(f)

    thisfold = int(input('Which fold is this? '))

    nfeatsmax = len(data.columns)
    nfeatsneural = round((nfeatsmax * 2 / 3))

    rf = ensemble.RandomForestClassifier(max_features=nfeatsmax,
                                         max_depth=5,
                                         bootstrap=False)
    et = ensemble.ExtraTreesClassifier(max_features=nfeatsmax,
                                       max_depth=5,
                                       bootstrap=False)
    kn = neighbors.KNeighborsClassifier(n_neighbors=nfeatsmax, p=1)
    nb = naive_bayes.GaussianNB()
    dt = tree.DecisionTreeClassifier(max_features=nfeatsmax,
                                     max_depth=5,
                                     criterion='entropy')
    ls = svm.LinearSVC(penalty='l1', dual=False)
    gb = ensemble.GradientBoostingClassifier(loss='exponential', max_depth=2)
    nn = neural_network.MLPClassifier(hidden_layer_sizes=(nfeatsneural,
                                                          nfeatsneural,
                                                          nfeatsneural),
                                      learning_rate_init=0.0001,
                                      max_iter=500)
    ab = ensemble.AdaBoostClassifier()
    bc = ensemble.BaggingClassifier(base_estimator=rf)
    vc = ensemble.VotingClassifier(estimators=[('ab', ab), ('gb', gb),
                                               ('bc', bc)],
                                   voting='soft')

    estimators = {  #'randomforest': rf,
        #'extratrees': et,
        #'kneighbors': kn,
        #'naivebayes': nb,
        #'decisiontree': dt,
        'linearsvc': ls,
        #'gboost': gb,
        #'neuralnet': nn,
        #'adaboost': ab,
        #'bagging': bc,
        #'voting': vc
    }

    train_results = {
        'fold': [],
        'estimator': [],
        'subjects': [],
        'labels': [],
        'predictions': [],
        'scores': [],
        'attempts': []
    }

    test_results = {
        'fold': [],
        'estimator': [],
        'subjects': [],
        'labels': [],
        'predictions': [],
        'scores': [],
        'attempts': []
    }

    train_ids = pd.DataFrame(index=inner_cv['train'][thisfold - 1])
    X_train = train_ids.join(data)
    y_train_df = train_ids.join(labels)
    y_train = np.array(y_train_df[y_train_df.columns[0]])

    test_ids = pd.DataFrame(index=inner_cv['test'][thisfold - 1])
    X_test = test_ids.join(data)
    y_test_df = test_ids.join(labels)
    y_test = np.array(y_test_df[y_test_df.columns[0]])

    for j, k in zip(estimators.keys(), estimators.values()):
        k.fit(X_train, y_train)

        predict_train = k.predict(X_train)
        train_scores = [
            1 if x == y else 0 for x, y in zip(y_train, predict_train)
        ]
        train_results['fold'].extend([thisfold] * len(X_train))
        train_results['estimator'].extend([j] * len(X_train))
        train_results['subjects'].extend(train_ids.index)
        train_results['labels'].extend(y_train)
        train_results['predictions'].extend(predict_train)
        train_results['scores'].extend(train_scores)
        train_results['attempts'].extend([1] * len(X_train))

        predict_test = k.predict(X_test)
        test_scores = [
            1 if x == y else 0 for x, y in zip(y_test, predict_test)
        ]
        test_results['fold'].extend([thisfold] * len(X_test))
        test_results['estimator'].extend([j] * len(X_test))
        test_results['subjects'].extend(test_ids.index)
        test_results['labels'].extend(y_test)
        test_results['predictions'].extend(predict_test)
        test_results['scores'].extend(test_scores)
        test_results['attempts'].extend([1] * len(X_test))

    train_df = pd.DataFrame.from_dict(train_results).set_index('subjects')
    test_df = pd.DataFrame.from_dict(test_results).set_index('subjects')

    train_df.to_csv(
        path_or_buf=
        '/media/james/ext4data/current/projects/pfizer/combined-study/inner_holdout_train_results_fold_'
        + str(thisfold) + '.csv')
    test_df.to_csv(
        path_or_buf=
        '/media/james/ext4data/current/projects/pfizer/combined-study/inner_holdout_test_results_fold_'
        + str(thisfold) + '.csv')

    with open(
            '/media/james/ext4data/current/projects/pfizer/combined-study/trainedclassifier_innerfold_'
            + str(thisfold) + '.pickle', 'wb') as f:
        pickle.dump(k, f, pickle.HIGHEST_PROTOCOL)

    print('D_-j RESULT')
    trd = train_df.groupby('estimator').sum()
    trsum = (trd['scores'] / trd['attempts']) * 100
    print(trsum)
    trmax = trsum.idxmax(axis=1)
    print('\nBest train: {}\n'.format(trmax))

    print('D_j (holdout for estimating model quality) RESULT')
    ted = test_df.groupby('estimator').sum()
    tesum = (ted['scores'] / ted['attempts']) * 100
    print(tesum)
    temax = tesum.idxmax(axis=1)
    print('\nBest test: {}\n'.format(temax))

    return
Exemple #24
0
 def get_skl_estimator(self, **default_parameters):
     return ensemble.AdaBoostClassifier(**default_parameters)
Exemple #25
0
def train_l1_models():
    chunk_size = 30000000
    num_of_chunks = 3
    models = []

    train_raw = pd.read_csv(path + "train.csv", nrows=2, dtype=init_dtype)
    starting_columns = train_raw.columns

    val = pd.read_csv(path + "train.csv", skiprows=chunk_size*num_of_chunks, nrows=chunk_size, dtype=init_dtype)
    val.columns = starting_columns
    val = preproccess_df(val)
    y_val = val['is_attributed']
    val.drop(['is_attributed', 'attributed_time'], axis=1, inplace=True)

    for i in range(num_of_chunks):
        train_raw = pd.read_csv(path + "train.csv", nrows=chunk_size, skiprows=i*chunk_size, dtype=init_dtype)
        train_raw.columns = starting_columns
        print('[{0}] Finished to load data'.format(time.time() - start_time))
        train = preproccess_df(train_raw)
        y_train = train['is_attributed']
        train.drop(['is_attributed', 'attributed_time'], axis=1, inplace=True)
        # print('[{}] Start LGBM Training'.format(time.time() - start_time))
        # dtrain = lgb.Dataset(train, label=y_train)
        # dval = lgb.Dataset(val, label=y_val, reference=dtrain)
        # lgbm_model1 = lgb.train(params, dtrain, num_boost_round=MAX_ROUNDS, valid_sets=[dtrain, dval],
        #                   early_stopping_rounds=50, verbose_eval=10)
        #
        # print('[{0}] Finish LGBM Training, {1}'.format(time.time() - start_time, 1))
        # with open(path + 'l1/light_gbm1_{0}.plk'.format(i), 'wb') as infile:
        #     pickle.dump(lgbm_model1, infile)
        # del lgbm_model1

        x3 = train.as_matrix()
        y3 = np.expand_dims(y_train.as_matrix(), 1)
        x4 = val.as_matrix()
        y4 = np.expand_dims(y_val.as_matrix(), 1)

        y3 = keras.utils.to_categorical(y3, 2)
        y4 = keras.utils.to_categorical(y4, 2)
        print(x3.shape, y3.shape)

        nn_model = get_nn(x3)
        nn_model.fit(x3, y3, epochs=5, class_weight=class_weight, verbose=0, batch_size=20000)
        print('nn trained:', nn_model.evaluate(x4,y4, verbose=0))
        nn_model.save(path + 'l1/model_nn_{0}.h5'.format(i))
        del nn_model

        gb = ensemble.GradientBoostingClassifier()
        gb.fit(train,y_train)
        print('gb', gb.score(val, y_val))
        with open(path + 'l1/gb_{0}.plk'.format(i), 'wb') as infile:
            pickle.dump(gb, infile)
        del gb

        ada = ensemble.AdaBoostClassifier()
        ada.fit(train, y_train)
        print('ada', ada.score(val, y_val))
        with open(path + 'l1/ada_{0}.plk'.format(i), 'wb') as infile:
            pickle.dump(ada, infile)

        del ada

        rf = ensemble.RandomForestClassifier(class_weight=class_weight,n_jobs=-1)
        rf.fit(train, y_train)
        print('rf', rf.score(val, y_val))
        with open(path + 'l1/rf_{0}.plk'.format(i), 'wb') as infile:
            pickle.dump(rf, infile)

        del rf

        et = ensemble.ExtraTreesClassifier(class_weight=class_weight,n_jobs=-1)
        et.fit(train, y_train)
        print('et', et.score(val, y_val, ))
        with open(path + 'l1/et_{0}.plk'.format(i), 'wb') as infile:
            pickle.dump(et, infile)

        del et

        # k = KNeighborsClassifier(n_jobs=-1)
        # k.fit(train, y_train)
        # print('k', k.score(val, y_val))
        # with open(path + 'l1/k_{0}.plk'.format(i), 'wb') as infile:
        #     pickle.dump(k, infile)
        #
        # del k
        #
        # r = RadiusNeighborsClassifier(n_jobs=-1)
        # r.fit(train, y_train)
        # print('r', r.score(val, y_val))
        # with open(path + 'l1/r_{0}.plk'.format(i), 'wb') as infile:
        #     pickle.dump(r, infile)
        #
        # del k

    # svc = SVC(class_weight=class_weight)
    # svc.fit(train, y_train)
    # print('k', svc.score(val, y_val))
    # with open(path + 'l1/svc.plk', 'wb') as infile:
    #     pickle.dump(svc, infile)
    return chunk_size*(num_of_chunks + 1)
Exemple #26
0
def errorCorrectionTrain(input_images,
                         output,
                         parameters=None,
                         debug=False,
                         partition=None,
                         part=None,
                         multilabel=1):
    try:
        use_coord = parameters.get('use_coord', True)
        use_joint = parameters.get('use_joint', True)
        patch_size = parameters.get('patch_size', 1)

        border = patch_size * 2

        if patch_size == 0:
            border = 2

        normalize_input = parameters.get('normalize_input', True)

        method = parameters.get('method', 'lSVC')
        method2 = parameters.get('method2', method)
        method_n = parameters.get('method_n', 15)
        method2_n = parameters.get('method2_n', method_n)
        method_random = parameters.get('method_random', None)
        method_max_features = parameters.get('method_max_features', 'auto')
        method_n_jobs = parameters.get('method_n_jobs', 1)
        primary_features = parameters.get('primary_features', 1)

        training_images = []
        training_diff = []
        training_images_direct = []
        training_direct = []

        if debug:
            print("errorCorrectionTrain use_coord={} use_joint={} patch_size={} normalize_input={} method={} output={} partition={} part={}".\
                    format(repr(use_coord),repr(use_joint),repr(patch_size),repr(normalize_input),method,output,partition,part))

        coords = None
        total_mask_size = 0
        total_diff_mask_size = 0

        for (i, inp) in enumerate(input_images):
            mask = None
            diff = None
            mask_diff = None

            if inp[-2] is not None:
                mask = extract_part(
                    minc.Label(inp[-2]).data, partition, part, border)

            ground_data = minc.Label(inp[-1]).data
            auto_data = minc.Label(inp[-3]).data

            ground_shape = ground_data.shape
            ground = extract_part(ground_data, partition, part, border)
            auto = extract_part(auto_data, partition, part, border)

            shape = ground_shape
            if coords is None and use_coord:
                c = np.mgrid[0:shape[0], 0:shape[1], 0:shape[2]]
                coords = [
                    extract_part((c[j] - shape[j] / 2.0) / (shape[j] / 2.0),
                                 partition, part, border) for j in range(3)
                ]

            features = [
                extract_part(
                    minc.Image(k, dtype=np.float32).data, partition, part,
                    border) for k in inp[0:-3]
            ]

            mask_size = shape[0] * shape[1] * shape[2]

            if debug:
                print("Training data size:{}".format(len(features)))
                if mask is not None:
                    mask_size = np.sum(mask)
                    print("Mask size:{}".format(mask_size))
                else:
                    print("Mask absent")
            total_mask_size += mask_size

            if multilabel > 1:
                diff = (ground != auto)
                total_diff_mask_size += np.sum(mask)

                if mask is not None:
                    mask_diff = diff & (mask > 0)
                    print("Sample {} mask_diff={} diff={}".format(
                        i, np.sum(mask_diff), np.sum(diff)))
                    #print(mask_diff)
                    training_diff.append(diff[mask > 0])
                    training_direct.append(ground[mask_diff])
                else:
                    mask_diff = diff
                    training_diff.append(diff)
                    training_direct.append(ground[diff])

                training_images.append(
                    prepare_features(features,
                                     coords,
                                     mask=mask,
                                     use_coord=use_coord,
                                     use_joint=use_joint,
                                     patch_size=patch_size,
                                     primary_features=primary_features))

                training_images_direct.append(
                    prepare_features(features,
                                     coords,
                                     mask=mask_diff,
                                     use_coord=use_coord,
                                     use_joint=use_joint,
                                     patch_size=patch_size,
                                     primary_features=primary_features))

            else:
                mask_diff = mask
                if mask is not None:
                    training_diff.append(ground[mask > 0])
                else:
                    training_diff.append(ground)

                training_images.append(
                    prepare_features(features,
                                     coords,
                                     mask=mask,
                                     use_coord=use_coord,
                                     use_joint=use_joint,
                                     patch_size=patch_size,
                                     primary_features=primary_features))

            if debug:
                print("feature size:{}".format(len(training_images[-1])))

            if i == 0 and parameters.get('dump', False):
                print("Dumping feature images...")
                for (j, k) in enumerate(training_images[-1]):
                    test = np.zeros_like(images[0])
                    test[mask > 0] = k
                    out = minc.Image(data=test)
                    out.save(name="dump_{}.mnc".format(j), imitate=inp[0])

        # calculate normalization coeffecients

        if debug: print("Done")

        clf = None
        clf2 = None

        if total_mask_size > 0:
            training_X = convert_image_list(training_images)
            training_Y = np.ravel(
                np.concatenate(tuple(j for j in training_diff)))

            if debug: print("Fitting 1st...")

            if method == "xgb":
                clf = None
            elif method == "SVM":
                clf = svm.SVC()
            elif method == "nuSVM":
                clf = svm.NuSVC()
            elif method == 'NC':
                clf = neighbors.NearestCentroid()
            elif method == 'NN':
                clf = neighbors.KNeighborsClassifier(method_n)
            elif method == 'RanForest':
                clf = ensemble.RandomForestClassifier(
                    n_estimators=method_n,
                    n_jobs=method_n_jobs,
                    max_features=method_max_features,
                    random_state=method_random)
            elif method == 'AdaBoost':
                clf = ensemble.AdaBoostClassifier(n_estimators=method_n,
                                                  random_state=method_random)
            elif method == 'AdaBoostPP':
                clf = Pipeline(steps=[('normalizer', Normalizer()),
                                      ('AdaBoost',
                                       ensemble.AdaBoostClassifier(
                                           n_estimators=method_n,
                                           random_state=method_random))])
            elif method == 'tree':
                clf = tree.DecisionTreeClassifier(random_state=method_random)
            elif method == 'ExtraTrees':
                clf = ensemble.ExtraTreesClassifier(
                    n_estimators=method_n,
                    max_features=method_max_features,
                    n_jobs=method_n_jobs,
                    random_state=method_random)
            elif method == 'Bagging':
                clf = ensemble.BaggingClassifier(
                    n_estimators=method_n,
                    max_features=method_max_features,
                    n_jobs=method_n_jobs,
                    random_state=method_random)
            elif method == 'dumb':
                clf = dummy.DummyClassifier(strategy="constant", constant=0)
            else:
                clf = svm.LinearSVC()

            #scores = cross_validation.cross_val_score(clf, training_X, training_Y)
            #print scores
            if method == "xgb":
                xg_train = xgb.DMatrix(training_X, label=training_Y)
                param = {}
                num_round = 100
                # use softmax multi-class classification
                param['objective'] = 'multi:softmax'
                # scale weight of positive examples
                param['eta'] = 0.1
                param['max_depth'] = 8
                param['silent'] = 1
                param['nthread'] = 4
                param['num_class'] = 2
                clf = xgb.train(param, xg_train, num_round)
            elif method != 'dumb':
                clf.fit(training_X, training_Y)

            if multilabel > 1 and method != 'dumb':
                if debug: print("Fitting direct...")

                training_X = convert_image_list(training_images_direct)
                training_Y = np.ravel(
                    np.concatenate(tuple(j for j in training_direct)))

                if method2 == "xgb":
                    clf2 = None
                if method2 == "SVM":
                    clf2 = svm.SVC()
                elif method2 == "nuSVM":
                    clf2 = svm.NuSVC()
                elif method2 == 'NC':
                    clf2 = neighbors.NearestCentroid()
                elif method2 == 'NN':
                    clf2 = neighbors.KNeighborsClassifier(method_n)
                elif method2 == 'RanForest':
                    clf2 = ensemble.RandomForestClassifier(
                        n_estimators=method_n,
                        n_jobs=method_n_jobs,
                        max_features=method_max_features,
                        random_state=method_random)
                elif method2 == 'AdaBoost':
                    clf2 = ensemble.AdaBoostClassifier(
                        n_estimators=method_n, random_state=method_random)
                elif method2 == 'AdaBoostPP':
                    clf2 = Pipeline(steps=[('normalizer', Normalizer()),
                                           ('AdaBoost',
                                            ensemble.AdaBoostClassifier(
                                                n_estimators=method_n,
                                                random_state=method_random))])
                elif method2 == 'tree':
                    clf2 = tree.DecisionTreeClassifier(
                        random_state=method_random)
                elif method2 == 'ExtraTrees':
                    clf2 = ensemble.ExtraTreesClassifier(
                        n_estimators=method_n,
                        max_features=method_max_features,
                        n_jobs=method_n_jobs,
                        random_state=method_random)
                elif method2 == 'Bagging':
                    clf2 = ensemble.BaggingClassifier(
                        n_estimators=method_n,
                        max_features=method_max_features,
                        n_jobs=method_n_jobs,
                        random_state=method_random)
                elif method2 == 'dumb':
                    clf2 = dummy.DummyClassifier(strategy="constant",
                                                 constant=0)
                else:
                    clf2 = svm.LinearSVC()

                if method2 == "xgb":
                    xg_train = xgb.DMatrix(training_X, label=training_Y)

                    param = {}
                    num_round = 100
                    # use softmax multi-class classification
                    param['objective'] = 'multi:softmax'
                    # scale weight of positive examples
                    param['eta'] = 0.1
                    param['max_depth'] = 8
                    param['silent'] = 1
                    param['nthread'] = 4
                    param['num_class'] = multilabel

                    clf2 = xgb.train(param, xg_train, num_round)

                elif method != 'dumb':
                    clf2.fit(training_X, training_Y)

            #print(clf.score(training_X,training_Y))

            if debug:
                print(clf)
                print(clf2)
        else:
            print("Warning : zero total mask size!, using null classifier")
            clf = dummy.DummyClassifier(strategy="constant", constant=0)

        if method == 'xgb' and method2 == 'xgb':
            #save
            clf.save_model(output)
            clf2.save_model(output + '_2')
        else:
            with open(output, 'wb') as f:
                cPickle.dump([clf, clf2], f, -1)

    except mincError as e:
        print("Exception in linear_registration:{}".format(str(e)))
        traceback.print_exc(file=sys.stdout)
        raise
    except:
        print("Exception in linear_registration:{}".format(sys.exc_info()[0]))
        traceback.print_exc(file=sys.stdout)
        raise
Exemple #27
0
for f in range(nb_features):
    print("%d. feature %s (%f)" %
          (f + 1, data.columns[2 + indices[f]],
           extratrees.feature_importances_[indices[f]]))

for f in sorted(
        np.argsort(extratrees.feature_importances_)[::-1][:nb_features]):
    features.append(data.columns[2 + f])

#Building the below Machine Learning model
model = {
    "DecisionTree": tree.DecisionTreeClassifier(max_depth=10),
    "RandomForest": ske.RandomForestClassifier(n_estimators=50),
    "GradientBoosting": ske.GradientBoostingClassifier(n_estimators=50),
    "AdaBoost": ske.AdaBoostClassifier(n_estimators=100),
    "GNB": GaussianNB()
}

#Training each of the model with the X_train and testing with X_test. The model with best accuracy will be ranked as winner
results = {}
print("\nNow testing model")

for algo in model:
    clf = model[algo]
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print("%s : %f %%" % (algo, score * 100))
    results[algo] = score

winner = max(results, key=results.get)
Exemple #28
0
# There's a high spike in survival from females with small family size, while generally people with large family size had a more difficult time surviving.

# In[17]:

X = train.drop('Survived', axis=1)
y = train.Survived

# I chose to run a voting classifier model, with using multiple models and voting on if each passenger survived or died in the predictions.
#
# Source: https://www.kaggle.com/ldfreeman3/a-data-science-framework-to-achieve-99-accuracy

# In[18]:

voting_estimates = [
    ('ada', ensemble.AdaBoostClassifier(n_estimators=200)),
    ('bc', ensemble.BaggingClassifier(n_estimators=200)),
    ('etc', ensemble.ExtraTreesClassifier(n_estimators=200)),
    ('gbc1', ensemble.GradientBoostingClassifier(n_estimators=200)),
    ('gbc2', ensemble.GradientBoostingClassifier(n_estimators=500)),
    ('rfc', ensemble.RandomForestClassifier(n_estimators=200)),
    ('gpc', gaussian_process.GaussianProcessClassifier()),
    ('lr', linear_model.LogisticRegressionCV()),
    ('bnb', naive_bayes.BernoulliNB()), ('gnb', naive_bayes.GaussianNB()),
    ('knn5', neighbors.KNeighborsClassifier()),
    ('svc', svm.SVC(probability=True))
]

# In[19]:

vote_soft = ensemble.VotingClassifier(estimators=voting_estimates,
Exemple #29
0
            pred = decision_stump.predict(data)
            Pred += pred * Alpha[i]

        return Pred


if __name__ == '__main__':
    sample = load_iris()
    data = sample.data
    target = sample.target
    data1 = data[target == 0, :]
    target1 = target[target == 0]
    target1 = target1 - 1
    data2 = data[target == 1, :]
    target2 = target[target == 1]

    data = numpy.concatenate((data1, data2), axis=0)
    target = numpy.concatenate((target1, target2), axis=0)
    data_train, data_test, target_train, target_test = train_test_split(
        data, target)

    clf = AdaBoostClassifier(no_of_stages=20)
    clf.fit(data_train, target_train)
    pred = clf.predict(data_test)
    pred = pred > 0

    clf1 = ensemble.AdaBoostClassifier(n_estimators=20, algorithm='SAMME')
    clf1.fit(data_train, target_train)
    pred1 = clf1.predict(data_test)
    pred1 = pred > 0
Exemple #30
0
    def adaboost(self, X, y, valid, test):

        # No weights associated with classes for the boosting model
        # Run boosting model
        boost = ensemble.AdaBoostClassifier(n_estimators=500)

        start = time.time()
        clf = boost.fit(X, y)
        end = time.time()

        # TRAIN DATA

        # y_score = clf.predict_proba(X)[:, 1]
        # results = clf.predict(X)
        #
        # # Get metrics
        # mets = self.compute_metrics(y, results, y_score)
        #
        # print('AUROC:', mets['auroc'])
        # print('Accuracy:', mets['accuracy'])
        # print('Precision:', mets['precision'])
        # print('Recall:', mets['recall'])
        # print('F Score:', mets['f'])
        # print('Average Precision', mets['ap'])
        # print(mets['confusion'])

        # VALID DATA

        # y_score = clf.predict_proba(valid.drop("Class", axis=1).drop("Time", axis=1))[:, 1]
        # results = clf.predict(valid.drop("Class", axis=1).drop("Time", axis=1))
        #
        # # Get metrics
        # mets = self.compute_metrics(valid["Class"], results, y_score)
        #
        # print('AUROC:', mets['auroc'])
        # print('Accuracy:', mets['accuracy'])
        # print('Precision:', mets['precision'])
        # print('Recall:', mets['recall'])
        # print('F Score:', mets['f'])
        # print('Average Precision', mets['ap'])
        # print(mets['confusion'])

        # TEST DATA

        y_score = clf.predict_proba(test.drop("Class", axis=1).drop("Time", axis=1))[:, 1]
        results = clf.predict(test.drop("Class", axis=1).drop("Time", axis=1))

        # Get metrics
        mets = self.compute_metrics(test["Class"], results, y_score)
        mets['time'] = end - start

        print('AUROC:', mets['auroc'])
        print('Accuracy:', mets['accuracy'])
        print('Precision:', mets['precision'])
        print('Recall:', mets['recall'])
        print('F Score:', mets['f'])
        print('Average Precision', mets['ap'])
        print(mets['confusion'], '\n')

        # Precision recall measure
        #self.plot_precision_recall(test["Class"], y_score, 'Boosting')

        # Plot ROC
        #self.plotROC(mets['fpr'], mets['tpr'], mets['auroc'], 'Boosting')

        return mets