Beispiel #1
0
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.tree import DecisionTreeClassifier
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.9353463587921848
exported_pipeline = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    StackingEstimator(estimator=ExtraTreesClassifier(bootstrap=False,
                                                     criterion="gini",
                                                     max_features=0.55,
                                                     min_samples_leaf=2,
                                                     min_samples_split=5,
                                                     n_estimators=100)),
    MinMaxScaler(),
    StackingEstimator(estimator=BernoulliNB(alpha=0.001, fit_prior=True)),
    DecisionTreeClassifier(criterion="entropy",
                           max_depth=5,
                           min_samples_leaf=12,
                           min_samples_split=9))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Beispiel #2
0
# Define the parameter grid
parameter_grid = [{
    'n_estimators': [100],
    'max_depth': [2, 4, 7, 12, 16]
}, {
    'max_depth': [4],
    'n_estimators': [25, 50, 100, 250]
}]

metrics = ['precision_weighted', 'recall_weighted']

for metric in metrics:
    print("\n##### Searching optimal parameters for", metric)

    classifier = GridSearchCV(ExtraTreesClassifier(random_state=0),
                              parameter_grid,
                              cv=5,
                              scoring=metric)
    classifier.fit(X_train, y_train)

    means = classifier.cv_results_['mean_test_score']  # LOTS OF ERRORS HERE
    print("\nGrid scores for the parameter grid:")
    for results, mean in zip(classifier.cv_results_['params'],
                             means):  # ERROR HERE
        print(results, " -> ", "%.3f" % mean)

    print("\nBest parameters:", classifier.best_params_)

    y_pred = classifier.predict(X_test)
    print("\nPerformance report:\n")
Beispiel #3
0
            estimators = RF.estimators_
            a = get_ensemble_score(estimators[:k], X_test, y_test)
            scores1.append(a)
        meta_scores.append(scores1)
    meta_scores = np.array(meta_scores)
    s = np.mean(meta_scores, axis=0).tolist()
    RF_cross_val_scores.append(s)
pickle.dump(RF_cross_val_scores, open('bag_sabit2_cross_val_scores', 'wb'))

RF_cross_val_scores = []

for i, data in enumerate(datasets[:]):
    X, y = arff_to_numpy('Datasets/' + str(data) + '.arff')
    meta_scores = []
    kf = KFold(n_splits=3, shuffle=True)
    for train_index, test_index in kf.split(X):
        scores1 = []
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None]:
            RF = ExtraTreesClassifier(max_depth=k, n_estimators=50, n_jobs=-1)
            RF.fit(X_train, y_train)
            estimators = RF.estimators_
            a = get_ensemble_score(estimators[:k], X_test, y_test)
            scores1.append(a)
        meta_scores.append(scores1)
    meta_scores = np.array(meta_scores)
    s = np.mean(meta_scores, axis=0).tolist()
    RF_cross_val_scores.append(s)
pickle.dump(RF_cross_val_scores, open('et_sabit2_cross_val_scores', 'wb'))
    else:
        movie_reviews_data_folder = "/home/gregor/ipyServer/data/movie_review/train_sub"
        movie_reviews_test_data_folder = "/home/gregor/ipyServer/data/movie_review/test_sub"

    dataset = load_files(movie_reviews_data_folder, shuffle=False)
    test_data = load_files(movie_reviews_test_data_folder, shuffle=False)
    print(len(test_data))
    print("n_samples: %d\n" % len(dataset.data))

    # Build vectorizer
    vectorizer = TfidfVectorizer(sublinear_tf=False,
                                 max_df=0.1,
                                 ngram_range=(1, 2))

    text_clf = ExtraTreesClassifier(max_depth=1024,
                                    min_samples_leaf=8,
                                    min_samples_split=16)

    reduceParams = 80 * 1000

    metrics_out = '/home/gregor/ipyServer/movie_review/output/metrics_ExTrees.out'
    kaggle_test_out = '/home/gregor/ipyServer/movie_review/output/kaggle_test_04_ExTrees.csv'

    #################################################################################

    outfile = open(metrics_out, 'a+')
    outfile.write(('*' * 70) + '\n')
    outfile.write(('*' * 5) + (' ' * 25) + 'Begin  Run' + (' ' * 25) +
                  ('*' * 5) + '\n')
    outfile.write(('*' * 70) + '\n\n')
Beispiel #5
0
from sklearn.datasets import load_iris
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel


X, Y = load_iris(return_X_y=True)
print(X.shape)

model = ExtraTreesClassifier(n_estimators=50)
model.fit(X, Y)
print(model.feature_importances_)

sfModel = SelectFromModel(model, prefit=True)
X1 = sfModel.transform(X)
print(X1.shape)
print(X1)
Beispiel #6
0
def get_feature_importance(X, y):
    extree = ExtraTreesClassifier()
    extree.fit(X, y)
    return X, extree.feature_importances_
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.datasets.samples_generator import make_blobs
'''创建训练的数据集'''
data, target = make_blobs(n_samples=50000,
                          centers=2,
                          random_state=0,
                          cluster_std=0.60)
'''模型融合中使用到的各个单模型'''
clfs = [
    RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
    RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
    ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),
    ExtraTreesClassifier(n_estimators=5, n_jobs=-1, criterion='entropy'),
    GradientBoostingClassifier(learning_rate=0.05,
                               subsample=0.5,
                               max_depth=6,
                               n_estimators=5)
]
'''切分一部分数据作为测试集'''
X, X_predict, y, y_predict = train_test_split(data,
                                              target,
                                              test_size=0.33,
                                              random_state=2017)

dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_predict.shape[0], len(clfs)))
'''5折stacking'''
# # accuracy_score(label_test,result)
# # print (classification_report(label_test,result,digits=4))

# # scores = cross_val_score(clf, feature_matrix, labels)
# # scores.mean()  
# # clf = ExtraTreesClassifier(n_estimators=150)
# # scores = cross_val_score(clf, feature_matrix, labels, cv=10)
# # scores.mean()
# # clf = clf.fit(feature_train,label_train)
# clf = svm.SVC(C=1.0,kernel='rbf',cache_size=1000,decision_function_shape='ovr',shrinking=True,probability=True)
# scores = cross_val_score(clf,feature_matrix,labels,cv=StratifiedKFold(n_splits=4,shuffle=True))
# print (scores, scores.mean())
# clf.fit(feature_train, label_train)

'''Extra-Trees'''
clf = ExtraTreesClassifier(n_estimators=200,n_jobs=-1,max_features=30,criterion='gini')
scores = cross_val_score(clf,feature_matrix,labels,cv=StratifiedKFold(n_splits=4,shuffle=True))
print (scores, scores.mean())
clf = clf.fit(feature_train,label_train)
result = clf.predict(feature_test)
accuracy_score(label_test,result)
print (classification_report(label_test,result,digits=4))
print (clf.max_depth)
clf.get_params()# print(classification_report_imbalanced(label_test, result))
clf.score(feature_test,label_test)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
#print ('hlo',clf.oob_score_)

cm=sklearn.metrics.confusion_matrix(label_test,result )
print(cm)
pl.matshow(cm)
Beispiel #9
0
le = LabelEncoder().fit(train.species)
labels = le.transform(train.species)
classes = list(le.classes_)
test_ids = test.id

train = train.drop(['id', 'species'], axis=1)
test = test.drop(['id'], axis=1)

sss = StratifiedShuffleSplit(labels, 10, test_size=0.2, random_state=23)

for train_index, test_index in sss:
    X_train, X_test = train.values[train_index], train.values[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

trees = ExtraTreesClassifier(n_estimators=100,
                             max_features=None,
                             min_samples_split=1)
trees.fit(X_train, y_train)

train_predictions = trees.predict(X_test)
accuracy = accuracy_score(y_test, train_predictions)
print "Accuracy: {:.2%}".format(accuracy)

train_prob = trees.predict_proba(X_test)
loss = log_loss(y_test, train_prob)
print "Log loss: {:10.4f}".format(loss)

trees_predict = trees.predict_proba(test)

submission = pd.DataFrame(trees_predict, columns=classes)
submission.insert(0, 'id', test_ids)
Beispiel #10
0
#Random Forest
clf_rf = RandomForestClassifier(n_estimators=1000,
                                max_depth=None,
                                min_samples_split=10)

clf_rf = clf_rf.fit(X, y)
score_rf = cross_val_score(clf_rf, X, y, cv=5).mean()
print(score_rf)

# In[ ]:

#Extremely Randomised Trees
clt_ext = ExtraTreesClassifier(max_features='auto',
                               bootstrap=True,
                               oob_score=True,
                               n_estimators=1000,
                               max_depth=None,
                               min_samples_split=10)
clt_ext.fit(X, y)
score_ext = cross_val_score(clt_ext, X, y, cv=5).mean()
print(score_ext)

# In[ ]:

#Gradient Boost
import warnings
warnings.filterwarnings

clf_gb = GradientBoostingClassifier(n_estimators=1000,
                                    learning_rate=0.1,
                                    max_depth=3,
X = ft.values
X = np.random.permutation(X)
X = np.random.permutation(X.T).T
y0 = ft.index.map(lb.structure_name).values

#%%
for label, y in (('All', y0), ):
    #*((key, np.where(y0==key, key, 'Other')) for key in np.unique(y0))):
    #%%
    transcripts = []
    scores = []
    trials = 1000
    for seed in tqdm(range(trials)):
        clf = ExtraTreesClassifier(n_estimators=50,
                                   random_state=seed,
                                   max_depth=5,
                                   criterion='entropy',
                                   min_impurity_decrease=0.05)
        clf.fit(X, y)
        dimred = SelectFromModel(clf, prefit=True, max_features=50)

        transcripts.extend(ft.T.index[dimred.get_support()])
        scores.extend(clf.feature_importances_[dimred.get_support()])

    df0 = pd.DataFrame()
    df0['transcripts'] = transcripts
    df0['scores'] = scores

    #%%
    def f(gp):
        return pd.DataFrame([[len(gp), max(gp['scores'])]],
for i in label_l:
    X[:, i] = labelencoder_X.fit_transform(X[:, i])

onehotencoder = OneHotEncoder(categorical_features=label_l)
X = onehotencoder.fit_transform(X).toarray()
# Encoding the Dependent Variable
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250, random_state=0)

forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)

indices = np.argsort(importances)[::-1]
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

print("Features sorted by their score:")
print(
    sorted(zip(map(lambda x: round(x, 4), forest.feature_importances_),
def main():

    # load data and split into submission and training data
    X = np.load("data/bagOfWods_3000.npz")['X'][()]
    Y = np.load("data/bagOfWods_3000.npz")['y'][()]
    test_X = X[0:50000, :]
    # test_Y = Y[0:50000]
    train_X = X[50000:, :]
    train_Y = Y[50000:]

    # split into train/test
    X_train, X_temp, y_train, y_temp = train_test_split(train_X,
                                                        train_Y,
                                                        test_size=0.3,
                                                        random_state=0)
    X_test, X_valid, y_test, y_valid = train_test_split(X_temp,
                                                        y_temp,
                                                        test_size=0.3,
                                                        random_state=1)

    print("running model....")

    name_report = "report/report_%s.json" % "ensemble"

    with open(name_report, mode='w') as f:
        json.dump([], f)
    with open(name_report, mode='r') as modeljson:
        models = json.load(modeljson)

    start = time.time()

    # dictionary of different models with their parameters
    model_dic = {
        "randomForest":
        RandomForestClassifier(n_jobs=-1, n_estimators=3000, max_depth=10),
        "logistic":
        LogisticRegression(n_jobs=-1),
        "svmrdf":
        SVC(probability=True),
        "linearSVM":
        LinearSVC(),
        "extra":
        ExtraTreesClassifier(n_estimators=3000, max_depth=10, n_jobs=-1)
    }

    # parameter grid
    PARAM_GRID = {
        "randomForest": {
            'n_estimators': [2000],
            'max_depth': [8, 11]
        },
        "logistic": {},
        "svmrdf": {
            'C': [0.1, 1]
        },
        "linearSVM": {},
        "extra": {
            'n_estimators': [2000],
            'max_depth': [8, 11]
        }
    }

    # loop through dictionary of models and fit the model on data
    for model_name, grid in model_dic.items():

        print("now %s is running" % model_name)
        print(PARAM_GRID[model_name])

        # grid = GridSearchCV(model, PARAM_GRID[model_name], scoring='accuracy', n_jobs=-1, cv=7)

        grid.fit(X_train, y_train)

        # output model
        model_file_name = "cache/%s.pkl" % (model_name)
        output = open(model_file_name, 'wb')
        pickle.dump(grid, output)
        output.close()

        if model_name == 'gbm' or model_name == 'svmrdf':
            result = grid.predict(X_test.toarray())
            result_prob = grid.predict_proba(X_test.toarray())
        else:
            result = grid.predict(X_test)

        accuracy = accuracy_score(y_test, result)

        end = time.time()
        time_delay = end - start

        submission_name = "submission/submission_file_%s.csv" % (model_name)
        report = {
            "model_name": model_name,
            "accuracy": accuracy,
            "time_delay": time_delay,
            "submission_name": submission_name
        }
        report_str = str(report)

        with open(name_report, mode='w') as modeljson:

            models.append(report_str)
            json.dump(models, modeljson)

        if model_name == 'gbm' or model_name == 'svmrdf':
            test_result = grid.predict(test_X.toarray())
        else:
            test_result = grid.predict(test_X)

        # make a submission file.
        submission = pd.DataFrame({
            'id': np.arange(1, 50001),
            'y': test_result
        })
        submission.to_csv(submission_name, index=False)
#from sklearn.neural_network import MLPClassifier
#
#sizes = (200,100,100)
#clfNN = MLPClassifier(solver='lbfgs', alpha=.015,
#                    hidden_layer_sizes=sizes, random_state=15)
#clfOne = OneVsRestClassifier(MLPClassifier(solver='lbfgs', alpha=.015,
#                    hidden_layer_sizes=sizes, random_state=15), n_jobs = -1)
#
#clfNN.fit(X_train,y_train)
#clfOne.fit(X_train,y_train)
#
#predicted_NN     = clfNN.predict(X_test)
#predicted_One    = clfOne.predict(X_test)
#%% Use TPOT to find best parameters/models
clfExtra = make_pipeline(
    ExtraTreesClassifier(criterion="gini", max_features=0.53,
                         n_estimators=500))
clfExtra.fit(X_train, y_train)

predicted = clfExtra.predict(X_test)

#%%
from sklearn.metrics import confusion_matrix
from classification_utilities import display_cm, display_adj_cm

conf = confusion_matrix(y_test, predicted)
display_cm(conf, facies_labels, hide_zeros=True)


def accuracy(conf):
    total_correct = 0.
    nb_classes = conf.shape[0]
Beispiel #15
0
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Average CV score on the training set was:0.8385981283133181
exported_pipeline = make_pipeline(
    make_union(FunctionTransformer(copy), FunctionTransformer(copy)),
    ExtraTreesClassifier(bootstrap=False,
                         criterion="entropy",
                         max_features=0.6500000000000001,
                         min_samples_leaf=2,
                         min_samples_split=11,
                         n_estimators=800))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Beispiel #16
0
from flask import Flask, render_template
import flask
#from flask.ext.sqlalchemy import SQLAlchemy
import numpy as np
import pandas as pd
import os
from sklearn.ensemble import ExtraTreesClassifier
import modify_data

#Model for Forest Identifying#
train = pd.read_csv('full_cols.csv')
train['Cover_Type'] = train['Cover_Type'].apply(str)
X_train = train.drop(['Cover_Type', 'Unnamed: 0'],1)
y_train = train['Cover_Type']
FOREST = ExtraTreesClassifier(n_estimators=200, random_state=42).fit(X_train, y_train)
# End of Forest Model

app = Flask(__name__)

@app.route('/')
def viz_page():
    """
    Visualization page for the app
    """
    with open('visualization.html', 'r') as viz_file:
        return viz_file.read()
@app.route('/test.html')
def test_page():
    with open('test.html', 'r') as test_file:
        return test_file.read()
def train_classifiers(X_data, y):
    """
    Trains several classifiers and reporting model quality.
    :param X_data:
    :param y:
    :return: trained models
    """
    # Split the dataset into Train and Test
    seed = 42
    test_size = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                        y,
                                                        test_size=test_size,
                                                        random_state=seed)

    svm = SVC()
    svm_params = {
        'C': [1, 10, 100, 1000],
        'gamma': [1, 0.1, 0.001, 0.0001],
        'kernel': ['linear', 'rbf']
    }

    svm_model, svm_grid = train_single_classifier_type(svm, "SVM", svm_params,
                                                       X_train, X_test,
                                                       y_train, y_test)

    knn = KNeighborsClassifier()
    knn_params = {
        'n_neighbors': [5, 6, 7, 8, 9, 10],
        'leaf_size': [1, 2, 3, 5],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'n_jobs': [-1]
    }
    knn_model, knn_grid = train_single_classifier_type(knn, "KNN", knn_params,
                                                       X_train, X_test,
                                                       y_train, y_test)

    # Train the XGboost Model for Classification
    xgb_model = xgb.XGBClassifier()

    # brute force scan for all parameters, here are the tricks
    # usually max_depth is 6,7,8
    # learning rate is around 0.05, but small changes may make big diff
    # tuning min_child_weight subsample colsample_bytree can have
    # much fun of fighting against overfit
    # n_estimators is how many round of boosting
    # finally, ensemble xgboost with multiple seeds may reduce variance
    xgb_parameters = {
        'nthread': [4],  # when use hyperthread, xgboost may become slower
        'objective': ['binary:logistic'],
        'learning_rate': [0.05, 0.1],  # so called `eta` value
        'max_depth': [6, 7, 8],
        'min_child_weight': [1, 11],
        'silent': [1],
        'subsample': [0.8],
        'colsample_bytree': [0.7, 0.8],
        'n_estimators':
        [5, 100,
         1000],  # number of trees, change it to 1000 for better results
        'missing': [-999],
        'seed': [1337]
    }

    train_model1, xgb_grid = train_single_classifier_type(
        xgb_model, "XGBoost", xgb_parameters, X_train, X_test, y_train, y_test)

    rfc = RandomForestClassifier()

    rfc_parameters = {
        'max_depth': [4, 5, 6],
        'n_estimators': [100, 200],
        'criterion': ['gini', 'entropy'],
        'max_features': ['auto', 'sqrt', 'log2'],
        'min_samples_leaf': [2, 4],
        'min_samples_split': [2, 5, 10],
    }

    rfc_model, rfc_grid = train_single_classifier_type(rfc, "Random Forest",
                                                       rfc_parameters, X_train,
                                                       X_test, y_train, y_test)

    ext = ExtraTreesClassifier()

    ext_parameters = {
        'n_estimators': [50, 100],
        'max_features': [5, 10, 25],
        'min_samples_leaf': [2, 5, 10],
        'min_samples_split': [2, 5, 10],
    }

    ext_model, ext_grid = train_single_classifier_type(ext, "Extra Trees",
                                                       ext_parameters, X_train,
                                                       X_test, y_train, y_test)

    lgbm = LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        n_jobs=-1,  # Updated from 'nthread'
        silent=True)
    # Create parameters to search
    lgbm_parameters = {
        'max_depth': [5, 6, 7, 8, 9, 10, 15, 20],
        'learning_rate': [0.005],
        'n_estimators': [100, 150, 500],
        'num_leaves': [6, 8, 12, 16],
        'boosting_type': ['gbdt'],
        'objective': ['binary'],
        'random_state': [501],  # Updated from 'seed'
        'colsample_bytree': [0.65],
        'subsample': [0.7],
        'reg_alpha': [1, 10],
        'reg_lambda': [10, 100],
    }
    lgbm_model, lgbm_grid = train_single_classifier_type(
        lgbm, "LGBM", lgbm_parameters, X_train, X_test, y_train, y_test)

    rgf = RGFClassifier()
    rgf_parameters = {
        'max_leaf': [900],
        'l2': [0.1, 0.05, 1.0],
        'min_samples_leaf': [5, 4, 3],
        'algorithm': ["RGF", "RGF_Opt", "RGF_Sib"],
        'loss': ["Log"],
    }

    rgf_model, rgf_grid = train_single_classifier_type(rgf, "RGF",
                                                       rgf_parameters, X_train,
                                                       X_test, y_train, y_test)

    frgf = FastRGFClassifier()
    frgf_parameters = {
        'max_leaf': [100, 200, 900],
        'n_estimators': [100, 1000],
        'max_bin': [10, 100],
        'l2': [0.1, 100, 1000],
        'min_samples_leaf': [5, 6],
        'opt_algorithm': ['rgf'],
        'loss': ["LS"],
    }

    frgf_model, frgf_grid = train_single_classifier_type(
        frgf, "FRGF", frgf_parameters, X_train, X_test, y_train, y_test)

    return svm_model, svm_grid, \
           train_model1, xgb_grid, \
           rfc_model, rfc_grid, \
           ext_model, ext_grid, \
           lgbm_model, lgbm_grid, \
           rgf_model, rgf_grid, \
           frgf_model, frgf_grid
def eval_trees_model(df):
    # perform k-fold validation
    kf = KFold(n=df.shape[0], n_folds=10, random_state=SEED, shuffle=True)
    acc_scores_log = np.zeros(10)
    acc_scores_rf = np.zeros(10)
    acc_scores_et = np.zeros(10)
    acc_scores_comb = np.zeros(10)

    fold_n = 0
    
    # logistic regression model with defaults
    log_cl = LogisticRegression()
    # rf model
    rf_cl = RandomForestClassifier(n_estimators=200, min_samples_split=16, random_state=SEED)
    # Naive Bayes model
    et_cl = ExtraTreesClassifier(n_estimators=200, min_samples_split=16, random_state=SEED)

    for train_indices, fold_eval_indices in kf:
        print("Evaluating fold {} of {}".format(fold_n+1, 10))
        # take a tfidf vectorisation of the text
        tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode',
                              analyzer='word', token_pattern=r'\w{1,}',
                              decode_error='ignore',
                              ngram_range=(1, 1), use_idf=1, smooth_idf=1,
                              sublinear_tf=1)

        X_train = tfv.fit_transform(df['tweets_text'][train_indices])
        X_eval = tfv.transform(df['tweets_text'][fold_eval_indices])

        y_train = np.array(list(df['tweet_group'][train_indices]))
        y_eval = np.array(list(df['tweet_group'][fold_eval_indices]))

        log_cl.fit(X_train, y_train)
        log_preds = log_cl.predict(X_eval)
        log_proba = log_cl.predict_proba(X_eval)
        acc_scores_log[fold_n] = accuracy_score(y_eval, log_preds)

        # use the most important words to train RF classifier
        # take the max absolute value from all one-v-all subclassifiers
        coef = np.abs(log_cl.coef_).mean(0)
        important_words_ind = np.argsort(coef)[-200:]

        X_train_dense = X_train[:, important_words_ind].todense()
        X_eval_dense = X_eval[:, important_words_ind].todense()

        rf_cl.fit(X_train_dense, y_train)
        rf_preds = rf_cl.predict(X_eval_dense)
        rf_proba = rf_cl.predict_proba(X_eval_dense)
        acc_scores_rf[fold_n] = accuracy_score(y_eval, rf_preds)

        et_cl.fit(X_train_dense, y_train)
        et_preds = et_cl.predict(X_eval_dense)
        et_proba = et_cl.predict_proba(X_eval_dense)
        acc_scores_et[fold_n] = accuracy_score(y_eval, et_preds)

        # combine predictions by taking the maximum probabilities from both classifiers
        if not all(log_cl.classes_ == rf_cl.classes_):
            print("Error: different classes for classifiers. Combined predictions incorrect")
        comb_proba = 0.5*rf_proba + 0.5*et_proba
        comb_preds = [log_cl.classes_[i] for i in comb_proba.argmax(1)]
        acc_scores_comb[fold_n] = accuracy_score(y_eval, comb_preds)

        fold_n += 1

    print("Mean Log Accuracy:{}, Std:{}".format(np.mean(acc_scores_log), np.std(acc_scores_log)))
    print("Mean RF Accuracy:{}, Std:{}".format(np.mean(acc_scores_rf), np.std(acc_scores_rf)))
    print("Mean Extra Trees Accuracy:{}, Std:{}".format(np.mean(acc_scores_et), np.std(acc_scores_et)))
    print("Mean Combined Accuracy:{}, Std:{}".format(np.mean(acc_scores_comb), np.std(acc_scores_comb)))
Beispiel #19
0
features = pd.DataFrame(
    pd.read_hdf('../Experiment Data/deephf_x.h5', key='deephf'))
labels = pd.DataFrame(
    pd.read_hdf('../Experiment Data/deephf_y_' + cas9 + '.h5', key='deephf'))

data = pd.concat([features, labels], axis=1, ignore_index=True)

data = data.dropna().reset_index(drop=True)

train_data, test_data = train_test_split(data,
                                         test_size=0.15,
                                         random_state=1,
                                         stratify=data.iloc[:, -1])

extraTree = ExtraTreesClassifier(n_estimators=500,
                                 n_jobs=-1,
                                 random_state=1,
                                 verbose=2)

steps = [('SFM', SelectFromModel(estimator=extraTree)),
         ('scaler', StandardScaler()),
         ('SVM',
          SVC(C=10,
              gamma=0.001,
              kernel='rbf',
              cache_size=20000,
              verbose=True,
              max_mem_size=6000,
              probability=True))]

train_x = train_data.iloc[:, :-1]
train_y = train_data.iloc[:, -1]
Beispiel #20
0
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier as kn
data = pd.read_csv('red.csv')
x = data[[
    'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
    'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
    'pH', 'sulphates', 'alcohol'
]]
y = data['quality']
clf1 = GaussianNB()
clf2 = ExtraTreesClassifier(n_estimators=82,
                            max_depth=None,
                            min_samples_split=1,
                            random_state=0)
clf3 = RandomForestClassifier(random_state=0,
                              n_estimators=250,
                              min_samples_split=1)
clf4 = kn(n_neighbors=13)
clf = VotingClassifier(estimators=[('gnb', clf1), ('et', clf2), ('rf', clf3),
                                   ('kn', clf4)],
                       voting='soft',
                       weights=[1, 8, 2, 1]).fit(x, y)
test = pd.read_csv('red_test.csv')
x = test[[
    'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
    'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
    'pH', 'sulphates', 'alcohol'
]]
Beispiel #21
0
Datei: v4.py Projekt: mars1198/v4
# training and test datasets
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    features, labels, test_size=0.2, random_state=42, stratify=labels)

# Support Vector Machine
print('Support Vector Machine starting ...')
cl = LinearSVC()
run_classifier(cl, X_train, y_train, X_test, y_test,
               "CNN-SVM Accuracy: {0:0.1f}%", "SVM Confusion matrix")

#Extra Trees
print('Extra Trees Classifier starting ...')
cl = ExtraTreesClassifier(n_jobs=1,
                          n_estimators=10,
                          criterion='gini',
                          min_samples_split=2,
                          max_features=50,
                          max_depth=None,
                          min_samples_leaf=1)
run_classifier(cl, X_train, y_train, X_test, y_test,
               "CNN-ET Accuracy: {0:0.1f}%", "Extra Trees Confusion matrix")

# Random Forest
print('Random Forest Classifier starting ...')
cl = RandomForestClassifier(n_jobs=1,
                            criterion='entropy',
                            n_estimators=10,
                            min_samples_split=2)
run_classifier(cl, X_train, y_train, X_test, y_test,
               "CNN-RF Accuracy: {0:0.1f}%", "Random Forest Confusion matrix")
Beispiel #22
0
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier

# Generate data
X, y = samples_generator.make_classification(n_samples=150,
                                             n_features=25,
                                             n_classes=3,
                                             n_informative=6,
                                             n_redundant=0,
                                             random_state=7)

# Select top K features
k_best_selector = SelectKBest(f_regression, k=9)

# Initialize Extremely Random Forests classifier
classifier = ExtraTreesClassifier(n_estimators=60, max_depth=4)

# Construct the pipeline
processor_pipeline = Pipeline([('selector', k_best_selector),
                               ('erf', classifier)])

# Set the parameters
processor_pipeline.set_params(selector__k=7, erf__n_estimators=30)

# Training the pipeline
processor_pipeline.fit(X, y)

# Predict outputs for the input data
output = processor_pipeline.predict(X)
print("\nPredicted output:\n", output)
#  * loss for the 2nd, 3rd, 4th, 5th best move, etc (perfect move is
#    less likely if there are several very close alternatives)

modelnum = 0
for elo_name, elo_df in train_df.groupby(train_df['elo_groups']):
    subset_df = elo_df
    for cb in chunk_bounds:
        msg('working on elo group %s, of size %i. fitting model for error >= %f' % (elo_name, subset_df.shape[0], cb))
        X = subset_df[features]
        y = (subset_df['clipped_movergain'] >= cb)

        rfc = True
        if rfc:
            extra = True
            if extra:
                clf = ExtraTreesClassifier(min_samples_split=200, min_samples_leaf=50, n_jobs=-1, n_estimators=NUM_ESTIMATORS, verbose=1)
            else:
                clf = RandomForestClassifier(min_samples_split=200, min_samples_leaf=50, n_jobs=-1, n_estimators=NUM_ESTIMATORS, verbose=1, oob_score=True)
        else:
            clf = GradientBoostingClassifier(min_samples_split=500, min_samples_leaf=300, n_estimators=NUM_ESTIMATORS, verbose=1, subsample=0.5, learning_rate=0.2)

        msg('CROSS VALIDATING')
        skf = StratifiedKFold(y, n_folds=2, shuffle=True)
        ins = []
        outs = []
        for train_index, test_index in skf:
            foo = clf.fit(X.iloc[train_index], y.iloc[train_index])
            ins.append(average_precision_score(clf.predict(X.iloc[train_index]), y.iloc[train_index]))
            outs.append(average_precision_score(clf.predict(X.iloc[test_index]), y.iloc[test_index]))
        msg("insample  average precision score: %s = %f" % (ins, np.mean(ins)))
        msg("outsample average precision score: %s = %f" % (outs, np.mean(outs)))
Beispiel #24
0
# Import statements required for Plotly
import plotly.offline as py

import plotly.graph_objs as go
from plotly import tools

# Loading some example data
data = pd.read_csv('2clstrain1200.csv', header=None)

names = ["Decision Tree", "Random Forest", "ExtraTrees"]
# Creating a Python List with our three Tree classifiers
treeclassifiers = [
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=20, max_features=1),
    ExtraTreesClassifier()
]

# X = data.iloc[:, 0:1]
# y = data.iloc[:, 1]

X, y = make_classification(n_features=2,
                           n_redundant=0,
                           n_informative=2,
                           random_state=1,
                           n_clusters_per_class=1)

datasets = [
    make_moons(noise=0.3, random_state=0),
    make_circles(noise=0.2, factor=0.5, random_state=1),
    make_blobs()
        'features': ['numeric', 'categorical_encoded'],
        'model': Sklearn(GradientBoostingRegressor(loss='lad', n_estimators=300, max_depth=7, max_features=0.2)),
        'param_grid': {'n_estimators': (200, 400), 'max_depth': (6, 8), 'max_features': (0.1, 0.4)},
    },

    'ab-ce': {
        'features': ['numeric', 'categorical_encoded'],
        'y_transform': y_log_ofs(200),
        'model': Sklearn(AdaBoostRegressor(loss='linear', n_estimators=300)),
        'param_grid': {'n_estimators': (50, 400), 'learning_rate': (0.1, 1.0)},
    },

    'et-tst': {
        'features': ['numeric'],
        # 'y_transform': y_log,
        'model': Sklearn(ExtraTreesClassifier(2, max_features=0.2, n_jobs=-1)),

    },

    'et-ce': {
        'features': ['numeric', 'categorical_encoded'],
        'y_transform': y_log,
        'model': Sklearn(ExtraTreesClassifier(200, max_features=0.2, n_jobs=-1)),
    },

    'et-ce-2': {
        'features': ['numeric', 'categorical_encoded'],
        'y_transform': y_log_ofs(200),
        'model': Sklearn(ExtraTreesClassifier(200, max_features=0.2, n_jobs=-1)),
    },
Beispiel #26
0
 params['max_depth'] = 3
 print(cross_val_score(ExtraTreesClassifier(**params), X, Y,
     scoring=scoring))
 params['max_depth'] = 4
 print(cross_val_score(ExtraTreesClassifier(**params), X, Y,
     scoring=scoring))
 params['max_depth'] = 5
 print(cross_val_score(ExtraTreesClassifier(**params), X, Y,
     scoring=scoring))
 exit(0)'''
 params = dict(n_estimators=1000,
               max_depth=3,
               class_weight='balanced',
               max_features=1,
               n_jobs=8)
 m = ExtraTreesClassifier(**params)
 m.fit(X, Y)
 # find threshold
 from sklearn.metrics import precision_recall_curve
 best_t = None
 precision, recall, thresholds = precision_recall_curve(
     Y,
     m.predict_proba(X)[:, 1])
 for p, r, t in zip(precision, recall, thresholds):
     print(p, r, t)
     if r < 0.8:
         best_t = t
         break
 print('best threshold:', best_t)
 print('final', accuracy_score(Y, m.predict_proba(X)[:, 1] > best_t))
 joblib.dump(m, 'regressor.joblib')
Beispiel #27
0
def _main():
    np.random.seed(rs)
    logger.info("Running script for Approach 1")
    tr_df = pd.read_csv(os.path.join("data", "cs-training.csv"), index_col=0)
    te_df = pd.read_csv(os.path.join("data", "cs-test.csv"), index_col=0)
    tr_df, te_df = _preprocess_data(tr_df, te_df)

    # Add features
    tr_df, te_df = feats.add_features_based_on_NumOCLL(tr_df, te_df)
    tr_df, te_df = feats.add_features_based_on_NumRELL(tr_df, te_df)
    tr_df, te_df = feats.add_features_based_on_RUoUL(tr_df, te_df)

    # Preparing dataset for training
    excluded_cols = [
        "age", "MonthlyIncome", "MonthlyIncome_Imputed", "SeriousDlqin2yrs"
    ]
    train_df = tr_df[tr_df.columns.difference(excluded_cols)]
    cols = train_df.columns.values.tolist()
    X, _ = utils.normalize_df(train_df)
    X = X.as_matrix()
    y = tr_df["SeriousDlqin2yrs"].values

    # Split
    sss = StratifiedShuffleSplit(n_splits=3, random_state=rs, test_size=0.3)
    for train_index, test_index in sss.split(X, y):
        X_train, X_valid, y_train, y_valid = X[train_index], X[test_index], y[
            train_index], y[test_index]

    logger.info("X {}, train {}, valid {}" \
                .format(X.shape, X_train.shape, X_valid.shape))

    # Train
    logger.info("Features used for training : {}".format(cols))
    base_estimators = [
        ExtraTreesClassifier(n_estimators=400, n_jobs=-1, random_state=rs),
        LogisticRegressionCV(random_state=rs),
        RandomForestClassifier(bootstrap=True,
                               criterion="gini",
                               max_depth=None,
                               max_features=5,
                               n_estimators=150,
                               n_jobs=-1,
                               random_state=rs),
        # SVC(C=0.01, gamma=0.01, kernel="rbf", probability=True,
        #     random_state=rs)
    ]

    # Each classifier is trained on 5 stratified splits
    # and the one (amongst the 5) with best AUC score is selected
    best_auc = 0.0
    common_top_n_features = []
    for est in base_estimators:
        fitted_est = utils.train_estimator(est, X_train, y_train, 5)
        top_n_features = []
        top_n_features_df = utils.log_important_features(est, cols)
        if top_n_features_df.shape[0] > 0:
            top_n_features = top_n_features_df.head(15).feature.values.tolist()
        common_top_n_features.extend(top_n_features)
        common_top_n_features = list(set(common_top_n_features))
        logger.info("{} common_top_n_features : {}" \
                    .format(len(common_top_n_features), common_top_n_features))
        preds = fitted_est.predict(X_valid)
        score = roc_auc_score(y_valid, preds)
        logger.info("AUC : {:.5f}".format(score))
        if score > best_auc:
            best_auc = score
            best_est = fitted_est

    logger.info("Best estimator : {}".format(best_est))

    # Re-fitting the best estimator using the common top N features
    refit = False  # TODO read from config
    if refit == True:
        logger.info("Re-fitting best estimator {} using top N features ..." \
                    .format(best_est.__class__.__name__))
        X, _ = utils.normalize_df(train_df[common_top_n_features])
        X = X.as_matrix()
        y = tr_df["SeriousDlqin2yrs"].values
        sss = StratifiedShuffleSplit(n_splits=3,
                                     random_state=rs,
                                     test_size=0.3)
        for train_index, test_index in sss.split(X, y):
            X_train, X_valid, y_train, y_valid = X[train_index], X[test_index], \
                                                 y[train_index], y[test_index]
        fitted_best_est = utils.train_estimator(best_est, X_train, y_train, 5)
        preds = fitted_best_est.predict(X_valid)
        score = roc_auc_score(y_valid, preds)
        logger.info("AUC : {:.5f}".format(score))
        if score > best_auc:
            best_auc = score
            best_est = fitted_est

    # Getting the predictions
    logger.info("Get the predictions using {} ...".format(best_est))
    te_df_, _ = utils.normalize_df(te_df[cols])
    identifiers = te_df_.index.tolist()
    if refit == True:
        p = [
            x[1] for x in best_est.predict_proba(te_df_[common_top_n_features])
        ]
    else:
        p = [x[1] for x in best_est.predict_proba(te_df_)]
    _prepare_submission_file(identifiers, p)
Beispiel #28
0
        # 4-fold cross_validation
        for j in xrange(fold_ids.shape[1]):

            fold = j + 1
            val_ids = fold_ids.ix[:, j].dropna()
            idx = train["ID"].isin(list(val_ids))

            trainingSet = train[~idx]
            validationSet = train[idx]

            et = ExtraTreesClassifier(n_estimators=2000,
                                      criterion="entropy",
                                      max_depth=50,
                                      max_features=0.9,
                                      min_samples_split=3,
                                      min_samples_leaf=5,
                                      bootstrap=False,
                                      oob_score=False,
                                      random_state=112,
                                      verbose=0,
                                      n_jobs=-1)

            et.fit(trainingSet[feature_names], np.array(trainingSet["target"]))
            preds = et.predict_proba(validationSet[feature_names])[:, 1]
            ll = log_loss(np.array(validationSet["target"]), preds)
            print "# Data_version : {0} | Fold : {1} | log_loss : {2}".format(
                i + 1, j + 1, ll)
            df = pd.DataFrame({
                "Fold":
                np.repeat((j + 1), validationSet.shape[0]),
                "ID":
Beispiel #29
0
    X_test = X[n_train:]
    y_test = y[n_train:]

    # Standardize first 10 features (the numerical ones)
    mean = X_train.mean(axis=0)
    std = X_train.std(axis=0)
    mean[10:] = 0.0
    std[10:] = 1.0
    X_train = (X_train - mean) / std
    X_test = (X_test - mean) / std
    return X_train, X_test, y_train, y_test


ESTIMATORS = {
    'RandomForest': RandomForestClassifier(n_estimators=100),
    'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=100)
}

X_train, X_test, y_train, y_test = load_data()

BACKENDS = [('threading', Parallel, {}),
            ('dask.distributed', Parallel2,
                {'scheduler_host': SCHEDULER_ADDRESS, 'scatter': [X_train]})]

if __name__ == "__main__":
    print("Dataset statistics:")
    print("===================")
    print("%s %d" % ("number of features:".ljust(25), X_train.shape[1]))
    print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size))
    print("%s %s" % ("data type:".ljust(25), X_train.dtype))
    print("%s %d (pos=%d, neg=%d, size=%dMB)"
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y_train) 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,random_state=random_state)

n_train = len(y_train) 
n_test = len(y_test) 

kfs = list(KFold(n_train, n_folds=5))

clfs = [
	OneVsRestClassifier(RandomForestClassifier(n_estimators=512,criterion='entropy',max_depth=8), n_jobs=cores),
	OneVsRestClassifier(RandomForestClassifier(n_estimators=1024,criterion='gini',max_depth=8), n_jobs=cores),
	OneVsRestClassifier(svm.SVC(kernel='linear', C= 4.0, probability=True), n_jobs=cores),
	OneVsRestClassifier(svm.SVC(kernel='rbf', C= 2.0, gamma = np.power(2.0, -8.075),probability=True), n_jobs=cores),
	OneVsRestClassifier(ExtraTreesClassifier(n_estimators=512,criterion='entropy',max_depth=16), n_jobs=cores),
	OneVsRestClassifier(GradientBoostingClassifier(n_estimators=512,learning_rate=0.01 , max_depth=8), n_jobs=cores)
	]

#meta features

blend_train_X = None
blend_train_y = None
blend_test = None

for j, clf in enumerate(clfs):
	print j, clf
	blend_test_j = None
	blend_train_X_j = None

        for i, (train, test) in enumerate(kfs):