Esempio n. 1
0
def randomforst_model_tpe():
    estim = HyperoptEstimator(classifier=random_forest('my_clf'),
                              preprocessing=[pca('my_pca')],
                              algo=tpe.suggest,
                              max_evals=150,
                              trial_timeout=60,
                              verbose=0)
    estim.fit(x_train, y_train)
    print("f1score", f1_score(estim.predict(x_test), y_test))
    print("accuracy score", accuracy_score(estim.predict(x_test), y_test))
    print(estim.best_model())
Esempio n. 2
0
def bench_classifiers(name):
    classifiers = [
        ada_boost(name + '.ada_boost'),  # boo
        gaussian_nb(name + '.gaussian_nb'),  # eey
        knn(name + '.knn', sparse_data=True),  # eey
        linear_discriminant_analysis(name + '.linear_discriminant_analysis', n_components=1),  # eey
        random_forest(name + '.random_forest'),  # boo
        sgd(name + '.sgd')  # eey
    ]
    if xgboost:
        classifiers.append(xgboost_classification(name + '.xgboost'))  # boo
    return hp.choice('%s' % name, classifiers)
Esempio n. 3
0
def tpe_classifier(name='clf'):
    linear_svc_space = hp.choice('liblinear_combination',
                                 [{'penalty': "l1", 'loss': "squared_hinge", 'dual': False},
                                  {'penalty': "l2", 'loss': "hinge", 'dual': True},
                                  {'penalty': "l2", 'loss': "squared_hinge", 'dual': True},
                                  {'penalty': "l2", 'loss': "squared_hinge", 'dual': False}])
    return hp.choice(name,
                     [gaussian_nb('hpsklearn_gaussian_nb'),
                      liblinear_svc('hpsklearn_liblinear_svc',
                                    C=hp.choice('hpsklearn_liblinear_svc_c',
                                                [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.]),
                                    loss=linear_svc_space['loss'],
                                    penalty=linear_svc_space['penalty'],
                                    dual=linear_svc_space['dual'],
                                    tol=hp.choice('hpsklearn_liblinear_svc_tol', [1e-5, 1e-4, 1e-3, 1e-2, 1e-1])
                                    ),
                      decision_tree('decision_tree',
                                    criterion=hp.choice('decision_tree_criterion', ["gini", "entropy"]),
                                    max_depth=hp.randint('decision_tree_max_depth', 10) + 1,
                                    min_samples_split=hp.randint('decision_tree_min_samples_split', 19) + 2,
                                    min_samples_leaf=hp.randint('decision_tree_min_samples_leaf', 20) + 1),
                      knn('knn',
                          n_neighbors=hp.randint('knn_n', 100) + 1,
                          weights=hp.choice('knn_weights', ['uniform', 'distance']),
                          p=hp.choice('knn_p', [1, 2])),
                      extra_trees('et',
                                  n_estimators=100,
                                  criterion=hp.choice('et_criterion', ["gini", "entropy"]),
                                  max_features=hp.randint('et_max_features', 20) * 0.05 + 0.05,
                                  min_samples_split=hp.randint('et_min_samples_split', 19) + 2,
                                  min_samples_leaf=hp.randint('et_min_samples_leaf', 20) + 1,
                                  bootstrap=hp.choice('et_bootstrap', [True, False])),
                      random_forest('rf',
                                    n_estimators=100,
                                    criterion=hp.choice('rf_criterion', ["gini", "entropy"]),
                                    max_features=hp.randint('rf_max_features', 20) * 0.05 + 0.05,
                                    min_samples_split=hp.randint('rf_min_samples_split', 19) + 2,
                                    min_samples_leaf=hp.randint('rf_min_samples_leaf', 20) + 1,
                                    bootstrap=hp.choice('rf_bootstrap', [True, False])),
                      gradient_boosting('gb',
                                        n_estimators=100,
                                        learning_rate=hp.choice('gb_lr', [1e-3, 1e-2, 1e-1, 0.5, 1.]),
                                        max_depth=hp.randint('gb_max_depth', 10) + 1,
                                        min_samples_split=hp.randint('gb_min_samples_split', 19) + 2,
                                        min_samples_leaf=hp.randint('gb_min_samples_leaf', 20) + 1,
                                        subsample=hp.randint('gb_subsample', 20) * 0.05 + 0.05,
                                        max_features=hp.randint('gb_max_features', 20) * 0.05 + 0.05,
                                        )
                      ])
def main():

    df_train = pd.read_csv('../train_dataset.csv')
    df_test = pd.read_csv('../test_dataset.csv')

    X_train, y_train = df_train.iloc[:, 2:].values, df_train.iloc[:, 0].values
    X_test, y_test = df_test.iloc[:, 2:].values, df_test.iloc[:, 0].values

    estim = HyperoptEstimator(classifier=random_forest('myDT'),
                              algo=tpe.suggest,
                              max_evals=150,
                              trial_timeout=120,
                              verbose=True)

    estim.fit(X_train, y_train)

    print("\n\n{}\n\n".format(estim.score(X_test, y_test)))
    print("\n\n{}\n\n".format(estim.best_model()))
        print("\ndata is loaded  - next step > model testing\n")

        n_job = 6
        select_classes = [0, 1, 2, 3, 4, 5]
        val_dist = X_val_mini.shape[0] / X_train_mini.shape[0]
        name = 'my_est_oVa'

        tic_mod_all = time.time()
        select_alg = [
            ada_boost(name + '.ada_boost'),
            gaussian_nb(name + '.gaussian_nb'),
            knn(name + '.knn', sparse_data=True),
            linear_discriminant_analysis(name +
                                         '.linear_discriminant_analysis',
                                         n_components=1),
            random_forest(name + '.random_forest'),
            sgd(name + '.sgd'),
            xgboost_classification(name + '.xgboost')
        ]

        # fitting models
        estim_one_vs_rest = dict()
        # scoring models
        algo_scoring = dict()
        save_score_path = r'C:/Users/anden/PycharmProjects/NovelEEG/results'
        for alg in [select_alg[args.index]]:
            tic_mod = time.time()
            print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n",
                  "running on %s" % (alg.name + '.one_V_all'),
                  "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
            clf_method = one_vs_rest(str(alg.name + '.one_V_all'),
Esempio n. 6
0
def main():
    
    # Download the data and split into training and test sets

    iris = load_iris()
    
    X = iris.data
    y = iris.target
    
    test_size = int(0.2 * len(y))
    np.random.seed(13)
    indices = np.random.permutation(len(X))
    X_train = X[indices[:-test_size]]
    y_train = y[indices[:-test_size]]
    X_test = X[indices[-test_size:]]
    y_test = y[indices[-test_size:]]
    
    # for other datas, there will more complex data clearning
    
    
    
    # list all machine learning algorithms for hyper params tuning
    MLA = {
        'rfc':  [
                RandomForestClassifier(),
                #RandomForestClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1),
                {
                'n_estimators': [50,100,200], #default=1.0
                'criterion': ['entropy'], #edfault: auto
                'max_depth': [4,5,6], #default:ovr
                #'min_samples_split': [5,10,.03,.05,.10],
                'max_features': [.5],
                'random_state': [1],
                },
                random_forest('my_rfc'),
                ],
        
        'etc':  [
                ExtraTreesClassifier(), 
                #ExtraTreesClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1),
                {
                'n_estimators': [50,100,200], #default=1.0
                'criterion': ['entropy'], #edfault: auto
                'max_depth': [4,5,6], #default:ovr
                'max_features': [.5],
                'random_state': [1],
                },
                extra_trees('my_etc'),
                ],
        
        'gbc':  [
                GradientBoostingClassifier(),
                #GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1),
                {
                #'loss': ['deviance', 'exponential'],
                'learning_rate': [.1,.25,.5],
                'n_estimators': [50,100,200],
                #'criterion': ['friedman_mse', 'mse', 'mae'],
                'max_depth': [4,5,6],
                'max_features': [.5],
                #'min_samples_split': [5,10,.03,.05,.10],
                #'min_samples_leaf': [5,10,.03,.05,.10],      
                'random_state': [1],
                },
                gradient_boosting('my_rgc'),
                ], 
        
        'lr':  [
                LogisticRegression(),
                #LogisticRegression(random_state=1)
                {
                #'fit_intercept': grid_bool,
                #'penalty': ['l1','l2'],
                #'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                'random_state': [1],
                },
                ], 
        
        'svc':  [
                svm.SVC(),
                {
                #SVC - http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
                #http://blog.hackerearth.com/simple-tutorial-svm-parameter-tuning-python-r
                #'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                'C': [1,2,3,4,5], #default=1.0
                'gamma': [.1, .25, .5, .75, 1.0], #edfault: auto
                'decision_function_shape': ['ovo', 'ovr'], #default:ovr
                'probability': [True],
                'random_state': [0]
                },
                ],
    
        'xgb':  [
                XGBClassifier(),
                {
                #XGBClassifier - http://xgboost.readthedocs.io/en/latest/parameter.html
                'learning_rate': [.01, .03, .05, .1, .25], #default: .3
                'max_depth': [1,2,4,6,8,10], #default 2
                'n_estimators': [10, 50, 100, 300], 
                'seed': [0]  
                },
                ]    
        }

    # list some algorithms for HyperoptEstimator, but error !!!
    #MLA2 = {
        #'rfc':  [
                #random_forest('my_rfc'),
                #],
        
        #'etc':  [
                #extra_trees('my_etc'),
                #],
        
        #'gbc':  [
                #gradient_boosting('my_rgc'),
                #], 
 
        #}  
    # list some algorithms for HyperoptEstimator, but error !!!
    
    
    def opt(clf):
        est = MLA[clf][0]

        # ---------want to use Hyperopt, but has some errors !!!
        #estim = HyperoptEstimator(classifier=MLA2[clf][0],
                                  #preprocessing=[],
                                  #algo=tpe.suggest,
                                  #max_evals=3,
                                  #trial_timeout=120)
        
        #estim.fit( X_train, y_train )
        
        #est = estim
        
        # ---------want to use Hyperopt, but has some errors !!!
        
        # use GridSearchCV, it's too slow
        est = model_selection.GridSearchCV(estimator=est, param_grid=MLA[clf][1], cv=5) # --, scoring='roc_auc'
        
        return est
        
    # for StackNetClassifier
    #models=[ 
            ######### First level ########
            #[RandomForestClassifier(n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1),
             #ExtraTreesClassifier(n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1),
             #GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1),
             #LogisticRegression(random_state=1)
            #],
            ######### Second level ########
            #[RandomForestClassifier(n_estimators=200, criterion="entropy", max_depth=5, max_features=0.5, random_state=1)]
            #]
    
    models=[ 
            ######## First level ########
            [
            opt('rfc'),
            opt('etc'),
            #opt('gbc'),
            #opt('lr'),
            ],
            ######## Second level ########
            [
            opt('rfc'),
            ],
           ]
    
    # use StackNet to stacking the models
    StackNetmodel=StackNetClassifier(models, folds=4, # --metric="auc", 
                                     restacking=False, use_retraining=True, use_proba=True, 
                                     random_state=12345, n_jobs=1, verbose=1)
    
    StackNetmodel.fit(X_train, y_train)    
from hyperopt import tpe

X = df[features].values
y = df["outcome"]

test_size = int(0.1 * len(y))
np.random.seed(10)
indices = np.random.permutation(len(X))
X_train = np.float64(X[indices[:-test_size]])
y_train = np.float64(y[indices[:-test_size]])
X_test = np.float64(X[indices[-test_size:]])
y_test = np.float64(y[indices[-test_size:]])

estim = HyperoptEstimator(algo=tpe.suggest,
                          trial_timeout=300,
                          classifier=random_forest('my_random_forest'))

estim.fit(X_train, y_train)

print(estim.score(X_test, y_test))
# <<show score here>>
print(estim.best_model())

# In[Another look at error: Out of bag error, plot written by Kian Ho]:

import matplotlib.pyplot as plt

from collections import OrderedDict
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
Esempio n. 8
0
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from hpsklearn import HyperoptEstimator, svc, random_forest, knn
from hyperopt import tpe
from sklearn.metrics import f1_score


def scorer(yt, yp):
    return 1 - f1_score(yt, yp, average='macro')


if __name__ == '__main__':
    np.random.seed(42)
    train_X = np.load('data/train_X.npy')
    test_X = np.load('data/test_X.npy')
    train_Y = np.load('data/train_Y.npy')
    test_Y = np.load('data/test_Y.npy')

    estim = HyperoptEstimator(classifier=random_forest('rf'),
                              algo=tpe.suggest,
                              loss_fn=scorer,
                              max_evals=200,
                              trial_timeout=1200)
    estim.fit(train_X, train_Y)
    yp = estim.predict(test_X)
    print(f1_score(test_Y, yp, average='macro'))