def tpe_classifier(name='clf'):
    linear_svc_space = hp.choice('liblinear_combination',
                                 [{'penalty': "l1", 'loss': "squared_hinge", 'dual': False},
                                  {'penalty': "l2", 'loss': "hinge", 'dual': True},
                                  {'penalty': "l2", 'loss': "squared_hinge", 'dual': True},
                                  {'penalty': "l2", 'loss': "squared_hinge", 'dual': False}])
    return hp.choice(name,
                     [gaussian_nb('hpsklearn_gaussian_nb'),
                      liblinear_svc('hpsklearn_liblinear_svc',
                                    C=hp.choice('hpsklearn_liblinear_svc_c',
                                                [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.]),
                                    loss=linear_svc_space['loss'],
                                    penalty=linear_svc_space['penalty'],
                                    dual=linear_svc_space['dual'],
                                    tol=hp.choice('hpsklearn_liblinear_svc_tol', [1e-5, 1e-4, 1e-3, 1e-2, 1e-1])
                                    ),
                      decision_tree('decision_tree',
                                    criterion=hp.choice('decision_tree_criterion', ["gini", "entropy"]),
                                    max_depth=hp.randint('decision_tree_max_depth', 10) + 1,
                                    min_samples_split=hp.randint('decision_tree_min_samples_split', 19) + 2,
                                    min_samples_leaf=hp.randint('decision_tree_min_samples_leaf', 20) + 1),
                      knn('knn',
                          n_neighbors=hp.randint('knn_n', 100) + 1,
                          weights=hp.choice('knn_weights', ['uniform', 'distance']),
                          p=hp.choice('knn_p', [1, 2])),
                      extra_trees('et',
                                  n_estimators=100,
                                  criterion=hp.choice('et_criterion', ["gini", "entropy"]),
                                  max_features=hp.randint('et_max_features', 20) * 0.05 + 0.05,
                                  min_samples_split=hp.randint('et_min_samples_split', 19) + 2,
                                  min_samples_leaf=hp.randint('et_min_samples_leaf', 20) + 1,
                                  bootstrap=hp.choice('et_bootstrap', [True, False])),
                      random_forest('rf',
                                    n_estimators=100,
                                    criterion=hp.choice('rf_criterion', ["gini", "entropy"]),
                                    max_features=hp.randint('rf_max_features', 20) * 0.05 + 0.05,
                                    min_samples_split=hp.randint('rf_min_samples_split', 19) + 2,
                                    min_samples_leaf=hp.randint('rf_min_samples_leaf', 20) + 1,
                                    bootstrap=hp.choice('rf_bootstrap', [True, False])),
                      gradient_boosting('gb',
                                        n_estimators=100,
                                        learning_rate=hp.choice('gb_lr', [1e-3, 1e-2, 1e-1, 0.5, 1.]),
                                        max_depth=hp.randint('gb_max_depth', 10) + 1,
                                        min_samples_split=hp.randint('gb_min_samples_split', 19) + 2,
                                        min_samples_leaf=hp.randint('gb_min_samples_leaf', 20) + 1,
                                        subsample=hp.randint('gb_subsample', 20) * 0.05 + 0.05,
                                        max_features=hp.randint('gb_max_features', 20) * 0.05 + 0.05,
                                        )
                      ])
Beispiel #2
0
    def anySample2():
        from hpsklearn import HyperoptEstimator, extra_trees
        from sklearn.datasets import fetch_mldata
        from hyperopt import tpe
        import numpy as np

        # Download the data and split into training and test sets
        digits = fetch_mldata('MNIST original')

        X = digits.data
        y = digits.target

        test_size = int(0.2 * len(y))
        np.random.seed(13)
        indices = np.random.permutation(len(X))
        X_train = X[ indices[:-test_size]]
        y_train = y[ indices[:-test_size]]
        X_test = X[ indices[-test_size:]]
        y_test = y[ indices[-test_size:]]

        # Instantiate a HyperoptEstimator with the search space and number of evaluations
        estim = HyperoptEstimator(classifier=extra_trees('my_clf'),
                                  preprocessing=[],
                                  algo=tpe.suggest,
                                  max_evals=10,
                                  trial_timeout=300)

        # Search the hyperparameter space based on the data
        estim.fit( X_train, y_train )

        # Show the results
        print( estim.score( X_test, y_test ) )
        # 0.962785714286

        print( estim.best_model() )
        # {'learner': ExtraTreesClassifier(bootstrap=True, class_weight=None, criterion='entropy',
        #           max_depth=None, max_features=0.959202875857,
        #           max_leaf_nodes=None, min_impurity_decrease=0.0,
        #           min_impurity_split=None, min_samples_leaf=1,
        #           min_samples_split=2, min_weight_fraction_leaf=0.0,
        #           n_estimators=20, n_jobs=1, oob_score=False, random_state=3,
        #           verbose=False, warm_start=False), 'preprocs': (), 'ex_preprocs': ()}
        pass
Beispiel #3
0
digits = fetch_mldata('MNIST original')

X = digits.data
y = digits.target

test_size = int(0.2 * len(y))
np.random.seed(13)
indices = np.random.permutation(len(X))
X_train = X[indices[:-test_size]]
y_train = y[indices[:-test_size]]
X_test = X[indices[-test_size:]]
y_test = y[indices[-test_size:]]

# Instantiate a HyperoptEstimator with the search space and number of evaluations

estim = HyperoptEstimator(classifier=extra_trees('my_clf'),
                          preprocessing=[],
                          algo=tpe.suggest,
                          max_evals=1,
                          trial_timeout=300)

# Search the hyperparameter space based on the data

estim.fit(X_train, y_train)

# Show the results

print(estim.score(X_test, y_test))
# 0.962785714286

print(estim.best_model())
def main():
    
    # Download the data and split into training and test sets

    iris = load_iris()
    
    X = iris.data
    y = iris.target
    
    test_size = int(0.2 * len(y))
    np.random.seed(13)
    indices = np.random.permutation(len(X))
    X_train = X[indices[:-test_size]]
    y_train = y[indices[:-test_size]]
    X_test = X[indices[-test_size:]]
    y_test = y[indices[-test_size:]]
    
    # for other datas, there will more complex data clearning
    
    
    
    # list all machine learning algorithms for hyper params tuning
    MLA = {
        'rfc':  [
                RandomForestClassifier(),
                #RandomForestClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1),
                {
                'n_estimators': [50,100,200], #default=1.0
                'criterion': ['entropy'], #edfault: auto
                'max_depth': [4,5,6], #default:ovr
                #'min_samples_split': [5,10,.03,.05,.10],
                'max_features': [.5],
                'random_state': [1],
                },
                random_forest('my_rfc'),
                ],
        
        'etc':  [
                ExtraTreesClassifier(), 
                #ExtraTreesClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1),
                {
                'n_estimators': [50,100,200], #default=1.0
                'criterion': ['entropy'], #edfault: auto
                'max_depth': [4,5,6], #default:ovr
                'max_features': [.5],
                'random_state': [1],
                },
                extra_trees('my_etc'),
                ],
        
        'gbc':  [
                GradientBoostingClassifier(),
                #GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1),
                {
                #'loss': ['deviance', 'exponential'],
                'learning_rate': [.1,.25,.5],
                'n_estimators': [50,100,200],
                #'criterion': ['friedman_mse', 'mse', 'mae'],
                'max_depth': [4,5,6],
                'max_features': [.5],
                #'min_samples_split': [5,10,.03,.05,.10],
                #'min_samples_leaf': [5,10,.03,.05,.10],      
                'random_state': [1],
                },
                gradient_boosting('my_rgc'),
                ], 
        
        'lr':  [
                LogisticRegression(),
                #LogisticRegression(random_state=1)
                {
                #'fit_intercept': grid_bool,
                #'penalty': ['l1','l2'],
                #'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                'random_state': [1],
                },
                ], 
        
        'svc':  [
                svm.SVC(),
                {
                #SVC - http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
                #http://blog.hackerearth.com/simple-tutorial-svm-parameter-tuning-python-r
                #'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                'C': [1,2,3,4,5], #default=1.0
                'gamma': [.1, .25, .5, .75, 1.0], #edfault: auto
                'decision_function_shape': ['ovo', 'ovr'], #default:ovr
                'probability': [True],
                'random_state': [0]
                },
                ],
    
        'xgb':  [
                XGBClassifier(),
                {
                #XGBClassifier - http://xgboost.readthedocs.io/en/latest/parameter.html
                'learning_rate': [.01, .03, .05, .1, .25], #default: .3
                'max_depth': [1,2,4,6,8,10], #default 2
                'n_estimators': [10, 50, 100, 300], 
                'seed': [0]  
                },
                ]    
        }

    # list some algorithms for HyperoptEstimator, but error !!!
    #MLA2 = {
        #'rfc':  [
                #random_forest('my_rfc'),
                #],
        
        #'etc':  [
                #extra_trees('my_etc'),
                #],
        
        #'gbc':  [
                #gradient_boosting('my_rgc'),
                #], 
 
        #}  
    # list some algorithms for HyperoptEstimator, but error !!!
    
    
    def opt(clf):
        est = MLA[clf][0]

        # ---------want to use Hyperopt, but has some errors !!!
        #estim = HyperoptEstimator(classifier=MLA2[clf][0],
                                  #preprocessing=[],
                                  #algo=tpe.suggest,
                                  #max_evals=3,
                                  #trial_timeout=120)
        
        #estim.fit( X_train, y_train )
        
        #est = estim
        
        # ---------want to use Hyperopt, but has some errors !!!
        
        # use GridSearchCV, it's too slow
        est = model_selection.GridSearchCV(estimator=est, param_grid=MLA[clf][1], cv=5) # --, scoring='roc_auc'
        
        return est
        
    # for StackNetClassifier
    #models=[ 
            ######### First level ########
            #[RandomForestClassifier(n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1),
             #ExtraTreesClassifier(n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1),
             #GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1),
             #LogisticRegression(random_state=1)
            #],
            ######### Second level ########
            #[RandomForestClassifier(n_estimators=200, criterion="entropy", max_depth=5, max_features=0.5, random_state=1)]
            #]
    
    models=[ 
            ######## First level ########
            [
            opt('rfc'),
            opt('etc'),
            #opt('gbc'),
            #opt('lr'),
            ],
            ######## Second level ########
            [
            opt('rfc'),
            ],
           ]
    
    # use StackNet to stacking the models
    StackNetmodel=StackNetClassifier(models, folds=4, # --metric="auc", 
                                     restacking=False, use_retraining=True, use_proba=True, 
                                     random_state=12345, n_jobs=1, verbose=1)
    
    StackNetmodel.fit(X_train, y_train)