def test_cvbestsearchrefit(self):
     X, y = datasets.make_classification(n_samples=12, n_features=10,
                                         n_informative=2)
     n_folds_nested = 2
     #random_state = 0
     C_values = [.1, 0.5, 1, 2, 5]
     kernels = ["linear", "rbf"]
     key_y_pred = 'y' + conf.SEP + conf.PREDICTION
     # With EPAC
     methods = Methods(*[SVC(C=C, kernel=kernel)
         for C in C_values for kernel in kernels])
     wf = CVBestSearchRefit(methods, n_folds=n_folds_nested)
     wf.run(X=X, y=y)
     r_epac = wf.reduce().values()[0]
     # - Without EPAC
     r_sklearn = dict()
     clf = SVC(kernel="linear")
     parameters = {'C': C_values, 'kernel': kernels}
     cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested)
     gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested)
     gscv.fit(X, y)
     r_sklearn[key_y_pred] = gscv.predict(X)
     r_sklearn[conf.BEST_PARAMS] = gscv.best_params_
     # - Comparisons
     comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred])
     self.assertTrue(comp, u'Diff CVBestSearchRefit: prediction')
     for key_param in r_epac[conf.BEST_PARAMS][0]:
         if key_param in r_sklearn[conf.BEST_PARAMS]:
             comp = r_sklearn[conf.BEST_PARAMS][key_param] == \
                     r_epac[conf.BEST_PARAMS][0][key_param]
             self.assertTrue(comp, \
                 u'Diff CVBestSearchRefit: best parameters')
def test_mem():
    X, y = datasets.make_classification(n_samples=2000,
                                        n_features=10000,
                                        n_informative=2,
                                        random_state=1)
    wf = CVBestSearchRefit(
                Methods(*[SVC(kernel="linear"), SVC(kernel="rbf")]),
                n_folds=10)
    wf.run(X=X, y=y) # Top-down process: computing recognition rates, etc.
    print wf.reduce() # Bottom-up process: computing p-values, etc.
    def test_peristence_perm_cv_parmethods_pipe_vs_sklearn(self):
        key_y_pred = 'y' + conf.SEP + conf.PREDICTION
        X, y = datasets.make_classification(n_samples=12, n_features=10,
                                            n_informative=2)
        n_folds_nested = 2
        #random_state = 0
        C_values = [.1, 0.5, 1, 2, 5]
        kernels = ["linear", "rbf"]
        # With EPAC
        methods = Methods(*[SVC(C=C, kernel=kernel)
            for C in C_values for kernel in kernels])
        wf = CVBestSearchRefit(methods, n_folds=n_folds_nested)
        # Save workflow
        # -------------
        import tempfile
        #store = StoreFs("/tmp/toto", clear=True)
        store = StoreFs(tempfile.mktemp())
        wf.save_tree(store=store)
        wf = store.load()
        wf.run(X=X, y=y)
        ## Save results
        wf.save_tree(store=store)
        wf = store.load()
        r_epac = wf.reduce().values()[0]

        # - Without EPAC
        r_sklearn = dict()
        clf = SVC(kernel="linear")
        parameters = {'C': C_values, 'kernel': kernels}
        cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested)
        gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested)
        gscv.fit(X, y)
        r_sklearn[key_y_pred] = gscv.predict(X)
        r_sklearn[conf.BEST_PARAMS] = gscv.best_params_
        r_sklearn[conf.BEST_PARAMS]['name'] = 'SVC'

        # - Comparisons
        comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred])
        self.assertTrue(comp, u'Diff CVBestSearchRefit: prediction')
        comp = np.all([r_epac[conf.BEST_PARAMS][0][p] == r_sklearn[conf.BEST_PARAMS][p]
        for p in  r_sklearn[conf.BEST_PARAMS]])
        self.assertTrue(comp, u'Diff CVBestSearchRefit: best parameters')
    def test_cvbestsearchrefit_select_k_best(self):
        list_C_value = range(2, 10, 1)
#        print repr(list_C_value)
        for C_value in list_C_value:
#            C_value = 2
#            print C_value
            X, y = datasets.make_classification(n_samples=100,
                                                n_features=500,
                                                n_informative=5)
            n_folds_nested = 2
            #random_state = 0
            k_values = [2, 3, 4, 5, 6]
            key_y_pred = 'y' + conf.SEP + conf.PREDICTION
            # With EPAC
            methods = Methods(*[Pipe(SelectKBest(k=k),
                                     SVC(C=C_value, kernel="linear"))
                                     for k in k_values])
            wf = CVBestSearchRefit(methods, n_folds=n_folds_nested)
            wf.run(X=X, y=y)
            r_epac = wf.reduce().values()[0]
            # - Without EPAC
            from sklearn.pipeline import Pipeline
            r_sklearn = dict()
            clf = Pipeline([('anova', SelectKBest(k=3)),
                            ('svm', SVC(C=C_value, kernel="linear"))])
            parameters = {'anova__k': k_values}
            cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested)
            gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested)
            gscv.fit(X, y)
            r_sklearn[key_y_pred] = gscv.predict(X)
            r_sklearn[conf.BEST_PARAMS] = gscv.best_params_
            r_sklearn[conf.BEST_PARAMS]['k'] = \
                r_sklearn[conf.BEST_PARAMS]['anova__k']
            # - Comparisons
            comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred])
            self.assertTrue(comp, u'Diff CVBestSearchRefit: prediction')
            for key_param in r_epac[conf.BEST_PARAMS][0]:
                if key_param in r_sklearn[conf.BEST_PARAMS]:
                    comp = r_sklearn[conf.BEST_PARAMS][key_param] == \
                            r_epac[conf.BEST_PARAMS][0][key_param]
                    self.assertTrue(comp, \
                        u'Diff CVBestSearchRefit: best parameters')
cv_results = cv.reduce()
print cv_results

#[{'key': CVBestSearchRefit, 'y/test/score_f1': [ 0.82352941  0.7       ], 'y/test/recall_pvalues': [ 0.01086887  0.06790736], 'y/test/score_precision': [ 0.77777778  0.77777778], 'y/test/recall_mean_pvalue': 0.0191572904587, 'y/test/score_recall': [ 0.875       0.63636364], 'y/test/score_accuracy': 0.777777777778, 'y/test/score_recall_mean': 0.755681818182}])

#
#Parmis les 27 11 ont fait la transition et 16 ne l'on pas faite
#- Sensibilité (Taux de detection de les transitions)
#63.63 % soit 7 / 11 (p = 0.067)
#
#- Spécificité (Taux de detection de ceux qui n'ont pas transité ou 1 - Faux positifs)
#87.5 % soit 14 / 16 (p = 0.01)
#
#Nous avons un taux de bonne classification moyen de 77 %
#
svms_cv.run(X=X, y=y)
#{'best_params': [{'C': 0.05, 'name': 'LinearSVC'}],
# 'y/pred': array([0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0,
#       0, 0, 0, 0]),
# 'y/true': array([0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0,
#       0, 0, 0, 0])}

svm = SVM(dual=False, class_weight='auto', penalty="l1", C=.05)
svm.fit(X, y)
svm.coef_
len(svm.coef_.squeeze())
# 27
coef = svm.coef_.squeeze()
print pd.DataFrame(dict(var=Xd.columns[coef != 0], coef=coef[coef != 0]))
#       coef   var
#0 -0.084406  @4.3
Beispiel #6
0
#  LDA  SVM    Classifier (Estimator)
from epac import CV, Methods
cv = CV(Methods(LDA(), SVM()))
cv.run(X=X, y=y)
print(cv.reduce())

# Model selection using CV
# ------------------------
# CVBestSearchRefit
#      Methods       (Splitter)
#      /    \
# SVM(C=1)  SVM(C=10)   Classifier (Estimator)
from epac import Pipe, CVBestSearchRefit, Methods
# CV + Grid search of a simple classifier
wf = CVBestSearchRefit(Methods(SVM(C=1), SVM(C=10)))
wf.run(X=X, y=y)
print(wf.reduce())

# Feature selection combined with SVM and LDA
# CVBestSearchRefit
#                     Methods          (Splitter)
#               /              \
#            KBest(1)         KBest(5) SelectKBest (Estimator)
#              |
#            Methods                   (Splitter)
#        /          \
#    LDA()          SVM() ...          Classifiers (Estimator)
pipelines = Methods(
    *[Pipe(SelectKBest(k=k), Methods(LDA(), SVM())) for k in [1, 5]])
print([n for n in pipelines.walk_leaves()])
best_cv = CVBestSearchRefit(pipelines)
Beispiel #7
0
from epac import CV, Methods
cv = CV(Methods(LDA(), SVM()))
cv.run(X=X, y=y)
print cv.reduce()


# Model selection using CV
# ------------------------
# CVBestSearchRefit
#      Methods       (Splitter)
#      /    \
# SVM(C=1)  SVM(C=10)   Classifier (Estimator)
from epac import Pipe, CVBestSearchRefit, Methods
# CV + Grid search of a simple classifier
wf = CVBestSearchRefit(Methods(SVM(C=1), SVM(C=10)))
wf.run(X=X, y=y)
print wf.reduce()

# Feature selection combined with SVM and LDA
# CVBestSearchRefit
#                     Methods          (Splitter)
#               /              \
#            KBest(1)         KBest(5) SelectKBest (Estimator)
#              |
#            Methods                   (Splitter)
#        /          \
#    LDA()          SVM() ...          Classifiers (Estimator)
pipelines = Methods(*[Pipe(SelectKBest(k=k), Methods(LDA(), SVM())) for k in [1, 5]])
print [n for n in pipelines.walk_leaves()]
best_cv = CVBestSearchRefit(pipelines)
best_cv.run(X=X, y=y)
Beispiel #8
0
print pd.DataFrame(d).to_string()

##############################################################################
# Automatic model selection: "CVBestSearchRefit"
from epac import CVBestSearchRefit, Methods, CV

svms_auto = CVBestSearchRefit(svms)
cv = CV(svms_auto, n_folds=n_folds)
cv.run(X=X, y=y)
#
res_cv_svms_auto = cv.reduce()
print res_cv_svms_auto
print res_cv_svms_auto["CVBestSearchRefit"]['y/test/score_recall']

# Re-fit on all data. Warning: biased !!!
svms_auto.run(X=X, y=y)
print svms_auto.best_params
print svms_auto.refited.estimator.coef_

##############################################################################
# Put everything together
# Pipeline, "Pipe": SelectKBest + StandardScaler + SVM l1 vs l2
from epac import range_log2
from epac import CVBestSearchRefit, Pipe, Methods, CV
k_values = range_log2(X.shape[1], add_n=True)
C_values = [.1, 1, 10, 100]
anova_svms = Methods(*[Pipe(SelectKBest(k=k), preprocessing.StandardScaler(),
                      Methods(*[SVM(C=C, penalty=penalty, class_weight='auto', dual=False) 
                                for C in C_values for penalty in  ['l1', 'l2']]))
                  for k in k_values])