Example #1
0
 def get_workflow(self, n_features=int(1E03)):
     random_state = 0
     C_values = [1, 10]
     k_values = 0
     k_max = "auto"
     n_folds_nested = 5
     n_folds = 10
     n_perms = 10
     if k_max != "auto":
         k_values = range_log2(np.minimum(int(k_max), n_features),
                               add_n=True)
     else:
         k_values = range_log2(n_features, add_n=True)
     cls = Methods(*[
         Pipe(SelectKBest(k=k), SVC(C=C, kernel="linear")) for C in C_values
         for k in k_values
     ])
     pipeline = CVBestSearchRefit(cls,
                                  n_folds=n_folds_nested,
                                  random_state=random_state)
     wf = Perms(CV(pipeline, n_folds=n_folds),
                n_perms=n_perms,
                permute="y",
                random_state=random_state)
     return wf
Example #2
0
 def get_workflow(self):
     ####################################################################
     ## EPAC WORKFLOW
     # -------------------------------------
     #             Perms                      Perm (Splitter)
     #         /     |       \
     #        0      1       2                Samples (Slicer)
     #        |
     #       CV                               CV (Splitter)
     #  /       |       \
     # 0        1       2                     Folds (Slicer)
     # |        |       |
     # Pipeline     Pipeline     Pipeline     Sequence
     # |
     # 2                                      SelectKBest (Estimator)
     # |
     # Methods
     # |                     \
     # SVM(linear,C=1)   SVM(linear,C=10)     Classifiers (Estimator)
     pipeline = Pipe(SelectKBest(k=2),
                     Methods(*[SVC(kernel="linear", C=C)
                               for C in [1, 3]]))
     wf = Perms(CV(pipeline, n_folds=3),
                n_perms=3,
                permute="y",
                random_state=1)
     return wf
    def test_pipeline(self):
        X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2)
        # = With EPAC
        wf = Pipe(SelectKBest(k=2), SVC(kernel="linear"))
        r_epac = wf.top_down(X=X, y=y)

        # = With SKLEARN
        pipe = sklearn.pipeline.Pipeline([("anova", SelectKBest(k=2)), ("svm", SVC(kernel="linear"))])
        r_sklearn = pipe.fit(X, y).predict(X)

        key2cmp = "y" + conf.SEP + conf.PREDICTION
        # = Comparison
        self.assertTrue(np.all(r_epac[key2cmp] == r_sklearn), u"Diff in Pipe: EPAC vs sklearn")
        # test reduce
        r_epac_reduce = wf.reduce().values()[0][key2cmp]
        self.assertTrue(np.all(r_epac_reduce == r_sklearn), u"Diff in Pipe: EPAC reduce")
Example #4
0
 def test_constructor_avoid_collision_level2(self):
     # Test that level 2 collisions are avoided
     pm = Methods(*[Pipe(SelectKBest(k=2), SVC(kernel="linear", C=C))
                    for C in [1, 10]])
     leaves_key = [l.get_key() for l in pm.walk_leaves()]
     self.assertTrue(len(leaves_key) == len(set(leaves_key)),
                     u'Collision could not be avoided')
Example #5
0
def do_all(options):
    if options.k_max != "auto":
        k_values = range_log2(np.minimum(int(options.k_max),
                                         options.n_features),
                              add_n=True)
    else:
        k_values = range_log2(options.n_features, add_n=True)
    C_values = [1, 10]
    random_state = 0
    #print options
    #sys.exit(0)
    if options.trace:
        from epac import conf
        conf.TRACE_TOPDOWN = True

    ## 1) Build dataset
    ## ================
    X, y = datasets.make_classification(n_samples=options.n_samples,
                                        n_features=options.n_features,
                                        n_informative=options.n_informative)

    ## 2) Build Workflow
    ## =================
    time_start = time.time()
    ## CV + Grid search of a pipeline with a nested grid search
    cls = Methods(*[
        Pipe(SelectKBest(k=k), SVC(kernel="linear", C=C)) for C in C_values
        for k in k_values
    ])
    pipeline = CVBestSearchRefit(cls,
                                 n_folds=options.n_folds_nested,
                                 random_state=random_state)
    wf = Perms(CV(pipeline, n_folds=options.n_folds),
               n_perms=options.n_perms,
               permute="y",
               random_state=random_state)
    print "Time ellapsed, tree construction:", time.time() - time_start

    ## 3) Run Workflow
    ## ===============
    time_fit_predict = time.time()
    ## Run on local machine
    sfw_engine = SomaWorkflowEngine(tree_root=wf,
                                    num_processes=options.n_cores)
    ## Run on cluster
    #    sfw_engine = SomaWorkflowEngine(
    #                        tree_root=wf,
    #                        num_processes=options.n_cores,
    #                        resource_id="jl237561@gabriel",
    #                        login="******")
    wf = sfw_engine.run(X=X, y=y)
    print "Time ellapsed, fit predict:", time.time() - time_fit_predict
    time_reduce = time.time()

    ## 4) Reduce Workflow
    ## ==================
    print wf.reduce()
    print "Time ellapsed, reduce:", time.time() - time_reduce
Example #6
0
    def test_pipeline(self):
        X, y = datasets.make_classification(n_samples=20,
                                            n_features=5,
                                            n_informative=2)
        # = With EPAC
        wf = Pipe(SelectKBest(k=2), SVC(kernel="linear"))
        r_epac = wf.top_down(X=X, y=y)

        # = With SKLEARN
        pipe = sklearn.pipeline.Pipeline([('anova', SelectKBest(k=2)),
                                          ('svm', SVC(kernel="linear"))])
        r_sklearn = pipe.fit(X, y).predict(X)

        key2cmp = 'y' + conf.SEP + conf.PREDICTION
        # = Comparison
        self.assertTrue(np.all(r_epac[key2cmp] == r_sklearn),
                        u'Diff in Pipe: EPAC vs sklearn')
        # test reduce
        r_epac_reduce = wf.reduce().values()[0][key2cmp]
        self.assertTrue(np.all(r_epac_reduce == r_sklearn),
                        u'Diff in Pipe: EPAC reduce')
Example #7
0
 def get_workflow(self):
     n_folds = 2
     n_folds_nested = 3
     k_values = [1, 2]
     C_values = [1, 2]
     pipelines = Methods(*[Pipe(SelectKBest(k=k),
                                Methods(*[SVC(kernel="linear", C=C)
                                          for C in C_values]))
                           for k in k_values])
     pipeline = CVBestSearchRefitParallel(pipelines,
                                          n_folds=n_folds_nested)
     wf = CV(pipeline, n_folds=n_folds)
     return wf
    def test_cv_best_search_refit_parallel(self):
        n_folds = 2
        n_folds_nested = 3
        k_values = [1, 2]
        C_values = [1, 2]
        n_samples = 500
        n_features = 10000
        n_cores = 2
        X, y = datasets.make_classification(n_samples=n_samples,
                                            n_features=n_features,
                                            n_informative=5)
        # epac workflow for paralle computing
        pipelines = Methods(*[
            Pipe(SelectKBest(
                k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values]))
            for k in k_values
        ])
        pipeline = CVBestSearchRefitParallel(pipelines, n_folds=n_folds_nested)
        wf = CV(pipeline, n_folds=n_folds)

        sfw_engine = SomaWorkflowEngine(tree_root=wf,
                                        num_processes=n_cores,
                                        remove_finished_wf=False,
                                        remove_local_tree=False)
        sfw_engine_wf = sfw_engine.run(X=X, y=y)

        # epac workflow for normal node computing
        pipelines2 = Methods(*[
            Pipe(SelectKBest(
                k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values]))
            for k in k_values
        ])
        pipeline2 = CVBestSearchRefitParallel(pipelines2,
                                              n_folds=n_folds_nested)
        wf2 = CV(pipeline2, n_folds=n_folds)
        wf2.run(X=X, y=y)

        self.assertTrue(compare_two_node(sfw_engine_wf, wf2))
        self.assertTrue(comp_2wf_reduce_res(sfw_engine_wf, wf2))
Example #9
0
    def test_cvbestsearchrefit_select_k_best(self):
        list_C_value = range(2, 10, 1)
#        print repr(list_C_value)
        for C_value in list_C_value:
#            C_value = 2
#            print C_value
            X, y = datasets.make_classification(n_samples=100,
                                                n_features=500,
                                                n_informative=5)
            n_folds_nested = 2
            #random_state = 0
            k_values = [2, 3, 4, 5, 6]
            key_y_pred = 'y' + conf.SEP + conf.PREDICTION
            # With EPAC
            methods = Methods(*[Pipe(SelectKBest(k=k),
                                     SVC(C=C_value, kernel="linear"))
                                for k in k_values])
            wf = CVBestSearchRefitParallel(methods, n_folds=n_folds_nested)
            wf.run(X=X, y=y)
            r_epac = wf.reduce().values()[0]
            # - Without EPAC
            from sklearn.pipeline import Pipeline
            r_sklearn = dict()
            clf = Pipeline([('anova', SelectKBest(k=3)),
                            ('svm', SVC(C=C_value, kernel="linear"))])
            parameters = {'anova__k': k_values}
            cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested)
            gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested)
            gscv.fit(X, y)
            r_sklearn[key_y_pred] = gscv.predict(X)
            r_sklearn[conf.BEST_PARAMS] = gscv.best_params_
            r_sklearn[conf.BEST_PARAMS]['k'] = \
                r_sklearn[conf.BEST_PARAMS]['anova__k']
            # - Comparisons
            comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred])
            self.assertTrue(comp,
                            u'Diff CVBestSearchRefitParallel: prediction')
            for key_param in r_epac[conf.BEST_PARAMS][0]:
                if key_param in r_sklearn[conf.BEST_PARAMS]:
                    comp = r_sklearn[conf.BEST_PARAMS][key_param] == \
                        r_epac[conf.BEST_PARAMS][0][key_param]
                    self.assertTrue(
                        comp,
                        u'Diff CVBestSearchRefitParallel: best parameters')
Example #10
0
    def test_peristence_load_and_fit_predict(self):
        X, y = datasets.make_classification(n_samples=20,
                                            n_features=10,
                                            n_informative=2)
        n_folds = 2
        n_folds_nested = 3
        k_values = [1, 2]
        C_values = [1, 2]
        pipelines = Methods(*[
            Pipe(SelectKBest(
                k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values]))
            for k in k_values
        ])

        pipeline = CVBestSearchRefitParallel(pipelines, n_folds=n_folds_nested)

        tree_mem = CV(pipeline,
                      n_folds=n_folds,
                      reducer=ClassificationReport(keep=False))
        # Save Tree
        import tempfile
        store = StoreFs(dirpath=tempfile.mkdtemp(), clear=True)
        tree_mem.save_tree(store=store)
        tree_mem.run(X=X, y=y)
        res_mem = tree_mem.reduce().values()[0]
        # Reload Tree
        tree_fs_noresults = store.load()
        tree_fs_noresults.run(X=X, y=y)
        res_fs_noresults = tree_fs_noresults.reduce().values()[0]
        # Save with results
        tree_fs_noresults.save_tree(store=store)
        tree_fs_withresults = store.load()
        res_fs_withresults = tree_fs_withresults.reduce().values()[0]
        # Compare
        comp = np.all([
            np.all(np.asarray(res_mem[k]) == np.asarray(res_fs_noresults[k]))
            and np.all(
                np.asarray(res_fs_noresults[k]) == np.asarray(
                    res_fs_withresults[k])) for k in res_mem
        ])
        self.assertTrue(comp)
Example #11
0
 def get_workflow(self):
     ####################################################################
     ## EPAC WORKFLOW
     # -------------------------------------
     #             Perms                      Perm (Splitter)
     #         /     |       \
     #        0      1       2                Samples (Slicer)
     #        |
     #       CV                               CV (Splitter)
     #  /       |       \
     # 0        1       2                     Folds (Slicer)
     # |        |       |
     # Pipeline     Pipeline     Pipeline     Sequence
     # |
     # 2                                      SelectKBest (Estimator)
     # |
     # Methods
     # |                     \
     # SVM(linear,C=1)   SVM(linear,C=10)     Classifiers (Estimator)
     pipeline = Pipe(SelectKBest(k=2),
                     Methods(*[SVC(kernel="linear", C=C) for C in [1, 3]]))
     wf = CV(pipeline, n_folds=3, reducer=ClassificationReport(keep=True))
     return wf
Example #12
0
 def _search_best(self, **Xy):
     # Fit/predict CV grid search
     self.cv.store = StoreMem()  # local store erased at each fit
     from epac.workflow.pipeline import Pipe
     self.cv.top_down(**Xy)
     #  Pump-up results
     cv_result_set = self.cv.reduce(store_results=False)
     key_val = [(result.key(), result[self.score])
                for result in cv_result_set]
     scores = np.asarray(zip(*key_val)[1])
     scores_opt = np.max(scores) if self.arg_max else np.min(scores)
     idx_best = np.where(scores == scores_opt)[0][0]
     best_key = key_val[idx_best][0]
     # Find nodes that match the best
     nodes_dict = {
         n.get_signature(): n
         for n in self.cv.walk_true_nodes()
         if n.get_signature() in key_split(best_key)
     }
     to_refit = Pipe(
         *[nodes_dict[k].wrapped_node for k in key_split(best_key)])
     best_params = [dict(sig) for sig in key_split(best_key, eval=True)]
     return to_refit, best_params
def do_all(options):
    if options.k_max != "auto":
        k_values = range_log2(np.minimum(int(options.k_max),
                                         options.n_features),
                              add_n=True)
    else:
        k_values = range_log2(options.n_features, add_n=True)
    C_values = [1, 10]
    random_state = 0
    #print options
    #sys.exit(0)
    if options.trace:
        from epac import conf
        conf.TRACE_TOPDOWN = True

    ## 1) Build dataset
    ## ================
    X, y = datasets.make_classification(n_samples=options.n_samples,
                                        n_features=options.n_features,
                                        n_informative=options.n_informative)

    ## 2) Build Workflow
    ## =================
    time_start = time.time()
    ## CV + Grid search of a pipeline with a nested grid search
    cls = Methods(*[
        Pipe(SelectKBest(k=k), SVC(kernel="linear", C=C)) for C in C_values
        for k in k_values
    ])
    pipeline = CVBestSearchRefit(cls,
                                 n_folds=options.n_folds_nested,
                                 random_state=random_state)
    wf = Perms(CV(pipeline, n_folds=options.n_folds),
               n_perms=options.n_perms,
               permute="y",
               random_state=random_state)
    print "Time ellapsed, tree construction:", time.time() - time_start

    ## 3) Export Workflow to soma_workflow_gui
    ## ===============
    time_fit_predict = time.time()
    if os.path.isdir(options.soma_workflow_dir):
        shutil.rmtree(options.soma_workflow_dir)
    sfw_engine = SomaWorkflowEngine(tree_root=wf,
                                    num_processes=options.n_cores)
    sfw_engine.export_to_gui(options.soma_workflow_dir, X=X, y=y)

    print "Time ellapsed, fit predict:", time.time() - time_fit_predict

    #    ## 6) Load Epac tree & Reduce
    #    ## ==========================
    reduce_filename = os.path.join(options.soma_workflow_dir, "reduce.py")
    f = open(reduce_filename, 'w')
    reduce_str = """from epac.map_reduce.engine import SomaWorkflowEngine
wf = SomaWorkflowEngine.load_from_gui("%s")
print wf.reduce()
""" % options.soma_workflow_dir
    f.write(reduce_str)
    f.close()
    print "#First run\n"\
        "soma_workflow_gui\n"\
        "\t(1)Open %s\n"\
        "\t(2)Submit\n"\
        "\t(3)Transfer Input Files\n"\
        "\t...wait...\n"\
        "\t(4)Transfer Output Files\n"\
        "#When done run:\npython %s" % (
            os.path.join(options.soma_workflow_dir,
                         sfw_engine.open_me_by_soma_workflow_gui),
            reduce_filename)
Example #14
0
 def test_constructor_cannot_avoid_collision_level2(self):
     # This should raise an exception since collision cannot be avoided
     self.assertRaises(ValueError, Methods,
                       *[Pipe(SelectKBest(k=2), SVC(kernel="linear", C=C))
                         for C in [1, 1]])
Example #15
0
    def todo_perm_cv_grid_vs_sklearn(self):
        X, y = datasets.make_classification(n_samples=100,
                                            n_features=500,
                                            n_informative=5)
        n_perms = 3
        n_folds = 2
        n_folds_nested = 2
        random_state = 0
        k_values = [2, 3]
        C_values = [1, 10]
        # = With EPAC
        pipelines = Methods(*[Pipe(SelectKBest(k=k),
                                   SVC(C=C, kernel="linear"))
                              for C in C_values
                              for k in k_values])
        #print [n for n in pipelines.walk_leaves()]
        pipelines_cv = CVBestSearchRefit(pipelines,
                                         n_folds=n_folds_nested,
                                         random_state=random_state)
        wf = Perms(CV(pipelines_cv, n_folds=n_folds,
                      reducer=ClassificationReport(keep=True)),
                   n_perms=n_perms, permute="y",
                   reducer=PvalPerms(keep=True),
                   random_state=random_state)
        wf.fit_predict(X=X, y=y)
        r_epac = wf.reduce().values()[0]
        for key in r_epac:
            print("key=" + repr(key) + ", value=" + repr(r_epac[key]))

        # = With SKLEARN
        from sklearn.cross_validation import StratifiedKFold
        from epac.sklearn_plugins import Permutations
        from sklearn.pipeline import Pipeline
        from sklearn import grid_search

        clf = Pipeline([('anova', SelectKBest(k=3)),
                        ('svm', SVC(kernel="linear"))])
        parameters = {'anova__k': k_values, 'svm__C': C_values}

        r_sklearn = dict()
        r_sklearn['pred_te'] = [[None] * n_folds for i in range(n_perms)]
        r_sklearn['true_te'] = [[None] * n_folds for i in range(n_perms)]
        r_sklearn['score_tr'] = [[None] * n_folds for i in range(n_perms)]
        r_sklearn['score_te'] = [[None] * n_folds for i in range(n_perms)]
        r_sklearn['mean_score_te'] = [None] * n_perms
        r_sklearn['mean_score_tr'] = [None] * n_perms

        perm_nb = 0
        perms = Permutations(n=y.shape[0],
                             n_perms=n_perms,
                             random_state=random_state)
        for idx in perms:
            #idx = perms.__iter__().next()
            y_p = y[idx]
            cv = StratifiedKFold(y=y_p, n_folds=n_folds)
            fold_nb = 0
            for idx_train, idx_test in cv:
                #idx_train, idx_test  = cv.__iter__().next()
                X_train = X[idx_train, :]
                X_test = X[idx_test, :]
                y_p_train = y_p[idx_train, :]
                y_p_test = y_p[idx_test, :]
                # Nested CV
                cv_nested = StratifiedKFold(y=y_p_train,
                                            n_folds=n_folds_nested)
                gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested)
                gscv.fit(X_train, y_p_train)
                r_sklearn['pred_te'][perm_nb][fold_nb] = gscv.predict(X_test)
                r_sklearn['true_te'][perm_nb][fold_nb] = y_p_test
                r_sklearn['score_tr'][perm_nb][fold_nb] =\
                    gscv.score(X_train, y_p_train)
                r_sklearn['score_te'][perm_nb][fold_nb] =\
                    gscv.score(X_test, y_p_test)
                fold_nb += 1
            # Average over folds
            r_sklearn['mean_score_te'][perm_nb] = \
                np.mean(np.asarray(r_sklearn['score_te'][perm_nb]), axis=0)
            r_sklearn['mean_score_tr'][perm_nb] = \
                np.mean(np.asarray(r_sklearn['score_tr'][perm_nb]), axis=0)
                    #np.mean(R2[key]['score_tr'][perm_nb])
            perm_nb += 1

        print(repr(r_sklearn))
        # - Comparisons
        shared_keys = set(r_epac.keys()).intersection(set(r_sklearn.keys()))
        comp = {k: np.all(np.asarray(r_epac[k]) == np.asarray(r_sklearn[k]))
                for k in shared_keys}
        print("comp=" + repr(comp))
        #return comp
        for key in comp:
            self.assertTrue(comp[key], u'Diff for attribute: "%s"' % key)
Example #16
0
Xy = np.load(datasets_filepath)
X = Xy["X"]
y = Xy["y"]

from sklearn.svm import LinearSVC as SVM
from sklearn.feature_selection import SelectKBest
from sklearn import preprocessing


##############################################################################
## Pipeline, "Pipe": SelectKBest + StandardScaler + SVM l1 vs l2
from epac import Pipe, CV
n_folds = 10

anova_svm = Pipe(SelectKBest(k=5), 
                 preprocessing.StandardScaler(), 
                 SVM(class_weight='auto'))

cv = CV(anova_svm, n_folds=n_folds)
cv.run(X=X, y=y)
#
res_cv_anova_svm = cv.reduce()
res_cv_anova_svm["SelectKBest/StandardScaler/LinearSVC"]['y/test/score_recall']

##############################################################################
## Multimethods, "Methods": SVM l1 vs l2
from epac import Methods, CV
svms = Methods(SVM(penalty="l1", class_weight='auto', dual=False), 
               SVM(penalty="l2", class_weight='auto', dual=False))

cv = CV(svms, n_folds=n_folds)
Example #17
0
"""

from sklearn import datasets
from sklearn.svm import LinearSVC as SVM
from sklearn.lda import LDA
from sklearn.feature_selection import SelectKBest
X, y = datasets.make_classification(n_samples=12, n_features=10,
                                    n_informative=2, random_state=1)

# Build sequential Pipeline
# -------------------------
# 2  SelectKBest (Estimator)
# |
# SVM Classifier (Estimator)
from epac import Pipe
pipe = Pipe(SelectKBest(k=2), SVM())
pipe.run(X=X, y=y)

# The downstream data-flow is a keyword arguments (dict) containing X and y.
# It will pass through each processing node, SelectKBest(k=2) and SVM.
# Each node calls the "transform" method, that take a dictionary as input
# and produces a dictionary as output. The output is passed to the next node.

# The return value of the run is simply agregation of the outputs (dict) of
# the leaf nodes

for leaf in pipe.walk_leaves():
    print leaf.load_results()

# The result of each branch of the tree is stored in the corresponding leaf.
# An iteration on all the leaves of a tree can return all the results
Example #18
0
"""

from sklearn import datasets
from sklearn.svm import LinearSVC as SVM
from sklearn.lda import LDA
from sklearn.feature_selection import SelectKBest
X, y = datasets.make_classification(n_samples=12, n_features=10,
                                    n_informative=2, random_state=1)

# Build sequential Pipeline
# -------------------------
# 2  SelectKBest (Estimator)
# |
# SVM Classifier (Estimator)
from epac import Pipe
pipe = Pipe(SelectKBest(k=2), SVM())
pipe.run(X=X, y=y)

# The downstream data-flow is a keyword arguments (dict) containing X and y.
# It will pass through each processing node, SelectKBest(k=2) and SVM.
# Each node call the "transform" method, that take a dictionnary as input
# and produces a dictionnary as output. The output is passed  to the next node. 

# The return value of the run is simply agregation of the outputs (dict) of
# the leaf nodes

## Parallelization
## ===============

# Multi-classifiers
# -----------------
Example #19
0
from sklearn import datasets
from sklearn.svm import LinearSVC as SVM
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.feature_selection import SelectKBest
X, y = datasets.make_classification(n_samples=12,
                                    n_features=10,
                                    n_informative=2,
                                    random_state=1)

# Build sequential Pipeline
# -------------------------
# 2  SelectKBest (Estimator)
# |
# SVM Classifier (Estimator)
from epac import Pipe
pipe = Pipe(SelectKBest(k=2), SVM())
pipe.run(X=X, y=y)

# The downstream data-flow is a keyword arguments (dict) containing X and y.
# It will pass through each processing node, SelectKBest(k=2) and SVM.
# Each node calls the "transform" method, that take a dictionary as input
# and produces a dictionary as output. The output is passed to the next node.

# The return value of the run is simply agregation of the outputs (dict) of
# the leaf nodes

for leaf in pipe.walk_leaves():
    print(leaf.load_results())

# The result of each branch of the tree is stored in the corresponding leaf.
# An iteration on all the leaves of a tree can return all the results
Xd.PAS2gr[Xd.PAS2gr == 1] = -1
Xd.PAS2gr[Xd.PAS2gr == 2] = 1
Xd.CB_EXPO[Xd.CB_EXPO == 0] = -1

X = np.asarray(Xd)
y = np.asarray(yd)

#k_values = [1, 2, 3, 4, 5, 10, 15, 20, 25, 27]
C_values = [0.01, 0.05, .1, .5, 1, 5, 10, 100, 1000]

# anova + SVM L1
# ==============

anova_svms = Pipe(
    mylib.SelectPvalue(alpha=1e-1),
    Methods(*[
        SVM(C=C, penalty="l1", class_weight='auto', dual=False)
        for C in C_values
    ]))

# P<0.05
cv = CV(anova_svms, cv_type="stratified", n_folds=10)
a = cv.run(X=X, y=y)
cv_results = cv.reduce()
print cv_results

epac.export_csv(
    cv, cv_results,
    os.path.join(WD, "results",
                 "cv10_caarms+pas+canabis_anova(p<0.05)_svmsl1.csv"))
# recall_mean: 67%
Example #21
0
@author: ed203246
"""

from sklearn import datasets
from sklearn.svm import SVC
from sklearn.lda import LDA
from sklearn.feature_selection import SelectKBest
X, y = datasets.make_classification(n_samples=12,
                                    n_features=10,
                                    n_informative=2)

from epac import Methods, Pipe

self = Methods(*[
    Pipe(SelectKBest(k=k), SVC(kernel=kernel, C=C))
    for kernel in ("linear", "rbf") for C in [1, 10] for k in [1, 2]
])
self = Methods(
    *[Pipe(SelectKBest(k=k), SVC(C=C)) for C in [1, 10] for k in [1, 2]])

import copy
self.fit_predict(X=X, y=y)
self.reduce()
[l.get_key() for l in svms.walk_nodes()]
[l.get_key(2)
 for l in svms.walk_nodes()]  # intermediary key collisions: trig aggregation
"""
# Model selection using CV: CV + Grid
# -----------------------------------------
from epac import CVBestSearchRefit