def test_mem(): X, y = datasets.make_classification(n_samples=2000, n_features=10000, n_informative=2, random_state=1) wf = CVBestSearchRefit( Methods(*[SVC(kernel="linear"), SVC(kernel="rbf")]), n_folds=10) wf.run(X=X, y=y) # Top-down process: computing recognition rates, etc. print wf.reduce() # Bottom-up process: computing p-values, etc.
def test_cvbestsearchrefit(self): X, y = datasets.make_classification(n_samples=12, n_features=10, n_informative=2) n_folds_nested = 2 #random_state = 0 C_values = [.1, 0.5, 1, 2, 5] kernels = ["linear", "rbf"] key_y_pred = 'y' + conf.SEP + conf.PREDICTION # With EPAC methods = Methods(*[SVC(C=C, kernel=kernel) for C in C_values for kernel in kernels]) wf = CVBestSearchRefit(methods, n_folds=n_folds_nested) wf.run(X=X, y=y) r_epac = wf.reduce().values()[0] # - Without EPAC r_sklearn = dict() clf = SVC(kernel="linear") parameters = {'C': C_values, 'kernel': kernels} cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested) gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested) gscv.fit(X, y) r_sklearn[key_y_pred] = gscv.predict(X) r_sklearn[conf.BEST_PARAMS] = gscv.best_params_ # - Comparisons comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred]) self.assertTrue(comp, u'Diff CVBestSearchRefit: prediction') for key_param in r_epac[conf.BEST_PARAMS][0]: if key_param in r_sklearn[conf.BEST_PARAMS]: comp = r_sklearn[conf.BEST_PARAMS][key_param] == \ r_epac[conf.BEST_PARAMS][0][key_param] self.assertTrue(comp, \ u'Diff CVBestSearchRefit: best parameters')
def test_peristence_perm_cv_parmethods_pipe_vs_sklearn(self): key_y_pred = 'y' + conf.SEP + conf.PREDICTION X, y = datasets.make_classification(n_samples=12, n_features=10, n_informative=2) n_folds_nested = 2 #random_state = 0 C_values = [.1, 0.5, 1, 2, 5] kernels = ["linear", "rbf"] # With EPAC methods = Methods(*[SVC(C=C, kernel=kernel) for C in C_values for kernel in kernels]) wf = CVBestSearchRefit(methods, n_folds=n_folds_nested) # Save workflow # ------------- import tempfile #store = StoreFs("/tmp/toto", clear=True) store = StoreFs(tempfile.mktemp()) wf.save_tree(store=store) wf = store.load() wf.run(X=X, y=y) ## Save results wf.save_tree(store=store) wf = store.load() r_epac = wf.reduce().values()[0] # - Without EPAC r_sklearn = dict() clf = SVC(kernel="linear") parameters = {'C': C_values, 'kernel': kernels} cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested) gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested) gscv.fit(X, y) r_sklearn[key_y_pred] = gscv.predict(X) r_sklearn[conf.BEST_PARAMS] = gscv.best_params_ r_sklearn[conf.BEST_PARAMS]['name'] = 'SVC' # - Comparisons comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred]) self.assertTrue(comp, u'Diff CVBestSearchRefit: prediction') comp = np.all([r_epac[conf.BEST_PARAMS][0][p] == r_sklearn[conf.BEST_PARAMS][p] for p in r_sklearn[conf.BEST_PARAMS]]) self.assertTrue(comp, u'Diff CVBestSearchRefit: best parameters')
def test_cvbestsearchrefit_select_k_best(self): list_C_value = range(2, 10, 1) # print repr(list_C_value) for C_value in list_C_value: # C_value = 2 # print C_value X, y = datasets.make_classification(n_samples=100, n_features=500, n_informative=5) n_folds_nested = 2 #random_state = 0 k_values = [2, 3, 4, 5, 6] key_y_pred = 'y' + conf.SEP + conf.PREDICTION # With EPAC methods = Methods(*[Pipe(SelectKBest(k=k), SVC(C=C_value, kernel="linear")) for k in k_values]) wf = CVBestSearchRefit(methods, n_folds=n_folds_nested) wf.run(X=X, y=y) r_epac = wf.reduce().values()[0] # - Without EPAC from sklearn.pipeline import Pipeline r_sklearn = dict() clf = Pipeline([('anova', SelectKBest(k=3)), ('svm', SVC(C=C_value, kernel="linear"))]) parameters = {'anova__k': k_values} cv_nested = StratifiedKFold(y=y, n_folds=n_folds_nested) gscv = grid_search.GridSearchCV(clf, parameters, cv=cv_nested) gscv.fit(X, y) r_sklearn[key_y_pred] = gscv.predict(X) r_sklearn[conf.BEST_PARAMS] = gscv.best_params_ r_sklearn[conf.BEST_PARAMS]['k'] = \ r_sklearn[conf.BEST_PARAMS]['anova__k'] # - Comparisons comp = np.all(r_epac[key_y_pred] == r_sklearn[key_y_pred]) self.assertTrue(comp, u'Diff CVBestSearchRefit: prediction') for key_param in r_epac[conf.BEST_PARAMS][0]: if key_param in r_sklearn[conf.BEST_PARAMS]: comp = r_sklearn[conf.BEST_PARAMS][key_param] == \ r_epac[conf.BEST_PARAMS][0][key_param] self.assertTrue(comp, \ u'Diff CVBestSearchRefit: best parameters')
from epac import CV, Methods cv = CV(Methods(LDA(), SVM())) cv.run(X=X, y=y) print(cv.reduce()) # Model selection using CV # ------------------------ # CVBestSearchRefit # Methods (Splitter) # / \ # SVM(C=1) SVM(C=10) Classifier (Estimator) from epac import Pipe, CVBestSearchRefit, Methods # CV + Grid search of a simple classifier wf = CVBestSearchRefit(Methods(SVM(C=1), SVM(C=10))) wf.run(X=X, y=y) print(wf.reduce()) # Feature selection combined with SVM and LDA # CVBestSearchRefit # Methods (Splitter) # / \ # KBest(1) KBest(5) SelectKBest (Estimator) # | # Methods (Splitter) # / \ # LDA() SVM() ... Classifiers (Estimator) pipelines = Methods( *[Pipe(SelectKBest(k=k), Methods(LDA(), SVM())) for k in [1, 5]]) print([n for n in pipelines.walk_leaves()]) best_cv = CVBestSearchRefit(pipelines) best_cv.run(X=X, y=y)
cv = CV(Methods(LDA(), SVM())) cv.run(X=X, y=y) print cv.reduce() # Model selection using CV # ------------------------ # CVBestSearchRefit # Methods (Splitter) # / \ # SVM(C=1) SVM(C=10) Classifier (Estimator) from epac import Pipe, CVBestSearchRefit, Methods # CV + Grid search of a simple classifier wf = CVBestSearchRefit(Methods(SVM(C=1), SVM(C=10))) wf.run(X=X, y=y) print wf.reduce() # Feature selection combined with SVM and LDA # CVBestSearchRefit # Methods (Splitter) # / \ # KBest(1) KBest(5) SelectKBest (Estimator) # | # Methods (Splitter) # / \ # LDA() SVM() ... Classifiers (Estimator) pipelines = Methods(*[Pipe(SelectKBest(k=k), Methods(LDA(), SVM())) for k in [1, 5]]) print [n for n in pipelines.walk_leaves()] best_cv = CVBestSearchRefit(pipelines) best_cv.run(X=X, y=y) best_cv.reduce()