def func_memm_local(): print "memm_local pt1" ## 1) Build a dataset and convert to np.memmap (for big matrix) ## ============================================================ X, y = datasets.make_classification(n_samples=500, n_features=5000, n_informative=2, random_state=1) print "memm_local pt2" X = convert2memmap(X) y = convert2memmap(y) Xy = dict(X=X, y=y) ## 2) Build two workflows respectively ## ======================================================= print "memm_local pt3" from sklearn.svm import SVC from epac import CV, Methods cv_svm_local = CV(Methods(*[SVC(kernel="linear"), SVC(kernel="rbf")]), n_folds=3) print "memm_local pt4" # from epac import LocalEngine # local_engine = LocalEngine(cv_svm_local, num_processes=2) # cv_svm = local_engine.run(**Xy) cv_svm_local.run(**Xy) print cv_svm_local.reduce() print "memm_local pt5"
def test_cv(self): X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2) n_folds = 2 # = With EPAC wf = CV(SVC(kernel="linear"), n_folds=n_folds, reducer=ClassificationReport(keep=True)) r_epac = wf.top_down(X=X, y=y) # = With SKLEARN clf = SVC(kernel="linear") r_sklearn = list() for idx_train, idx_test in StratifiedKFold(y=y, n_folds=n_folds): # idx_train, idx_test = cv.__iter__().next() X_train = X[idx_train, :] X_test = X[idx_test, :] y_train = y[idx_train, :] clf.fit(X_train, y_train) r_sklearn.append(clf.predict(X_test)) # = Comparison key2cmp = "y" + conf.SEP + conf.TEST + conf.SEP + conf.PREDICTION for icv in range(n_folds): comp = np.all(np.asarray(r_epac[0][key2cmp]) == np.asarray(r_sklearn[0])) self.assertTrue(comp, u"Diff CV: EPAC vs sklearn") # test reduce r_epac_reduce = wf.reduce().values()[0][key2cmp] comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn)) self.assertTrue(comp, u"Diff CV: EPAC reduce")
def func_memm_local(): print("memm_local pt1") ## 1) Build a dataset and convert to np.memmap (for big matrix) ## ============================================================ X, y = datasets.make_classification(n_samples=500, n_features=5000, n_informative=2, random_state=1) print("memm_local pt2") X = convert2memmap(X) y = convert2memmap(y) Xy = dict(X=X, y=y) ## 2) Build two workflows respectively ## ======================================================= print("memm_local pt3") from sklearn.svm import SVC from epac import CV, Methods cv_svm_local = CV(Methods(*[SVC( kernel="linear"), SVC(kernel="rbf")]), n_folds=3) print("memm_local pt4") # from epac import LocalEngine # local_engine = LocalEngine(cv_svm_local, num_processes=2) # cv_svm = local_engine.run(**Xy) cv_svm_local.run(**Xy) print(cv_svm_local.reduce()) print("memm_local pt5")
def test_mem(): X, y = datasets.make_classification(n_samples=2000, n_features=10000, n_informative=2, random_state=1) # f = open("/home/jinpeng/x.log", "w") # pickle.dump(X, f) # =>> 474 MB # f.close() # np.savez ("/home/jinpeng/np_x.log", dict(X=X)) # ===> 160 MB cv_svm = CV(Methods(*[SVC(kernel="linear"), SVC(kernel="rbf")]), n_folds=10) cv_svm.run(X=X, y=y) # Top-down process: computing recognition rates, etc. # local_engine = LocalEngine(cv_svm, num_processes=2) # cv_svm = local_engine.run(X=X, y=y) print cv_svm.reduce() # Bottom-up process: computing p-values, etc.
def test_cv(self): X, y = datasets.make_classification(n_samples=20, n_features=5, n_informative=2) n_folds = 2 # = With EPAC wf = CV(SVC(kernel="linear"), n_folds=n_folds, reducer=ClassificationReport(keep=True)) r_epac = wf.top_down(X=X, y=y) # = With SKLEARN clf = SVC(kernel="linear") r_sklearn = list() for idx_train, idx_test in StratifiedKFold(y=y, n_folds=n_folds): #idx_train, idx_test = cv.__iter__().next() X_train = X[idx_train, :] X_test = X[idx_test, :] y_train = y[idx_train, :] clf.fit(X_train, y_train) r_sklearn.append(clf.predict(X_test)) # = Comparison key2cmp = 'y' + conf.SEP + conf.TEST + conf.SEP + conf.PREDICTION for icv in range(n_folds): comp = np.all(np.asarray(r_epac[0][key2cmp]) == np.asarray(r_sklearn[0])) self.assertTrue(comp, u'Diff CV: EPAC vs sklearn') # test reduce r_epac_reduce = wf.reduce().values()[0][key2cmp] comp = np.all(np.asarray(r_epac_reduce) == np.asarray(r_sklearn)) self.assertTrue(comp, u'Diff CV: EPAC reduce')
def test_peristence_load_and_fit_predict(self): X, y = datasets.make_classification(n_samples=20, n_features=10, n_informative=2) n_folds = 2 n_folds_nested = 3 k_values = [1, 2] C_values = [1, 2] pipelines = Methods(*[ Pipe(SelectKBest(k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values]) pipeline = CVBestSearchRefit(pipelines, n_folds=n_folds_nested) tree_mem = CV(pipeline, n_folds=n_folds, reducer=ClassificationReport(keep=False)) # Save Tree import tempfile store = StoreFs(dirpath=tempfile.mkdtemp(), clear=True) tree_mem.save_tree(store=store) tree_mem.run(X=X, y=y) res_mem = tree_mem.reduce().values()[0] # Reload Tree tree_fs_noresults = store.load() tree_fs_noresults.run(X=X, y=y) res_fs_noresults = tree_fs_noresults.reduce().values()[0] # Save with results tree_fs_noresults.save_tree(store=store) tree_fs_withresults = store.load() res_fs_withresults = tree_fs_withresults.reduce().values()[0] # # Compare comp = np.all([ np.all( np.asarray(res_mem[k]) == np.asarray(res_fs_noresults[k])) and np.all(np.asarray(res_fs_noresults[k]) == np.asarray(res_fs_withresults[k])) for k in res_mem]) self.assertTrue(comp)
def test_peristence_load_and_fit_predict(self): X, y = datasets.make_classification(n_samples=20, n_features=10, n_informative=2) n_folds = 2 n_folds_nested = 3 k_values = [1, 2] C_values = [1, 2] pipelines = Methods(*[ Pipe(SelectKBest( k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values ]) pipeline = CVBestSearchRefitParallel(pipelines, n_folds=n_folds_nested) tree_mem = CV(pipeline, n_folds=n_folds, reducer=ClassificationReport(keep=False)) # Save Tree import tempfile store = StoreFs(dirpath=tempfile.mkdtemp(), clear=True) tree_mem.save_tree(store=store) tree_mem.run(X=X, y=y) res_mem = tree_mem.reduce().values()[0] # Reload Tree tree_fs_noresults = store.load() tree_fs_noresults.run(X=X, y=y) res_fs_noresults = tree_fs_noresults.reduce().values()[0] # Save with results tree_fs_noresults.save_tree(store=store) tree_fs_withresults = store.load() res_fs_withresults = tree_fs_withresults.reduce().values()[0] # Compare comp = np.all([ np.all(np.asarray(res_mem[k]) == np.asarray(res_fs_noresults[k])) and np.all( np.asarray(res_fs_noresults[k]) == np.asarray( res_fs_withresults[k])) for k in res_mem ]) self.assertTrue(comp)
# in the result with key "y/true" class MySVM: def __init__(self, C=1.0): self.C = C def fit(self, X, y): from sklearn.svm import SVC self.svc = SVC(C=self.C) self.svc.fit(X, y) def predict(self, X): return self.svc.predict(X) svms = Methods(MySVM(C=1.0), MySVM(C=2.0)) cv = CV(svms, cv_key="y", cv_type="stratified", n_folds=2, reducer=None) cv.run(X=X, y=y) # top-down process to call transform cv.reduce() # buttom-up process from sklearn.decomposition import PCA class MyPCA(PCA): """PCA with predict method""" def predict(self, X): """Project to X PCs then project back to original space If X is not singular, self.fit(X).predict(X) == X""" return np.dot(self.transform(X), self.components_) + self.mean_ pcas = Methods(MyPCA(n_components=1), MyPCA(n_components=2)) cv = CV(pcas, n_folds=2, reducer=None) cv.run(X=X, y=y) # top-down process to call transform cv.reduce() # buttom-up process
X = np.asarray(Xd) y = np.asarray(yd) C_values = [0.01, 0.05, .1, .5, 1, 5, 10] # SVM L1 # ====== svms = Methods(*[ SVM(dual=False, class_weight='auto', penalty="l1", C=C) for C in C_values ]) cv = CV(svms, cv_type="stratified", n_folds=10) cv.run(X=X, y=y) cv_results = cv.reduce() #print cv_results epac.export_csv( cv, cv_results, os.path.join(WD, "results", "cv10_caarms+pas+canabis_svmsl1.csv")) # SVM L1 with CVBestSearchRefit # ============================= svms_cv = CVBestSearchRefit(svms, n_folds=10, cv_type="stratified") cv = CV(svms_cv, cv_type="stratified", n_folds=10) cv.run(X=X, y=y) cv_results = cv.reduce() print cv_results
from epac import ClassificationReport, PvalPerms from epac import StoreFs from epac import CVBestSearchRefit from epac.sklearn_plugins import Permutations from epac.configuration import conf X, y = datasets.make_classification(n_samples=20, n_features=10, n_informative=2) n_folds = 2 n_folds_nested = 3 k_values = [1, 2] C_values = [1, 2] pipelines = Methods(*[ Pipe(SelectKBest(k=k), Methods(*[SVC(kernel="linear", C=C) for C in C_values])) for k in k_values]) pipeline = CVBestSearchRefit(pipelines, n_folds=n_folds_nested) tree_mem = CV(pipeline, n_folds=n_folds, reducer=ClassificationReport(keep=False)) # Save Tree import tempfile store = StoreFs(dirpath=tempfile.mkdtemp(), clear=True) tree_mem.save_tree(store=store) tree_mem.run(X=X, y=y) tree_mem.reduce()
#k_values = [1, 2, 3, 4, 5, 10, 15, 20, 25, 27] C_values = [0.01, 0.05, .1, .5, 1, 5, 10] # SVM L1 # ====== svms = Methods(*[SVM(dual=False, class_weight='auto', penalty="l1", C=C) for C in C_values]) # #anova_svms = Methods(*[Pipe(SelectKBest(k=k), #preprocessing.StandardScaler(), # Methods(*[SVM(C=C, penalty=penalty, class_weight='auto', dual=False) for C in C_values for penalty in ['l1', 'l2']])) for k in k_values]) cv = CV(svms, cv_type="stratified", n_folds=10) cv.run(X=X, y=y) cv_results = cv.reduce() #print cv_results epac.export_csv(cv, cv_results, os.path.join(WD, "results", "cv10_svmsl1.csv")) # SVM L1 with CVBestSearchRefit # ============================= svms_cv = CVBestSearchRefit(svms, n_folds=10) cv = CV(svms_cv, cv_type="stratified", n_folds=10) cv.run(X=X, y=y) cv_results = cv.reduce() print cv_results #[{'key': CVBestSearchRefit, 'y/test/score_f1': [ 0.84848485 0.76190476], 'y/test/recall_pvalues': [ 0.01086887 0.03000108], 'y/test/score_precision': [ 0.82352941 0.8 ], 'y/test/recall_mean_pvalue': 0.00592461228371, 'y/test/score_recall': [ 0.875 0.72727273], 'y/test/score_accuracy': 0.814814814815, 'y/test/score_recall_mean': 0.801136363636}]) # #Parmis les 27 11 ont fait la transition et 16 ne l'on pas faite
print(anovas_svm.reduce()) # Cross-validation # ---------------- # CV of LDA # CV (Splitter) # / | \ # 0 1 2 Folds (Slicer) # | | # Methods (Splitter) # / \ # LDA SVM Classifier (Estimator) from epac import CV, Methods cv = CV(Methods(LDA(), SVM())) cv.run(X=X, y=y) print(cv.reduce()) # Model selection using CV # ------------------------ # CVBestSearchRefit # Methods (Splitter) # / \ # SVM(C=1) SVM(C=10) Classifier (Estimator) from epac import Pipe, CVBestSearchRefit, Methods # CV + Grid search of a simple classifier wf = CVBestSearchRefit(Methods(SVM(C=1), SVM(C=10))) wf.run(X=X, y=y) print(wf.reduce()) # Feature selection combined with SVM and LDA # CVBestSearchRefit
# Cross-validation # ---------------- # CV of LDA # CV (Splitter) # / | \ # 0 1 2 Folds (Slicer) # | | # Methods (Splitter) # / \ # LDA SVM Classifier (Estimator) from epac import CV, Methods cv = CV(Methods(LDA(), SVM())) cv.run(X=X, y=y) print cv.reduce() # Model selection using CV # ------------------------ # CVBestSearchRefit # Methods (Splitter) # / \ # SVM(C=1) SVM(C=10) Classifier (Estimator) from epac import Pipe, CVBestSearchRefit, Methods # CV + Grid search of a simple classifier wf = CVBestSearchRefit(Methods(SVM(C=1), SVM(C=10))) wf.run(X=X, y=y) print wf.reduce() # Feature selection combined with SVM and LDA
from sklearn import preprocessing ############################################################################## ## Pipeline, "Pipe": SelectKBest + StandardScaler + SVM l1 vs l2 from epac import Pipe, CV n_folds = 10 anova_svm = Pipe(SelectKBest(k=5), preprocessing.StandardScaler(), SVM(class_weight='auto')) cv = CV(anova_svm, n_folds=n_folds) cv.run(X=X, y=y) # res_cv_anova_svm = cv.reduce() res_cv_anova_svm["SelectKBest/StandardScaler/LinearSVC"]['y/test/score_recall'] ############################################################################## ## Multimethods, "Methods": SVM l1 vs l2 from epac import Methods, CV svms = Methods(SVM(penalty="l1", class_weight='auto', dual=False), SVM(penalty="l2", class_weight='auto', dual=False)) cv = CV(svms, n_folds=n_folds) cv.run(X=X, y=y) res_cv_svms = cv.reduce() # print res_cv_svms print res_cv_svms["LinearSVC(penalty=l1)"]['y/test/score_recall'] print res_cv_svms["LinearSVC(penalty=l2)"]['y/test/score_recall']