def setUp(self): self.pca_pipe_element = PipelineElement.create( 'pca', {'n_components': [1, 2]}, test_disabled=True) self.svc_pipe_element = PipelineElement.create('svc', { 'C': [0.1, 1], 'kernel': ['rbf', 'sigmoid'] }) self.cv_object = KFold(n_splits=3) self.hyperpipe = Hyperpipe('god', self.cv_object) self.hyperpipe += self.pca_pipe_element self.hyperpipe.add(self.svc_pipe_element)
def setUp(self): # set up inner pipeline self.inner_hyperpipe = Hyperpipe('inner_pipe', KFold(n_splits=2), local_search=True) self.inner_pipeline_test_element = PipelineElement.create( 'test_wrapper') self.inner_hyperpipe += self.inner_pipeline_test_element self.pipeline_fusion = PipelineStacking('fusion_element', [self.inner_hyperpipe]) # set up outer pipeline self.outer_hyperpipe = Hyperpipe('outer_pipe', KFold(n_splits=2)) self.outer_pipeline_test_element = PipelineElement.create( 'test_wrapper') self.outer_hyperpipe += self.outer_pipeline_test_element self.outer_hyperpipe += self.pipeline_fusion self.X = np.arange(1, 101) self.y = np.ones((100, ))
def testCaseA(self): pca_n_components = [2, 5] svc_c = [.1, 1, 5] #svc_kernel = ['rbf'] svc_kernel = ['rbf', 'linear'] # SET UP HYPERPIPE my_pipe = Hyperpipe('primary_pipe', optimizer='grid_search', optimizer_params={}, metrics=['accuracy', 'precision', 'f1_score'], inner_cv=KFold(n_splits=2, random_state=3), eval_final_performance=False) my_pipe += PipelineElement.create('standard_scaler') my_pipe += PipelineElement.create('pca', {'n_components': pca_n_components}) my_pipe += PipelineElement.create('svc', { 'C': svc_c, 'kernel': svc_kernel }) # START HYPERPARAMETER SEARCH my_pipe.fit(self.__X, self.__y) print(my_pipe._test_performances) pipe_results = {'train': [], 'test': []} for i in range(len(my_pipe._performance_history_list)): pipe_results['train'].extend(my_pipe._performance_history_list[i] ['accuracy_folds']['train']) pipe_results['test'].extend( my_pipe._performance_history_list[i]['accuracy_folds']['test']) print('\n\n') print('Running sklearn version...') #cv_outer = KFold(n_splits=2, random_state=3) cv_inner_1 = KFold(n_splits=2, random_state=3) sk_results = {'train': [], 'test': []} for n_comp in pca_n_components: for c in svc_c: for current_kernel in svc_kernel: tr_acc = [] val_acc = [] for train_2, val_1 in cv_inner_1.split(self.__X): data_train_2 = self.__X[train_2] print(data_train_2.shape) data_val_1 = self.__X[val_1] y_train_2 = self.__y[train_2] y_val_1 = self.__y[val_1] my_scaler = StandardScaler() my_scaler.fit(data_train_2) data_train_2 = my_scaler.transform(data_train_2) data_val_1 = my_scaler.transform(data_val_1) # Run PCA my_pca = PCA(n_components=n_comp) my_pca.fit(data_train_2) data_tr_2_pca = my_pca.transform(data_train_2) data_val_1_pca = my_pca.transform(data_val_1) # Run SVC my_svc = SVC(kernel=current_kernel, C=c) my_svc.fit(data_tr_2_pca, y_train_2) tr_acc.append(my_svc.score(data_tr_2_pca, y_train_2)) val_acc.append(my_svc.score(data_val_1_pca, y_val_1)) print('n_components: ', n_comp, 'kernel:', current_kernel, 'c:', c) print('Training 2:', tr_acc[-1], 'validation 1:', val_acc[-1]) sk_results['train'].extend(tr_acc) sk_results['test'].extend(val_acc) print('\nCompare results of last iteration (outer cv)...') print('SkL Train:', sk_results['train']) print('Pipe Train:', pipe_results['train']) print('SkL test: ', sk_results['test']) print('Pipe test: ', pipe_results['test']) self.assertEqual(sk_results['test'], pipe_results['test']) self.assertEqual(sk_results['train'], pipe_results['train'])
class CVTestsLocalSearchTrue(unittest.TestCase): def setUp(self): # set up inner pipeline self.inner_hyperpipe = Hyperpipe('inner_pipe', KFold(n_splits=2), local_search=True) self.inner_pipeline_test_element = PipelineElement.create( 'test_wrapper') self.inner_hyperpipe += self.inner_pipeline_test_element self.pipeline_fusion = PipelineStacking('fusion_element', [self.inner_hyperpipe]) # set up outer pipeline self.outer_hyperpipe = Hyperpipe('outer_pipe', KFold(n_splits=2)) self.outer_pipeline_test_element = PipelineElement.create( 'test_wrapper') self.outer_hyperpipe += self.outer_pipeline_test_element self.outer_hyperpipe += self.pipeline_fusion self.X = np.arange(1, 101) self.y = np.ones((100, )) def test_default_split_fit(self): """ test default splitting mode: 80% validation and 20% testing make sure that DURING the optimization the optimum pipeline is fitted with the correct amount of data """ self.outer_hyperpipe.debug_cv_mode = True self.inner_hyperpipe.debug_cv_mode = True self.outer_hyperpipe.hyperparameter_fitting_cv_object = ShuffleSplit( n_splits=1, test_size=0.2) self.inner_hyperpipe.hyperparameter_fitting_cv_object = ShuffleSplit( n_splits=1, test_size=0.2) self.outer_hyperpipe.fit(self.X, self.y) outer_data = self.outer_pipeline_test_element.base_element.data_dict[ 'fit_X'].tolist() inner_data = self.inner_pipeline_test_element.base_element.data_dict[ 'fit_X'].tolist() print('local_search true: outer pipeline data:') print(sorted(outer_data)) print('local_search true: inner pipeline data:') print(sorted(inner_data)) # we expect that all items from inner_data are existent in outer_data validation = set(inner_data) < set(outer_data) self.assertTrue(validation) # test that it is only 50% of 80% of original X (n=100) and that there is a test_x of 20% size self.assertEqual(len(outer_data), 40) # test that inner data is 50% from 80% of outer self.assertEqual(len(inner_data), 16) # we also expect that inner_data is 50% of length from outer_data self.assertEqual(len(inner_data), 0.5 * 0.8 * len(outer_data)) def test_default_split_predict(self): """ test default splitting mode: 80% validation and 20% testing make sure that AFTER the optimization the optimum pipeline is fitted with the correct amount of data which means test that the optimum pipe is fitted to the validation data and tested with the test data """ self.outer_hyperpipe.debug_cv_mode = False self.inner_hyperpipe.debug_cv_mode = False self.outer_hyperpipe.hyperparameter_fitting_cv_object = ShuffleSplit( n_splits=1, test_size=0.2, train_size=0.8) self.inner_hyperpipe.hyperparameter_fitting_cv_object = ShuffleSplit( n_splits=1, test_size=0.2, train_size=0.8) self.outer_hyperpipe.fit(self.X, self.y) print('local_search true: outer pipeline data:') print(self.outer_pipeline_test_element.base_element.data_dict['fit_X']) print('local_search true: inner pipeline data:') print(self.inner_pipeline_test_element.base_element.data_dict['fit_X']) outer_data = self.outer_pipeline_test_element.base_element.data_dict[ 'fit_X'].tolist() inner_data = self.inner_pipeline_test_element.base_element.data_dict[ 'fit_X'].tolist() # we expect that all items from inner_data are existent in outer_data validation = set(inner_data) < set(outer_data) self.assertTrue(validation) # test that it is only 80% of original X (n=100) and that there is a test_x of 20% size self.assertEqual(len(outer_data), 80) # test that inner data is 80% from 80% of original self.assertEqual(len(inner_data), 64) # we also expect that inner_data is 80% of length from outer_data self.assertEqual(len(inner_data), 0.8 * len(outer_data)) def test_no_split(self): """ test no splitting mode: the data is NOT split into test and validation set """ self.outer_hyperpipe.debug_cv_mode = True self.outer_hyperpipe.eval_final_performance = False self.inner_hyperpipe.debug_cv_mode = True self.inner_hyperpipe.eval_final_performance = False self.outer_hyperpipe.fit(self.X, self.y) outer_data = self.outer_pipeline_test_element.base_element.data_dict[ 'fit_X'].tolist() inner_data = self.inner_pipeline_test_element.base_element.data_dict[ 'fit_X'].tolist() # we expect that all items from inner_data are existent in outer_data validation = set(inner_data) < set(outer_data) self.assertTrue(validation) # test that it is only 50% of original X (n=100) self.assertEqual(len(outer_data), 50) # test that inner data is 50% from 50% of outer = 25% of original self.assertEqual(len(inner_data), 25) def test_CV_split(self): """ test cv splitting mode: the entire search for hyperparameters is cross validated """ self.outer_hyperpipe.debug_cv_mode = True self.outer_hyperpipe.eval_final_performance = False self.outer_hyperpipe.hyperparameter_fitting_cv_object = KFold( n_splits=2) self.inner_hyperpipe.debug_cv_mode = True self.inner_hyperpipe.eval_final_performance = False self.inner_hyperpipe.hyperparameter_fitting_cv_object = KFold( n_splits=2) self.outer_hyperpipe.fit(self.X, self.y) outer_data = self.outer_pipeline_test_element.base_element.data_dict[ 'fit_X'].tolist() inner_data = self.inner_pipeline_test_element.base_element.data_dict[ 'fit_X'].tolist() # we expect that all items from inner_data are existent in outer_data validation = set(inner_data) < set(outer_data) self.assertTrue(validation) # we use KFold = 2 so the original should be 100/2 = 50 # test that it is only 50% of original X (n=50) self.assertEqual(len(outer_data), 25) # test that inner data is 25% of outer = 12,5% of original self.assertTrue((len(inner_data) == 6 or len(inner_data) == 7))
class CVTestsLocalSearchFalse(unittest.TestCase): def setUp(self): self.outer_hyperpipe = Hyperpipe('outer_pipe', KFold(n_splits=2)) # set up inner pipeline self.inner_hyperpipe = Hyperpipe( 'inner_pipe', KFold(n_splits=2), optimizer=self.outer_hyperpipe.optimizer, local_search=False) self.inner_pipeline_test_element = PipelineElement.create( 'test_wrapper') self.inner_hyperpipe += self.inner_pipeline_test_element self.pipeline_fusion = PipelineStacking('fusion_element', [self.inner_hyperpipe]) # set up outer pipeline self.outer_pipeline_test_element = PipelineElement.create( 'test_wrapper') self.outer_hyperpipe += self.outer_pipeline_test_element self.outer_hyperpipe += self.pipeline_fusion self.X = np.arange(1, 101) self.y = np.ones((100, )) self.inner_hyperpipe.debug_cv_mode = True self.outer_hyperpipe.debug_cv_mode = True def test_no_split(self): self.outer_hyperpipe.eval_final_performance = False self.inner_hyperpipe.eval_final_performance = False self.outer_hyperpipe.fit(self.X, self.y) print('local_search true: outer pipeline data:') print(self.outer_pipeline_test_element.base_element.data_dict['fit_X']) print('local_search true: inner pipeline data:') print(self.inner_pipeline_test_element.base_element.data_dict['fit_X']) outer_data = self.outer_pipeline_test_element.base_element.data_dict[ 'fit_X'].tolist() inner_data = self.inner_pipeline_test_element.base_element.data_dict[ 'fit_X'].tolist() self.assertTrue(set(outer_data) == set(inner_data)) self.assertEqual(len(outer_data), 50) def test_default_split(self): self.outer_hyperpipe.eval_final_performance = True self.inner_hyperpipe.eval_final_performance = True self.outer_hyperpipe.fit(self.X, self.y) outer_data = self.outer_pipeline_test_element.base_element.data_dict[ 'fit_X'].tolist() inner_data = self.inner_pipeline_test_element.base_element.data_dict[ 'fit_X'].tolist() self.assertTrue(set(outer_data) == set(inner_data)) self.assertEqual(len(outer_data), 40) def test_cv_split(self): self.outer_hyperpipe.hyperparameter_fitting_cv_object = KFold( n_splits=2) # should be ignored: self.inner_hyperpipe.hyperparameter_fitting_cv_object = KFold( n_splits=2) self.outer_hyperpipe.fit(self.X, self.y) outer_data = self.outer_pipeline_test_element.base_element.data_dict[ 'fit_X'].tolist() inner_data = self.inner_pipeline_test_element.base_element.data_dict[ 'fit_X'].tolist() self.assertTrue(set(outer_data) == set(inner_data)) self.assertEqual(len(outer_data), 25) self.assertEqual(len(outer_data), len(inner_data))
def testCaseA(self): pca_n_components = [2, 5] svc_c = [.1, 1] svc_kernel = ['rbf'] # svc_kernel = ['rbf','linear'] # SET UP HYPERPIPE my_pipe = Hyperpipe('primary_pipe', optimizer='grid_search', optimizer_params={}, inner_cv=KFold( n_splits=2, random_state=3), outer_cv=KFold( n_splits=2, random_state=3), verbose=2, eval_final_performance=True) my_pipe += PipelineElement.create('standard_scaler') my_pipe += PipelineElement.create('pca', {'n_components': pca_n_components}) my_pipe += PipelineElement.create('svc', {'C': svc_c, 'kernel': svc_kernel}) # START HYPERPARAMETER SEARCH my_pipe.fit(self.__X, self.__y) from Framework import LogExtractor log_ex = LogExtractor.LogExtractor(my_pipe.result_tree) log_ex.extract_csv("test_case_A2.csv") # print(my_pipe.test_performances) # pipe_results = {'train': [], 'test': []} # for i in range(len(my_pipe.performance_history_list)): # pipe_results['train'].extend( # my_pipe.performance_history_list[i]['accuracy_folds']['train']) # pipe_results['test'].extend( # my_pipe.performance_history_list[i]['accuracy_folds']['test']) print('\n\n') print('Running sklearn version...') cv_outer = KFold(n_splits=2, random_state=3) cv_inner_1 = KFold(n_splits=2, random_state=3) for train_1, test in cv_outer.split(self.__X): data_train_1 = self.__X[train_1] data_test = self.__X[test] y_train_1 = self.__y[train_1] y_test = self.__y[test] sk_results = {'train': [], 'test': []} for n_comp in pca_n_components: for current_kernel in svc_kernel: for c in svc_c: tr_acc = [] val_acc = [] for train_2, val_1 in cv_inner_1.split( data_train_1): data_train_2 = data_train_1[train_2] data_val_1 = data_train_1[val_1] y_train_2 = y_train_1[train_2] y_val_1 = y_train_1[val_1] my_scaler = StandardScaler() my_scaler.fit(data_train_2) data_train_2 = my_scaler.transform(data_train_2) data_val_1 = my_scaler.transform(data_val_1) # Run PCA my_pca = PCA(n_components=n_comp) my_pca.fit(data_train_2) data_tr_2_pca = my_pca.transform(data_train_2) data_val_1_pca = my_pca.transform(data_val_1) # Run SVC my_svc = SVC(kernel=current_kernel, C=c) my_svc.fit(data_tr_2_pca, y_train_2) tr_acc.append(my_svc.score(data_tr_2_pca, y_train_2)) val_acc.append(my_svc.score(data_val_1_pca, y_val_1)) print('n_components: ', n_comp, 'kernel:', current_kernel, 'c:', c) print('Training 2:', tr_acc[-1], 'validation 1:', val_acc[-1]) sk_results['train'].extend(tr_acc) sk_results['test'].extend(val_acc) print('\nCompare results of last iteration (outer cv)...') print('SkL Train:', sk_results['train']) print('Pipe Train:', pipe_results['train']) print('SkL test: ', sk_results['test']) print('Pipe test: ', pipe_results['test']) self.assertEqual(sk_results['test'], pipe_results['test']) self.assertEqual(sk_results['train'], pipe_results['train'])
def testCaseB(self): pca_n_components = [7, 15, 10] svc_c = [.1, 1] #svc_kernel = ['rbf'] svc_kernel = ['rbf', 'linear'] cv_outer = ShuffleSplit(n_splits=1, test_size=0.2, random_state=3) cv_inner_1 = ShuffleSplit(n_splits=1, test_size=0.2, random_state=3) cv_inner_2 = ShuffleSplit(n_splits=1, test_size=0.2, random_state=3) # SET UP HYPERPIPE outer_pipe = Hyperpipe('outer_pipe', optimizer='grid_search', metrics=['accuracy'], inner_cv=cv_inner_1, outer_cv=cv_outer, eval_final_performance=True) inner_pipe = Hyperpipe('pca_pipe', optimizer='grid_search', inner_cv=cv_inner_2, eval_final_performance=False) inner_pipe.add(PipelineElement.create('standard_scaler')) inner_pipe.add( PipelineElement.create('ae_pca', {'n_components': pca_n_components})) pipeline_fusion = PipelineStacking('fusion_element', [inner_pipe]) outer_pipe.add(pipeline_fusion) outer_pipe.add( PipelineElement.create('svc', { 'C': svc_c, 'kernel': svc_kernel })) # START HYPERPARAMETER SEARCH outer_pipe.fit(self.__X, self.__y) print(outer_pipe._test_performances) pipe_results = {'train': [], 'test': []} for i in range(len(outer_pipe._performance_history_list)): pipe_results['train'].extend( outer_pipe._performance_history_list[i]['accuracy_folds'] ['train']) pipe_results['test'].extend(outer_pipe._performance_history_list[i] ['accuracy_folds']['test']) print(outer_pipe._test_performances['accuracy']) print('\n\n') print('Running sklearn version...\n') opt_tr_acc = [] opt_test_acc = [] for train_1, test in cv_outer.split(self.__X): data_train_1 = self.__X[train_1] data_test = self.__X[test] y_train_1 = self.__y[train_1] y_test = self.__y[test] config_inner_1 = {'C': [], 'kernel': []} sk_results_inner1 = { 'train_2': [], 'val_1': [], 'train_2_mean': [], 'val_1_mean': [] } print('Outer Split') print('n train_1:', data_train_1.shape[0], '\n') for c in svc_c: for current_kernel in svc_kernel: config_inner_1['C'].extend([c]) config_inner_1['kernel'].extend([current_kernel]) print('C:', c, 'Kernel:', current_kernel, '\n') svc_score_tr = [] svc_score_te = [] fold_cnt = 1 for train_2, val_1 in cv_inner_1.split(data_train_1): print('\n\nSklearn Outer Pipe FoldMetrics', fold_cnt) data_train_2 = data_train_1[train_2] data_val_1 = data_train_1[val_1] y_train_2 = y_train_1[train_2] y_val_1 = y_train_1[val_1] print('n train_2:', data_train_2.shape[0], '\n') config_inner_2 = {'n_comp': []} print('Sklearn PCA Pipe') sk_results_inner2 = { 'train_3': [], 'val_2': [], 'train_3_mean': [], 'val_2_mean': [] } for n_comp in pca_n_components: config_inner_2['n_comp'].extend([n_comp]) tr_acc = [] val_acc = [] # print('Some training data:', # data_train_2[0:2, 0:2]) for train_3, val_2 in cv_inner_2.split( data_train_2): data_train_3 = data_train_2[train_3] data_val_2 = data_train_2[val_2] my_scaler = StandardScaler() my_scaler.fit(data_train_3) data_train_3 = my_scaler.transform( data_train_3) data_val_2 = my_scaler.transform(data_val_2) # Run PCA my_pca = PCA_AE_Wrapper(n_components=n_comp) my_pca.fit(data_train_3) mae_tr = my_pca.score(data_train_3) mae_te = my_pca.score(data_val_2) tr_acc.append(mae_tr) val_acc.append(mae_te) sk_results_inner2['train_3'].extend(tr_acc) sk_results_inner2['val_2'].extend(val_acc) sk_results_inner2['train_3_mean'].extend( [np.mean(tr_acc)]) sk_results_inner2['val_2_mean'].extend( [np.mean(val_acc)]) print('n_comp:', n_comp) print('n train_3 fold 1:', data_train_3.shape[0]) print('Training 3 mean:', [np.mean(tr_acc)], 'validation 2 mean:', [np.mean(val_acc)]) # find best config for val 2 best_config_id = np.argmin( sk_results_inner2['val_2_mean']) print('Best PCA config:', config_inner_2['n_comp'][best_config_id], '\n') # fit optimum pipe my_scaler = StandardScaler() my_scaler.fit(data_train_2) data_train_2 = my_scaler.transform(data_train_2) data_val_1 = my_scaler.transform(data_val_1) # Run PCA my_pca = PCA_AE_Wrapper( n_components=config_inner_2['n_comp'] [best_config_id]) my_pca.fit(data_train_2) data_tr_2_pca = my_pca.transform(data_train_2) data_val_1_pca = my_pca.transform(data_val_1) # Run SVC my_svc = SVC(kernel=current_kernel, C=c) my_svc.fit(data_tr_2_pca, y_train_2) svc_score_tr.append( my_svc.score(data_tr_2_pca, y_train_2)) svc_score_te.append( my_svc.score(data_val_1_pca, y_val_1)) print('Fit Optimum PCA Config and train with SVC') print('n train 2:', data_train_2.shape[0]) print('n_comp:', config_inner_2['n_comp'][best_config_id]) print('SVC Train:', svc_score_tr[-1]) print('SVC test:', svc_score_te[-1], '\n\n') sk_results_inner1['train_2'].append(svc_score_tr[-1]) sk_results_inner1['val_1'].append(svc_score_te[-1]) fold_cnt += 1 sk_results_inner1['train_2_mean'].append( np.mean(svc_score_tr)) sk_results_inner1['val_1_mean'].append( np.mean(svc_score_te)) print('\nNow find best config for SVC...') best_config_id_inner_1 = np.argmax(sk_results_inner1['val_1_mean']) print('Some test data:') print(data_test.shape) print(data_test[0:2, 0:2]) # fit optimum pipe my_scaler = StandardScaler() my_scaler.fit(data_train_1) data_train_1 = my_scaler.transform(data_train_1) data_test = my_scaler.transform(data_test) # Run PCA my_pca = PCA_AE_Wrapper( n_components=config_inner_2['n_comp'][best_config_id]) my_pca.fit(data_train_1) data_tr_1_pca = my_pca.transform(data_train_1) data_test_pca = my_pca.transform(data_test) # Run SVC my_svc = SVC( kernel=config_inner_1['kernel'][best_config_id_inner_1], C=config_inner_1['C'][best_config_id_inner_1]) print('Best overall config:...') print('C = ', config_inner_1['C'][best_config_id_inner_1]) print('kernel=', config_inner_1['kernel'][best_config_id_inner_1]) print('pca_n_comp=', config_inner_2['n_comp'][best_config_id]) print('n train 1:', data_train_1.shape[0]) my_svc.fit(data_tr_1_pca, y_train_1) opt_tr_acc.append(my_svc.score(data_tr_1_pca, y_train_1)) opt_test_acc.append(my_svc.score(data_test_pca, y_test)) print('Train Acc:', opt_tr_acc[-1]) print('test Acc:', opt_test_acc[-1]) print('\nCompare results of last iteration (outer cv)...') print('SkL Train:', sk_results_inner1['train_2']) print('Pipe Train:', pipe_results['train']) print('SkL test: ', sk_results_inner1['val_1']) print('Pipe test: ', pipe_results['test']) print('\nEval final performance:') print('Pipe final perf:', outer_pipe._test_performances['accuracy']) print('Sklearn final perf:', opt_test_acc) self.assertEqual(sk_results_inner1['train_2'], pipe_results['train']) self.assertEqual(sk_results_inner1['val_1'], pipe_results['test']) self.assertEqual(opt_test_acc, outer_pipe._test_performances['accuracy'])
def testCaseC2(self): pca_n_components = [5, 10] svc_c = [0.1] svc_c_2 = [1] #svc_kernel = ['rbf'] svc_kernel = ['linear'] # SET UP HYPERPIPE outer_pipe = Hyperpipe('outer_pipe', optimizer='grid_search', metrics=['accuracy'], inner_cv=ShuffleSplit(n_splits=1, test_size=0.2, random_state=3), outer_cv=ShuffleSplit(n_splits=1, test_size=0.2, random_state=3), eval_final_performance=True) # Create pipe for first data source pipe_source_1 = Hyperpipe('source_1', optimizer='grid_search', inner_cv=ShuffleSplit(n_splits=1, test_size=0.2, random_state=3), eval_final_performance=False) pipe_source_1.add( PipelineElement.create('SourceSplitter', {'column_indices': [np.arange(0, 10)]})) pipe_source_1.add( PipelineElement.create('pca', {'n_components': pca_n_components})) pipe_source_1.add( PipelineElement.create('svc', { 'C': svc_c, 'kernel': svc_kernel })) # Create pipe for second data source pipe_source_2 = Hyperpipe('source_2', optimizer='grid_search', inner_cv=ShuffleSplit(n_splits=1, test_size=0.2, random_state=3), eval_final_performance=False) pipe_source_2.add( PipelineElement.create('SourceSplitter', {'column_indices': [np.arange(10, 20)]})) pipe_source_2.add( PipelineElement.create('pca', {'n_components': pca_n_components})) pipe_source_2.add( PipelineElement.create('svc', { 'C': svc_c, 'kernel': svc_kernel })) # Create pipe for third data source pipe_source_3 = Hyperpipe('source_3', optimizer='grid_search', inner_cv=ShuffleSplit(n_splits=1, test_size=0.2, random_state=3), eval_final_performance=False) pipe_source_3.add( PipelineElement.create('SourceSplitter', {'column_indices': [np.arange(20, 30)]})) pipe_source_3.add( PipelineElement.create('pca', {'n_components': pca_n_components})) pipe_source_3.add( PipelineElement.create('svc', { 'C': svc_c, 'kernel': svc_kernel })) # pipeline_fusion = PipelineStacking('multiple_source_pipes',[pipe_source_1, pipe_source_2, pipe_source_3], voting=False) pipeline_fusion = PipelineStacking( 'multiple_source_pipes', [pipe_source_1, pipe_source_2, pipe_source_3]) outer_pipe.add(pipeline_fusion) #outer_pipe.add(PipelineElement.create('svc', {'C': svc_c_2, 'kernel': svc_kernel})) #outer_pipe.add(PipelineElement.create('knn',{'n_neighbors':[15]})) outer_pipe.add( PipelineElement.create('kdnn', { 'target_dimension': [2], 'nb_epoch': [10] })) # START HYPERPARAMETER SEARCH outer_pipe.fit(self.__X, self.__y) print(outer_pipe._test_performances) pipe_results = {'train': [], 'test': []} for i in range(int(len(outer_pipe._performance_history_list) / 2)): pipe_results['train'].extend( outer_pipe._performance_history_list[i]['accuracy_folds'] ['train']) pipe_results['test'].extend(outer_pipe._performance_history_list[i] ['accuracy_folds']['test']) print(outer_pipe._test_performances['accuracy'])
class HyperpipeTests(unittest.TestCase): def setUp(self): self.pca_pipe_element = PipelineElement.create( 'pca', {'n_components': [1, 2]}, test_disabled=True) self.svc_pipe_element = PipelineElement.create('svc', { 'C': [0.1, 1], 'kernel': ['rbf', 'sigmoid'] }) self.cv_object = KFold(n_splits=3) self.hyperpipe = Hyperpipe('god', self.cv_object) self.hyperpipe += self.pca_pipe_element self.hyperpipe.add(self.svc_pipe_element) def test_init(self): self.assertEqual(self.hyperpipe.name, 'god') # assure pipeline has two steps, first the pca and second the svc self.assertEqual(len(self.hyperpipe._pipe.steps), 2) self.assertIs(self.hyperpipe._pipe.steps[0][1], self.pca_pipe_element) self.assertIs(self.hyperpipe._pipe.steps[1][1], self.svc_pipe_element) def test_hyperparameters(self): # hyperparameters self.assertDictEqual( self.hyperpipe.hyperparameters, { 'pca': { 'n_components': [1, 2], 'test_disabled': True }, 'svc': { 'C': [0.1, 1], 'kernel': ['rbf', 'sigmoid'] } }) # sklearn params # Todo: has no sklearn attribute # config grid # print(self.hyperpipe.config_grid) expected_config_grid = [{ 'pca__n_components': 1, 'pca__disabled': False, 'svc__C': 0.1, 'svc__kernel': 'rbf' }, { 'pca__n_components': 1, 'pca__disabled': False, 'svc__C': 0.1, 'svc__kernel': 'sigmoid' }, { 'pca__n_components': 1, 'pca__disabled': False, 'svc__C': 1, 'svc__kernel': 'rbf' }, { 'pca__n_components': 1, 'pca__disabled': False, 'svc__C': 1, 'svc__kernel': 'sigmoid' }, { 'pca__n_components': 2, 'pca__disabled': False, 'svc__C': 0.1, 'svc__kernel': 'rbf' }, { 'pca__n_components': 2, 'pca__disabled': False, 'svc__C': 0.1, 'svc__kernel': 'sigmoid' }, { 'pca__n_components': 2, 'pca__disabled': False, 'svc__C': 1, 'svc__kernel': 'rbf' }, { 'pca__n_components': 2, 'pca__disabled': False, 'svc__C': 1, 'svc__kernel': 'sigmoid' }, { 'pca__disabled': True, 'svc__C': 0.1, 'svc__kernel': 'rbf' }, { 'pca__disabled': True, 'svc__C': 0.1, 'svc__kernel': 'sigmoid' }, { 'pca__disabled': True, 'svc__C': 1, 'svc__kernel': 'rbf' }, { 'pca__disabled': True, 'svc__C': 1, 'svc__kernel': 'sigmoid' }] expected_config_grid = [sorted(i) for i in expected_config_grid] actual_config_grid = [sorted(i) for i in self.hyperpipe.config_grid] self.assertListEqual(actual_config_grid, expected_config_grid)
def testCaseA(self): pca_n_components = 10 svc_c = 1 svc_kernel = "rbf" # SET UP HYPERPIPE my_pipe = Hyperpipe('primary_pipe', optimizer='grid_search', optimizer_params={}, metrics=['accuracy', 'precision', 'f1_score'], inner_cv=KFold(n_splits=3), outer_cv=KFold(n_splits=3), eval_final_performance=True) my_pipe += PipelineElement.create('standard_scaler') my_pipe += PipelineElement.create('pca', {'n_components': [pca_n_components]}) my_pipe += PipelineElement.create('svc', {'C': [svc_c], 'kernel': [svc_kernel]}) # START HYPERPARAMETER SEARCH my_pipe.fit(self.__X, self.__y) print(my_pipe._test_performances) from Framework import LogExtractor log_ex = LogExtractor.LogExtractor(my_pipe.result_tree) log_ex.extract_csv("test_case_A.csv") # Das muss noch weg! ToDo from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.svm import SVC from sklearn.pipeline import Pipeline from sklearn.metrics import f1_score, accuracy_score, precision_score # Now we are using the native Scikit-learn methods sk_pipeline = Pipeline([("standard_scaler", StandardScaler()), ("pca", PCA(n_components=pca_n_components)), ("svc", SVC(C=svc_c, kernel=svc_kernel))]) my_pipe._generate_outer_cv_indices() tmp_counter = 0 for train_idx_arr, test_idx_arr in my_pipe.data_test_cases: sk_results = {'accuracy': [], 'precision': [], 'f1_score': [], 'default': []} outer_train_X = self.__X[train_idx_arr] outer_train_y = self.__y[train_idx_arr] outer_test_X = self.__X[test_idx_arr] outer_test_y = self.__y[test_idx_arr] sk_config_cv = KFold(n_splits=3) # Todo: test other configs and select best! for sub_train_idx, sub_test_idx in sk_config_cv.split(outer_train_X, outer_train_y): inner_train_X = self.__X[sub_train_idx] inner_train_y = self.__y[sub_train_idx] #test_X = self.__X[sub_test_idx] #test_y = self.__y[sub_test_idx] # sk_pipeline.fit(inner_train_X, inner_train_y) fit_and_predict_score = _fit_and_score(sk_pipeline, outer_train_X, outer_train_y, self.score, sub_train_idx, sub_test_idx, verbose=0, parameters={}, fit_params={}, return_train_score=True, return_n_test_samples=True, return_times=True, return_parameters=True, error_score='raise') sk_pipeline.fit(outer_train_X, outer_train_y) sk_prediction = sk_pipeline.predict(outer_test_X) sk_results['default'].append(fit_and_predict_score[1]) sk_results['accuracy'].append(accuracy_score(outer_test_y, sk_prediction)) sk_results['precision'].append(precision_score(outer_test_y, sk_prediction)) sk_results['f1_score'].append(f1_score(outer_test_y, sk_prediction)) # bestItem = np.argmax(sk_results['default']) # print([str(k)+':'+str(i[bestItem]) for k, i in sk_results.items()]) self.assertEqual(sk_results['accuracy'], my_pipe._test_performances['accuracy'][tmp_counter]) self.assertEqual(sk_results['precision'], my_pipe._test_performances['precision'][tmp_counter]) self.assertEqual(sk_results['f1_score'], my_pipe._test_performances['f1_score'][tmp_counter]) tmp_counter += 1
""" Test Feature Selection """ from sklearn.datasets import load_breast_cancer from sklearn.model_selection import KFold from Framework.PhotonBase import Hyperpipe, PipelineElement dataset = load_breast_cancer() X = dataset.data y = dataset.target # create cross-validation object first cv_object = KFold(n_splits=3, shuffle=True, random_state=0) # create a hyperPipe manager = Hyperpipe('god', cv_object, optimizer='random_grid_search') manager += PipelineElement.create('f_classif_select_percentile', {'percentile': [10, 20, 30, 100]}, test_disabled=True) # SVMs (linear and rbf) manager += PipelineElement.create('svc', {}, kernel='linear') manager.fit(X, y)
dataset_files = oasis_dataset.gray_matter_maps targets = oasis_dataset.ext_vars['age'].astype(float) # age # # data # from sklearn.datasets import load_breast_cancer # dataset = load_breast_cancer() # dataset_files = dataset.data # targets = dataset.target print(BrainAtlas._getAtlasDict()) # setup photonai HP my_pipe = Hyperpipe('primary_pipe', optimizer='grid_search', optimizer_params={}, metrics=['mean_squared_error', 'mean_absolute_error'], inner_cv=KFold(n_splits=2, shuffle=True, random_state=3), outer_cv=KFold(n_splits=2, shuffle=True, random_state=3), eval_final_performance=True) my_pipe += PipelineElement.create('SmoothImgs', {'fwhr': [[8, 8, 8], [12, 12, 12]]}) my_pipe += PipelineElement.create('ResampleImgs', {'voxel_size': [[5, 5, 5]]}) atlas_info = AtlasInfo(atlas_name='mni_icbm152_t1_tal_nlin_sym_09a_mask', mask_threshold=.5, roi_names='all', extraction_mode='vec') #atlas_info = AtlasInfo(atlas_name='AAL', roi_names='all', extraction_mode='box') my_pipe += PipelineElement.create('BrainAtlas', {}, atlas_info_object=atlas_info)
# -----------> calculate something ------------------- # # LOAD DATA dataset = load_breast_cancer() X = dataset.data y = dataset.target print(np.sum(y)/len(y)) from pymodm import connect connect("mongodb://localhost:27017/photon_db") # BUILD PIPELINE manager = Hyperpipe('test_manager', optimizer='timeboxed_random_grid_search', optimizer_params={'limit_in_minutes': 1}, outer_cv=ShuffleSplit(test_size=0.2, n_splits=3), inner_cv=KFold(n_splits=10, shuffle=True), best_config_metric='accuracy', metrics=['accuracy', 'precision', 'recall', "f1_score"], logging=False, eval_final_performance=True, calculate_metrics_across_folds=True, verbose=2) manager.add(PipelineElement.create('standard_scaler', test_disabled=True)) manager += PipelineElement.create('pca', hyperparameters={'n_components': [None, 1, 10000]}) # tmp_lasso = Lasso() # manager.add(PipelineElement.create('SelectModelWrapper', estimator_obj=tmp_lasso)) svm = PipelineElement.create('svc', hyperparameters={'C': [0.5, 1], 'kernel': ['linear']}) manager.add(svm) manager.fit(X, y) # -----------> Result Tree generated ------------------- # result_tree = manager.result_tree