def test_discrimination_rgr(self, verbose=0, seed=42): """ Determine if Ensemble can pick real regressor over dummy and test performance. """ X, y = make_regression(n_samples=500, n_features=20, n_informative=10, random_state=seed) base_regressors = [DummyRegressor(strategy='mean') for i in range(5)] base_regressors.append(LinearRegression()) random.shuffle(base_regressors) mclf = MultichannelPipeline(n_channels=1) mclf.add_layer(StandardScaler()) mclf.add_layer(Ensemble(base_regressors, internal_cv=5, score_selector=RankScoreSelector(k=1))) mclf.fit([X], y) ensemble = mclf.get_model(1, 0) selected_model = ensemble.get_base_models()[0] selected_model = transform_wrappers.unwrap_model(selected_model) if verbose > 0: print(ensemble.get_screen_results()) self.assertTrue(type(selected_model) == LinearRegression, 'Ensemble failed to pick LinearRegression ' 'over dummies') acc = np.mean(cross_val_score(mclf, [X], y)) if verbose > 0: print('cross val accuracy: {}'.format(acc)) self.assertTrue(acc > 0.9, 'Accuracy tolerance failure.')
def test_multi_input_classification(self): mclf = MultichannelPipeline(n_channels=1) mclf.add_layer(self.clf) pc_scores = pc_cross_validation.cross_val_score( mclf, [self.X_cls], self.y_cls, score_method='predict_proba', scorer=roc_auc_score, cv=self.cv, n_processes=1) self.assertTrue(np.array_equal(self.cls_scores, pc_scores), 'classifier scores from pipecaster.cross_validation.cross_val_score did not match sklearn control (multi input predictor)')
def test_multi_matrices_no_metapredictor(self, verbose=0, seed=42): """ Determine if ChannelEnsemble works without a meta-predictor. Determine if it can pick informative input over random and test its performance. """ Xs, y, types = make_multi_input_regression(n_informative_Xs=1, n_weak_Xs=0, n_random_Xs=4, weak_noise_sd=None, seed=seed, n_samples=500, n_features=20, n_informative=20) mclf = MultichannelPipeline(n_channels=5) mclf.add_layer(StandardScaler()) mclf.add_layer( ChannelEnsemble(LinearRegression(), internal_cv=5, score_selector=RankScoreSelector(k=1))) mclf.fit(Xs, y) selected_type = types[mclf.get_model(1, 0).get_support()[0]] self.assertTrue(selected_type == 'informative', 'Ensemble failed to pick informative channel') acc = np.mean(cross_val_score(mclf, Xs, y)) if verbose > 0: print('cross val accuracy: {}'.format(acc)) self.assertTrue(acc > 0.10, 'Accuracy tolerance failure.')
def test_discrimination_cls(self, verbose=0, seed=42): """ Determine if Ensemble can pick real classifier over dummy and test performance. """ X, y = make_classification(n_samples=500, n_features=20, n_informative=15, class_sep=1, random_state=seed) base_classifiers = [DummyClassifier(strategy='stratified') for i in range(5)] base_classifiers.append(LogisticRegression()) random.shuffle(base_classifiers) mclf = MultichannelPipeline(n_channels=1) mclf.add_layer(StandardScaler()) mclf.add_layer(Ensemble(base_classifiers, internal_cv=5, score_selector=RankScoreSelector(k=1))) mclf.fit([X], y) c = mclf.get_model(1, 0).get_base_models()[0] c = transform_wrappers.unwrap_model(c) self.assertTrue(type(c) == LogisticRegression, 'Ensemble failed to pick LogisticRegression ' 'over dummies') acc = np.mean(cross_val_score(mclf, [X], y)) if verbose > 0: print('cross val accuracy: {}'.format(acc)) self.assertTrue(acc > 0.70, 'Accuracy tolerance failure.')
def test_multiprocessing_speedup(self, verbose=0): if n_cpus > 1: warnings.filterwarnings("ignore") parallel.start_if_needed(n_cpus=n_cpus) X, y = self.X_cls, self.y_cls = make_classification(n_classes=2, n_samples=500, n_features=40, n_informative=20, random_state=test_seed) mclf = MultichannelPipeline(n_channels=1) mclf.add_layer(DummyClassifier(futile_cycles_fit=2000000, futile_cycles_pred=10)) # shut off warnings because ray and redis generate massive numbers SETUP_CODE = ''' import pipecaster.cross_validation''' TEST_CODE = ''' pipecaster.cross_validation.cross_val_score(mclf, [X], y, cv = 5, n_processes = 1)''' t_serial = timeit.timeit(setup = SETUP_CODE, stmt = TEST_CODE, globals = locals(), number = 5) TEST_CODE = ''' pipecaster.cross_validation.cross_val_score(mclf, [X], y, cv = 5, n_processes = {})'''.format(n_cpus) t_parallel = timeit.timeit(setup = SETUP_CODE, stmt = TEST_CODE, globals = locals(), number = 5) warnings.resetwarnings() if verbose > 0: print('serial run mean time = {} s'.format(t_serial)) print('parallel run mean time = {} s'.format(t_parallel)) if t_serial <= t_parallel: warnings.warn('mulitple cpus detected, but parallel cross_val_score not faster than serial, possible problem with multiprocessing')
def test_multi_input_regression_parallel(self): if n_cpus > 1: warnings.filterwarnings("ignore") parallel.start_if_needed(n_cpus=n_cpus) mrgr = MultichannelPipeline(n_channels=1) mrgr.add_layer(self.rgr) pc_scores = pc_cross_validation.cross_val_score(mrgr, [self.X_rgr], self.y_rgr, scorer=explained_variance_score, cv=self.cv, n_processes=n_cpus) self.assertTrue(np.array_equal(self.rgr_scores, pc_scores), 'regressor scores from pipecaster.cross_validation.cross_val_score did not match sklearn control (multi input predictor)') warnings.resetwarnings()
def test_multi_input_classification_parallel(self): if n_cpus > 1: warnings.filterwarnings("ignore") parallel.start_if_needed() mclf = MultichannelPipeline(n_channels=1) mclf.add_layer(self.clf) pc_scores = pc_cross_validation.cross_val_score( mclf, [self.X_cls], self.y_cls, score_method='predict_proba', scorer=roc_auc_score, cv=self.cv, n_processes=n_cpus) self.assertTrue(np.array_equal(self.cls_scores, pc_scores), 'classifier scores from pipecaster.cross_validation.cross_val_score did not match sklearn control (multi input predictor)') warnings.resetwarnings()
def test_multi_input_classification(self): mclf = MultichannelPipeline(n_channels=1) mclf.add_layer(self.clf) pc_predictions = pc_cross_validation.cross_val_predict(mclf, [self.X_cls], self.y_cls, cv=self.cv, n_processes=1) self.assertTrue( np.array_equal(self.cls_predictions, pc_predictions['predict']['y_pred']), 'pipecaster predictions did not match sklearn control')
def test_multi_input_regression(self): mrgr = MultichannelPipeline(n_channels=1) mrgr.add_layer(self.rgr) pc_predictions = pc_cross_validation.cross_val_predict(mrgr, [self.X_rgr], self.y_rgr, cv=self.cv, n_processes=1) self.assertTrue( np.array_equal(self.rgr_predictions, pc_predictions['predict']['y_pred']), 'pipecaster predictions did not match sklearn control')
def test_multi_input_classification_parallel(self): if n_cpus > 1: warnings.filterwarnings("ignore") parallel.start_if_needed(n_cpus=n_cpus) mclf = MultichannelPipeline(n_channels=1) mclf.add_layer(self.clf) pc_predictions = pc_cross_validation.cross_val_predict( mclf, [self.X_cls], self.y_cls, cv=self.cv, n_processes=n_cpus) self.assertTrue( np.array_equal(self.cls_predictions, pc_predictions['predict']['y_pred']), 'pipecaster predictions did not match sklearn control') warnings.resetwarnings()
def test_throttled_multiprocessing_speedup(self, verbose=0): if n_cpus > 1: warnings.filterwarnings("ignore") parallel.start_if_needed(n_cpus=n_cpus) X, y = self.X_cls, self.y_cls = make_classification( n_classes=2, n_samples=500, n_features=40, n_informative=20, random_state=test_seed) mclf = MultichannelPipeline(n_channels=1) mclf.add_layer( DummyClassifier(futile_cycles_fit=2000000, futile_cycles_pred=10)) SETUP_CODE = ''' import pipecaster.cross_validation''' TEST_CODE = ''' pipecaster.cross_validation.cross_val_predict(mclf, [X], y, cv = {}, n_processes = 1)'''.format( n_cpus - 1) t_serial = timeit.timeit(setup=SETUP_CODE, stmt=TEST_CODE, globals=locals(), number=5) TEST_CODE = ''' pipecaster.cross_validation.cross_val_predict(mclf, [X], y, cv = {}, n_processes = {})'''.format( n_cpus - 1, n_cpus - 1) t_parallel = timeit.timeit(setup=SETUP_CODE, stmt=TEST_CODE, globals=locals(), number=5) warnings.resetwarnings() if verbose > 0: print( 'number of CPUs detected and parallel jobs requested: {}'. format(n_cpus)) print('duration of serial cross cross_val_predict task: {} s'. format(t_serial)) print( 'duration of parallel cross cross_val_predict task (ray pool.starmap): {} s' .format(t_parallel)) if t_serial <= t_parallel: warnings.warn( 'multiple cpus detected, but parallel cross_val_predict not faster than serial using ray.multiprocessing.starmap(), possible problem with multiprocessing' )
def test_aggregating_regressor(self, verbose=0, seed=42): Xs, y, _ = make_multi_input_regression(n_informative_Xs=3, random_state=seed) clf = MultichannelPipeline(n_channels=3) base_clf = GradientBoostingRegressor(n_estimators=50) clf.add_layer(make_transformer(base_clf)) clf.add_layer(AggregatingRegressor(np.mean)) cross_val_score(clf, Xs, y, cv=3) scores = cross_val_score(clf, Xs, y, score_method='predict', scorer=explained_variance_score) score = np.mean(scores) if verbose > 0: print('accuracy = {}'.format(score)) self.assertTrue(score > 0.3)
def test_multi_input_regression_parallel_starmap(self): if n_cpus > 2: warnings.filterwarnings("ignore") parallel.start_if_needed(n_cpus=n_cpus) mrgr = MultichannelPipeline(n_channels=1) mrgr.add_layer(self.rgr) pc_predictions = pc_cross_validation.cross_val_predict( mrgr, [self.X_rgr], self.y_rgr, cv=self.cv, n_processes=n_cpus - 1) self.assertTrue( np.array_equal(self.rgr_predictions, pc_predictions['predict']['y_pred']), 'pipecaster predictions did not match sklearn ' 'control') warnings.resetwarnings()
def _select_synthetic_classification(channel_selector, n_informative_Xs=3, n_weak_Xs=0, n_random_Xs=0, weak_noise_sd=1.0, verbose=0, seed=None, **sklearn_params): n_Xs = n_informative_Xs + n_weak_Xs + n_random_Xs Xs, y, X_types = make_multi_input_classification( n_informative_Xs, n_weak_Xs, n_random_Xs, weak_noise_sd, seed, **sklearn_params) clf = MultichannelPipeline(n_channels=n_Xs) clf.add_layer(StandardScaler()) clf.add_layer(channel_selector) clf.fit(Xs, y) Xs_t = clf.transform(Xs) Xs_selected = [ 'selected' if X is not None else 'not selected' for X in Xs_t ] n_informative_hits, n_random_hits, n_weak_hits = 0, 0, 0 for X, t in zip(Xs_selected, X_types): if X == 'selected' and t == 'informative': n_informative_hits += 1 if X == 'not selected' and t == 'random': n_random_hits += 1 if X == 'selected' and t == 'weak': n_weak_hits += 1 if verbose > 0: print('InputSelector selected {} out of {} informative inputs'. format(n_informative_hits, n_informative_Xs)) print( 'InputSelector filtered out {} out of {} random inputs'.format( n_random_hits, n_Xs - n_informative_Xs - n_weak_Xs)) print( 'InputSelector selected out {} out of {} weakly informative inputs' .format(n_weak_hits, n_weak_Xs)) return n_informative_hits, n_random_hits, n_weak_hits
def test_compare_to_StackingClassifier(self, verbose=0, seed=42): """ Determine if Ensemble with dummies correctly selects the real predictors and gives similar performance to scikit-learn StackingClassifier trained without dummies. """ X, y = make_classification(n_samples=1000, n_features=20, n_informative=5, class_sep=0.5, random_state=seed) classifiers = [LogisticRegression(random_state=seed), KNeighborsClassifier(), RandomForestClassifier(random_state=seed)] dummy_classifiers = [DummyClassifier(strategy='stratified', random_state=seed) for repeat in range(100)] all_classifiers = classifiers + dummy_classifiers random.shuffle(all_classifiers) mclf = MultichannelPipeline(n_channels=1) mclf.add_layer(StandardScaler()) mclf.add_layer(Ensemble(all_classifiers, SVC(random_state=seed), internal_cv=5, score_selector=RankScoreSelector(k=3))) pc_score_all = np.mean(cross_val_score(mclf, [X], y, cv=5, n_processes=5)) mclf.fit([X], y) selected_classifiers = mclf.get_model(1,0).get_base_models() self.assertTrue(len(selected_classifiers) == 3, 'Ensemble picked the {} classifiers instead of 3.'.format(len(selected_classifiers))) self.assertFalse(DummyClassifier in [c.__class__ for c in selected_classifiers], 'Ensemble chose a dummy classifier over a real one') mclf = MultichannelPipeline(n_channels=1) mclf.add_layer(StandardScaler()) mclf.add_layer(Ensemble(classifiers, SVC(random_state=seed), internal_cv=5, score_selector=RankScoreSelector(k=3))) pc_score_informative = np.mean(cross_val_score(mclf, [X], y, cv=5, n_processes=5)) base_classifier_arg = [(str(i), c) for i, c in enumerate(classifiers)] clf = StackingClassifier(base_classifier_arg, SVC(random_state=seed), cv=StratifiedKFold(n_splits=3)) sk_score_informative = np.mean(cross_val_score(clf, X, y, cv=5, n_processes=5)) if verbose > 0: base_classifier_arg = [(str(i), c) for i, c in enumerate(all_classifiers)] clf = StackingClassifier(base_classifier_arg, SVC(random_state=seed), cv=StratifiedKFold(n_splits=3)) sk_score_all = np.mean(cross_val_score(clf, X, y, cv=5, n_processes=5)) print('\nBalanced accuracy scores') print('Ensemble informative predictors: {}'.format(pc_score_informative)) print('Ensemble all predictors: {}'.format(pc_score_all)) print('StackingClassifier informative predictors: {}'.format(sk_score_informative)) print('StackingClassifier all predictors: {}'.format(sk_score_all)) self.assertTrue(np.round(pc_score_all, 2) == np.round(pc_score_informative, 2), 'Ensemble accuracy is not same for all classifiers and informative classifiers.') tolerance_pct = 5 self.assertTrue(pc_score_all >= sk_score_informative * (1 - tolerance_pct / 100.0), '''Ensemble with random inputs did not perform within accepted tolerance of StackingClassifier with no dummy classifiers.''')
def test_discrimination_rgr(self, verbose=0, seed=42): """ Determine if Ensemble can discriminate between dummy regressors and LinearRegression classifiers """ X, y = make_regression(n_samples=500, n_features=20, n_informative=10, random_state=seed) base_regressors = [DummyRegressor(strategy='mean') for i in range(5)] base_regressors.extend([LinearRegression() for i in range(5)]) random.shuffle(base_regressors) informative_mask = [True if type(c) == LinearRegression else False for c in base_regressors] mclf = MultichannelPipeline(n_channels=1) mclf.add_layer(StandardScaler()) mclf.add_layer(Ensemble(base_regressors, SVR(), internal_cv=5, score_selector=RankScoreSelector(k=5))) mclf.fit([X], y) selected_indices = mclf.get_model(layer_index=1, model_index=0).get_support() selection_mask = [True if i in selected_indices else False for i in range(len(base_regressors))] if verbose > 0: n_correct = sum([1 for i, s in zip(informative_mask, selection_mask) if i and s]) print('\n\ncorrectly selected {}/5 LinearRegression regressors'.format(n_correct)) print('incorrectly selected {}/5 DummyRegressors\n\n'.format(5- n_correct)) self.assertTrue(np.array_equal(selection_mask, informative_mask), 'Ensemble failed to discriminate between dummy regressors and LinearRegression')
def test_soft_voting_decision(self, verbose=0, seed=42): Xs, y, _ = make_multi_input_classification(n_informative_Xs=6, n_random_Xs=3, random_state=seed) clf = MultichannelPipeline(n_channels=9) clf.add_layer(StandardScaler()) base_clf = make_transformer(SVC(), transform_method='decision_function') clf.add_layer(base_clf) meta_clf1 = SoftVotingDecision() clf.add_layer(3, meta_clf1, 3, meta_clf1, 3, meta_clf1) meta_clf2 = MultichannelPredictor(GradientBoostingClassifier()) clf.add_layer(meta_clf2) scores = cross_val_score(clf, Xs, y, score_method='predict', scorer=balanced_accuracy_score) score = np.mean(scores) if verbose > 0: print('accuracy = {}'.format(score)) self.assertTrue(score > 0.85)
def test_soft_voting(self, verbose=0, seed=42): Xs, y, _ = make_multi_input_classification(n_informative_Xs=5, n_random_Xs=2, random_state=seed) clf = MultichannelPipeline(n_channels=7) clf.add_layer(StandardScaler()) base_clf = KNeighborsClassifier() base_clf = transform_wrappers.SingleChannel(base_clf) clf.add_layer(base_clf) clf.add_layer(SoftVotingClassifier()) scores = cross_val_score(clf, Xs, y, score_method='predict', scorer=balanced_accuracy_score) score = np.mean(scores) if verbose > 0: print('accuracy = {}'.format(score)) self.assertTrue(score > 0.80)
def test_single_matrix_hard_voting(self): """ Determine if KNN->ChannelClassifier(hard voting) in a pipecaster pipeline gives identical predictions to sklearn KNN on training data. """ X, y = make_classification(n_samples=100, n_features=20, n_informative=10, class_sep=5, random_state=42) # control clf = KNeighborsClassifier(n_neighbors=5, weights='uniform') clf.fit(X, y) clf_predictions = clf.predict(X) # implementation 1 mclf = MultichannelPipeline(n_channels=1) base_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform') base_clf = transform_wrappers.SingleChannel(base_clf, transform_method='predict') mclf.add_layer(base_clf) mclf.add_layer(MultichannelPredictor(HardVotingMetaClassifier())) mclf.fit([X], y) mclf_predictions = mclf.predict([X]) self.assertTrue( np.array_equal(clf_predictions, mclf_predictions), 'hard voting metaclassifier did not reproduce sklearn ' 'result on single matrix prediction task') # implementation 2 mclf = MultichannelPipeline(n_channels=1) base_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform') mclf.add_layer( ChannelEnsemble(base_clf, HardVotingMetaClassifier(), base_transform_methods='predict')) mclf.fit([X], y) mclf_predictions = mclf.predict([X]) self.assertTrue( np.array_equal(clf_predictions, mclf_predictions), 'hard voting metaclassifier did not reproduce sklearn ' 'result on single matrix prediction task')
def test_single_matrix_mean_voting(self, seed=42): """ Determine if KNN->ChannelRegressor(mean voting) in a pipecaster pipeline gives identical predictions to sklearn KNN on training data """ X, y = make_regression(n_samples=100, n_features=20, n_informative=10, random_state=seed) # control rgr = KNeighborsRegressor(n_neighbors=5, weights='uniform') rgr.fit(X, y) rgr_predictions = rgr.predict(X) # implementation 1 mrgr = MultichannelPipeline(n_channels=1) rgr = transform_wrappers.SingleChannel( KNeighborsRegressor(n_neighbors=5, weights='uniform')) mrgr.add_layer(rgr, pipe_processes=n_cpus) mrgr.add_layer(MultichannelPredictor(AggregatingMetaRegressor( np.mean))) mrgr.fit([X], y) mrgr_predictions = mrgr.predict([X]) self.assertTrue( np.array_equal(rgr_predictions, mrgr_predictions), 'mean voting ChannelRegressor failed to reproduce ' 'sklearn result on single matrix prediction task') # implementation 2 mrgr = MultichannelPipeline(n_channels=1) base_rgr = KNeighborsRegressor(n_neighbors=5, weights='uniform') mrgr.add_layer( ChannelEnsemble(base_rgr, AggregatingMetaRegressor(np.mean), base_processes='max')) mrgr.fit([X], y) mrgr_predictions = mrgr.predict([X]) self.assertTrue( np.array_equal(rgr_predictions, mrgr_predictions), 'mean voting ChannelRegressor failed to reproduce ' 'sklearn result on single matrix prediction task')
def test_multi_matrix_voting(self, verbose=0): """ Test if KNN->ChannelClassifier(soft voting) in a pipecaster pipeline gives monotonically increasing accuracy with increasing number of inputs in concordance with Condorcet's jury theorem, and also test hard voting with same pass criterion. Test if accuracy is > 80%. """ if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.filterwarnings("ignore") n_channels = 5 sklearn_params = { 'n_classes': 2, 'n_samples': 500, 'n_features': 100, 'n_informative': 30, 'n_redundant': 0, 'n_repeated': 0, 'class_sep': 3.0 } # implementation 1 soft_accuracies, hard_accuracies = [], [] for i in range(0, n_channels + 1): Xs, y, _ = make_multi_input_classification(n_informative_Xs=i, n_weak_Xs=0, n_random_Xs=n_channels - i, weak_noise_sd=None, seed=42, **sklearn_params) mclf = MultichannelPipeline(n_channels) mclf.add_layer(StandardScaler(), pipe_processes=n_cpus) clf = transform_wrappers.SingleChannel( KNeighborsClassifier(n_neighbors=5, weights='uniform')) mclf.add_layer(clf, pipe_processes=n_cpus) mclf.add_layer(MultichannelPredictor(SoftVotingMetaClassifier())) split_accuracies = cross_val_score(mclf, Xs, y, scorer=roc_auc_score, cv=3, n_processes=1) soft_accuracies.append(np.mean(split_accuracies)) mclf = MultichannelPipeline(n_channels) mclf.add_layer(StandardScaler(), pipe_processes=n_cpus) clf = KNeighborsClassifier(n_neighbors=5, weights='uniform') clf = transform_wrappers.SingleChannel(clf, transform_method='predict') mclf.add_layer(clf, pipe_processes=n_cpus) mclf.add_layer(MultichannelPredictor(HardVotingMetaClassifier())) split_accuracies = cross_val_score(mclf, Xs, y, scorer=roc_auc_score, cv=3, n_processes=1) hard_accuracies.append(np.mean(split_accuracies)) if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.resetwarnings() if verbose > 0: print('soft voting results:') print('n_informative, accuray') for i in range(0, n_channels + 1): print(i, soft_accuracies[i]) print('hard voting results:') print('n_informative, accuray') for i in range(0, n_channels + 1): print(i, hard_accuracies[i]) n_informative = range(0, n_channels + 1) accuracy = soft_accuracies[-1] self.assertTrue( accuracy > 0.80, 'soft voting accuracy of {} below ' 'acceptable threshold of 0.80'.format(accuracy)) linearity = pearsonr(soft_accuracies, n_informative)[0] self.assertTrue( linearity > 0.80, 'hard voting linearity of {} below acceptable ' 'threshold of 0.80 pearsonr'.format(linearity)) accuracy = hard_accuracies[-1] self.assertTrue( accuracy > 0.80, 'hard voting accuracy of {} below ' 'acceptable threshold of 0.80'.format(accuracy)) linearity = pearsonr(hard_accuracies, n_informative)[0] self.assertTrue( linearity > 0.80, 'hard voting linearity of {} below acceptable ' 'threshold of 0.80 pearsonr'.format(linearity)) # implementation 2 soft_accuracies, hard_accuracies = [], [] for i in range(0, n_channels + 1): Xs, y, _ = make_multi_input_classification(n_informative_Xs=i, n_weak_Xs=0, n_random_Xs=n_channels - i, weak_noise_sd=None, seed=42, **sklearn_params) mclf = MultichannelPipeline(n_channels) mclf.add_layer(StandardScaler(), pipe_processes=n_cpus) base_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform') mclf.add_layer( ChannelEnsemble(base_clf, SoftVotingMetaClassifier(), base_processes=n_cpus)) split_accuracies = cross_val_score(mclf, Xs, y, scorer=roc_auc_score, cv=3, n_processes=1) soft_accuracies.append(np.mean(split_accuracies)) mclf = MultichannelPipeline(n_channels) mclf.add_layer(StandardScaler(), pipe_processes='max') clf = KNeighborsClassifier(n_neighbors=5, weights='uniform') clf = transform_wrappers.SingleChannel(clf, transform_method='predict') mclf.add_layer(clf, pipe_processes='max') mclf.add_layer(MultichannelPredictor(HardVotingMetaClassifier())) split_accuracies = cross_val_score(mclf, Xs, y, scorer=roc_auc_score, cv=3, n_processes=1) hard_accuracies.append(np.mean(split_accuracies)) if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.resetwarnings() if verbose > 0: print('soft voting results:') print('n_informative, accuray') for i in range(0, n_channels + 1): print(i, soft_accuracies[i]) print('hard voting results:') print('n_informative, accuray') for i in range(0, n_channels + 1): print(i, hard_accuracies[i]) n_informative = range(0, n_channels + 1) accuracy = soft_accuracies[-1] self.assertTrue( accuracy > 0.80, 'soft voting accuracy of {} below ' 'acceptable threshold of 0.80'.format(accuracy)) linearity = pearsonr(soft_accuracies, n_informative)[0] self.assertTrue( linearity > 0.80, 'hard voting linearity of {} below acceptable ' 'threshold of 0.80 pearsonr'.format(linearity)) accuracy = hard_accuracies[-1] self.assertTrue( accuracy > 0.80, 'hard voting accuracy of {} below ' 'acceptable threshold of 0.80'.format(accuracy)) linearity = pearsonr(hard_accuracies, n_informative)[0] self.assertTrue( linearity > 0.80, 'hard voting linearity of {} below acceptable ' 'threshold of 0.80 pearsonr'.format(linearity))
def test_add_layer_interface_mapping(self, verbose=0, seed=42): """ Functional test of the MultichannelPipeline channel mapping interface. """ if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.filterwarnings("ignore") n_channels = 5 accuracies = [] rgr_params = {'n_samples': 1000, 'n_features': 10, 'n_informative': 10} # implementation 1 for i in range(0, n_channels + 1): Xs, y, _ = make_multi_input_regression(n_informative_Xs=i, n_weak_Xs=0, n_random_Xs=n_channels - i, weak_noise_sd=None, seed=seed, **rgr_params) mrgr = MultichannelPipeline(n_channels) mrgr.add_layer(2, StandardScaler(), 3, StandardScaler(), pipe_processes=n_cpus) base_rgr = transform_wrappers.SingleChannelCV(LinearRegression()) mrgr.add_layer(2, base_rgr, 3, base_rgr, pipe_processes=n_cpus) mrgr.add_layer(5, MultichannelPredictor(SVR())) split_accuracies = cross_val_score(mrgr, Xs, y, scorer=explained_variance_score, cv=3, n_processes=1) accuracies.append(np.mean(split_accuracies)) n_informatives = range(0, n_channels + 1) if verbose > 0: print('explained variance scores') print('informative Xs\t\t svr stacking') for n_informative, ev in zip(n_informatives, accuracies): print('{}\t\t {}'.format(n_informative, ev)) if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.resetwarnings() final_ev = accuracies[-1] linearity = pearsonr(accuracies, n_informatives)[0] if verbose > 0: print('SVR stacking pearsonr = {}'.format(linearity)) self.assertTrue( final_ev > 0.1, 'SVR stacking explained variance of {} is below ' 'acceptable threshold of 0.80'.format(final_ev)) linearity = pearsonr(accuracies, n_informatives)[0] self.assertTrue( linearity > 0.0, 'SVR stacking linearity of {} below acceptable ' 'threshold of 0.80 pearsonr'.format(linearity)) # implementation 2 accuracies = [] for i in range(0, n_channels + 1): Xs, y, _ = make_multi_input_regression(n_informative_Xs=i, n_weak_Xs=0, n_random_Xs=n_channels - i, weak_noise_sd=None, seed=seed, **rgr_params) mrgr = MultichannelPipeline(n_channels) mrgr.add_layer(2, StandardScaler(), 3, StandardScaler(), pipe_processes='max') base_rfrs = [LinearRegression() for i in range(2)] base_rfrs += [LinearRegression() for i in range(3)] mrgr.add_layer( ChannelEnsemble(base_rfrs, SVR(), base_processes='max', internal_cv=5)) split_accuracies = cross_val_score(mrgr, Xs, y, scorer=explained_variance_score, cv=3, n_processes=1) accuracies.append(np.mean(split_accuracies)) n_informatives = range(0, n_channels + 1) if verbose > 0: print('explained variance scores') print('informative Xs\t\t svr stacking') for n_informative, ev in zip(n_informatives, accuracies): print('{}\t\t {}'.format(n_informative, ev)) if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.resetwarnings() final_ev = accuracies[-1] linearity = pearsonr(accuracies, n_informatives)[0] if verbose > 0: print('SVR stacking pearsonr = {}'.format(linearity)) self.assertTrue( final_ev > 0.1, 'SVR stacking explained variance of {} is below ' 'acceptable threshold of 0.80'.format(final_ev)) linearity = pearsonr(accuracies, n_informatives)[0] self.assertTrue( linearity > 0.0, 'SVR stacking linearity of {} below acceptable ' 'threshold of 0.80 pearsonr'.format(linearity))
def test_multi_input_regression(self): mrgr = MultichannelPipeline(n_channels=1) mrgr.add_layer(self.rgr) pc_scores = pc_cross_validation.cross_val_score(mrgr, [self.X_rgr], self.y_rgr, scorer=explained_variance_score, cv=self.cv, n_processes=1) self.assertTrue(np.array_equal(self.rgr_scores, pc_scores), 'regressor scores from pipecaster.cross_validation.cross_val_score did not match sklearn control (multi input predictor)')
def test_multi_matrices_svm_metaclassifier(self, seed=42, verbose=0): """ Test if KNN classifier->ChannelClassifier(SVC) in a pipecaster pipeline gives monotonically increasing accuracy with increasing number of inputs, and test if accuracy is > 75%. """ if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.filterwarnings("ignore") n_channels = 5 accuracies = [] sklearn_params = { 'n_classes': 2, 'n_samples': 500, 'n_features': 100, 'n_informative': 5, 'n_redundant': 10, 'n_repeated': 5, 'class_sep': 1.0 } # implementation 1 for i in range(0, n_channels + 1): Xs, y, _ = make_multi_input_classification(n_informative_Xs=i, n_weak_Xs=0, n_random_Xs=n_channels - i, weak_noise_sd=None, seed=seed, **sklearn_params) mclf = MultichannelPipeline(n_channels) mclf.add_layer(StandardScaler(), pipe_processes='max') clf = transform_wrappers.SingleChannel( KNeighborsClassifier(n_neighbors=5, weights='uniform')) mclf.add_layer(clf, pipe_processes='max') mclf.add_layer(MultichannelPredictor(SVC())) split_accuracies = cross_val_score(mclf, Xs, y, scorer=roc_auc_score, cv=3, n_processes=1) accuracies.append(np.mean(split_accuracies)) if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.resetwarnings() if verbose > 0: print('SVC meta-classification results:') print('n_informative, accuray') for i in range(0, n_channels + 1): print(i, accuracies[i]) n_informative = range(0, n_channels + 1) self.assertTrue( accuracies[-1] > 0.75, 'SVC metaclassification accuracy of {} below \ acceptable threshold of 0.75'.format(accuracies[-1])) linearity = pearsonr(accuracies, n_informative)[0] self.assertTrue( linearity > 0.75, 'SVC metaclassification linearity of {} below \ acceptable threshold of 0.75 pearsonr'.format( linearity)) # implementation 2 accuracies = [] for i in range(0, n_channels + 1): Xs, y, _ = make_multi_input_classification(n_informative_Xs=i, n_weak_Xs=0, n_random_Xs=n_channels - i, weak_noise_sd=None, seed=seed, **sklearn_params) mclf = MultichannelPipeline(n_channels) mclf.add_layer(StandardScaler(), pipe_processes='max') base_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform') mclf.add_layer( ChannelEnsemble(base_clf, SVC(), internal_cv=5, base_processes='max')) split_accuracies = cross_val_score(mclf, Xs, y, scorer=roc_auc_score, cv=3, n_processes=1) accuracies.append(np.mean(split_accuracies)) if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.resetwarnings() if verbose > 0: print('SVC meta-classification results:') print('n_informative, accuray') for i in range(0, n_channels + 1): print(i, accuracies[i]) n_informative = range(0, n_channels + 1) self.assertTrue( accuracies[-1] > 0.75, 'SVC metaclassification accuracy of {} below \ acceptable threshold of 0.75'.format(accuracies[-1])) linearity = pearsonr(accuracies, n_informative)[0] self.assertTrue( linearity > 0.75, 'SVC metaclassification linearity of {} below \ acceptable threshold of 0.75 pearsonr'.format( linearity))
def test_architecture_01(self, verbose=0, seed=42): """ Test the accuracy and hygiene (shuffle control) of a complex pipeline with feature selection, matrix selection, model selection, and model stacking. """ X_rand = np.random.rand(500, 30) X_inf, y = make_classification(n_samples=500, n_features=30, n_informative=15, class_sep=3, random_state=seed) Xs = [X_rand, X_rand, X_inf, X_rand, X_inf, X_inf] clf = MultichannelPipeline(n_channels=6) clf.add_layer(SimpleImputer()) clf.add_layer(StandardScaler()) clf.add_layer(SelectPercentile(percentile=25)) clf.add_layer( 5, SelectKBestScores(feature_scorer=f_classif, aggregator=np.mean, k=2)) LR_cv = transform_wrappers.SingleChannelCV(LogisticRegression()) CE = ChannelEnsemble(base_predictors=KNeighborsClassifier(), internal_cv=5, score_selector=RankScoreSelector(1)) CE_cv = transform_wrappers.MultichannelCV(CE) clf.add_layer(5, CE_cv, 1, LR_cv) clf.add_layer(MultichannelPredictor(SVC())) score = np.mean( cross_val_score(clf, Xs, y, scorer=balanced_accuracy_score)) if verbose > 0: print('accuracy score: {}'.format(score)) self.assertTrue( score > 0.95, 'Accuracy score of {} did not exceed ' 'tolerance value of 95%'.format(score)) clf.fit(Xs, y) score_selector = transform_wrappers.unwrap_model(clf.get_model(3, 0)) if verbose > 0: print('indices selected by SelectKBestScores: {}'.format( score_selector.get_support())) print('correct indices: [2, 4]') self.assertTrue(np.array_equal(score_selector.get_support(), [2, 4]), 'SelectKBestScores selected the wrong channels.') model_selector = transform_wrappers.unwrap_model(clf.get_model(4, 0)) if verbose > 0: print('indices selected by SelectKBestModels: {}'.format( model_selector.get_support())) print('correct indices: [2, 4]') self.assertTrue(model_selector.get_support()[0] in [2, 4], 'SelectKBestModels selected the wrong model') score = np.mean( cross_val_score(clf, Xs, y[np.random.permutation(len(y))], scorer=balanced_accuracy_score)) if verbose > 0: print('shuffle control accuracy score: {}'.format(score)) self.assertTrue( score < 0.55, 'Accuracy score of shuffle control, {}, ' 'exceeded tolerance value of 55%'.format(score))
def test_multi_matrix_voting(self, verbose=0, seed=42): """ Determine if KNN->ChannelRegressor(voting) in a MultichannelPipeline gives monotonically increasing accuracy with increasing number of inputs and exceeds an accuracy cutoff. """ if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.filterwarnings("ignore") n_channels = 5 rgr_params = {'n_samples': 500, 'n_features': 10, 'n_informative': 5} # implementation 1 mean_accuracies, median_accuracies = [], [] for i in range(0, n_channels + 1): Xs, y, _ = make_multi_input_regression(n_informative_Xs=i, n_weak_Xs=0, n_random_Xs=n_channels - i, weak_noise_sd=None, seed=seed, **rgr_params) # mean aggregation mrgr = MultichannelPipeline(n_channels) mrgr.add_layer(StandardScaler(), pipe_processes=n_cpus) rgr = transform_wrappers.SingleChannel( KNeighborsRegressor(n_neighbors=20, weights='distance')) mrgr.add_layer(rgr, pipe_processes=n_cpus) mrgr.add_layer( MultichannelPredictor(AggregatingMetaRegressor(np.mean))) split_accuracies = cross_val_score(mrgr, Xs, y, scorer=explained_variance_score, cv=3, n_processes=1) mean_accuracies.append(np.mean(split_accuracies)) # median aggregation mrgr = MultichannelPipeline(n_channels) mrgr.add_layer(StandardScaler(), pipe_processes=n_cpus) rgr = transform_wrappers.SingleChannel( KNeighborsRegressor(n_neighbors=20, weights='distance')) mrgr.add_layer(rgr, pipe_processes=n_cpus) mrgr.add_layer( MultichannelPredictor(AggregatingMetaRegressor(np.median))) split_accuracies = cross_val_score(mrgr, Xs, y, scorer=explained_variance_score, cv=3, n_processes=1) median_accuracies.append(np.mean(split_accuracies)) n_informatives = range(0, n_channels + 1) if verbose > 0: print('explained variance scores') print('informative Xs\t\t mean voting\t\t median voting') for n_informative, mean_ev, median_ev in zip( n_informatives, mean_accuracies, median_accuracies): print('{}\t\t {}\t\t {}'.format(n_informative, mean_ev, median_ev)) if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.resetwarnings() mean_ev = mean_accuracies[-1] mean_linearity = pearsonr(mean_accuracies, n_informatives)[0] median_ev = median_accuracies[-1] median_linearity = pearsonr(median_accuracies, n_informatives)[0] if verbose > 0: print('mean voting pearsonr = {}'.format(mean_linearity)) print('median voting pearsonr = {}'.format(median_linearity)) self.assertTrue( mean_ev > 0.1, 'mean voting explained variance of {} is below ' 'acceptable threshold of 0.80'.format(mean_ev)) linearity = pearsonr(mean_accuracies, n_informatives)[0] self.assertTrue( mean_linearity > 0.9, 'mean voting linearity of {} below acceptable ' 'threshold of 0.80 pearsonr'.format(mean_linearity)) accuracy = median_accuracies[-1] self.assertTrue( median_ev > 0.1, 'median voting explained variance of {} is below ' 'acceptable threshold of 0.80'.format(median_ev)) linearity = pearsonr(median_accuracies, n_informatives)[0] self.assertTrue( median_linearity > 0.9, 'median voting linearity of {} below acceptable ' 'threshold of 0.80 pearsonr'.format(median_linearity)) # implementation 2 mean_accuracies, median_accuracies = [], [] for i in range(0, n_channels + 1): Xs, y, _ = make_multi_input_regression(n_informative_Xs=i, n_weak_Xs=0, n_random_Xs=n_channels - i, weak_noise_sd=None, seed=seed, **rgr_params) # mean aggregation mrgr = MultichannelPipeline(n_channels) mrgr.add_layer(StandardScaler(), pipe_processes=n_cpus) base_rgr = KNeighborsRegressor(n_neighbors=20, weights='distance') mrgr.add_layer( ChannelEnsemble(base_rgr, AggregatingMetaRegressor(np.mean), base_processes='max')) split_accuracies = cross_val_score(mrgr, Xs, y, scorer=explained_variance_score, cv=3, n_processes=1) mean_accuracies.append(np.mean(split_accuracies)) # median aggregation mrgr = MultichannelPipeline(n_channels) mrgr.add_layer(StandardScaler(), pipe_processes=n_cpus) rgr = KNeighborsRegressor(n_neighbors=20, weights='distance') mrgr.add_layer( ChannelEnsemble(base_rgr, AggregatingMetaRegressor(np.median), base_processes='max')) split_accuracies = cross_val_score(mrgr, Xs, y, scorer=explained_variance_score, cv=3, n_processes=1) median_accuracies.append(np.mean(split_accuracies)) n_informatives = range(0, n_channels + 1) if verbose > 0: print('explained variance scores') print('informative Xs\t\t mean voting\t\t median voting') for n_informative, mean_ev, median_ev in zip( n_informatives, mean_accuracies, median_accuracies): print('{}\t\t {}\t\t {}'.format(n_informative, mean_ev, median_ev)) if n_cpus > 1: # shut off warnings because ray and redis generate massive numbers warnings.resetwarnings() mean_ev = mean_accuracies[-1] mean_linearity = pearsonr(mean_accuracies, n_informatives)[0] median_ev = median_accuracies[-1] median_linearity = pearsonr(median_accuracies, n_informatives)[0] if verbose > 0: print('mean voting pearsonr = {}'.format(mean_linearity)) print('median voting pearsonr = {}'.format(median_linearity)) self.assertTrue( mean_ev > 0.1, 'mean voting explained variance of {} is below ' 'acceptable threshold of 0.80'.format(mean_ev)) linearity = pearsonr(mean_accuracies, n_informatives)[0] self.assertTrue( mean_linearity > 0.9, 'mean voting linearity of {} below acceptable ' 'threshold of 0.80 pearsonr'.format(mean_linearity)) accuracy = median_accuracies[-1] self.assertTrue( median_ev > 0.1, 'median voting explained variance of {} is below ' 'acceptable threshold of 0.80'.format(median_ev)) linearity = pearsonr(median_accuracies, n_informatives)[0] self.assertTrue( median_linearity > 0.9, 'median voting linearity of {} below acceptable ' 'threshold of 0.80 pearsonr'.format(median_linearity))