コード例 #1
0
 def test_multi_input_classification(self):
     mclf = MultichannelPipeline(n_channels=1)
     mclf.add_layer(self.clf)
     pc_scores = pc_cross_validation.cross_val_score(
         mclf, [self.X_cls], self.y_cls,  score_method='predict_proba', 
         scorer=roc_auc_score, cv=self.cv, n_processes=1)
     self.assertTrue(np.array_equal(self.cls_scores, pc_scores), 'classifier scores from pipecaster.cross_validation.cross_val_score did not match sklearn control (multi input predictor)')
コード例 #2
0
    def test_multiprocessing_speedup(self, verbose=0):

        if n_cpus > 1:
            warnings.filterwarnings("ignore")
            parallel.start_if_needed(n_cpus=n_cpus)
            X, y = self.X_cls, self.y_cls = make_classification(n_classes=2, n_samples=500, n_features=40,
                                             n_informative=20, random_state=test_seed)
            mclf = MultichannelPipeline(n_channels=1)
            mclf.add_layer(DummyClassifier(futile_cycles_fit=2000000, futile_cycles_pred=10))

            # shut off warnings because ray and redis generate massive numbers
            SETUP_CODE = '''
import pipecaster.cross_validation'''
            TEST_CODE = '''
pipecaster.cross_validation.cross_val_score(mclf, [X], y, cv = 5, n_processes = 1)'''
            t_serial = timeit.timeit(setup = SETUP_CODE,
                                  stmt = TEST_CODE,
                                  globals = locals(),
                                  number = 5)
            TEST_CODE = '''
pipecaster.cross_validation.cross_val_score(mclf, [X], y, cv = 5, n_processes = {})'''.format(n_cpus)
            t_parallel = timeit.timeit(setup = SETUP_CODE,
                                  stmt = TEST_CODE,
                                  globals = locals(),
                                  number = 5)

            warnings.resetwarnings()

            if verbose > 0:
                print('serial run mean time = {} s'.format(t_serial))
                print('parallel run mean time = {} s'.format(t_parallel))

            if t_serial <= t_parallel:
                warnings.warn('mulitple cpus detected, but parallel cross_val_score not faster than serial, possible problem with multiprocessing')
コード例 #3
0
 def test_multi_input_regression_parallel(self):
     if n_cpus > 1:
         warnings.filterwarnings("ignore")
         parallel.start_if_needed(n_cpus=n_cpus)
         mrgr = MultichannelPipeline(n_channels=1)
         mrgr.add_layer(self.rgr)
         pc_scores = pc_cross_validation.cross_val_score(mrgr, [self.X_rgr], self.y_rgr, scorer=explained_variance_score,
                                                        cv=self.cv, n_processes=n_cpus)
         self.assertTrue(np.array_equal(self.rgr_scores, pc_scores), 'regressor scores from pipecaster.cross_validation.cross_val_score did not match sklearn control (multi input predictor)')
         warnings.resetwarnings()
コード例 #4
0
    def test_discrimination_rgr(self, verbose=0, seed=42):
        """
        Determine if Ensemble can pick real regressor over dummy and
        test performance.
        """
        X, y = make_regression(n_samples=500, n_features=20, n_informative=10,
                               random_state=seed)

        base_regressors = [DummyRegressor(strategy='mean') for i in range(5)]
        base_regressors.append(LinearRegression())
        random.shuffle(base_regressors)

        mclf = MultichannelPipeline(n_channels=1)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(Ensemble(base_regressors, internal_cv=5,
                                score_selector=RankScoreSelector(k=1)))
        mclf.fit([X], y)

        ensemble = mclf.get_model(1, 0)
        selected_model = ensemble.get_base_models()[0]
        selected_model = transform_wrappers.unwrap_model(selected_model)

        if verbose > 0:
            print(ensemble.get_screen_results())

        self.assertTrue(type(selected_model) == LinearRegression,
                        'Ensemble failed to pick LinearRegression '
                        'over dummies')

        acc = np.mean(cross_val_score(mclf, [X], y))
        if verbose > 0:
            print('cross val accuracy: {}'.format(acc))

        self.assertTrue(acc > 0.9, 'Accuracy tolerance failure.')
コード例 #5
0
    def test_multi_matrices_no_metapredictor(self, verbose=0, seed=42):
        """
        Determine if ChannelEnsemble works without a meta-predictor.

        Determine if it can pick informative input over random and
        test its performance.
        """

        Xs, y, types = make_multi_input_regression(n_informative_Xs=1,
                                                   n_weak_Xs=0,
                                                   n_random_Xs=4,
                                                   weak_noise_sd=None,
                                                   seed=seed,
                                                   n_samples=500,
                                                   n_features=20,
                                                   n_informative=20)

        mclf = MultichannelPipeline(n_channels=5)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(
            ChannelEnsemble(LinearRegression(),
                            internal_cv=5,
                            score_selector=RankScoreSelector(k=1)))
        mclf.fit(Xs, y)

        selected_type = types[mclf.get_model(1, 0).get_support()[0]]

        self.assertTrue(selected_type == 'informative',
                        'Ensemble failed to pick informative channel')

        acc = np.mean(cross_val_score(mclf, Xs, y))
        if verbose > 0:
            print('cross val accuracy: {}'.format(acc))

        self.assertTrue(acc > 0.10, 'Accuracy tolerance failure.')
コード例 #6
0
    def test_discrimination_cls(self, verbose=0, seed=42):
        """
        Determine if Ensemble can pick real classifier over dummy and
        test performance.
        """
        X, y = make_classification(n_samples=500, n_features=20,
                                   n_informative=15, class_sep=1,
                                   random_state=seed)

        base_classifiers = [DummyClassifier(strategy='stratified')
                            for i in range(5)]
        base_classifiers.append(LogisticRegression())
        random.shuffle(base_classifiers)

        mclf = MultichannelPipeline(n_channels=1)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(Ensemble(base_classifiers, internal_cv=5,
                                score_selector=RankScoreSelector(k=1)))
        mclf.fit([X], y)

        c = mclf.get_model(1, 0).get_base_models()[0]
        c = transform_wrappers.unwrap_model(c)

        self.assertTrue(type(c) == LogisticRegression,
                        'Ensemble failed to pick LogisticRegression '
                        'over dummies')

        acc = np.mean(cross_val_score(mclf, [X], y))
        if verbose > 0:
            print('cross val accuracy: {}'.format(acc))

        self.assertTrue(acc > 0.70, 'Accuracy tolerance failure.')
コード例 #7
0
 def test_multi_input_classification_parallel(self):
     if n_cpus > 1:
         warnings.filterwarnings("ignore")
         parallel.start_if_needed()
         mclf = MultichannelPipeline(n_channels=1)
         mclf.add_layer(self.clf)
         pc_scores = pc_cross_validation.cross_val_score(
             mclf, [self.X_cls], self.y_cls, score_method='predict_proba',
             scorer=roc_auc_score, cv=self.cv, n_processes=n_cpus)
         self.assertTrue(np.array_equal(self.cls_scores, pc_scores), 'classifier scores from pipecaster.cross_validation.cross_val_score did not match sklearn control (multi input predictor)')
         warnings.resetwarnings()
コード例 #8
0
 def test_multi_input_classification(self):
     mclf = MultichannelPipeline(n_channels=1)
     mclf.add_layer(self.clf)
     pc_predictions = pc_cross_validation.cross_val_predict(mclf,
                                                            [self.X_cls],
                                                            self.y_cls,
                                                            cv=self.cv,
                                                            n_processes=1)
     self.assertTrue(
         np.array_equal(self.cls_predictions,
                        pc_predictions['predict']['y_pred']),
         'pipecaster predictions did not match sklearn control')
コード例 #9
0
 def test_multi_input_regression(self):
     mrgr = MultichannelPipeline(n_channels=1)
     mrgr.add_layer(self.rgr)
     pc_predictions = pc_cross_validation.cross_val_predict(mrgr,
                                                            [self.X_rgr],
                                                            self.y_rgr,
                                                            cv=self.cv,
                                                            n_processes=1)
     self.assertTrue(
         np.array_equal(self.rgr_predictions,
                        pc_predictions['predict']['y_pred']),
         'pipecaster predictions did not match sklearn control')
コード例 #10
0
 def test_multi_input_classification_parallel(self):
     if n_cpus > 1:
         warnings.filterwarnings("ignore")
         parallel.start_if_needed(n_cpus=n_cpus)
         mclf = MultichannelPipeline(n_channels=1)
         mclf.add_layer(self.clf)
         pc_predictions = pc_cross_validation.cross_val_predict(
             mclf, [self.X_cls], self.y_cls, cv=self.cv, n_processes=n_cpus)
         self.assertTrue(
             np.array_equal(self.cls_predictions,
                            pc_predictions['predict']['y_pred']),
             'pipecaster predictions did not match sklearn control')
         warnings.resetwarnings()
コード例 #11
0
    def test_throttled_multiprocessing_speedup(self, verbose=0):

        if n_cpus > 1:
            warnings.filterwarnings("ignore")
            parallel.start_if_needed(n_cpus=n_cpus)
            X, y = self.X_cls, self.y_cls = make_classification(
                n_classes=2,
                n_samples=500,
                n_features=40,
                n_informative=20,
                random_state=test_seed)
            mclf = MultichannelPipeline(n_channels=1)
            mclf.add_layer(
                DummyClassifier(futile_cycles_fit=2000000,
                                futile_cycles_pred=10))

            SETUP_CODE = '''
import pipecaster.cross_validation'''
            TEST_CODE = '''
pipecaster.cross_validation.cross_val_predict(mclf, [X], y, cv = {}, n_processes = 1)'''.format(
                n_cpus - 1)
            t_serial = timeit.timeit(setup=SETUP_CODE,
                                     stmt=TEST_CODE,
                                     globals=locals(),
                                     number=5)
            TEST_CODE = '''
pipecaster.cross_validation.cross_val_predict(mclf, [X], y, cv = {}, n_processes = {})'''.format(
                n_cpus - 1, n_cpus - 1)
            t_parallel = timeit.timeit(setup=SETUP_CODE,
                                       stmt=TEST_CODE,
                                       globals=locals(),
                                       number=5)

            warnings.resetwarnings()

            if verbose > 0:
                print(
                    'number of CPUs detected and parallel jobs requested: {}'.
                    format(n_cpus))
                print('duration of serial cross cross_val_predict task: {} s'.
                      format(t_serial))
                print(
                    'duration of parallel cross cross_val_predict task (ray pool.starmap): {} s'
                    .format(t_parallel))

            if t_serial <= t_parallel:
                warnings.warn(
                    'multiple cpus detected, but parallel cross_val_predict not faster than serial using ray.multiprocessing.starmap(), possible problem with multiprocessing'
                )
コード例 #12
0
 def test_multi_input_regression_parallel_starmap(self):
     if n_cpus > 2:
         warnings.filterwarnings("ignore")
         parallel.start_if_needed(n_cpus=n_cpus)
         mrgr = MultichannelPipeline(n_channels=1)
         mrgr.add_layer(self.rgr)
         pc_predictions = pc_cross_validation.cross_val_predict(
             mrgr, [self.X_rgr],
             self.y_rgr,
             cv=self.cv,
             n_processes=n_cpus - 1)
         self.assertTrue(
             np.array_equal(self.rgr_predictions,
                            pc_predictions['predict']['y_pred']),
             'pipecaster predictions did not match sklearn '
             'control')
         warnings.resetwarnings()
コード例 #13
0
    def _select_synthetic_regression(channel_selector,
                                     n_informative_Xs=5,
                                     n_weak_Xs=0,
                                     n_random_Xs=0,
                                     weak_noise_sd=None,
                                     verbose=0,
                                     seed=None,
                                     **rgr_params):

        n_Xs = n_informative_Xs + n_weak_Xs + n_random_Xs
        Xs, y, X_types = make_multi_input_regression(n_informative_Xs,
                                                     n_weak_Xs, n_random_Xs,
                                                     weak_noise_sd, seed,
                                                     **rgr_params)
        clf = MultichannelPipeline(n_channels=n_Xs)
        clf.add_layer(StandardScaler())
        clf.add_layer(channel_selector)
        Xs_t = clf.fit_transform(Xs, y)
        Xs_selected = [
            'selected' if X is not None else 'not selected' for X in Xs_t
        ]

        n_informative_hits, n_random_hits, n_weak_hits = 0, 0, 0
        for X, t in zip(Xs_selected, X_types):
            if X == 'selected' and t == 'informative':
                n_informative_hits += 1
            if X == 'not selected' and t == 'random':
                n_random_hits += 1
            if X == 'selected' and t == 'weak':
                n_weak_hits += 1

        if verbose > 0:
            print('InputSelector selected {} out of {} informative inputs'.
                  format(n_informative_hits, n_informative_Xs))
            print(
                'InputSelector filtered out {} out of {} random inputs'.format(
                    n_random_hits, n_Xs - n_informative_Xs - n_weak_Xs))
            print(
                'InputSelector selected out {} out of {} weakly informative inputs'
                .format(n_weak_hits, n_weak_Xs))

        return n_informative_hits, n_random_hits, n_weak_hits
コード例 #14
0
    def test_soft_voting_decision(self, verbose=0, seed=42):

        Xs, y, _ = make_multi_input_classification(n_informative_Xs=6,
                                                   n_random_Xs=3,
                                                   random_state=seed)

        clf = MultichannelPipeline(n_channels=9)
        clf.add_layer(StandardScaler())
        base_clf = make_transformer(SVC(),
                                    transform_method='decision_function')
        clf.add_layer(base_clf)
        meta_clf1 = SoftVotingDecision()
        clf.add_layer(3, meta_clf1, 3, meta_clf1, 3, meta_clf1)
        meta_clf2 = MultichannelPredictor(GradientBoostingClassifier())
        clf.add_layer(meta_clf2)
        scores = cross_val_score(clf, Xs, y, score_method='predict',
                                scorer=balanced_accuracy_score)
        score = np.mean(scores)
        if verbose > 0:
            print('accuracy = {}'.format(score))

        self.assertTrue(score > 0.85)
コード例 #15
0
    def test_soft_voting(self, verbose=0, seed=42):
        Xs, y, _ = make_multi_input_classification(n_informative_Xs=5,
                                              n_random_Xs=2, random_state=seed)
        clf = MultichannelPipeline(n_channels=7)
        clf.add_layer(StandardScaler())
        base_clf = KNeighborsClassifier()
        base_clf = transform_wrappers.SingleChannel(base_clf)
        clf.add_layer(base_clf)
        clf.add_layer(SoftVotingClassifier())
        scores = cross_val_score(clf, Xs, y, score_method='predict',
                                scorer=balanced_accuracy_score)
        score = np.mean(scores)
        if verbose > 0:
            print('accuracy = {}'.format(score))

        self.assertTrue(score > 0.80)
コード例 #16
0
    def test_discrimination_rgr(self, verbose=0, seed=42):
        """
        Determine if Ensemble can discriminate between dummy regressors and LinearRegression classifiers
        """
        X, y = make_regression(n_samples=500, n_features=20, n_informative=10, random_state=seed)

        base_regressors = [DummyRegressor(strategy='mean') for i in range(5)]
        base_regressors.extend([LinearRegression() for i in range(5)])
        random.shuffle(base_regressors)
        informative_mask = [True if type(c) == LinearRegression else False for c in base_regressors]

        mclf = MultichannelPipeline(n_channels=1)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(Ensemble(base_regressors, SVR(), internal_cv=5, score_selector=RankScoreSelector(k=5)))
        mclf.fit([X], y)
        selected_indices = mclf.get_model(layer_index=1, model_index=0).get_support()
        selection_mask = [True if i in selected_indices else False for i in range(len(base_regressors))]
        if verbose > 0:
            n_correct = sum([1 for i, s in zip(informative_mask, selection_mask) if i and s])
            print('\n\ncorrectly selected {}/5 LinearRegression regressors'.format(n_correct))
            print('incorrectly selected {}/5 DummyRegressors\n\n'.format(5- n_correct))
        self.assertTrue(np.array_equal(selection_mask, informative_mask),
                        'Ensemble failed to discriminate between dummy regressors and LinearRegression')
コード例 #17
0
    def test_aggregating_regressor(self, verbose=0, seed=42):
        Xs, y, _ = make_multi_input_regression(n_informative_Xs=3,
                                               random_state=seed)

        clf = MultichannelPipeline(n_channels=3)
        base_clf = GradientBoostingRegressor(n_estimators=50)
        clf.add_layer(make_transformer(base_clf))
        clf.add_layer(AggregatingRegressor(np.mean))
        cross_val_score(clf, Xs, y, cv=3)
        scores = cross_val_score(clf, Xs, y, score_method='predict',
                                scorer=explained_variance_score)
        score = np.mean(scores)
        if verbose > 0:
            print('accuracy = {}'.format(score))

        self.assertTrue(score > 0.3)
コード例 #18
0
 def test_multi_input_regression(self):
     mrgr = MultichannelPipeline(n_channels=1)
     mrgr.add_layer(self.rgr)
     pc_scores = pc_cross_validation.cross_val_score(mrgr, [self.X_rgr], self.y_rgr, scorer=explained_variance_score,
                                                    cv=self.cv, n_processes=1)
     self.assertTrue(np.array_equal(self.rgr_scores, pc_scores), 'regressor scores from pipecaster.cross_validation.cross_val_score did not match sklearn control (multi input predictor)')
コード例 #19
0
    def test_architecture_01(self, verbose=0, seed=42):
        """
        Test the accuracy and hygiene (shuffle control) of a complex pipeline
        with feature selection, matrix selection, model selection, and
        model stacking.
        """
        X_rand = np.random.rand(500, 30)
        X_inf, y = make_classification(n_samples=500,
                                       n_features=30,
                                       n_informative=15,
                                       class_sep=3,
                                       random_state=seed)

        Xs = [X_rand, X_rand, X_inf, X_rand, X_inf, X_inf]

        clf = MultichannelPipeline(n_channels=6)
        clf.add_layer(SimpleImputer())
        clf.add_layer(StandardScaler())
        clf.add_layer(SelectPercentile(percentile=25))
        clf.add_layer(
            5,
            SelectKBestScores(feature_scorer=f_classif,
                              aggregator=np.mean,
                              k=2))
        LR_cv = transform_wrappers.SingleChannelCV(LogisticRegression())
        CE = ChannelEnsemble(base_predictors=KNeighborsClassifier(),
                             internal_cv=5,
                             score_selector=RankScoreSelector(1))
        CE_cv = transform_wrappers.MultichannelCV(CE)
        clf.add_layer(5, CE_cv, 1, LR_cv)
        clf.add_layer(MultichannelPredictor(SVC()))

        score = np.mean(
            cross_val_score(clf, Xs, y, scorer=balanced_accuracy_score))
        if verbose > 0:
            print('accuracy score: {}'.format(score))
        self.assertTrue(
            score > 0.95, 'Accuracy score of {} did not exceed '
            'tolerance value of 95%'.format(score))

        clf.fit(Xs, y)
        score_selector = transform_wrappers.unwrap_model(clf.get_model(3, 0))
        if verbose > 0:
            print('indices selected by SelectKBestScores: {}'.format(
                score_selector.get_support()))
            print('correct indices: [2, 4]')
        self.assertTrue(np.array_equal(score_selector.get_support(), [2, 4]),
                        'SelectKBestScores selected the wrong channels.')

        model_selector = transform_wrappers.unwrap_model(clf.get_model(4, 0))
        if verbose > 0:
            print('indices selected by SelectKBestModels: {}'.format(
                model_selector.get_support()))
            print('correct indices: [2, 4]')
        self.assertTrue(model_selector.get_support()[0] in [2, 4],
                        'SelectKBestModels selected the wrong model')

        score = np.mean(
            cross_val_score(clf,
                            Xs,
                            y[np.random.permutation(len(y))],
                            scorer=balanced_accuracy_score))
        if verbose > 0:
            print('shuffle control accuracy score: {}'.format(score))
        self.assertTrue(
            score < 0.55, 'Accuracy score of shuffle control, {}, '
            'exceeded tolerance value of 55%'.format(score))
コード例 #20
0
    def test_multi_matrix_voting(self, verbose=0, seed=42):
        """
        Determine if KNN->ChannelRegressor(voting) in a MultichannelPipeline
        gives monotonically increasing accuracy with increasing number of
        inputs and exceeds an accuracy cutoff.
        """

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.filterwarnings("ignore")

        n_channels = 5

        rgr_params = {'n_samples': 500, 'n_features': 10, 'n_informative': 5}

        # implementation 1
        mean_accuracies, median_accuracies = [], []
        for i in range(0, n_channels + 1):

            Xs, y, _ = make_multi_input_regression(n_informative_Xs=i,
                                                   n_weak_Xs=0,
                                                   n_random_Xs=n_channels - i,
                                                   weak_noise_sd=None,
                                                   seed=seed,
                                                   **rgr_params)
            # mean aggregation
            mrgr = MultichannelPipeline(n_channels)
            mrgr.add_layer(StandardScaler(), pipe_processes=n_cpus)
            rgr = transform_wrappers.SingleChannel(
                KNeighborsRegressor(n_neighbors=20, weights='distance'))
            mrgr.add_layer(rgr, pipe_processes=n_cpus)
            mrgr.add_layer(
                MultichannelPredictor(AggregatingMetaRegressor(np.mean)))
            split_accuracies = cross_val_score(mrgr,
                                               Xs,
                                               y,
                                               scorer=explained_variance_score,
                                               cv=3,
                                               n_processes=1)
            mean_accuracies.append(np.mean(split_accuracies))

            # median aggregation
            mrgr = MultichannelPipeline(n_channels)
            mrgr.add_layer(StandardScaler(), pipe_processes=n_cpus)
            rgr = transform_wrappers.SingleChannel(
                KNeighborsRegressor(n_neighbors=20, weights='distance'))
            mrgr.add_layer(rgr, pipe_processes=n_cpus)
            mrgr.add_layer(
                MultichannelPredictor(AggregatingMetaRegressor(np.median)))

            split_accuracies = cross_val_score(mrgr,
                                               Xs,
                                               y,
                                               scorer=explained_variance_score,
                                               cv=3,
                                               n_processes=1)
            median_accuracies.append(np.mean(split_accuracies))

        n_informatives = range(0, n_channels + 1)
        if verbose > 0:
            print('explained variance scores')
            print('informative Xs\t\t mean voting\t\t median voting')
            for n_informative, mean_ev, median_ev in zip(
                    n_informatives, mean_accuracies, median_accuracies):
                print('{}\t\t {}\t\t {}'.format(n_informative, mean_ev,
                                                median_ev))

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.resetwarnings()

        mean_ev = mean_accuracies[-1]
        mean_linearity = pearsonr(mean_accuracies, n_informatives)[0]
        median_ev = median_accuracies[-1]
        median_linearity = pearsonr(median_accuracies, n_informatives)[0]

        if verbose > 0:
            print('mean voting pearsonr = {}'.format(mean_linearity))
            print('median voting pearsonr = {}'.format(median_linearity))

        self.assertTrue(
            mean_ev > 0.1, 'mean voting explained variance of {} is below '
            'acceptable threshold of 0.80'.format(mean_ev))
        linearity = pearsonr(mean_accuracies, n_informatives)[0]
        self.assertTrue(
            mean_linearity > 0.9,
            'mean voting linearity of {} below acceptable '
            'threshold of 0.80 pearsonr'.format(mean_linearity))
        accuracy = median_accuracies[-1]
        self.assertTrue(
            median_ev > 0.1, 'median voting explained variance of {} is below '
            'acceptable threshold of 0.80'.format(median_ev))
        linearity = pearsonr(median_accuracies, n_informatives)[0]
        self.assertTrue(
            median_linearity > 0.9,
            'median voting linearity of {} below acceptable '
            'threshold of 0.80 pearsonr'.format(median_linearity))

        # implementation 2
        mean_accuracies, median_accuracies = [], []
        for i in range(0, n_channels + 1):

            Xs, y, _ = make_multi_input_regression(n_informative_Xs=i,
                                                   n_weak_Xs=0,
                                                   n_random_Xs=n_channels - i,
                                                   weak_noise_sd=None,
                                                   seed=seed,
                                                   **rgr_params)

            # mean aggregation
            mrgr = MultichannelPipeline(n_channels)
            mrgr.add_layer(StandardScaler(), pipe_processes=n_cpus)
            base_rgr = KNeighborsRegressor(n_neighbors=20, weights='distance')
            mrgr.add_layer(
                ChannelEnsemble(base_rgr,
                                AggregatingMetaRegressor(np.mean),
                                base_processes='max'))
            split_accuracies = cross_val_score(mrgr,
                                               Xs,
                                               y,
                                               scorer=explained_variance_score,
                                               cv=3,
                                               n_processes=1)
            mean_accuracies.append(np.mean(split_accuracies))

            # median aggregation
            mrgr = MultichannelPipeline(n_channels)
            mrgr.add_layer(StandardScaler(), pipe_processes=n_cpus)
            rgr = KNeighborsRegressor(n_neighbors=20, weights='distance')
            mrgr.add_layer(
                ChannelEnsemble(base_rgr,
                                AggregatingMetaRegressor(np.median),
                                base_processes='max'))
            split_accuracies = cross_val_score(mrgr,
                                               Xs,
                                               y,
                                               scorer=explained_variance_score,
                                               cv=3,
                                               n_processes=1)
            median_accuracies.append(np.mean(split_accuracies))

        n_informatives = range(0, n_channels + 1)
        if verbose > 0:
            print('explained variance scores')
            print('informative Xs\t\t mean voting\t\t median voting')
            for n_informative, mean_ev, median_ev in zip(
                    n_informatives, mean_accuracies, median_accuracies):
                print('{}\t\t {}\t\t {}'.format(n_informative, mean_ev,
                                                median_ev))

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.resetwarnings()

        mean_ev = mean_accuracies[-1]
        mean_linearity = pearsonr(mean_accuracies, n_informatives)[0]
        median_ev = median_accuracies[-1]
        median_linearity = pearsonr(median_accuracies, n_informatives)[0]

        if verbose > 0:
            print('mean voting pearsonr = {}'.format(mean_linearity))
            print('median voting pearsonr = {}'.format(median_linearity))

        self.assertTrue(
            mean_ev > 0.1, 'mean voting explained variance of {} is below '
            'acceptable threshold of 0.80'.format(mean_ev))
        linearity = pearsonr(mean_accuracies, n_informatives)[0]
        self.assertTrue(
            mean_linearity > 0.9,
            'mean voting linearity of {} below acceptable '
            'threshold of 0.80 pearsonr'.format(mean_linearity))
        accuracy = median_accuracies[-1]
        self.assertTrue(
            median_ev > 0.1, 'median voting explained variance of {} is below '
            'acceptable threshold of 0.80'.format(median_ev))
        linearity = pearsonr(median_accuracies, n_informatives)[0]
        self.assertTrue(
            median_linearity > 0.9,
            'median voting linearity of {} below acceptable '
            'threshold of 0.80 pearsonr'.format(median_linearity))
コード例 #21
0
    def test_multi_matrices_svm_metaclassifier(self, seed=42, verbose=0):
        """
        Test if KNN classifier->ChannelClassifier(SVC) in a pipecaster
        pipeline gives monotonically increasing accuracy with increasing number
        of inputs, and test if accuracy is > 75%.
        """
        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.filterwarnings("ignore")

        n_channels = 5
        accuracies = []

        sklearn_params = {
            'n_classes': 2,
            'n_samples': 500,
            'n_features': 100,
            'n_informative': 5,
            'n_redundant': 10,
            'n_repeated': 5,
            'class_sep': 1.0
        }

        # implementation 1
        for i in range(0, n_channels + 1):
            Xs, y, _ = make_multi_input_classification(n_informative_Xs=i,
                                                       n_weak_Xs=0,
                                                       n_random_Xs=n_channels -
                                                       i,
                                                       weak_noise_sd=None,
                                                       seed=seed,
                                                       **sklearn_params)
            mclf = MultichannelPipeline(n_channels)
            mclf.add_layer(StandardScaler(), pipe_processes='max')
            clf = transform_wrappers.SingleChannel(
                KNeighborsClassifier(n_neighbors=5, weights='uniform'))
            mclf.add_layer(clf, pipe_processes='max')
            mclf.add_layer(MultichannelPredictor(SVC()))

            split_accuracies = cross_val_score(mclf,
                                               Xs,
                                               y,
                                               scorer=roc_auc_score,
                                               cv=3,
                                               n_processes=1)
            accuracies.append(np.mean(split_accuracies))

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.resetwarnings()

        if verbose > 0:
            print('SVC meta-classification results:')
            print('n_informative, accuray')
            for i in range(0, n_channels + 1):
                print(i, accuracies[i])

        n_informative = range(0, n_channels + 1)
        self.assertTrue(
            accuracies[-1] > 0.75,
            'SVC metaclassification accuracy of {} below \
                        acceptable threshold of 0.75'.format(accuracies[-1]))
        linearity = pearsonr(accuracies, n_informative)[0]
        self.assertTrue(
            linearity > 0.75, 'SVC metaclassification linearity of {} below \
                        acceptable threshold of 0.75 pearsonr'.format(
                linearity))

        # implementation 2
        accuracies = []
        for i in range(0, n_channels + 1):
            Xs, y, _ = make_multi_input_classification(n_informative_Xs=i,
                                                       n_weak_Xs=0,
                                                       n_random_Xs=n_channels -
                                                       i,
                                                       weak_noise_sd=None,
                                                       seed=seed,
                                                       **sklearn_params)
            mclf = MultichannelPipeline(n_channels)
            mclf.add_layer(StandardScaler(), pipe_processes='max')
            base_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform')
            mclf.add_layer(
                ChannelEnsemble(base_clf,
                                SVC(),
                                internal_cv=5,
                                base_processes='max'))
            split_accuracies = cross_val_score(mclf,
                                               Xs,
                                               y,
                                               scorer=roc_auc_score,
                                               cv=3,
                                               n_processes=1)
            accuracies.append(np.mean(split_accuracies))

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.resetwarnings()

        if verbose > 0:
            print('SVC meta-classification results:')
            print('n_informative, accuray')
            for i in range(0, n_channels + 1):
                print(i, accuracies[i])

        n_informative = range(0, n_channels + 1)
        self.assertTrue(
            accuracies[-1] > 0.75,
            'SVC metaclassification accuracy of {} below \
                        acceptable threshold of 0.75'.format(accuracies[-1]))
        linearity = pearsonr(accuracies, n_informative)[0]
        self.assertTrue(
            linearity > 0.75, 'SVC metaclassification linearity of {} below \
                        acceptable threshold of 0.75 pearsonr'.format(
                linearity))
コード例 #22
0
    def test_compare_to_StackingRegressor(self, verbose=0, seed=42):
        """
        Determine if Ensemble with dummies correctly selects the real predictors and gives similar
        performance to scikit-learn StackingRegressor trained without dummies.
        """
        X, y = make_regression(n_samples=500, n_features=20, n_informative=10, random_state=seed)

        regressors = [LinearRegression(),
                       KNeighborsRegressor(),
                       RandomForestRegressor(random_state=seed)]
        dummy_regressors = [DummyRegressor(strategy='mean') for repeat in range(100)]
        all_regressors = regressors + dummy_regressors
        random.shuffle(all_regressors)

        mclf = MultichannelPipeline(n_channels=1)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(Ensemble(all_regressors, SVR(), internal_cv=5, score_selector=RankScoreSelector(k=3)))
        pc_score_all = np.mean(cross_val_score(mclf, [X], y, cv=5, n_processes=5))

        mclf.fit([X], y)
        selected_regressors = mclf.get_model(1,0).get_base_models()
        self.assertTrue(len(selected_regressors) == 3,
                        'Ensemble picked the {} regressors instead of 3.'.format(len(selected_regressors)))
        self.assertFalse(DummyRegressor in [c.__class__ for c in selected_regressors],
                         'Ensemble chose a dummy regressors over a real one')

        mclf = MultichannelPipeline(n_channels=1)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(Ensemble(regressors, SVR(), internal_cv=5, score_selector=RankScoreSelector(k=3)))
        pc_score_informative = np.mean(cross_val_score(mclf, [X], y, cv=5, n_processes=5))

        base_arg = [(str(i), c) for i, c in enumerate(regressors)]
        clf = StackingRegressor(base_arg, SVR(), cv=KFold(n_splits=3))
        sk_score_informative = np.mean(cross_val_score(clf, X, y, cv=5, n_processes=5))

        if verbose > 0:
            base_arg = [(str(i), c) for i, c in enumerate(all_regressors)]
            clf = StackingRegressor(base_arg, SVR(), cv=KFold(n_splits=3))
            sk_score_all = np.mean(cross_val_score(clf, X, y, cv=5, n_processes=5))
            print('\nExplained variance scores')
            print('Ensemble informative predictors: {}'.format(pc_score_informative))
            print('Ensemble all predictors: {}'.format(pc_score_all))
            print('StackingRegressor informative predictors: {}'.format(sk_score_informative))
            print('StackingRegressor all predictors: {}'.format(sk_score_all))

        self.assertTrue(np.round(pc_score_all, 2) == np.round(pc_score_informative, 2),
                        'Ensemble accuracy is not same for all regressors and informative regressors.')
        tolerance_pct = 5
        self.assertTrue(pc_score_all >= sk_score_informative * (1 - tolerance_pct / 100.0),
                        '''Ensemble with dummy regressors did not perform within accepted tolerance of StackingClassifier with no dummy regressors.''')
コード例 #23
0
    def test_add_layer_interface_mapping(self, verbose=0, seed=42):
        """
        Functional test of the MultichannelPipeline channel mapping interface.
        """

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.filterwarnings("ignore")

        n_channels = 5
        accuracies = []

        rgr_params = {'n_samples': 1000, 'n_features': 10, 'n_informative': 10}

        # implementation 1
        for i in range(0, n_channels + 1):

            Xs, y, _ = make_multi_input_regression(n_informative_Xs=i,
                                                   n_weak_Xs=0,
                                                   n_random_Xs=n_channels - i,
                                                   weak_noise_sd=None,
                                                   seed=seed,
                                                   **rgr_params)

            mrgr = MultichannelPipeline(n_channels)
            mrgr.add_layer(2,
                           StandardScaler(),
                           3,
                           StandardScaler(),
                           pipe_processes=n_cpus)
            base_rgr = transform_wrappers.SingleChannelCV(LinearRegression())
            mrgr.add_layer(2, base_rgr, 3, base_rgr, pipe_processes=n_cpus)
            mrgr.add_layer(5, MultichannelPredictor(SVR()))
            split_accuracies = cross_val_score(mrgr,
                                               Xs,
                                               y,
                                               scorer=explained_variance_score,
                                               cv=3,
                                               n_processes=1)
            accuracies.append(np.mean(split_accuracies))

        n_informatives = range(0, n_channels + 1)
        if verbose > 0:
            print('explained variance scores')
            print('informative Xs\t\t svr stacking')
            for n_informative, ev in zip(n_informatives, accuracies):
                print('{}\t\t {}'.format(n_informative, ev))

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.resetwarnings()

        final_ev = accuracies[-1]
        linearity = pearsonr(accuracies, n_informatives)[0]

        if verbose > 0:
            print('SVR stacking pearsonr = {}'.format(linearity))

        self.assertTrue(
            final_ev > 0.1, 'SVR stacking explained variance of {} is below '
            'acceptable threshold of 0.80'.format(final_ev))
        linearity = pearsonr(accuracies, n_informatives)[0]
        self.assertTrue(
            linearity > 0.0, 'SVR stacking linearity of {} below acceptable '
            'threshold of 0.80 pearsonr'.format(linearity))

        # implementation 2
        accuracies = []
        for i in range(0, n_channels + 1):

            Xs, y, _ = make_multi_input_regression(n_informative_Xs=i,
                                                   n_weak_Xs=0,
                                                   n_random_Xs=n_channels - i,
                                                   weak_noise_sd=None,
                                                   seed=seed,
                                                   **rgr_params)

            mrgr = MultichannelPipeline(n_channels)
            mrgr.add_layer(2,
                           StandardScaler(),
                           3,
                           StandardScaler(),
                           pipe_processes='max')
            base_rfrs = [LinearRegression() for i in range(2)]
            base_rfrs += [LinearRegression() for i in range(3)]
            mrgr.add_layer(
                ChannelEnsemble(base_rfrs,
                                SVR(),
                                base_processes='max',
                                internal_cv=5))
            split_accuracies = cross_val_score(mrgr,
                                               Xs,
                                               y,
                                               scorer=explained_variance_score,
                                               cv=3,
                                               n_processes=1)
            accuracies.append(np.mean(split_accuracies))

        n_informatives = range(0, n_channels + 1)
        if verbose > 0:
            print('explained variance scores')
            print('informative Xs\t\t svr stacking')
            for n_informative, ev in zip(n_informatives, accuracies):
                print('{}\t\t {}'.format(n_informative, ev))

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.resetwarnings()

        final_ev = accuracies[-1]
        linearity = pearsonr(accuracies, n_informatives)[0]

        if verbose > 0:
            print('SVR stacking pearsonr = {}'.format(linearity))

        self.assertTrue(
            final_ev > 0.1, 'SVR stacking explained variance of {} is below '
            'acceptable threshold of 0.80'.format(final_ev))
        linearity = pearsonr(accuracies, n_informatives)[0]
        self.assertTrue(
            linearity > 0.0, 'SVR stacking linearity of {} below acceptable '
            'threshold of 0.80 pearsonr'.format(linearity))
コード例 #24
0
    def test_single_matrix_hard_voting(self):
        """
        Determine if KNN->ChannelClassifier(hard voting) in a pipecaster
        pipeline gives identical predictions to sklearn KNN on training data.
        """
        X, y = make_classification(n_samples=100,
                                   n_features=20,
                                   n_informative=10,
                                   class_sep=5,
                                   random_state=42)

        # control
        clf = KNeighborsClassifier(n_neighbors=5, weights='uniform')
        clf.fit(X, y)
        clf_predictions = clf.predict(X)

        # implementation 1
        mclf = MultichannelPipeline(n_channels=1)
        base_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform')
        base_clf = transform_wrappers.SingleChannel(base_clf,
                                                    transform_method='predict')
        mclf.add_layer(base_clf)
        mclf.add_layer(MultichannelPredictor(HardVotingMetaClassifier()))
        mclf.fit([X], y)
        mclf_predictions = mclf.predict([X])
        self.assertTrue(
            np.array_equal(clf_predictions, mclf_predictions),
            'hard voting metaclassifier did not reproduce sklearn '
            'result on single matrix prediction task')

        # implementation 2
        mclf = MultichannelPipeline(n_channels=1)
        base_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform')
        mclf.add_layer(
            ChannelEnsemble(base_clf,
                            HardVotingMetaClassifier(),
                            base_transform_methods='predict'))
        mclf.fit([X], y)
        mclf_predictions = mclf.predict([X])
        self.assertTrue(
            np.array_equal(clf_predictions, mclf_predictions),
            'hard voting metaclassifier did not reproduce sklearn '
            'result on single matrix prediction task')
コード例 #25
0
    def test_multi_matrix_voting(self, verbose=0):
        """
        Test if KNN->ChannelClassifier(soft voting) in a pipecaster pipeline
        gives monotonically increasing accuracy with increasing number of
        inputs in concordance with Condorcet's jury theorem, and also test hard
        voting with same pass criterion. Test if accuracy is > 80%.
        """

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.filterwarnings("ignore")

        n_channels = 5

        sklearn_params = {
            'n_classes': 2,
            'n_samples': 500,
            'n_features': 100,
            'n_informative': 30,
            'n_redundant': 0,
            'n_repeated': 0,
            'class_sep': 3.0
        }

        # implementation 1
        soft_accuracies, hard_accuracies = [], []
        for i in range(0, n_channels + 1):

            Xs, y, _ = make_multi_input_classification(n_informative_Xs=i,
                                                       n_weak_Xs=0,
                                                       n_random_Xs=n_channels -
                                                       i,
                                                       weak_noise_sd=None,
                                                       seed=42,
                                                       **sklearn_params)

            mclf = MultichannelPipeline(n_channels)
            mclf.add_layer(StandardScaler(), pipe_processes=n_cpus)
            clf = transform_wrappers.SingleChannel(
                KNeighborsClassifier(n_neighbors=5, weights='uniform'))
            mclf.add_layer(clf, pipe_processes=n_cpus)
            mclf.add_layer(MultichannelPredictor(SoftVotingMetaClassifier()))

            split_accuracies = cross_val_score(mclf,
                                               Xs,
                                               y,
                                               scorer=roc_auc_score,
                                               cv=3,
                                               n_processes=1)
            soft_accuracies.append(np.mean(split_accuracies))

            mclf = MultichannelPipeline(n_channels)
            mclf.add_layer(StandardScaler(), pipe_processes=n_cpus)
            clf = KNeighborsClassifier(n_neighbors=5, weights='uniform')
            clf = transform_wrappers.SingleChannel(clf,
                                                   transform_method='predict')
            mclf.add_layer(clf, pipe_processes=n_cpus)
            mclf.add_layer(MultichannelPredictor(HardVotingMetaClassifier()))

            split_accuracies = cross_val_score(mclf,
                                               Xs,
                                               y,
                                               scorer=roc_auc_score,
                                               cv=3,
                                               n_processes=1)
            hard_accuracies.append(np.mean(split_accuracies))

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.resetwarnings()

        if verbose > 0:
            print('soft voting results:')
            print('n_informative, accuray')
            for i in range(0, n_channels + 1):
                print(i, soft_accuracies[i])
            print('hard voting results:')
            print('n_informative, accuray')
            for i in range(0, n_channels + 1):
                print(i, hard_accuracies[i])
        n_informative = range(0, n_channels + 1)
        accuracy = soft_accuracies[-1]
        self.assertTrue(
            accuracy > 0.80, 'soft voting accuracy of {} below '
            'acceptable threshold of 0.80'.format(accuracy))
        linearity = pearsonr(soft_accuracies, n_informative)[0]
        self.assertTrue(
            linearity > 0.80, 'hard voting linearity of {} below acceptable '
            'threshold of 0.80 pearsonr'.format(linearity))
        accuracy = hard_accuracies[-1]
        self.assertTrue(
            accuracy > 0.80, 'hard voting accuracy of {} below '
            'acceptable threshold of 0.80'.format(accuracy))
        linearity = pearsonr(hard_accuracies, n_informative)[0]
        self.assertTrue(
            linearity > 0.80, 'hard voting linearity of {} below acceptable '
            'threshold of 0.80 pearsonr'.format(linearity))

        # implementation 2
        soft_accuracies, hard_accuracies = [], []
        for i in range(0, n_channels + 1):

            Xs, y, _ = make_multi_input_classification(n_informative_Xs=i,
                                                       n_weak_Xs=0,
                                                       n_random_Xs=n_channels -
                                                       i,
                                                       weak_noise_sd=None,
                                                       seed=42,
                                                       **sklearn_params)

            mclf = MultichannelPipeline(n_channels)
            mclf.add_layer(StandardScaler(), pipe_processes=n_cpus)
            base_clf = KNeighborsClassifier(n_neighbors=5, weights='uniform')
            mclf.add_layer(
                ChannelEnsemble(base_clf,
                                SoftVotingMetaClassifier(),
                                base_processes=n_cpus))
            split_accuracies = cross_val_score(mclf,
                                               Xs,
                                               y,
                                               scorer=roc_auc_score,
                                               cv=3,
                                               n_processes=1)
            soft_accuracies.append(np.mean(split_accuracies))

            mclf = MultichannelPipeline(n_channels)
            mclf.add_layer(StandardScaler(), pipe_processes='max')
            clf = KNeighborsClassifier(n_neighbors=5, weights='uniform')
            clf = transform_wrappers.SingleChannel(clf,
                                                   transform_method='predict')
            mclf.add_layer(clf, pipe_processes='max')
            mclf.add_layer(MultichannelPredictor(HardVotingMetaClassifier()))

            split_accuracies = cross_val_score(mclf,
                                               Xs,
                                               y,
                                               scorer=roc_auc_score,
                                               cv=3,
                                               n_processes=1)
            hard_accuracies.append(np.mean(split_accuracies))

        if n_cpus > 1:
            # shut off warnings because ray and redis generate massive numbers
            warnings.resetwarnings()

        if verbose > 0:
            print('soft voting results:')
            print('n_informative, accuray')
            for i in range(0, n_channels + 1):
                print(i, soft_accuracies[i])
            print('hard voting results:')
            print('n_informative, accuray')
            for i in range(0, n_channels + 1):
                print(i, hard_accuracies[i])
        n_informative = range(0, n_channels + 1)
        accuracy = soft_accuracies[-1]
        self.assertTrue(
            accuracy > 0.80, 'soft voting accuracy of {} below '
            'acceptable threshold of 0.80'.format(accuracy))
        linearity = pearsonr(soft_accuracies, n_informative)[0]
        self.assertTrue(
            linearity > 0.80, 'hard voting linearity of {} below acceptable '
            'threshold of 0.80 pearsonr'.format(linearity))
        accuracy = hard_accuracies[-1]
        self.assertTrue(
            accuracy > 0.80, 'hard voting accuracy of {} below '
            'acceptable threshold of 0.80'.format(accuracy))
        linearity = pearsonr(hard_accuracies, n_informative)[0]
        self.assertTrue(
            linearity > 0.80, 'hard voting linearity of {} below acceptable '
            'threshold of 0.80 pearsonr'.format(linearity))
コード例 #26
0
    def test_single_matrix_mean_voting(self, seed=42):
        """
        Determine if KNN->ChannelRegressor(mean voting) in a pipecaster
        pipeline gives identical predictions to sklearn KNN on training data
        """
        X, y = make_regression(n_samples=100,
                               n_features=20,
                               n_informative=10,
                               random_state=seed)

        # control
        rgr = KNeighborsRegressor(n_neighbors=5, weights='uniform')
        rgr.fit(X, y)
        rgr_predictions = rgr.predict(X)

        # implementation 1
        mrgr = MultichannelPipeline(n_channels=1)
        rgr = transform_wrappers.SingleChannel(
            KNeighborsRegressor(n_neighbors=5, weights='uniform'))
        mrgr.add_layer(rgr, pipe_processes=n_cpus)
        mrgr.add_layer(MultichannelPredictor(AggregatingMetaRegressor(
            np.mean)))
        mrgr.fit([X], y)
        mrgr_predictions = mrgr.predict([X])
        self.assertTrue(
            np.array_equal(rgr_predictions, mrgr_predictions),
            'mean voting ChannelRegressor failed to reproduce '
            'sklearn result on single matrix prediction task')

        # implementation 2
        mrgr = MultichannelPipeline(n_channels=1)
        base_rgr = KNeighborsRegressor(n_neighbors=5, weights='uniform')
        mrgr.add_layer(
            ChannelEnsemble(base_rgr,
                            AggregatingMetaRegressor(np.mean),
                            base_processes='max'))
        mrgr.fit([X], y)
        mrgr_predictions = mrgr.predict([X])
        self.assertTrue(
            np.array_equal(rgr_predictions, mrgr_predictions),
            'mean voting ChannelRegressor failed to reproduce '
            'sklearn result on single matrix prediction task')