def test_discrimination_rgr(self, verbose=0, seed=42):
        """
        Determine if Ensemble can pick real regressor over dummy and
        test performance.
        """
        X, y = make_regression(n_samples=500, n_features=20, n_informative=10,
                               random_state=seed)

        base_regressors = [DummyRegressor(strategy='mean') for i in range(5)]
        base_regressors.append(LinearRegression())
        random.shuffle(base_regressors)

        mclf = MultichannelPipeline(n_channels=1)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(Ensemble(base_regressors, internal_cv=5,
                                score_selector=RankScoreSelector(k=1)))
        mclf.fit([X], y)

        ensemble = mclf.get_model(1, 0)
        selected_model = ensemble.get_base_models()[0]
        selected_model = transform_wrappers.unwrap_model(selected_model)

        if verbose > 0:
            print(ensemble.get_screen_results())

        self.assertTrue(type(selected_model) == LinearRegression,
                        'Ensemble failed to pick LinearRegression '
                        'over dummies')

        acc = np.mean(cross_val_score(mclf, [X], y))
        if verbose > 0:
            print('cross val accuracy: {}'.format(acc))

        self.assertTrue(acc > 0.9, 'Accuracy tolerance failure.')
    def test_discrimination_cls(self, verbose=0, seed=42):
        """
        Determine if Ensemble can pick real classifier over dummy and
        test performance.
        """
        X, y = make_classification(n_samples=500, n_features=20,
                                   n_informative=15, class_sep=1,
                                   random_state=seed)

        base_classifiers = [DummyClassifier(strategy='stratified')
                            for i in range(5)]
        base_classifiers.append(LogisticRegression())
        random.shuffle(base_classifiers)

        mclf = MultichannelPipeline(n_channels=1)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(Ensemble(base_classifiers, internal_cv=5,
                                score_selector=RankScoreSelector(k=1)))
        mclf.fit([X], y)

        c = mclf.get_model(1, 0).get_base_models()[0]
        c = transform_wrappers.unwrap_model(c)

        self.assertTrue(type(c) == LogisticRegression,
                        'Ensemble failed to pick LogisticRegression '
                        'over dummies')

        acc = np.mean(cross_val_score(mclf, [X], y))
        if verbose > 0:
            print('cross val accuracy: {}'.format(acc))

        self.assertTrue(acc > 0.70, 'Accuracy tolerance failure.')
Beispiel #3
0
    def test_multi_matrices_no_metapredictor(self, verbose=0, seed=42):
        """
        Determine if ChannelEnsemble works without a meta-predictor.

        Determine if it can pick informative input over random and
        test its performance.
        """

        Xs, y, types = make_multi_input_regression(n_informative_Xs=1,
                                                   n_weak_Xs=0,
                                                   n_random_Xs=4,
                                                   weak_noise_sd=None,
                                                   seed=seed,
                                                   n_samples=500,
                                                   n_features=20,
                                                   n_informative=20)

        mclf = MultichannelPipeline(n_channels=5)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(
            ChannelEnsemble(LinearRegression(),
                            internal_cv=5,
                            score_selector=RankScoreSelector(k=1)))
        mclf.fit(Xs, y)

        selected_type = types[mclf.get_model(1, 0).get_support()[0]]

        self.assertTrue(selected_type == 'informative',
                        'Ensemble failed to pick informative channel')

        acc = np.mean(cross_val_score(mclf, Xs, y))
        if verbose > 0:
            print('cross val accuracy: {}'.format(acc))

        self.assertTrue(acc > 0.10, 'Accuracy tolerance failure.')
    def test_compare_to_StackingClassifier(self, verbose=0, seed=42):
        """
        Determine if Ensemble with dummies correctly selects the real predictors and gives similar
        performance to scikit-learn StackingClassifier trained without dummies.
        """

        X, y = make_classification(n_samples=1000, n_features=20, n_informative=5, class_sep=0.5, random_state=seed)

        classifiers = [LogisticRegression(random_state=seed),
                       KNeighborsClassifier(),
                       RandomForestClassifier(random_state=seed)]
        dummy_classifiers = [DummyClassifier(strategy='stratified', random_state=seed) for repeat in range(100)]
        all_classifiers = classifiers + dummy_classifiers
        random.shuffle(all_classifiers)

        mclf = MultichannelPipeline(n_channels=1)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(Ensemble(all_classifiers, SVC(random_state=seed), internal_cv=5, score_selector=RankScoreSelector(k=3)))
        pc_score_all = np.mean(cross_val_score(mclf, [X], y, cv=5, n_processes=5))

        mclf.fit([X], y)
        selected_classifiers = mclf.get_model(1,0).get_base_models()
        self.assertTrue(len(selected_classifiers) == 3,
                        'Ensemble picked the {} classifiers instead of 3.'.format(len(selected_classifiers)))
        self.assertFalse(DummyClassifier in [c.__class__ for c in selected_classifiers],
                         'Ensemble chose a dummy classifier over a real one')

        mclf = MultichannelPipeline(n_channels=1)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(Ensemble(classifiers, SVC(random_state=seed), internal_cv=5, score_selector=RankScoreSelector(k=3)))
        pc_score_informative = np.mean(cross_val_score(mclf, [X], y, cv=5, n_processes=5))

        base_classifier_arg = [(str(i), c) for i, c in enumerate(classifiers)]
        clf = StackingClassifier(base_classifier_arg, SVC(random_state=seed), cv=StratifiedKFold(n_splits=3))
        sk_score_informative = np.mean(cross_val_score(clf, X, y, cv=5, n_processes=5))

        if verbose > 0:
            base_classifier_arg = [(str(i), c) for i, c in enumerate(all_classifiers)]
            clf = StackingClassifier(base_classifier_arg, SVC(random_state=seed), cv=StratifiedKFold(n_splits=3))
            sk_score_all = np.mean(cross_val_score(clf, X, y, cv=5, n_processes=5))
            print('\nBalanced accuracy scores')
            print('Ensemble informative predictors: {}'.format(pc_score_informative))
            print('Ensemble all predictors: {}'.format(pc_score_all))
            print('StackingClassifier informative predictors: {}'.format(sk_score_informative))
            print('StackingClassifier all predictors: {}'.format(sk_score_all))

        self.assertTrue(np.round(pc_score_all, 2) == np.round(pc_score_informative, 2),
                        'Ensemble accuracy is not same for all classifiers and informative classifiers.')
        tolerance_pct = 5
        self.assertTrue(pc_score_all >= sk_score_informative * (1 - tolerance_pct / 100.0),
                        '''Ensemble with random inputs did not perform within accepted tolerance of StackingClassifier with no dummy classifiers.''')
    def test_discrimination_rgr(self, verbose=0, seed=42):
        """
        Determine if Ensemble can discriminate between dummy regressors and LinearRegression classifiers
        """
        X, y = make_regression(n_samples=500, n_features=20, n_informative=10, random_state=seed)

        base_regressors = [DummyRegressor(strategy='mean') for i in range(5)]
        base_regressors.extend([LinearRegression() for i in range(5)])
        random.shuffle(base_regressors)
        informative_mask = [True if type(c) == LinearRegression else False for c in base_regressors]

        mclf = MultichannelPipeline(n_channels=1)
        mclf.add_layer(StandardScaler())
        mclf.add_layer(Ensemble(base_regressors, SVR(), internal_cv=5, score_selector=RankScoreSelector(k=5)))
        mclf.fit([X], y)
        selected_indices = mclf.get_model(layer_index=1, model_index=0).get_support()
        selection_mask = [True if i in selected_indices else False for i in range(len(base_regressors))]
        if verbose > 0:
            n_correct = sum([1 for i, s in zip(informative_mask, selection_mask) if i and s])
            print('\n\ncorrectly selected {}/5 LinearRegression regressors'.format(n_correct))
            print('incorrectly selected {}/5 DummyRegressors\n\n'.format(5- n_correct))
        self.assertTrue(np.array_equal(selection_mask, informative_mask),
                        'Ensemble failed to discriminate between dummy regressors and LinearRegression')
Beispiel #6
0
    def test_architecture_01(self, verbose=0, seed=42):
        """
        Test the accuracy and hygiene (shuffle control) of a complex pipeline
        with feature selection, matrix selection, model selection, and
        model stacking.
        """
        X_rand = np.random.rand(500, 30)
        X_inf, y = make_classification(n_samples=500,
                                       n_features=30,
                                       n_informative=15,
                                       class_sep=3,
                                       random_state=seed)

        Xs = [X_rand, X_rand, X_inf, X_rand, X_inf, X_inf]

        clf = MultichannelPipeline(n_channels=6)
        clf.add_layer(SimpleImputer())
        clf.add_layer(StandardScaler())
        clf.add_layer(SelectPercentile(percentile=25))
        clf.add_layer(
            5,
            SelectKBestScores(feature_scorer=f_classif,
                              aggregator=np.mean,
                              k=2))
        LR_cv = transform_wrappers.SingleChannelCV(LogisticRegression())
        CE = ChannelEnsemble(base_predictors=KNeighborsClassifier(),
                             internal_cv=5,
                             score_selector=RankScoreSelector(1))
        CE_cv = transform_wrappers.MultichannelCV(CE)
        clf.add_layer(5, CE_cv, 1, LR_cv)
        clf.add_layer(MultichannelPredictor(SVC()))

        score = np.mean(
            cross_val_score(clf, Xs, y, scorer=balanced_accuracy_score))
        if verbose > 0:
            print('accuracy score: {}'.format(score))
        self.assertTrue(
            score > 0.95, 'Accuracy score of {} did not exceed '
            'tolerance value of 95%'.format(score))

        clf.fit(Xs, y)
        score_selector = transform_wrappers.unwrap_model(clf.get_model(3, 0))
        if verbose > 0:
            print('indices selected by SelectKBestScores: {}'.format(
                score_selector.get_support()))
            print('correct indices: [2, 4]')
        self.assertTrue(np.array_equal(score_selector.get_support(), [2, 4]),
                        'SelectKBestScores selected the wrong channels.')

        model_selector = transform_wrappers.unwrap_model(clf.get_model(4, 0))
        if verbose > 0:
            print('indices selected by SelectKBestModels: {}'.format(
                model_selector.get_support()))
            print('correct indices: [2, 4]')
        self.assertTrue(model_selector.get_support()[0] in [2, 4],
                        'SelectKBestModels selected the wrong model')

        score = np.mean(
            cross_val_score(clf,
                            Xs,
                            y[np.random.permutation(len(y))],
                            scorer=balanced_accuracy_score))
        if verbose > 0:
            print('shuffle control accuracy score: {}'.format(score))
        self.assertTrue(
            score < 0.55, 'Accuracy score of shuffle control, {}, '
            'exceeded tolerance value of 55%'.format(score))