Exemple #1
0
    def _prepare_clf_data_set(self, oob_score):
        """
        Helper function for preparing data sets for feature importance

        :param oob_score: (bool): bool flag for oob_score in classifier
        """
        clf_base = RandomForestClassifier(n_estimators=1,
                                          criterion='entropy',
                                          bootstrap=False,
                                          class_weight='balanced_subsample',
                                          random_state=1)

        sb_clf = SequentiallyBootstrappedBaggingClassifier(
            base_estimator=clf_base,
            max_features=1.0,
            n_estimators=100,
            samples_info_sets=self.samples_info_sets,
            price_bars=self.price_bars_trim,
            oob_score=oob_score,
            random_state=1)
        sb_clf.fit(self.X_train, self.y_train_clf)

        cv_gen = PurgedKFold(n_splits=4,
                             samples_info_sets=self.samples_info_sets)
        return sb_clf, cv_gen
    def test_sb_bagging_with_max_features(self):
        """
        Test SB Bagging with base_estimator bootstrap = True, float max_features, max_features bootstrap = True
        :return:
        """
        clf = RandomForestClassifier(n_estimators=1,
                                     criterion='entropy',
                                     bootstrap=True,
                                     class_weight='balanced_subsample',
                                     max_depth=12)

        sb_clf = SequentiallyBootstrappedBaggingClassifier(
            base_estimator=clf,
            max_features=0.2,
            n_estimators=2,
            samples_info_sets=self.samples_info_sets,
            price_bars=self.price_bars_trim,
            oob_score=True,
            random_state=1,
            bootstrap_features=True,
            max_samples=30,
            verbose=2)

        sb_clf.fit(self.X_train,
                   self.y_train_clf,
                   sample_weight=np.ones((self.X_train.shape[0], )))
        self.assertTrue((sb_clf.predict(self.X_train)[:10] == np.array(
            [0, 0, 0, 0, 0, 0, 1, 1, 1, 0])).all)
    def test_sb_classifier(self):
        """
        Test Sequentially Bootstrapped Bagging Classifier. Here we compare oos/oob scores to sklearn's bagging oos scores,
        test oos predictions values
        """

        # Init classifiers
        clf_base = RandomForestClassifier(n_estimators=1, criterion='entropy', bootstrap=False,
                                          class_weight='balanced_subsample')

        sb_clf = SequentiallyBootstrappedBaggingClassifier(base_estimator=clf_base, max_features=1.0, n_estimators=100,
                                                           samples_info_sets=self.samples_info_sets,
                                                           price_bars=self.price_bars_trim, oob_score=True,
                                                           random_state=1)

        # X_train index should be in index mapping
        self.assertTrue(self.X_train.index.isin(sb_clf.timestamp_int_index_mapping.index).all())

        sb_clf.fit(self.X_train, self.y_train_clf)

        self.assertTrue((sb_clf.X_time_index == self.X_train.index).all())  # X_train index == clf X_train index

        oos_sb_predictions = sb_clf.predict(self.X_test)

        sb_precision = precision_score(self.y_test_clf, oos_sb_predictions)
        sb_roc_auc = roc_auc_score(self.y_test_clf, oos_sb_predictions)
        sb_accuracy = accuracy_score(self.y_test_clf, oos_sb_predictions)

        self.assertAlmostEqual(sb_accuracy, 0.66, delta=0.2)
        self.assertEqual(sb_precision, 1.0)
        self.assertAlmostEqual(sb_roc_auc, 0.59, delta=0.2)
 def test_sb_bagging_not_tree_base_estimator(self):
     """
     Test SB Bagging with non-tree base estimator (KNN)
     """
     clf = KNeighborsClassifier()
     sb_clf = SequentiallyBootstrappedBaggingClassifier(base_estimator=clf,
                                                        samples_info_sets=self.samples_info_sets,
                                                        price_bars=self.price_bars_trim)
     sb_clf.fit(self.X_train, self.y_train_clf)
     self.assertTrue((sb_clf.predict(self.X_train)[:10] == np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 0])).all)
    def test_sb_bagging_non_sample_weights_with_verbose(self):
        """
        Test SB Bagging with classifier which doesn't support sample_weights with verbose > 1
        """

        clf = LinearSVC()

        sb_clf = SequentiallyBootstrappedBaggingClassifier(base_estimator=clf, max_features=0.2,
                                                           n_estimators=2,
                                                           samples_info_sets=self.samples_info_sets,
                                                           price_bars=self.price_bars_trim, oob_score=True,
                                                           random_state=1, bootstrap_features=True,
                                                           max_samples=30, verbose=2)

        sb_clf.fit(self.X_train, self.y_train_clf)
        self.assertTrue((sb_clf.predict(self.X_train)[:10] == np.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 0])).all)
    def test_orthogonal_features(self):
        """
        Test orthogonal features: PCA features, importance vs PCA importance analysis
        """

        # Init classifiers
        clf_base = RandomForestClassifier(n_estimators=1,
                                          criterion='entropy',
                                          bootstrap=False,
                                          class_weight='balanced_subsample')

        sb_clf = SequentiallyBootstrappedBaggingClassifier(
            base_estimator=clf_base,
            max_features=1.0,
            n_estimators=100,
            samples_info_sets=self.samples_info_sets,
            price_bars=self.price_bars_trim,
            oob_score=True,
            random_state=1)

        pca_features = get_orthogonal_features(self.X_train)

        # PCA features should have mean of 0
        self.assertAlmostEqual(np.mean(pca_features[:, 2]), 0, delta=1e-7)
        self.assertAlmostEqual(np.mean(pca_features[:, 5]), 0, delta=1e-7)
        self.assertAlmostEqual(np.mean(pca_features[:, 6]), 0, delta=1e-7)

        # Check particular PCA values std
        self.assertAlmostEqual(np.std(pca_features[:, 1]), 1.499, delta=0.2)
        self.assertAlmostEqual(np.std(pca_features[:, 3]), 1.047, delta=0.2)
        self.assertAlmostEqual(np.std(pca_features[:, 4]), 0.948, delta=0.2)

        sb_clf.fit(self.X_train, self.y_train_clf)
        mdi_feat_imp = feature_importance_mean_decrease_impurity(
            sb_clf, self.X_train.columns)
        pca_corr_res = feature_pca_analysis(self.X_train, mdi_feat_imp)

        # Check correlation metrics results
        self.assertAlmostEqual(pca_corr_res['Weighted_Kendall_Rank'][0],
                               0.26,
                               delta=1e-1)
    def test_sb_bagging_float_max_samples_warm_start_true(self):
        """
        Test SB Bagging with warm start = True and float max_samples
        """
        clf = RandomForestClassifier(n_estimators=1,
                                     criterion='entropy',
                                     bootstrap=False,
                                     class_weight='balanced_subsample',
                                     max_depth=12)

        sb_clf = SequentiallyBootstrappedBaggingClassifier(
            base_estimator=clf,
            max_features=7,
            n_estimators=2,
            samples_info_sets=self.samples_info_sets,
            price_bars=self.price_bars_trim,
            oob_score=False,
            random_state=1,
            bootstrap_features=True,
            max_samples=0.3,
            warm_start=True)

        sb_clf.fit(
            self.X_train,
            self.y_train_clf,
            sample_weight=np.ones((self.X_train.shape[0], )),
        )

        sb_clf.n_estimators += 0
        with self.assertWarns(UserWarning):
            sb_clf.fit(
                self.X_train,
                self.y_train_clf,
                sample_weight=np.ones((self.X_train.shape[0], )),
            )
        sb_clf.n_estimators += 2
        sb_clf.fit(
            self.X_train,
            self.y_train_clf,
            sample_weight=np.ones((self.X_train.shape[0], )),
        )

        self.assertTrue((sb_clf.predict(self.X_train)[:10] == np.array(
            [0, 0, 0, 0, 0, 0, 1, 1, 1, 0])).all)
 def test_value_error_raise(self):
     """
     Test various values error raise
     """
     clf = KNeighborsClassifier()
     bagging_clf_1 = SequentiallyBootstrappedBaggingClassifier(
         base_estimator=clf,
         samples_info_sets=self.samples_info_sets,
         price_bars=self.data)
     bagging_clf_2 = SequentiallyBootstrappedBaggingClassifier(
         base_estimator=clf,
         samples_info_sets=self.samples_info_sets,
         price_bars=self.data,
         max_samples=2000000)
     bagging_clf_3 = SequentiallyBootstrappedBaggingClassifier(
         base_estimator=clf,
         samples_info_sets=self.samples_info_sets,
         price_bars=self.data,
         max_features='20')
     bagging_clf_4 = SequentiallyBootstrappedBaggingClassifier(
         base_estimator=clf,
         samples_info_sets=self.samples_info_sets,
         price_bars=self.data,
         max_features=2000000)
     bagging_clf_5 = SequentiallyBootstrappedBaggingClassifier(
         base_estimator=clf,
         samples_info_sets=self.samples_info_sets,
         price_bars=self.data,
         oob_score=True,
         warm_start=True)
     bagging_clf_6 = SequentiallyBootstrappedBaggingClassifier(
         base_estimator=clf,
         samples_info_sets=self.samples_info_sets,
         price_bars=self.data,
         warm_start=True)
     bagging_clf_7 = SequentiallyBootstrappedBaggingClassifier(
         base_estimator=clf,
         samples_info_sets=self.samples_info_sets,
         price_bars=self.data,
         warm_start=True)
     with self.assertRaises(ValueError):
         # ValueError to use sample weight with classifier which doesn't support sample weights
         bagging_clf_1.fit(
             self.X_train,
             self.y_train_clf,
             sample_weight=np.ones((self.X_train.shape[0], )),
         )
     with self.assertRaises(ValueError):
         # ValueError for max_samples > X_train.shape[0]
         bagging_clf_2.fit(
             self.X_train,
             self.y_train_clf,
             sample_weight=np.ones((self.X_train.shape[0], )),
         )
     with self.assertRaises(ValueError):
         # ValueError for non-int/float max_features param
         bagging_clf_3.fit(
             self.X_train,
             self.y_train_clf,
             sample_weight=np.ones((self.X_train.shape[0], )),
         )
     with self.assertRaises(ValueError):
         # ValueError for max_features > X_train.shape[1]
         bagging_clf_4.fit(
             self.X_train,
             self.y_train_clf,
             sample_weight=np.ones((self.X_train.shape[0], )),
         )
     with self.assertRaises(ValueError):
         # ValueError for warm_start and oob_score being True
         bagging_clf_5.fit(
             self.X_train,
             self.y_train_clf,
             sample_weight=np.ones((self.X_train.shape[0], )),
         )
     with self.assertRaises(ValueError):
         # ValueError for decreasing the number of estimators when warm start is True
         bagging_clf_6.fit(self.X_train, self.y_train_clf)
         bagging_clf_6.n_estimators -= 2
         bagging_clf_6.fit(self.X_train, self.y_train_clf)
     with self.assertRaises(ValueError):
         # ValueError for setting n_estimators to negative value
         bagging_clf_7.fit(self.X_train, self.y_train_clf)
         bagging_clf_7.n_estimators -= 1000
         bagging_clf_7.fit(self.X_train, self.y_train_clf)