Ejemplo n.º 1
0
    def test_feature_importance(self):
        """
        Test features importance: MDI, MDA, SFI and plot function
        """
        sb_clf, cv_gen = self._prepare_clf_data_set(oob_score=False)

        # MDI feature importance
        mdi_feat_imp = mean_decrease_impurity(sb_clf, self.X_train.columns)

        # MDA feature importance
        mda_feat_imp_log_loss = mean_decrease_accuracy(
            sb_clf,
            self.X_train,
            self.y_train_clf,
            cv_gen,
            sample_weight_train=np.ones((self.X_train.shape[0], )),
            sample_weight_score=np.ones((self.X_train.shape[0], )))
        mda_feat_imp_f1 = mean_decrease_accuracy(sb_clf,
                                                 self.X_train,
                                                 self.y_train_clf,
                                                 cv_gen,
                                                 scoring=f1_score)
        # SFI feature importance
        # Take only 5 features for faster test run
        sfi_feat_imp_log_loss = single_feature_importance(
            sb_clf,
            self.X_train[self.X_train.columns[:5]],
            self.y_train_clf,
            cv_gen=cv_gen,
            sample_weight_train=np.ones((self.X_train.shape[0], )))
        sfi_feat_imp_f1 = single_feature_importance(
            sb_clf,
            self.X_train[self.X_train.columns[:5]],
            self.y_train_clf,
            cv_gen=cv_gen,
            scoring=f1_score,
            sample_weight_score=np.ones((self.X_train.shape[0], )))

        # MDI assertions
        self.assertAlmostEqual(mdi_feat_imp['mean'].sum(), 1, delta=0.001)
        # The most informative features
        self.assertAlmostEqual(mdi_feat_imp.loc['label_prob_0.1', 'mean'],
                               0.19598,
                               delta=0.01)
        self.assertAlmostEqual(mdi_feat_imp.loc['label_prob_0.2', 'mean'],
                               0.164,
                               delta=0.01)
        # Noisy feature
        self.assertAlmostEqual(mdi_feat_imp.loc['label_prob_0.1_sma_5',
                                                'mean'],
                               0.08805,
                               delta=0.01)

        # MDA(log_loss) assertions
        self.assertAlmostEqual(mda_feat_imp_log_loss.loc['label_prob_0.1',
                                                         'mean'],
                               0.23685,
                               delta=10)
        self.assertAlmostEqual(mda_feat_imp_log_loss.loc['label_prob_0.2',
                                                         'mean'],
                               0.3222,
                               delta=10)

        # MDA(f1) assertions
        self.assertAlmostEqual(mda_feat_imp_f1.loc['label_prob_0.1', 'mean'],
                               0.25,
                               delta=3)
        self.assertAlmostEqual(mda_feat_imp_f1.loc['label_prob_0.2', 'mean'],
                               0.3,
                               delta=3)

        # SFI(log_loss) assertions
        self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['label_prob_0.1',
                                                         'mean'],
                               -2.14,
                               delta=1)
        self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['label_prob_0.2',
                                                         'mean'],
                               -2.15,
                               delta=1)

        # SFI(accuracy) assertions
        self.assertAlmostEqual(sfi_feat_imp_f1.loc['label_prob_0.1', 'mean'],
                               0.81,
                               delta=1)
        self.assertAlmostEqual(sfi_feat_imp_f1.loc['label_prob_0.2', 'mean'],
                               0.74,
                               delta=1)
        self.assertAlmostEqual(sfi_feat_imp_f1.loc['label_prob_0.5_sma_2',
                                                   'mean'],
                               0.224,
                               delta=1)
Ejemplo n.º 2
0
    def test_feature_importance(self):
        """
        Test features importance: MDI, MDA, SFI and plot function
        """

        # MDI feature importance
        mdi_feat_imp = mean_decrease_impurity(self.fit_clf, self.X.columns)

        # MDA feature importance
        mda_feat_imp_log_loss = mean_decrease_accuracy(
            self.bag_clf,
            self.X,
            self.y,
            self.cv_gen,
            sample_weight_train=np.ones((self.X.shape[0], )),
            sample_weight_score=np.ones((self.X.shape[0], )),
            scoring=log_loss)
        mda_feat_imp_f1 = mean_decrease_accuracy(self.bag_clf,
                                                 self.X,
                                                 self.y,
                                                 self.cv_gen,
                                                 scoring=f1_score)
        # SFI feature importance
        sfi_feat_imp_log_loss = single_feature_importance(
            self.bag_clf,
            self.X,
            self.y,
            cv_gen=self.cv_gen,
            sample_weight_train=np.ones((self.X.shape[0], )),
            scoring=log_loss)
        sfi_feat_imp_f1 = single_feature_importance(
            self.bag_clf,
            self.X,
            self.y,
            cv_gen=self.cv_gen,
            sample_weight_score=np.ones((self.X.shape[0], )),
            scoring=f1_score)

        # MDI assertions
        self.assertAlmostEqual(mdi_feat_imp['mean'].sum(), 1, delta=0.001)
        # The most informative features
        self.assertAlmostEqual(mdi_feat_imp.loc['I_1', 'mean'],
                               0.47075,
                               delta=0.01)
        self.assertAlmostEqual(mdi_feat_imp.loc['I_0', 'mean'],
                               0.09291,
                               delta=0.01)
        # Redundant feature
        self.assertAlmostEqual(mdi_feat_imp.loc['R_0', 'mean'],
                               0.07436,
                               delta=0.01)
        # Noisy feature
        self.assertAlmostEqual(mdi_feat_imp.loc['N_0', 'mean'],
                               0.01798,
                               delta=0.01)

        # MDA(log_loss) assertions
        self.assertAlmostEqual(mda_feat_imp_log_loss.loc['I_1', 'mean'],
                               0.59684,
                               delta=0.1)
        self.assertAlmostEqual(mda_feat_imp_log_loss.loc['R_0', 'mean'],
                               0.13177,
                               delta=0.1)

        # MDA(f1) assertions
        self.assertAlmostEqual(mda_feat_imp_f1.loc['I_1', 'mean'],
                               0.52268,
                               delta=0.1)
        self.assertAlmostEqual(mda_feat_imp_f1.loc['I_2', 'mean'],
                               0.29533,
                               delta=0.1)

        # SFI(log_loss) assertions
        self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['I_0', 'mean'],
                               -6.50385,
                               delta=0.1)
        self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['R_0', 'mean'],
                               -3.27282,
                               delta=0.1)

        # SFI(accuracy) assertions
        self.assertAlmostEqual(sfi_feat_imp_f1.loc['I_0', 'mean'],
                               0.48530,
                               delta=0.1)
        self.assertAlmostEqual(sfi_feat_imp_f1.loc['I_1', 'mean'],
                               0.78778,
                               delta=0.1)
    def test_feature_importance(self):
        """
        Test features importance: MDI, MDA, SFI and plot function
        """
        #getting the clustered subsets for CFI with number of clusters selection using ONC algorithm
        clustered_subsets_linear = get_feature_clusters(
            self.X,
            dependence_metric='linear',
            distance_metric=None,
            linkage_method=None,
            n_clusters=None)
        #Also to verify the theory that if number clusters is equal to number of features then the
        #result will be same as MDA
        feature_subset_single = [[x] for x in self.X.columns]

        # MDI feature importance
        mdi_feat_imp = mean_decrease_impurity(self.fit_clf, self.X.columns)
        #Clustered MDI feature importance
        clustered_mdi = mean_decrease_impurity(
            self.fit_clf,
            self.X.columns,
            clustered_subsets=clustered_subsets_linear)
        mdi_cfi_single = mean_decrease_impurity(
            self.fit_clf,
            self.X.columns,
            clustered_subsets=feature_subset_single)

        # MDA feature importance
        mda_feat_imp_log_loss = mean_decrease_accuracy(
            self.bag_clf,
            self.X,
            self.y,
            self.cv_gen,
            sample_weight_train=np.ones((self.X.shape[0], )),
            sample_weight_score=np.ones((self.X.shape[0], )),
            scoring=log_loss)

        mda_feat_imp_f1 = mean_decrease_accuracy(self.bag_clf,
                                                 self.X,
                                                 self.y,
                                                 self.cv_gen,
                                                 scoring=f1_score)
        #ClusteredMDA feature importance
        clustered_mda = mean_decrease_accuracy(
            self.bag_clf,
            self.X,
            self.y,
            self.cv_gen,
            clustered_subsets=clustered_subsets_linear)
        mda_cfi_single = mean_decrease_accuracy(
            self.bag_clf,
            self.X,
            self.y,
            self.cv_gen,
            clustered_subsets=feature_subset_single)

        # SFI feature importance
        sfi_feat_imp_log_loss = single_feature_importance(
            self.bag_clf,
            self.X,
            self.y,
            cv_gen=self.cv_gen,
            sample_weight_train=np.ones((self.X.shape[0], )),
            scoring=log_loss)
        sfi_feat_imp_f1 = single_feature_importance(
            self.bag_clf,
            self.X,
            self.y,
            cv_gen=self.cv_gen,
            sample_weight_score=np.ones((self.X.shape[0], )),
            scoring=f1_score)

        # MDI assertions
        self.assertAlmostEqual(mdi_feat_imp['mean'].sum(), 1, delta=0.001)
        # The most informative features
        self.assertAlmostEqual(mdi_feat_imp.loc['I_1', 'mean'],
                               0.48058,
                               delta=0.01)
        self.assertAlmostEqual(mdi_feat_imp.loc['I_0', 'mean'],
                               0.08214,
                               delta=0.01)
        # Redundant feature
        self.assertAlmostEqual(mdi_feat_imp.loc['R_0', 'mean'],
                               0.06511,
                               delta=0.01)
        # Noisy feature
        self.assertAlmostEqual(mdi_feat_imp.loc['N_0', 'mean'],
                               0.02229,
                               delta=0.01)

        # MDA(log_loss) assertions
        self.assertAlmostEqual(mda_feat_imp_log_loss.loc['I_1', 'mean'],
                               0.65522,
                               delta=0.1)
        self.assertAlmostEqual(mda_feat_imp_log_loss.loc['R_0', 'mean'],
                               0.00332,
                               delta=0.1)

        # MDA(f1) assertions
        self.assertAlmostEqual(mda_feat_imp_f1.loc['I_1', 'mean'],
                               0.47751,
                               delta=0.1)
        self.assertAlmostEqual(mda_feat_imp_f1.loc['I_2', 'mean'],
                               0.33617,
                               delta=0.1)

        # SFI(log_loss) assertions
        self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['I_0', 'mean'],
                               -6.39442,
                               delta=0.1)
        self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['R_0', 'mean'],
                               -5.04315,
                               delta=0.1)

        # SFI(accuracy) assertions
        self.assertAlmostEqual(sfi_feat_imp_f1.loc['I_0', 'mean'],
                               0.48915,
                               delta=0.1)
        self.assertAlmostEqual(sfi_feat_imp_f1.loc['I_1', 'mean'],
                               0.78443,
                               delta=0.1)

        #Cluster MDI  assertions
        self.assertAlmostEqual(clustered_mdi.loc['R_0', 'mean'],
                               0.01912,
                               delta=0.1)
        self.assertAlmostEqual(clustered_mdi.loc['I_0', 'mean'],
                               0.06575,
                               delta=0.1)

        #Clustered MDA (log_loss) assertions
        self.assertAlmostEqual(clustered_mda.loc['I_0', 'mean'],
                               0.04154,
                               delta=0.1)
        self.assertAlmostEqual(clustered_mda.loc['R_0', 'mean'],
                               0.02940,
                               delta=0.1)

        #Test if CFI with number of clusters same to number features is equal to normal MDI & MDA results
        self.assertAlmostEqual(mdi_feat_imp.loc['I_1', 'mean'],
                               mdi_cfi_single.loc['I_1', 'mean'],
                               delta=0.1)
        self.assertAlmostEqual(mdi_feat_imp.loc['R_0', 'mean'],
                               mdi_cfi_single.loc['R_0', 'mean'],
                               delta=0.1)
        self.assertAlmostEqual(mda_feat_imp_log_loss.loc['I_1', 'mean'],
                               mda_cfi_single.loc['I_1', 'mean'],
                               delta=0.1)
        self.assertAlmostEqual(mda_feat_imp_log_loss.loc['R_0', 'mean'],
                               mda_cfi_single.loc['R_0', 'mean'],
                               delta=0.1)