def test_value_error_raise(self):
     """
     Test get_feature_clusters for invalid number of clusters arguments
     """
     #Number of clusters larger than number of features
     with self.assertRaises(ValueError):
         get_feature_clusters(self.X,
                              dependence_metric='linear',
                              distance_metric='angular',
                              linkage_method='single',
                              n_clusters=int(41))
Ejemplo n.º 2
0
    def test_get_feature_clusters(self):
        """
        Test get_feature_clusters arguments
        """
        #test for different dependence matrix

        clustered_subsets = get_feature_clusters(
            self.X,
            dependence_metric='information_variation',
            distance_metric='angular',
            linkage_method='single',
            n_clusters=2)
        #hierarchical auto clustering
        clustered_subsets_ha = get_feature_clusters(self.X,
                                                    dependence_metric='linear',
                                                    distance_metric='angular',
                                                    linkage_method='single',
                                                    n_clusters=None,
                                                    critical_threshold=0.2)
        #test for optimal number of clusters and  _check_for_low_silhouette_scores
        #since this is done on test dataset so there will be no features with low silhouette score
        #so we will make a feature with some what lower silhouette score (near to zero) and set
        #the threshold higher (0.2) than that. Also we need a feature to trigger the low degree of freedom
        #condition so, we create a series of zero in the datasets
        self.X['R_5c'] = self.X[
            'R_5']  #this feature is add to introduce low DF in the regressor.
        self.X['R_1c'] = self.X[
            'R_1']  #this will trigger the expection of LinAlgError i.e. presence of singular matrix
        clustered_subsets_distance = get_feature_clusters(
            self.X,
            dependence_metric='linear',
            distance_metric=None,
            linkage_method=None,
            n_clusters=None,
            critical_threshold=0.2)

        #assertions
        #output clusters must be 2
        self.assertEqual(len(clustered_subsets), 2)
        #The ONC should detect somwhere around 5 clusters
        self.assertAlmostEqual(len(clustered_subsets_ha), 5, delta=1)
        self.assertAlmostEqual(len(clustered_subsets_distance), 5, delta=1)
    def test_feature_importance(self):
        """
        Test features importance: MDI, MDA, SFI and plot function
        """
        #getting the clustered subsets for CFI with number of clusters selection using ONC algorithm
        clustered_subsets_linear = get_feature_clusters(
            self.X,
            dependence_metric='linear',
            distance_metric=None,
            linkage_method=None,
            n_clusters=None)
        #Also to verify the theory that if number clusters is equal to number of features then the
        #result will be same as MDA
        feature_subset_single = [[x] for x in self.X.columns]

        # MDI feature importance
        mdi_feat_imp = mean_decrease_impurity(self.fit_clf, self.X.columns)
        #Clustered MDI feature importance
        clustered_mdi = mean_decrease_impurity(
            self.fit_clf,
            self.X.columns,
            clustered_subsets=clustered_subsets_linear)
        mdi_cfi_single = mean_decrease_impurity(
            self.fit_clf,
            self.X.columns,
            clustered_subsets=feature_subset_single)

        # MDA feature importance
        mda_feat_imp_log_loss = mean_decrease_accuracy(
            self.bag_clf,
            self.X,
            self.y,
            self.cv_gen,
            sample_weight_train=np.ones((self.X.shape[0], )),
            sample_weight_score=np.ones((self.X.shape[0], )),
            scoring=log_loss)

        mda_feat_imp_f1 = mean_decrease_accuracy(self.bag_clf,
                                                 self.X,
                                                 self.y,
                                                 self.cv_gen,
                                                 scoring=f1_score)
        #ClusteredMDA feature importance
        clustered_mda = mean_decrease_accuracy(
            self.bag_clf,
            self.X,
            self.y,
            self.cv_gen,
            clustered_subsets=clustered_subsets_linear)
        mda_cfi_single = mean_decrease_accuracy(
            self.bag_clf,
            self.X,
            self.y,
            self.cv_gen,
            clustered_subsets=feature_subset_single)

        # SFI feature importance
        sfi_feat_imp_log_loss = single_feature_importance(
            self.bag_clf,
            self.X,
            self.y,
            cv_gen=self.cv_gen,
            sample_weight_train=np.ones((self.X.shape[0], )),
            scoring=log_loss)
        sfi_feat_imp_f1 = single_feature_importance(
            self.bag_clf,
            self.X,
            self.y,
            cv_gen=self.cv_gen,
            sample_weight_score=np.ones((self.X.shape[0], )),
            scoring=f1_score)

        # MDI assertions
        self.assertAlmostEqual(mdi_feat_imp['mean'].sum(), 1, delta=0.001)
        # The most informative features
        self.assertAlmostEqual(mdi_feat_imp.loc['I_1', 'mean'],
                               0.48058,
                               delta=0.01)
        self.assertAlmostEqual(mdi_feat_imp.loc['I_0', 'mean'],
                               0.08214,
                               delta=0.01)
        # Redundant feature
        self.assertAlmostEqual(mdi_feat_imp.loc['R_0', 'mean'],
                               0.06511,
                               delta=0.01)
        # Noisy feature
        self.assertAlmostEqual(mdi_feat_imp.loc['N_0', 'mean'],
                               0.02229,
                               delta=0.01)

        # MDA(log_loss) assertions
        self.assertAlmostEqual(mda_feat_imp_log_loss.loc['I_1', 'mean'],
                               0.65522,
                               delta=0.1)
        self.assertAlmostEqual(mda_feat_imp_log_loss.loc['R_0', 'mean'],
                               0.00332,
                               delta=0.1)

        # MDA(f1) assertions
        self.assertAlmostEqual(mda_feat_imp_f1.loc['I_1', 'mean'],
                               0.47751,
                               delta=0.1)
        self.assertAlmostEqual(mda_feat_imp_f1.loc['I_2', 'mean'],
                               0.33617,
                               delta=0.1)

        # SFI(log_loss) assertions
        self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['I_0', 'mean'],
                               -6.39442,
                               delta=0.1)
        self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['R_0', 'mean'],
                               -5.04315,
                               delta=0.1)

        # SFI(accuracy) assertions
        self.assertAlmostEqual(sfi_feat_imp_f1.loc['I_0', 'mean'],
                               0.48915,
                               delta=0.1)
        self.assertAlmostEqual(sfi_feat_imp_f1.loc['I_1', 'mean'],
                               0.78443,
                               delta=0.1)

        #Cluster MDI  assertions
        self.assertAlmostEqual(clustered_mdi.loc['R_0', 'mean'],
                               0.01912,
                               delta=0.1)
        self.assertAlmostEqual(clustered_mdi.loc['I_0', 'mean'],
                               0.06575,
                               delta=0.1)

        #Clustered MDA (log_loss) assertions
        self.assertAlmostEqual(clustered_mda.loc['I_0', 'mean'],
                               0.04154,
                               delta=0.1)
        self.assertAlmostEqual(clustered_mda.loc['R_0', 'mean'],
                               0.02940,
                               delta=0.1)

        #Test if CFI with number of clusters same to number features is equal to normal MDI & MDA results
        self.assertAlmostEqual(mdi_feat_imp.loc['I_1', 'mean'],
                               mdi_cfi_single.loc['I_1', 'mean'],
                               delta=0.1)
        self.assertAlmostEqual(mdi_feat_imp.loc['R_0', 'mean'],
                               mdi_cfi_single.loc['R_0', 'mean'],
                               delta=0.1)
        self.assertAlmostEqual(mda_feat_imp_log_loss.loc['I_1', 'mean'],
                               mda_cfi_single.loc['I_1', 'mean'],
                               delta=0.1)
        self.assertAlmostEqual(mda_feat_imp_log_loss.loc['R_0', 'mean'],
                               mda_cfi_single.loc['R_0', 'mean'],
                               delta=0.1)
    def test_get_feature_clusters(self):
        """
        Test get_feature_clusters arguments
        """
        #test for different dependence matrix
        #get ONC on codependence metrics (i.e. Variation of Infomation and Mutual Information Scores)
        #here ONC will decide and develope the cluster from the given dependence_matric
        onc_clusters_VI = get_feature_clusters(
            self.X,
            dependence_metric='information_variation',
            distance_metric=None,
            linkage_method=None,
            n_clusters=None)
        onc_clusters_MI = get_feature_clusters(
            self.X,
            dependence_metric='mutual_information',
            distance_metric=None,
            linkage_method=None,
            n_clusters=None)
        onc_clusters_DC = get_feature_clusters(
            self.X,
            dependence_metric='distance_correlation',
            distance_metric=None,
            linkage_method=None,
            n_clusters=None)
        #hierarchical clustering for codependence metrics
        h_clusters = get_feature_clusters(
            self.X,
            dependence_metric='information_variation',
            distance_metric='angular',
            linkage_method='single',
            n_clusters=2)
        #hierarchical auto clustering
        h_clusters_auto = get_feature_clusters(self.X,
                                               dependence_metric='linear',
                                               distance_metric='angular',
                                               linkage_method='single',
                                               n_clusters=None,
                                               critical_threshold=0.2)
        #test for optimal number of clusters and  _check_for_low_silhouette_scores
        #since this is done on test dataset so there will be no features with low silhouette score
        #so we will make a feature with some what lower silhouette score (near to zero) and set
        #the threshold higher (0.2) than that. Also we need a feature to trigger the low degree of freedom
        #condition so, we create a series of zero in the datasets
        self.X['R_5c'] = self.X[
            'R_5']  #this feature is add to introduce low DF in the regressor.
        self.X['R_1c'] = self.X[
            'R_1']  #this will trigger the expection of LinAlgError i.e. presence of singular matrix
        clustered_subsets_distance = get_feature_clusters(
            self.X,
            dependence_metric='linear',
            distance_metric=None,
            linkage_method=None,
            n_clusters=None,
            critical_threshold=0.2)

        #assertions
        #codependence metric cluster using ONC
        self.assertAlmostEqual(len(onc_clusters_VI), 7, delta=1)
        self.assertAlmostEqual(len(onc_clusters_MI), 6, delta=1)
        self.assertAlmostEqual(len(onc_clusters_DC), 6, delta=1)
        #output clusters must be 2 since n_clusters was specified as 2
        self.assertEqual(len(h_clusters), 2)
        #The ONC should detect somwhere around 5 clusters
        self.assertAlmostEqual(len(h_clusters_auto), 5, delta=1)
        self.assertAlmostEqual(len(clustered_subsets_distance), 5, delta=1)