def test_value_error_raise(self): """ Test get_feature_clusters for invalid number of clusters arguments """ #Number of clusters larger than number of features with self.assertRaises(ValueError): get_feature_clusters(self.X, dependence_metric='linear', distance_metric='angular', linkage_method='single', n_clusters=int(41))
def test_get_feature_clusters(self): """ Test get_feature_clusters arguments """ #test for different dependence matrix clustered_subsets = get_feature_clusters( self.X, dependence_metric='information_variation', distance_metric='angular', linkage_method='single', n_clusters=2) #hierarchical auto clustering clustered_subsets_ha = get_feature_clusters(self.X, dependence_metric='linear', distance_metric='angular', linkage_method='single', n_clusters=None, critical_threshold=0.2) #test for optimal number of clusters and _check_for_low_silhouette_scores #since this is done on test dataset so there will be no features with low silhouette score #so we will make a feature with some what lower silhouette score (near to zero) and set #the threshold higher (0.2) than that. Also we need a feature to trigger the low degree of freedom #condition so, we create a series of zero in the datasets self.X['R_5c'] = self.X[ 'R_5'] #this feature is add to introduce low DF in the regressor. self.X['R_1c'] = self.X[ 'R_1'] #this will trigger the expection of LinAlgError i.e. presence of singular matrix clustered_subsets_distance = get_feature_clusters( self.X, dependence_metric='linear', distance_metric=None, linkage_method=None, n_clusters=None, critical_threshold=0.2) #assertions #output clusters must be 2 self.assertEqual(len(clustered_subsets), 2) #The ONC should detect somwhere around 5 clusters self.assertAlmostEqual(len(clustered_subsets_ha), 5, delta=1) self.assertAlmostEqual(len(clustered_subsets_distance), 5, delta=1)
def test_feature_importance(self): """ Test features importance: MDI, MDA, SFI and plot function """ #getting the clustered subsets for CFI with number of clusters selection using ONC algorithm clustered_subsets_linear = get_feature_clusters( self.X, dependence_metric='linear', distance_metric=None, linkage_method=None, n_clusters=None) #Also to verify the theory that if number clusters is equal to number of features then the #result will be same as MDA feature_subset_single = [[x] for x in self.X.columns] # MDI feature importance mdi_feat_imp = mean_decrease_impurity(self.fit_clf, self.X.columns) #Clustered MDI feature importance clustered_mdi = mean_decrease_impurity( self.fit_clf, self.X.columns, clustered_subsets=clustered_subsets_linear) mdi_cfi_single = mean_decrease_impurity( self.fit_clf, self.X.columns, clustered_subsets=feature_subset_single) # MDA feature importance mda_feat_imp_log_loss = mean_decrease_accuracy( self.bag_clf, self.X, self.y, self.cv_gen, sample_weight_train=np.ones((self.X.shape[0], )), sample_weight_score=np.ones((self.X.shape[0], )), scoring=log_loss) mda_feat_imp_f1 = mean_decrease_accuracy(self.bag_clf, self.X, self.y, self.cv_gen, scoring=f1_score) #ClusteredMDA feature importance clustered_mda = mean_decrease_accuracy( self.bag_clf, self.X, self.y, self.cv_gen, clustered_subsets=clustered_subsets_linear) mda_cfi_single = mean_decrease_accuracy( self.bag_clf, self.X, self.y, self.cv_gen, clustered_subsets=feature_subset_single) # SFI feature importance sfi_feat_imp_log_loss = single_feature_importance( self.bag_clf, self.X, self.y, cv_gen=self.cv_gen, sample_weight_train=np.ones((self.X.shape[0], )), scoring=log_loss) sfi_feat_imp_f1 = single_feature_importance( self.bag_clf, self.X, self.y, cv_gen=self.cv_gen, sample_weight_score=np.ones((self.X.shape[0], )), scoring=f1_score) # MDI assertions self.assertAlmostEqual(mdi_feat_imp['mean'].sum(), 1, delta=0.001) # The most informative features self.assertAlmostEqual(mdi_feat_imp.loc['I_1', 'mean'], 0.48058, delta=0.01) self.assertAlmostEqual(mdi_feat_imp.loc['I_0', 'mean'], 0.08214, delta=0.01) # Redundant feature self.assertAlmostEqual(mdi_feat_imp.loc['R_0', 'mean'], 0.06511, delta=0.01) # Noisy feature self.assertAlmostEqual(mdi_feat_imp.loc['N_0', 'mean'], 0.02229, delta=0.01) # MDA(log_loss) assertions self.assertAlmostEqual(mda_feat_imp_log_loss.loc['I_1', 'mean'], 0.65522, delta=0.1) self.assertAlmostEqual(mda_feat_imp_log_loss.loc['R_0', 'mean'], 0.00332, delta=0.1) # MDA(f1) assertions self.assertAlmostEqual(mda_feat_imp_f1.loc['I_1', 'mean'], 0.47751, delta=0.1) self.assertAlmostEqual(mda_feat_imp_f1.loc['I_2', 'mean'], 0.33617, delta=0.1) # SFI(log_loss) assertions self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['I_0', 'mean'], -6.39442, delta=0.1) self.assertAlmostEqual(sfi_feat_imp_log_loss.loc['R_0', 'mean'], -5.04315, delta=0.1) # SFI(accuracy) assertions self.assertAlmostEqual(sfi_feat_imp_f1.loc['I_0', 'mean'], 0.48915, delta=0.1) self.assertAlmostEqual(sfi_feat_imp_f1.loc['I_1', 'mean'], 0.78443, delta=0.1) #Cluster MDI assertions self.assertAlmostEqual(clustered_mdi.loc['R_0', 'mean'], 0.01912, delta=0.1) self.assertAlmostEqual(clustered_mdi.loc['I_0', 'mean'], 0.06575, delta=0.1) #Clustered MDA (log_loss) assertions self.assertAlmostEqual(clustered_mda.loc['I_0', 'mean'], 0.04154, delta=0.1) self.assertAlmostEqual(clustered_mda.loc['R_0', 'mean'], 0.02940, delta=0.1) #Test if CFI with number of clusters same to number features is equal to normal MDI & MDA results self.assertAlmostEqual(mdi_feat_imp.loc['I_1', 'mean'], mdi_cfi_single.loc['I_1', 'mean'], delta=0.1) self.assertAlmostEqual(mdi_feat_imp.loc['R_0', 'mean'], mdi_cfi_single.loc['R_0', 'mean'], delta=0.1) self.assertAlmostEqual(mda_feat_imp_log_loss.loc['I_1', 'mean'], mda_cfi_single.loc['I_1', 'mean'], delta=0.1) self.assertAlmostEqual(mda_feat_imp_log_loss.loc['R_0', 'mean'], mda_cfi_single.loc['R_0', 'mean'], delta=0.1)
def test_get_feature_clusters(self): """ Test get_feature_clusters arguments """ #test for different dependence matrix #get ONC on codependence metrics (i.e. Variation of Infomation and Mutual Information Scores) #here ONC will decide and develope the cluster from the given dependence_matric onc_clusters_VI = get_feature_clusters( self.X, dependence_metric='information_variation', distance_metric=None, linkage_method=None, n_clusters=None) onc_clusters_MI = get_feature_clusters( self.X, dependence_metric='mutual_information', distance_metric=None, linkage_method=None, n_clusters=None) onc_clusters_DC = get_feature_clusters( self.X, dependence_metric='distance_correlation', distance_metric=None, linkage_method=None, n_clusters=None) #hierarchical clustering for codependence metrics h_clusters = get_feature_clusters( self.X, dependence_metric='information_variation', distance_metric='angular', linkage_method='single', n_clusters=2) #hierarchical auto clustering h_clusters_auto = get_feature_clusters(self.X, dependence_metric='linear', distance_metric='angular', linkage_method='single', n_clusters=None, critical_threshold=0.2) #test for optimal number of clusters and _check_for_low_silhouette_scores #since this is done on test dataset so there will be no features with low silhouette score #so we will make a feature with some what lower silhouette score (near to zero) and set #the threshold higher (0.2) than that. Also we need a feature to trigger the low degree of freedom #condition so, we create a series of zero in the datasets self.X['R_5c'] = self.X[ 'R_5'] #this feature is add to introduce low DF in the regressor. self.X['R_1c'] = self.X[ 'R_1'] #this will trigger the expection of LinAlgError i.e. presence of singular matrix clustered_subsets_distance = get_feature_clusters( self.X, dependence_metric='linear', distance_metric=None, linkage_method=None, n_clusters=None, critical_threshold=0.2) #assertions #codependence metric cluster using ONC self.assertAlmostEqual(len(onc_clusters_VI), 7, delta=1) self.assertAlmostEqual(len(onc_clusters_MI), 6, delta=1) self.assertAlmostEqual(len(onc_clusters_DC), 6, delta=1) #output clusters must be 2 since n_clusters was specified as 2 self.assertEqual(len(h_clusters), 2) #The ONC should detect somwhere around 5 clusters self.assertAlmostEqual(len(h_clusters_auto), 5, delta=1) self.assertAlmostEqual(len(clustered_subsets_distance), 5, delta=1)