def test_sensitivity_based_feature_selection(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) # of features to remove Nremove = 2 # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity fe = SensitivityBasedFeatureSelection(sens_ana, feature_selector=FixedNElementTailSelector(2), enable_ca=["sensitivity", "selected_ids"]) data = self.get_data() data_nfeatures = data.nfeatures fe.train(data) resds = fe(data) # fail if orig datasets are changed self.failUnless(data.nfeatures == data_nfeatures) # silly check if nfeatures got a single one removed self.failUnlessEqual(data.nfeatures, resds.nfeatures+Nremove, msg="We had to remove just a single feature") self.failUnlessEqual(fe.ca.sensitivity.nfeatures, data_nfeatures, msg="Sensitivity have to have # of features equal to original")
def test_sensitivity_based_feature_selection(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) # of features to remove Nremove = 2 # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity fe = SensitivityBasedFeatureSelection( sens_ana, feature_selector=FixedNElementTailSelector(2), enable_ca=["sensitivity", "selected_ids"] ) wdata = self.get_data() tdata = self.get_data_t() # XXX for now convert to numeric labels, but should better be taken # care of during clf refactoring am = AttributeMap() wdata.targets = am.to_numeric(wdata.targets) tdata.targets = am.to_numeric(tdata.targets) wdata_nfeatures = wdata.nfeatures tdata_nfeatures = tdata.nfeatures sdata, stdata = fe(wdata, tdata) # fail if orig datasets are changed self.failUnless(wdata.nfeatures == wdata_nfeatures) self.failUnless(tdata.nfeatures == tdata_nfeatures) # silly check if nfeatures got a single one removed self.failUnlessEqual(wdata.nfeatures, sdata.nfeatures + Nremove, msg="We had to remove just a single feature") self.failUnlessEqual( tdata.nfeatures, stdata.nfeatures + Nremove, msg="We had to remove just a single feature in testing as well" ) self.failUnlessEqual( fe.ca.sensitivity.nfeatures, wdata_nfeatures, msg="Sensitivity have to have # of features equal to original" ) self.failUnlessEqual( len(fe.ca.selected_ids), sdata.nfeatures, msg="# of selected features must be equal the one in the result dataset", )
def test_rfe(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) trans_error = TransferError(clf) # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity rfe = RFE(sens_ana, trans_error, feature_selector=FixedNElementTailSelector(1), train_clf=False) wdata = self.get_data() wdata_nfeatures = wdata.nfeatures tdata = self.get_data_t() tdata_nfeatures = tdata.nfeatures sdata, stdata = rfe(wdata, tdata) # fail if orig datasets are changed self.failUnless(wdata.nfeatures == wdata_nfeatures) self.failUnless(tdata.nfeatures == tdata_nfeatures) # check that the features set with the least error is selected if len(rfe.ca.errors): e = np.array(rfe.ca.errors) self.failUnless(sdata.nfeatures == wdata_nfeatures - e.argmin()) else: self.failUnless(sdata.nfeatures == wdata_nfeatures) # silly check if nfeatures is in decreasing order nfeatures = np.array(rfe.ca.nfeatures).copy() nfeatures.sort() self.failUnless( (nfeatures[::-1] == rfe.ca.nfeatures).all() ) # check if history has elements for every step self.failUnless(set(rfe.ca.history) == set(range(len(np.array(rfe.ca.errors))))) # Last (the largest number) can be present multiple times even # if we remove 1 feature at a time -- just need to stop well # in advance when we have more than 1 feature left ;) self.failUnless(rfe.ca.nfeatures[-1] == len(np.where(rfe.ca.history ==max(rfe.ca.history))[0]))
def test_rfe(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) pmeasure = ProxyMeasure(clf, postproc=BinaryFxNode(mean_mismatch_error, 'targets')) # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity rfe = RFE(sens_ana, pmeasure, Splitter('train'), fselector=FixedNElementTailSelector(1), train_pmeasure=False) data = self.get_data() data_nfeatures = data.nfeatures rfe.train(data) resds = rfe(data) # fail if orig datasets are changed self.failUnless(data.nfeatures == data_nfeatures) # check that the features set with the least error is selected if len(rfe.ca.errors): e = np.array(rfe.ca.errors) self.failUnless(resds.nfeatures == data_nfeatures - e.argmin()) else: self.failUnless(resds.nfeatures == data_nfeatures) # silly check if nfeatures is in decreasing order nfeatures = np.array(rfe.ca.nfeatures).copy() nfeatures.sort() self.failUnless( (nfeatures[::-1] == rfe.ca.nfeatures).all() ) # check if history has elements for every step self.failUnless(set(rfe.ca.history) == set(range(len(np.array(rfe.ca.errors))))) # Last (the largest number) can be present multiple times even # if we remove 1 feature at a time -- just need to stop well # in advance when we have more than 1 feature left ;) self.failUnless(rfe.ca.nfeatures[-1] == len(np.where(rfe.ca.history ==max(rfe.ca.history))[0]))
def test_sensitivity_based_feature_selection(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) # of features to remove Nremove = 2 # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity fe = SensitivityBasedFeatureSelection(sens_ana, feature_selector=FixedNElementTailSelector(2), enable_ca=["sensitivity", "selected_ids"]) wdata = self.get_data() tdata = self.get_data_t() # XXX for now convert to numeric labels, but should better be taken # care of during clf refactoring am = AttributeMap() wdata.targets = am.to_numeric(wdata.targets) tdata.targets = am.to_numeric(tdata.targets) wdata_nfeatures = wdata.nfeatures tdata_nfeatures = tdata.nfeatures sdata, stdata = fe(wdata, tdata) # fail if orig datasets are changed self.failUnless(wdata.nfeatures == wdata_nfeatures) self.failUnless(tdata.nfeatures == tdata_nfeatures) # silly check if nfeatures got a single one removed self.failUnlessEqual(wdata.nfeatures, sdata.nfeatures+Nremove, msg="We had to remove just a single feature") self.failUnlessEqual(tdata.nfeatures, stdata.nfeatures+Nremove, msg="We had to remove just a single feature in testing as well") self.failUnlessEqual(fe.ca.sensitivity.nfeatures, wdata_nfeatures, msg="Sensitivity have to have # of features equal to original") self.failUnlessEqual(len(fe.ca.selected_ids), sdata.nfeatures, msg="# of selected features must be equal the one in the result dataset")
descr='skl.LassoLARS()') regrswh += [_lars, _lasso_lars] clfswh += [RegressionAsClassifier(_lars, descr="skl.LARS_C()"), RegressionAsClassifier(_lasso_lars, descr="skl.LassoLARS_C()")] # kNN clfswh += kNN(k=5, descr="kNN(k=5)") clfswh += kNN(k=5, voting='majority', descr="kNN(k=5, voting='majority')") clfswh += \ FeatureSelectionClassifier( kNN(), SensitivityBasedFeatureSelection( SMLRWeights(SMLR(lm=1.0, implementation="C"), postproc=maxofabs_sample()), RangeElementSelector(mode='select')), descr="kNN on SMLR(lm=1) non-0") clfswh += \ FeatureSelectionClassifier( kNN(), SensitivityBasedFeatureSelection( OneWayAnova(), FractionTailSelector(0.05, mode='select', tail='upper')), descr="kNN on 5%(ANOVA)") clfswh += \ FeatureSelectionClassifier( kNN(), SensitivityBasedFeatureSelection(
def test_analyzer_with_split_classifier(self, clfds): """Test analyzers in split classifier """ clf, ds = clfds # unroll the tuple # We need to skip some LARSes here _sclf = str(clf) if 'LARS(' in _sclf and "type='stepwise'" in _sclf: # ADD KnownToFail thingie from NiPy return # To don't waste too much time testing lets limit to 3 splits nsplits = 3 partitioner = NFoldPartitioner(count=nsplits) mclf = SplitClassifier(clf=clf, partitioner=partitioner, enable_ca=['training_stats', 'stats']) sana = mclf.get_sensitivity_analyzer(# postproc=absolute_features(), enable_ca=["sensitivities"]) ulabels = ds.uniquetargets nlabels = len(ulabels) # Can't rely on splitcfg since count-limit is done in __call__ assert(nsplits == len(list(partitioner.generate(ds)))) sens = sana(ds) # It should return either ... # nlabels * nsplits req_nsamples = [ nlabels * nsplits ] if nlabels == 2: # A single sensitivity in case of binary req_nsamples += [ nsplits ] else: # and for pairs in case of multiclass req_nsamples += [ (nlabels * (nlabels-1) / 2) * nsplits ] # and for 1-vs-1 embedded within Multiclass operating on # pairs (e.g. SMLR) req_nsamples += [req_nsamples[-1]*2] # Also for regression_based -- they can do multiclass # but only 1 sensitivity is provided if 'regression_based' in clf.__tags__: req_nsamples += [ nsplits ] # # of features should correspond self.failUnlessEqual(sens.shape[1], ds.nfeatures) # # of samples/sensitivities should also be reasonable self.failUnless(sens.shape[0] in req_nsamples) # Check if labels are present self.failUnless('splits' in sens.sa) self.failUnless('targets' in sens.sa) # should be 1D -- otherwise dtype object self.failUnless(sens.sa.targets.ndim == 1) sens_ulabels = sens.sa['targets'].unique # Some labels might be pairs(tuples) so ndarray would be of # dtype object and we would need to get them all if sens_ulabels.dtype is np.dtype('object'): sens_ulabels = np.unique( reduce(lambda x,y: x+y, [list(x) for x in sens_ulabels])) assert_array_equal(sens_ulabels, ds.sa['targets'].unique) errors = [x.percent_correct for x in sana.clf.ca.stats.matrices] # lets go through all sensitivities and see if we selected the right # features #if 'meta' in clf.__tags__ and len(sens.samples[0].nonzero()[0])<2: if '5%' in clf.descr \ or (nlabels > 2 and 'regression_based' in clf.__tags__): # Some meta classifiers (5% of ANOVA) are too harsh ;-) # if we get less than 2 features with on-zero sensitivities we # cannot really test # Also -- regression based classifiers performance for multiclass # is expected to suck in general return if cfg.getboolean('tests', 'labile', default='yes'): for conf_matrix in [sana.clf.ca.training_stats] \ + sana.clf.ca.stats.matrices: self.failUnless( conf_matrix.percent_correct>=70, msg="We must have trained on each one more or " \ "less correctly. Got %f%% correct on %d labels" % (conf_matrix.percent_correct, nlabels)) # Since now we have per split and possibly per label -- lets just find # mean per each feature per label across splits sensm = FxMapper('samples', lambda x: np.sum(x), uattrs=['targets']).forward(sens) sensgm = maxofabs_sample().forward(sensm) # global max of abs of means assert_equal(sensgm.shape[0], 1) assert_equal(sensgm.shape[1], ds.nfeatures) selected = FixedNElementTailSelector( len(ds.a.bogus_features))(sensgm.samples[0]) if cfg.getboolean('tests', 'labile', default='yes'): self.failUnlessEqual( set(selected), set(ds.a.nonbogus_features), msg="At the end we should have selected the right features. " "Chose %s whenever nonbogus are %s" % (selected, ds.a.nonbogus_features)) # Now test each one per label # TODO: collect all failures and spit them out at once -- # that would make it easy to see if the sensitivity # just has incorrect order of labels assigned for sens1 in sensm: labels1 = sens1.targets # labels (1) for this sensitivity lndim = labels1.ndim label = labels1[0] # current label # XXX whole lndim comparison should be gone after # things get fixed and we arrive here with a tuple! if lndim == 1: # just a single label self.failUnless(label in ulabels) ilabel_all = np.where(ds.fa.nonbogus_targets == label)[0] # should have just 1 feature for the label self.failUnlessEqual(len(ilabel_all), 1) ilabel = ilabel_all[0] maxsensi = np.argmax(sens1) # index of max sensitivity self.failUnlessEqual(maxsensi, ilabel, "Maximal sensitivity for %s was found in %i whenever" " original feature was %i for nonbogus features %s" % (labels1, maxsensi, ilabel, ds.a.nonbogus_features)) elif lndim == 2 and labels1.shape[1] == 2: # pair of labels # we should have highest (in abs) coefficients in # those two labels maxsensi2 = np.argsort(np.abs(sens1))[0][-2:] ilabel2 = [np.where(ds.fa.nonbogus_targets == l)[0][0] for l in label] self.failUnlessEqual( set(maxsensi2), set(ilabel2), "Maximal sensitivity for %s was found in %s whenever" " original features were %s for nonbogus features %s" % (labels1, maxsensi2, ilabel2, ds.a.nonbogus_features)) """ # Now test for the sign of each one in pair ;) in # all binary problems L1 (-1) -> L2(+1), then # weights for L2 should be positive. to test for # L1 -- invert the sign # We already know (if we haven't failed in previous test), # that those 2 were the strongest -- so check only signs """ self.failUnless( sens1.samples[0, ilabel2[0]]<0, "With %i classes in pair %s got feature %i for %r >= 0" % (nlabels, label, ilabel2[0], label[0])) self.failUnless(sens1.samples[0, ilabel2[1]]>0, "With %i classes in pair %s got feature %i for %r <= 0" % (nlabels, label, ilabel2[1], label[1])) else: # yoh could be wrong at this assumption... time will show self.fail("Got unknown number labels per sensitivity: %s." " Should be either a single label or a pair" % labels1)
# glmnet from R via RPy if externals.exists('glmnet'): from mvpa.clfs.glmnet import GLMNET_C, GLMNET_R clfswh += GLMNET_C(descr="GLMNET_C()") regrswh += GLMNET_R(descr="GLMNET_R()") # kNN clfswh += kNN(k=5, descr="kNN(k=5)") clfswh += kNN(k=5, voting='majority', descr="kNN(k=5, voting='majority')") clfswh += \ FeatureSelectionClassifier( kNN(), SensitivityBasedFeatureSelection( SMLRWeights(SMLR(lm=1.0, implementation="C"), postproc=maxofabs_sample()), RangeElementSelector(mode='select')), descr="kNN on SMLR(lm=1) non-0") clfswh += \ FeatureSelectionClassifier( kNN(), SensitivityBasedFeatureSelection( OneWayAnova(), FractionTailSelector(0.05, mode='select', tail='upper')), descr="kNN on 5%(ANOVA)") clfswh += \ FeatureSelectionClassifier( kNN(), SensitivityBasedFeatureSelection(
def test_analyzer_with_split_classifier(self, clfds): """Test analyzers in split classifier """ clf, ds = clfds # unroll the tuple # We need to skip some LARSes here _sclf = str(clf) if 'LARS(' in _sclf and "type='stepwise'" in _sclf: # ADD KnownToFail thingie from NiPy return # To don't waste too much time testing lets limit to 3 splits nsplits = 3 splitter = NFoldSplitter(count=nsplits) mclf = SplitClassifier(clf=clf, splitter=splitter, enable_ca=['training_confusion', 'confusion']) sana = mclf.get_sensitivity_analyzer( # postproc=absolute_features(), enable_ca=["sensitivities"]) ulabels = ds.uniquetargets nlabels = len(ulabels) # Can't rely on splitcfg since count-limit is done in __call__ assert (nsplits == len(list(splitter(ds)))) sens = sana(ds) # It should return either ... # nlabels * nsplits req_nsamples = [nlabels * nsplits] if nlabels == 2: # A single sensitivity in case of binary req_nsamples += [nsplits] else: # and for pairs in case of multiclass req_nsamples += [(nlabels * (nlabels - 1) / 2) * nsplits] # and for 1-vs-1 embedded within Multiclass operating on # pairs (e.g. SMLR) req_nsamples += [req_nsamples[-1] * 2] # Also for regression_based -- they can do multiclass # but only 1 sensitivity is provided if 'regression_based' in clf.__tags__: req_nsamples += [nsplits] # # of features should correspond self.failUnlessEqual(sens.shape[1], ds.nfeatures) # # of samples/sensitivities should also be reasonable self.failUnless(sens.shape[0] in req_nsamples) # Check if labels are present self.failUnless('splits' in sens.sa) self.failUnless('targets' in sens.sa) # should be 1D -- otherwise dtype object self.failUnless(sens.sa.targets.ndim == 1) sens_ulabels = sens.sa['targets'].unique # Some labels might be pairs(tuples) so ndarray would be of # dtype object and we would need to get them all if sens_ulabels.dtype is np.dtype('object'): sens_ulabels = np.unique( reduce(lambda x, y: x + y, [list(x) for x in sens_ulabels])) assert_array_equal(sens_ulabels, ds.sa['targets'].unique) errors = [x.percent_correct for x in sana.clf.ca.confusion.matrices] # lets go through all sensitivities and see if we selected the right # features #if 'meta' in clf.__tags__ and len(sens.samples[0].nonzero()[0])<2: if '5%' in clf.descr \ or (nlabels > 2 and 'regression_based' in clf.__tags__): # Some meta classifiers (5% of ANOVA) are too harsh ;-) # if we get less than 2 features with on-zero sensitivities we # cannot really test # Also -- regression based classifiers performance for multiclass # is expected to suck in general return if cfg.getboolean('tests', 'labile', default='yes'): for conf_matrix in [sana.clf.ca.training_confusion] \ + sana.clf.ca.confusion.matrices: self.failUnless( conf_matrix.percent_correct>=70, msg="We must have trained on each one more or " \ "less correctly. Got %f%% correct on %d labels" % (conf_matrix.percent_correct, nlabels)) # Since now we have per split and possibly per label -- lets just find # mean per each feature per label across splits sensm = FxMapper('samples', lambda x: np.sum(x), uattrs=['targets'])(sens) sensgm = maxofabs_sample()(sensm) # global max of abs of means assert_equal(sensgm.shape[0], 1) assert_equal(sensgm.shape[1], ds.nfeatures) selected = FixedNElementTailSelector(len(ds.a.bogus_features))( sensgm.samples[0]) if cfg.getboolean('tests', 'labile', default='yes'): self.failUnlessEqual( set(selected), set(ds.a.nonbogus_features), msg="At the end we should have selected the right features. " "Chose %s whenever nonbogus are %s" % (selected, ds.a.nonbogus_features)) # Now test each one per label # TODO: collect all failures and spit them out at once -- # that would make it easy to see if the sensitivity # just has incorrect order of labels assigned for sens1 in sensm: labels1 = sens1.targets # labels (1) for this sensitivity lndim = labels1.ndim label = labels1[0] # current label # XXX whole lndim comparison should be gone after # things get fixed and we arrive here with a tuple! if lndim == 1: # just a single label self.failUnless(label in ulabels) ilabel_all = np.where(ds.fa.targets == label)[0] # should have just 1 feature for the label self.failUnlessEqual(len(ilabel_all), 1) ilabel = ilabel_all[0] maxsensi = np.argmax(sens1) # index of max sensitivity self.failUnlessEqual( maxsensi, ilabel, "Maximal sensitivity for %s was found in %i whenever" " original feature was %i for nonbogus features %s" % (labels1, maxsensi, ilabel, ds.a.nonbogus_features)) elif lndim == 2 and labels1.shape[1] == 2: # pair of labels # we should have highest (in abs) coefficients in # those two labels maxsensi2 = np.argsort(np.abs(sens1))[0][-2:] ilabel2 = [ np.where(ds.fa.targets == l)[0][0] for l in label ] self.failUnlessEqual( set(maxsensi2), set(ilabel2), "Maximal sensitivity for %s was found in %s whenever" " original features were %s for nonbogus features %s" % (labels1, maxsensi2, ilabel2, ds.a.nonbogus_features)) """ # Now test for the sign of each one in pair ;) in # all binary problems L1 (-1) -> L2(+1), then # weights for L2 should be positive. to test for # L1 -- invert the sign # We already know (if we haven't failed in previous test), # that those 2 were the strongest -- so check only signs """ self.failUnless( sens1.samples[0, ilabel2[0]] < 0, "With %i classes in pair %s got feature %i for %r >= 0" % (nlabels, label, ilabel2[0], label[0])) self.failUnless( sens1.samples[0, ilabel2[1]] > 0, "With %i classes in pair %s got feature %i for %r <= 0" % (nlabels, label, ilabel2[1], label[1])) else: # yoh could be wrong at this assumption... time will show self.fail("Got unknown number labels per sensitivity: %s." " Should be either a single label or a pair" % labels1)
def test_rfe(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) pmeasure = ProxyMeasure(clf, postproc=BinaryFxNode(mean_mismatch_error, 'targets')) cvmeasure = CrossValidation(clf, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()) rfesvm_split = SplitClassifier(clf, OddEvenPartitioner()) # explore few recipes for rfe, data in [ # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity (RFE(sens_ana, pmeasure, Splitter('train'), fselector=FixedNElementTailSelector(1), train_pmeasure=False), self.get_data()), # use cross-validation within training to get error for the stopping point # but use full training data to derive sensitivity (RFE(sens_ana, cvmeasure, Repeater(2), # give the same full dataset to sens_ana and cvmeasure fselector=FractionTailSelector( 0.70, mode='select', tail='upper'), train_pmeasure=True), normal_feature_dataset(perlabel=20, nchunks=5, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)), # use cross-validation (via SplitClassifier) and get mean # of normed sensitivities across those splits (RFE(rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ FxMapper('features', l2_normed), FxMapper('samples', np.mean), FxMapper('samples', np.abs)])), ConfusionBasedError(rfesvm_split, confusion_state='stats'), Repeater(2), # we will use the same full cv-training dataset fselector=FractionTailSelector( 0.50, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10), train_pmeasure=False, # we just extract it from existing confusion update_sensitivity=True), normal_feature_dataset(perlabel=28, nchunks=7, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)) ]: # prep data # data = datasets['uni2medium'] data_nfeatures = data.nfeatures rfe.train(data) resds = rfe(data) # fail if orig datasets are changed self.failUnless(data.nfeatures == data_nfeatures) # check that the features set with the least error is selected if len(rfe.ca.errors): e = np.array(rfe.ca.errors) if isinstance(rfe._fselector, FixedNElementTailSelector): self.failUnless(resds.nfeatures == data_nfeatures - e.argmin()) else: # in this case we can even check if we had actual # going down/up trend... although -- why up??? imin = np.argmin(e) self.failUnless( 1 < imin < len(e) - 1 ) else: self.failUnless(resds.nfeatures == data_nfeatures) # silly check if nfeatures is in decreasing order nfeatures = np.array(rfe.ca.nfeatures).copy() nfeatures.sort() self.failUnless( (nfeatures[::-1] == rfe.ca.nfeatures).all() ) # check if history has elements for every step self.failUnless(set(rfe.ca.history) == set(range(len(np.array(rfe.ca.errors))))) # Last (the largest number) can be present multiple times even # if we remove 1 feature at a time -- just need to stop well # in advance when we have more than 1 feature left ;) self.failUnless(rfe.ca.nfeatures[-1] == len(np.where(rfe.ca.history ==max(rfe.ca.history))[0]))