def __init__(self, fmeasure, pmeasure, splitter, fselector, stopping_criterion=NBackHistoryStopCrit(BestDetector()), bestdetector=BestDetector(), train_pmeasure=True, # XXX should we may be guard splitter so we do not end up # with inappropriate one for the use, i.e. which # generates more than 2 splits # guard_splitter=True, **kwargs ): """ Parameters ---------- fmeasure : Measure Computed for each candidate feature selection. The measure has to compute a scalar value. pmeasure : Measure Compute against a test dataset for each incremental feature set. splitter: Splitter This splitter instance has to generate at least one dataset split when called with the input dataset that is used to compute the per-feature criterion for feature selection. bestdetector : Functor Given a list of error values it has to return a boolean that signals whether the latest error value is the total minimum. stopping_criterion : Functor Given a list of error values it has to return whether the criterion is fulfilled. fselector : Functor train_clf : bool Flag whether the classifier in `transfer_error` should be trained before computing the error. In general this is required, but if the `sensitivity_analyzer` and `transfer_error` share and make use of the same classifier it can be switched off to save CPU cycles. Default `None` checks if sensitivity_analyzer is based on a classifier and doesn't train if so. """ # bases init first FeatureSelection.__init__(self, **kwargs) self._fmeasure = fmeasure self._pmeasure = pmeasure self._splitter = splitter self._fselector = fselector self._stopping_criterion = stopping_criterion self._bestdetector = bestdetector self._train_pmeasure = train_pmeasure
def test_best_detector(self): bd = BestDetector() # for empty history -- no best self.assertTrue(bd([]) == False) # we got the best if we have just 1 self.assertTrue(bd([1]) == True) # we got the best if we have the last minimal self.assertTrue(bd([1, 0.9, 0.8]) == True) # test for alternative func bd = BestDetector(func=max) self.assertTrue(bd([0.8, 0.9, 1.0]) == True) self.assertTrue(bd([0.8, 0.9, 1.0] + [0.9] * 9) == False) self.assertTrue(bd([0.8, 0.9, 1.0] + [0.9] * 10) == False) # test to detect earliest and latest minimum bd = BestDetector(lastminimum=True) self.assertTrue(bd([3, 2, 1, 1, 1, 2, 1]) == True) bd = BestDetector() self.assertTrue(bd([3, 2, 1, 1, 1, 2, 1]) == False)
def test_n_back_history_stop_crit(self): """Test stopping criterion""" stopcrit = NBackHistoryStopCrit() # for empty history -- no best but just go self.assertTrue(stopcrit([]) == False) # should not stop if we got 10 more after minimal self.assertTrue( stopcrit([1, 0.9, 0.8] + [0.9] * (stopcrit.steps - 1)) == False) # should stop if we got 10 more after minimal self.assertTrue( stopcrit([1, 0.9, 0.8] + [0.9] * stopcrit.steps) == True) # test for alternative func stopcrit = NBackHistoryStopCrit(BestDetector(func=max)) self.assertTrue(stopcrit([0.8, 0.9, 1.0] + [0.9] * 9) == False) self.assertTrue(stopcrit([0.8, 0.9, 1.0] + [0.9] * 10) == True) # test to detect earliest and latest minimum stopcrit = NBackHistoryStopCrit(BestDetector(lastminimum=True)) self.assertTrue(stopcrit([3, 2, 1, 1, 1, 2, 1]) == False) stopcrit = NBackHistoryStopCrit(steps=4) self.assertTrue(stopcrit([3, 2, 1, 1, 1, 2, 1]) == True)
def test_rfe_sensmap(): # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2013q3/002538.html # just a smoke test. fails with from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import FeatureSelectionClassifier from mvpa2.measures.base import CrossValidation, RepeatedMeasure from mvpa2.generators.splitters import Splitter from mvpa2.generators.partition import NFoldPartitioner from mvpa2.misc.errorfx import mean_mismatch_error from mvpa2.mappers.fx import mean_sample from mvpa2.mappers.fx import maxofabs_sample from mvpa2.generators.base import Repeater from mvpa2.featsel.rfe import RFE from mvpa2.featsel.helpers import FractionTailSelector, BestDetector from mvpa2.featsel.helpers import NBackHistoryStopCrit from mvpa2.datasets import vstack from mvpa2.misc.data_generators import normal_feature_dataset # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent fds = normal_feature_dataset(nlabels=3, snr=1, # 100, # pure signal! ;) perlabel=9, nfeatures=6, nonbogus_features=range(3), nchunks=3) clfsvm = LinearCSVMC() rfesvm = RFE(clfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()), CrossValidation( clfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()), Repeater(2), fselector=FractionTailSelector(0.70, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10), update_sensitivity=True) fclfsvm = FeatureSelectionClassifier(clfsvm, rfesvm) sensanasvm = fclfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()) # manually repeating/splitting so we do both RFE sensitivity and classification senses, errors = [], [] for i, pset in enumerate(NFoldPartitioner().generate(fds)): # split partitioned dataset split = [d for d in Splitter('partitions').generate(pset)] senses.append(sensanasvm(split[0])) # and it also should train the classifier so we would ask it about error errors.append(mean_mismatch_error(fclfsvm.predict(split[1]), split[1].targets)) senses = vstack(senses) errors = vstack(errors) # Let's compare against rerunning the beast simply for classification with CV errors_cv = CrossValidation(fclfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error)(fds) # and they should match assert_array_equal(errors, errors_cv) # buggy! cv_sensana_svm = RepeatedMeasure(sensanasvm, NFoldPartitioner()) senses_rm = cv_sensana_svm(fds) #print senses.samples, senses_rm.samples #print errors, errors_cv.samples assert_raises(AssertionError, assert_array_almost_equal, senses.samples, senses_rm.samples) raise SkipTest("Known failure for repeated measures: https://github.com/PyMVPA/PyMVPA/issues/117")
def test_rfe(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) pmeasure = ProxyMeasure(clf, postproc=BinaryFxNode(mean_mismatch_error, 'targets')) cvmeasure = CrossValidation(clf, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()) rfesvm_split = SplitClassifier(clf, OddEvenPartitioner()) # explore few recipes for rfe, data in [ # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity (RFE(sens_ana, pmeasure, Splitter('train'), fselector=FixedNElementTailSelector(1), train_pmeasure=False), self.get_data()), # use cross-validation within training to get error for the stopping point # but use full training data to derive sensitivity ( RFE( sens_ana, cvmeasure, Repeater( 2 ), # give the same full dataset to sens_ana and cvmeasure fselector=FractionTailSelector(0.70, mode='select', tail='upper'), train_pmeasure=True), normal_feature_dataset(perlabel=20, nchunks=5, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)), # use cross-validation (via SplitClassifier) and get mean # of normed sensitivities across those splits ( RFE( rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ FxMapper('features', l2_normed), FxMapper('samples', np.mean), FxMapper('samples', np.abs) ])), ConfusionBasedError(rfesvm_split, confusion_state='stats'), Repeater( 2), # we will use the same full cv-training dataset fselector=FractionTailSelector(0.50, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit( BestDetector(), 10), train_pmeasure= False, # we just extract it from existing confusion update_sensitivity=True), normal_feature_dataset(perlabel=28, nchunks=7, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)) ]: # prep data # data = datasets['uni2medium'] data_nfeatures = data.nfeatures rfe.train(data) resds = rfe(data) # fail if orig datasets are changed self.assertTrue(data.nfeatures == data_nfeatures) # check that the features set with the least error is selected if len(rfe.ca.errors): e = np.array(rfe.ca.errors) if isinstance(rfe._fselector, FixedNElementTailSelector): self.assertTrue(resds.nfeatures == data_nfeatures - e.argmin()) else: imin = np.argmin(e) if 'does_feature_selection' in clf.__tags__: # if clf is smart it might figure it out right away assert_array_less(imin, len(e)) else: # in this case we can even check if we had actual # going down/up trend... although -- why up??? self.assertTrue(1 < imin < len(e) - 1) else: self.assertTrue(resds.nfeatures == data_nfeatures) # silly check if nfeatures is in decreasing order nfeatures = np.array(rfe.ca.nfeatures).copy() nfeatures.sort() self.assertTrue((nfeatures[::-1] == rfe.ca.nfeatures).all()) # check if history has elements for every step self.assertTrue( set(rfe.ca.history) == set(range(len(np.array( rfe.ca.errors))))) # Last (the largest number) can be present multiple times even # if we remove 1 feature at a time -- just need to stop well # in advance when we have more than 1 feature left ;) self.assertTrue(rfe.ca.nfeatures[-1] == len( np.where(rfe.ca.history == max(rfe.ca.history))[0]))