def test_feature_selection_pipeline(self): sens_ana = SillySensitivityAnalyzer() data = self.get_data() data_nfeatures = data.nfeatures # test silly one first ;-) self.assertEqual( sens_ana(data).samples[0, 0], -int(data_nfeatures / 2)) # OLD: first remove 25% == 6, and then 4, total removing 10 # NOW: test should be independent of the numerical number of features feature_selections = [ SensitivityBasedFeatureSelection(sens_ana, FractionTailSelector(0.25)), SensitivityBasedFeatureSelection(sens_ana, FixedNElementTailSelector(4)) ] # create a FeatureSelection pipeline feat_sel_pipeline = ChainMapper(feature_selections) feat_sel_pipeline.train(data) resds = feat_sel_pipeline(data) self.assertEqual(len(feat_sel_pipeline), len(feature_selections), msg="Test the property feature_selections") desired_nfeatures = int(np.ceil(data_nfeatures * 0.75)) self.assertEqual([fe._oshape[0] for fe in feat_sel_pipeline], [desired_nfeatures, desired_nfeatures - 4])
def __init__(self, sensitivity_analyzer, feature_selector=FractionTailSelector(0.05), train_analyzer=True, **kwargs ): """Initialize feature selection Parameters ---------- sensitivity_analyzer : FeaturewiseMeasure sensitivity analyzer to come up with sensitivity feature_selector : Functor Given a sensitivity map it has to return the ids of those features that should be kept. train_analyzer : bool Flag whether to train the sensitivity analyzer on the input dataset during train(). If False, the employed sensitivity measure has to be already trained before. """ # base init first FeatureSelection.__init__(self, **kwargs) self.__sensitivity_analyzer = sensitivity_analyzer """Sensitivity analyzer to use once""" self.__feature_selector = feature_selector """Functor which takes care about removing some features.""" self.__train_analyzer = train_analyzer
def __test_matthias_question(self): rfe_clf = LinearCSVMC(C=1) rfesvm_split = SplitClassifier(rfe_clf) clf = \ FeatureSelectionClassifier( clf = LinearCSVMC(C=1), feature_selection = RFE( sensitivity_analyzer = rfesvm_split.get_sensitivity_analyzer( combiner=first_axis_mean, transformer=np.abs), transfer_error=ConfusionBasedError( rfesvm_split, confusion_state="confusion"), stopping_criterion=FixedErrorThresholdStopCrit(0.20), feature_selector=FractionTailSelector( 0.2, mode='discard', tail='lower'), update_sensitivity=True)) no_permutations = 1000 permutator = AttributePermutator('targets', count=no_permutations) cv = CrossValidation(clf, NFoldPartitioner(), null_dist=MCNullDist(permutator, tail='left'), enable_ca=['stats']) error = cv(datasets['uni2small']) self.assertTrue(error < 0.4) self.assertTrue(cv.ca.null_prob < 0.05)
def test_split_samples_probability_mapper(self): skip_if_no_external('scipy') nf = 10 ns = 100 nsubj = 5 nchunks = 5 data = np.random.normal(size=(ns, nf)) ds = AttrDataset(data, sa=dict(sidx=np.arange(ns), targets=np.arange(ns) % nchunks, chunks=np.floor(np.arange(ns) * nchunks / ns), subjects=np.arange(ns) / (ns / nsubj / nchunks) % nsubj), fa=dict(fidx=np.arange(nf))) analyzer = OneWayAnova() element_selector = FractionTailSelector(.4, mode='select', tail='upper') common = True m = SplitSamplesProbabilityMapper(analyzer, 'subjects', probability_label='fprob', select_common_features=common, selector=element_selector) m.train(ds) y = m(ds) z = m(ds.samples) assert_array_equal(z, y.samples) assert_equal(y.shape, (100, 4))
def __init__(self, sensitivity_analyzer, split_by_labels, select_common_features=True, probability_label=None, probability_combiner=None, selector=FractionTailSelector(0.05), **kwargs): ''' Parameters ---------- sensitivity_analyzer: FeaturewiseMeasure Sensitivity analyzer to come up with sensitivity. split_by_labels: str or list of str Sample labels on which input datasets are split before data is selected. select_common_features: bool True means that the same features are selected after the split. probablity_label: None or str If None, then the output dataset ds from the sensitivity_analyzer is taken to select the samples. If not None it takes ds.sa['probablity_label']. For example if sensitivity_analyzer=OneWayAnova then probablity_label='fprob' is a sensible value. probability_combiner: function If select_common_features is True, then this function is applied to the feature scores across splits. If None, it uses lambda x:np.sum(-np.log(x)) which is sensible if the scores are probability values selector: Selector function that returns the indices to keep. ''' SliceMapper.__init__(self, None, **kwargs) if probability_combiner is None: def f(x): y = -np.log(x.ravel()) # address potential NaNs # set to max value in y m = np.isnan(y) if np.all(m): return 0 # p=1 y[m] = np.max(y[np.logical_not(m)]) return np.sum(y) probability_combiner = f # avoid lambda as h5py doesn't like it self._sensitivity_analyzer = sensitivity_analyzer self._split_by_labels = split_by_labels self._select_common_features = select_common_features self._probability_label = probability_label self._probability_combiner = probability_combiner self._selector = selector
def test_james_problem_multiclass(self): percent = 80 dataset = datasets['uni4large'] #dataset = dataset[:, dataset.a.nonbogus_features] rfesvm_split = LinearCSVMC() fs = \ RFE(rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ #FxMapper('features', l2_normed), #FxMapper('samples', np.mean), #FxMapper('samples', np.abs) FxMapper('features', lambda x: np.argsort(np.abs(x))), #maxofabs_sample() mean_sample() ])), ProxyMeasure(rfesvm_split, postproc=BinaryFxNode(mean_mismatch_error, 'targets')), Splitter('train'), fselector=FractionTailSelector( percent / 100.0, mode='select', tail='upper'), update_sensitivity=True) clf = FeatureSelectionClassifier( LinearCSVMC(), # on features selected via RFE fs) # update sensitivity at each step (since we're not using the # same CLF as sensitivity analyzer) class StoreResults(object): def __init__(self): self.storage = [] def __call__(self, data, node, result): self.storage.append((node.measure.mapper.ca.history, node.measure.mapper.ca.errors)), cv_storage = StoreResults() cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), callback=cv_storage, enable_ca=['stats']) #cv = SplitClassifier(clf) try: error = cv(dataset).samples.squeeze() except Exception, e: self.fail('CrossValidation cannot handle classifier with RFE ' 'feature selection. Got exception: %s' % (e, ))
def test_SplitRFE(self): # just a smoke test ATM from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import MappedClassifier from mvpa2.misc.data_generators import normal_feature_dataset #import mvpa2.featsel.rfe #reload(mvpa2.featsel.rfe) from mvpa2.featsel.rfe import RFE, SplitRFE from mvpa2.generators.partition import NFoldPartitioner from mvpa2.featsel.helpers import FractionTailSelector from mvpa2.testing import ok_, assert_equal clf = LinearCSVMC(C=1) dataset = normal_feature_dataset(perlabel=20, nlabels=2, nfeatures=30, snr=1., nonbogus_features=[1, 5]) # flip one of the meaningful features around to see # if we are still getting proper selection dataset.samples[:, dataset.a.nonbogus_features[1]] *= -1 # 4 partitions should be enough for testing partitioner = NFoldPartitioner(count=4) rfeclf = MappedClassifier( clf, SplitRFE(clf, partitioner, fselector=FractionTailSelector(0.2, mode='discard', tail='lower'))) r0 = repr(rfeclf) ok_(rfeclf.mapper.nfeatures_min == 0) rfeclf.train(dataset) ok_(rfeclf.mapper.nfeatures_min > 0) predictions = rfeclf(dataset).samples # at least 1 of the nonbogus-features should be chosen ok_( len( set(dataset.a.nonbogus_features).intersection( rfeclf.mapper.slicearg)) > 0) # check repr to have all needed pieces r = repr(rfeclf) s = str(rfeclf) ok_(('partitioner=NFoldP' in r) or ('partitioner=mvpa2.generators.partition.NFoldPartitioner' in r)) ok_('lrn=' in r) ok_(not 'slicearg=' in r) assert_equal(r, r0)
def test_james_problem(self): percent = 80 dataset = datasets['uni2small'] rfesvm_split = LinearCSVMC() fs = \ RFE(rfesvm_split.get_sensitivity_analyzer(), ProxyMeasure(rfesvm_split, postproc=BinaryFxNode(mean_mismatch_error, 'targets')), Splitter('train'), fselector=FractionTailSelector( percent / 100.0, mode='select', tail='upper'), update_sensitivity=True) clf = FeatureSelectionClassifier( LinearCSVMC(), # on features selected via RFE fs) # update sensitivity at each step (since we're not using the # same CLF as sensitivity analyzer) class StoreResults(object): def __init__(self): self.storage = [] def __call__(self, data, node, result): self.storage.append((node.measure.mapper.ca.history, node.measure.mapper.ca.errors)), cv_storage = StoreResults() cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), callback=cv_storage, enable_ca=['confusion']) # TODO -- it is stats #cv = SplitClassifier(clf) try: error = cv(dataset).samples.squeeze() except Exception as e: self.fail('CrossValidation cannot handle classifier with RFE ' 'feature selection. Got exception: %s' % (e, )) assert (len(cv_storage.storage) == len(dataset.sa['chunks'].unique)) assert (len(cv_storage.storage[0]) == 2) assert (len(cv_storage.storage[0][0]) == dataset.nfeatures) self.assertTrue(error < 0.2)
def test_mapped_classifier_sensitivity_analyzer(self, clf): """Test sensitivity of the mapped classifier """ # Assuming many defaults it is as simple as mclf = FeatureSelectionClassifier( clf, SensitivityBasedFeatureSelection( OneWayAnova(), FractionTailSelector(0.5, mode='select', tail='upper')), enable_ca=['training_stats']) sana = mclf.get_sensitivity_analyzer(postproc=sumofabs_sample(), enable_ca=["sensitivities"]) # and lets look at all sensitivities dataset = datasets['uni2small'] # and we get sensitivity analyzer which works on splits sens = sana(dataset) self.assertEqual(sens.shape, (1, dataset.nfeatures))
def __init__(self, fmeasure, pmeasure, splitter, fselector=FractionTailSelector(0.05), update_sensitivity=True, nfeatures_min=0, **kwargs): # XXX Allow for multiple stopping criterions, e.g. error not decreasing # anymore OR number of features less than threshold """Initialize recursive feature elimination Parameters ---------- fmeasure : FeaturewiseMeasure pmeasure : Measure used to compute the transfer error of a classifier based on a certain feature set on the test dataset. NOTE: If sensitivity analyzer is based on the same classifier as transfer_error is using, make sure you initialize transfer_error with train=False, otherwise it would train classifier twice without any necessity. splitter: Splitter This splitter instance has to generate at least two dataset splits when called with the input dataset. The first split serves as the training dataset and the second as the evaluation dataset. fselector : Functor Given a sensitivity map it has to return the ids of those features that should be kept. update_sensitivity : bool If False the sensitivity map is only computed once and reused for each iteration. Otherwise the sensitivities are recomputed at each selection step. nfeatures_min : int Number of features for RFE to stop if reached. """ # bases init first IterativeFeatureSelection.__init__(self, fmeasure, pmeasure, splitter, fselector, **kwargs) self.__update_sensitivity = update_sensitivity """Flag whether sensitivity map is recomputed for each step.""" self._nfeatures_min = nfeatures_min
def test_union_feature_selection(self): # two methods: 5% highes F-scores, non-zero SMLR weights fss = [SensitivityBasedFeatureSelection( OneWayAnova(), FractionTailSelector(0.05, mode='select', tail='upper')), SensitivityBasedFeatureSelection( SMLRWeights(SMLR(lm=1, implementation="C"), postproc=sumofabs_sample()), RangeElementSelector(mode='select'))] fs = CombinedFeatureSelection(fss, method='union') od_union = fs(self.dataset) self.assertTrue(fs.method == 'union') # check output dataset self.assertTrue(od_union.nfeatures <= self.dataset.nfeatures) # again for intersection fs = CombinedFeatureSelection(fss, method='intersection') od_intersect = fs(self.dataset) assert_true(od_intersect.nfeatures < od_union.nfeatures)
def __test_fspipeline_with_split_classifier(self, basic_clf): #basic_clf = LinearNuSVMC() multi_clf = MulticlassClassifier(clf=basic_clf) #svm_weigths = LinearSVMWeights(svm) # Proper RFE: aggregate sensitivities across multiple splits, # but also due to multi class those need to be aggregated # somehow. Transfer error here should be 'leave-1-out' error # of split classifier itself sclf = SplitClassifier(clf=basic_clf) rfe = RFE(sensitivity_analyzer=sclf.get_sensitivity_analyzer( enable_ca=["sensitivities"]), transfer_error=trans_error, feature_selector=FeatureSelectionPipeline([ FractionTailSelector(0.5), FixedNElementTailSelector(1) ]), train_pmeasure=True) # and we get sensitivity analyzer which works on splits and uses # sensitivity selected_features = rfe(self.dataset)
def test_rfe(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) pmeasure = ProxyMeasure(clf, postproc=BinaryFxNode(mean_mismatch_error, 'targets')) cvmeasure = CrossValidation(clf, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()) rfesvm_split = SplitClassifier(clf, OddEvenPartitioner()) # explore few recipes for rfe, data in [ # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity (RFE(sens_ana, pmeasure, Splitter('train'), fselector=FixedNElementTailSelector(1), train_pmeasure=False), self.get_data()), # use cross-validation within training to get error for the stopping point # but use full training data to derive sensitivity ( RFE( sens_ana, cvmeasure, Repeater( 2 ), # give the same full dataset to sens_ana and cvmeasure fselector=FractionTailSelector(0.70, mode='select', tail='upper'), train_pmeasure=True), normal_feature_dataset(perlabel=20, nchunks=5, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)), # use cross-validation (via SplitClassifier) and get mean # of normed sensitivities across those splits ( RFE( rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ FxMapper('features', l2_normed), FxMapper('samples', np.mean), FxMapper('samples', np.abs) ])), ConfusionBasedError(rfesvm_split, confusion_state='stats'), Repeater( 2), # we will use the same full cv-training dataset fselector=FractionTailSelector(0.50, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit( BestDetector(), 10), train_pmeasure= False, # we just extract it from existing confusion update_sensitivity=True), normal_feature_dataset(perlabel=28, nchunks=7, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)) ]: # prep data # data = datasets['uni2medium'] data_nfeatures = data.nfeatures rfe.train(data) resds = rfe(data) # fail if orig datasets are changed self.assertTrue(data.nfeatures == data_nfeatures) # check that the features set with the least error is selected if len(rfe.ca.errors): e = np.array(rfe.ca.errors) if isinstance(rfe._fselector, FixedNElementTailSelector): self.assertTrue(resds.nfeatures == data_nfeatures - e.argmin()) else: imin = np.argmin(e) if 'does_feature_selection' in clf.__tags__: # if clf is smart it might figure it out right away assert_array_less(imin, len(e)) else: # in this case we can even check if we had actual # going down/up trend... although -- why up??? self.assertTrue(1 < imin < len(e) - 1) else: self.assertTrue(resds.nfeatures == data_nfeatures) # silly check if nfeatures is in decreasing order nfeatures = np.array(rfe.ca.nfeatures).copy() nfeatures.sort() self.assertTrue((nfeatures[::-1] == rfe.ca.nfeatures).all()) # check if history has elements for every step self.assertTrue( set(rfe.ca.history) == set(range(len(np.array( rfe.ca.errors))))) # Last (the largest number) can be present multiple times even # if we remove 1 feature at a time -- just need to stop well # in advance when we have more than 1 feature left ;) self.assertTrue(rfe.ca.nfeatures[-1] == len( np.where(rfe.ca.history == max(rfe.ca.history))[0]))
def test_feature_selector(self): """Test feature selector""" # remove 10% weekest selector = FractionTailSelector(0.1) data = np.array([3.5, 10, 7, 5, -0.4, 0, 0, 2, 10, 9]) # == rank [4, 5, 6, 7, 0, 3, 2, 9, 1, 8] target10 = np.array([0, 1, 2, 3, 5, 6, 7, 8, 9]) target30 = np.array([0, 1, 2, 3, 7, 8, 9]) self.assertRaises(UnknownStateError, selector.ca.__getattribute__, 'ndiscarded') self.assertTrue((selector(data) == target10).all()) selector.felements = 0.30 # discard 30% self.assertTrue(selector.felements == 0.3) self.assertTrue((selector(data) == target30).all()) self.assertTrue(selector.ca.ndiscarded == 3) # se 3 were discarded selector = FixedNElementTailSelector(1) # 0 1 2 3 4 5 6 7 8 9 data = np.array([3.5, 10, 7, 5, -0.4, 0, 0, 2, 10, 9]) self.assertTrue((selector(data) == target10).all()) selector.nelements = 3 self.assertTrue(selector.nelements == 3) self.assertTrue((selector(data) == target30).all()) self.assertTrue(selector.ca.ndiscarded == 3) # test range selector # simple range 'above' self.assertTrue((RangeElementSelector(lower=0)(data) == \ np.array([0,1,2,3,7,8,9])).all()) self.assertTrue((RangeElementSelector(lower=0, inclusive=True)(data) == \ np.array([0,1,2,3,5,6,7,8,9])).all()) self.assertTrue((RangeElementSelector(lower=0, mode='discard', inclusive=True)(data) == \ np.array([4])).all()) # simple range 'below' self.assertTrue((RangeElementSelector(upper=2)(data) == \ np.array([4,5,6])).all()) self.assertTrue((RangeElementSelector(upper=2, inclusive=True)(data) == \ np.array([4,5,6,7])).all()) self.assertTrue((RangeElementSelector(upper=2, mode='discard', inclusive=True)(data) == \ np.array([0,1,2,3,8,9])).all()) # ranges self.assertTrue((RangeElementSelector(lower=2, upper=9)(data) == \ np.array([0,2,3])).all()) self.assertTrue((RangeElementSelector(lower=2, upper=9, inclusive=True)(data) == \ np.array([0,2,3,7,9])).all()) self.assertTrue((RangeElementSelector( upper=2, lower=9, mode='discard', inclusive=True)(data) == RangeElementSelector( lower=2, upper=9, inclusive=False)(data)).all()) # non-0 elements -- should be equivalent to np.nonzero()[0] self.assertTrue((RangeElementSelector()(data) == \ np.nonzero(data)[0]).all())
clfswh += \ FeatureSelectionClassifier( kNN(), SensitivityBasedFeatureSelection( SMLRWeights(SMLR(lm=1.0, implementation="C"), postproc=maxofabs_sample()), RangeElementSelector(mode='select')), descr="kNN on SMLR(lm=1) non-0") clfswh += \ FeatureSelectionClassifier( kNN(), SensitivityBasedFeatureSelection( OneWayAnova(), FractionTailSelector(0.05, mode='select', tail='upper')), descr="kNN on 5%(ANOVA)") clfswh += \ FeatureSelectionClassifier( kNN(), SensitivityBasedFeatureSelection( OneWayAnova(), FixedNElementTailSelector(50, mode='select', tail='upper')), descr="kNN on 50(ANOVA)") # GNB clfswh += GNB(descr="GNB()") clfswh += GNB(common_variance=True, descr="GNB(common_variance=True)") clfswh += GNB(prior='uniform', descr="GNB(prior='uniform')") clfswh += \
def test_SplitRFE(self, fmeasure): # just a smoke test ATM from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import MappedClassifier from mvpa2.misc.data_generators import normal_feature_dataset #import mvpa2.featsel.rfe #reload(mvpa2.featsel.rfe) from mvpa2.featsel.rfe import RFE, SplitRFE from mvpa2.generators.partition import NFoldPartitioner from mvpa2.featsel.helpers import FractionTailSelector from mvpa2.testing import ok_, assert_equal clf = LinearCSVMC(C=1) dataset = normal_feature_dataset(perlabel=20, nlabels=2, nfeatures=11, snr=1., nonbogus_features=[1, 5]) # flip one of the meaningful features around to see # if we are still getting proper selection dataset.samples[:, dataset.a.nonbogus_features[1]] *= -1 # 3 partitions should be enough for testing partitioner = NFoldPartitioner(count=3) rfeclf = MappedClassifier( clf, SplitRFE( clf, partitioner, fselector=FractionTailSelector(0.5, mode='discard', tail='lower'), fmeasure=fmeasure, # need to update only when using clf's sens anal update_sensitivity=fmeasure is None)) r0 = repr(rfeclf) ok_(rfeclf.mapper.nfeatures_min == 0) rfeclf.train(dataset) ok_(rfeclf.mapper.nfeatures_min > 0) predictions = rfeclf(dataset).samples # at least 1 of the nonbogus-features should be chosen ok_( len( set(dataset.a.nonbogus_features).intersection( rfeclf.mapper.slicearg)) > 0) # check repr to have all needed pieces r = repr(rfeclf) s = str(rfeclf) ok_(('partitioner=NFoldP' in r) or ('partitioner=mvpa2.generators.partition.NFoldPartitioner' in r)) ok_('lrn=' in r) ok_(not 'slicearg=' in r) assert_equal(r, r0) if externals.exists('joblib'): rfeclf.mapper.nproc = -1 # compare results against the one ran in parallel _slicearg = rfeclf.mapper.slicearg _predictions = predictions rfeclf.train(dataset) predictions = rfeclf(dataset).samples assert_array_equal(predictions, _predictions) assert_array_equal(_slicearg, rfeclf.mapper.slicearg) # Test that we can collect stats from cas within cross-validation sensitivities = [] nested_errors = [] nested_nfeatures = [] def store_me(data, node, result): sens = node.measure.get_sensitivity_analyzer( force_train=False)(data) sensitivities.append(sens) nested_errors.append(node.measure.mapper.ca.nested_errors) nested_nfeatures.append(node.measure.mapper.ca.nested_nfeatures) cv = CrossValidation(rfeclf, NFoldPartitioner(count=1), callback=store_me, enable_ca=['stats']) _ = cv(dataset) # just to make sure we collected them assert_equal(len(sensitivities), 1) assert_equal(len(nested_errors), 1) assert_equal(len(nested_nfeatures), 1)
def test_rfe_sensmap(): # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2013q3/002538.html # just a smoke test. fails with from mvpa2.clfs.svm import LinearCSVMC from mvpa2.clfs.meta import FeatureSelectionClassifier from mvpa2.measures.base import CrossValidation, RepeatedMeasure from mvpa2.generators.splitters import Splitter from mvpa2.generators.partition import NFoldPartitioner from mvpa2.misc.errorfx import mean_mismatch_error from mvpa2.mappers.fx import mean_sample from mvpa2.mappers.fx import maxofabs_sample from mvpa2.generators.base import Repeater from mvpa2.featsel.rfe import RFE from mvpa2.featsel.helpers import FractionTailSelector, BestDetector from mvpa2.featsel.helpers import NBackHistoryStopCrit from mvpa2.datasets import vstack from mvpa2.misc.data_generators import normal_feature_dataset # Let's simulate the beast -- 6 categories total groupped into 3 # super-ordinate, and actually without any 'superordinate' effect # since subordinate categories independent fds = normal_feature_dataset(nlabels=3, snr=1, # 100, # pure signal! ;) perlabel=9, nfeatures=6, nonbogus_features=range(3), nchunks=3) clfsvm = LinearCSVMC() rfesvm = RFE(clfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()), CrossValidation( clfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()), Repeater(2), fselector=FractionTailSelector(0.70, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit(BestDetector(), 10), update_sensitivity=True) fclfsvm = FeatureSelectionClassifier(clfsvm, rfesvm) sensanasvm = fclfsvm.get_sensitivity_analyzer(postproc=maxofabs_sample()) # manually repeating/splitting so we do both RFE sensitivity and classification senses, errors = [], [] for i, pset in enumerate(NFoldPartitioner().generate(fds)): # split partitioned dataset split = [d for d in Splitter('partitions').generate(pset)] senses.append(sensanasvm(split[0])) # and it also should train the classifier so we would ask it about error errors.append(mean_mismatch_error(fclfsvm.predict(split[1]), split[1].targets)) senses = vstack(senses) errors = vstack(errors) # Let's compare against rerunning the beast simply for classification with CV errors_cv = CrossValidation(fclfsvm, NFoldPartitioner(), errorfx=mean_mismatch_error)(fds) # and they should match assert_array_equal(errors, errors_cv) # buggy! cv_sensana_svm = RepeatedMeasure(sensanasvm, NFoldPartitioner()) senses_rm = cv_sensana_svm(fds) #print senses.samples, senses_rm.samples #print errors, errors_cv.samples assert_raises(AssertionError, assert_array_almost_equal, senses.samples, senses_rm.samples) raise SkipTest("Known failure for repeated measures: https://github.com/PyMVPA/PyMVPA/issues/117")
def test_feature_selector(self): """Test feature selector""" # remove 10% weekest selector = FractionTailSelector(0.1) data = np.array([3.5, 10, 7, 5, -0.4, 0, 0, 2, 10, 9]) # == rank [4, 5, 6, 7, 0, 3, 2, 9, 1, 8] target10 = np.array([0, 1, 2, 3, 5, 6, 7, 8, 9]) target30 = np.array([0, 1, 2, 3, 7, 8, 9]) self.assertRaises(UnknownStateError, selector.ca.__getattribute__, 'ndiscarded') self.assertTrue((selector(data) == target10).all()) selector.felements = 0.30 # discard 30% self.assertTrue(selector.felements == 0.3) self.assertTrue((selector(data) == target30).all()) self.assertTrue(selector.ca.ndiscarded == 3) # se 3 were discarded selector = FixedNElementTailSelector(1) # 0 1 2 3 4 5 6 7 8 9 data = np.array([3.5, 10, 7, 5, -0.4, 0, 0, 2, 10, 9]) self.assertTrue((selector(data) == target10).all()) selector.nelements = 3 self.assertTrue(selector.nelements == 3) self.assertTrue((selector(data) == target30).all()) self.assertTrue(selector.ca.ndiscarded == 3) # test range selector # simple range 'above' self.assertTrue((RangeElementSelector(lower=0)(data) == \ np.array([0,1,2,3,7,8,9])).all()) self.assertTrue((RangeElementSelector(lower=0, inclusive=True)(data) == \ np.array([0,1,2,3,5,6,7,8,9])).all()) self.assertTrue((RangeElementSelector(lower=0, mode='discard', inclusive=True)(data) == \ np.array([4])).all()) # simple range 'below' self.assertTrue((RangeElementSelector(upper=2)(data) == \ np.array([4,5,6])).all()) self.assertTrue((RangeElementSelector(upper=2, inclusive=True)(data) == \ np.array([4,5,6,7])).all()) self.assertTrue((RangeElementSelector(upper=2, mode='discard', inclusive=True)(data) == \ np.array([0,1,2,3,8,9])).all()) # ranges self.assertTrue((RangeElementSelector(lower=2, upper=9)(data) == \ np.array([0,2,3])).all()) self.assertTrue((RangeElementSelector(lower=2, upper=9, inclusive=True)(data) == \ np.array([0,2,3,7,9])).all()) self.assertTrue((RangeElementSelector(upper=2, lower=9, mode='discard', inclusive=True)(data) == RangeElementSelector(lower=2, upper=9, inclusive=False)(data)).all()) # non-0 elements -- should be equivalent to np.nonzero()[0] self.assertTrue((RangeElementSelector()(data) == \ np.nonzero(data)[0]).all())
def custom_tail_selector(seq): seq1 = FractionTailSelector(0.01, mode='discard', tail='upper')(seq) seq2 = FractionTailSelector(0.05, mode='select', tail='upper')(seq) return list(set(seq1).intersection(seq2))