def test_split_clf_on_chainpartitioner(self): # pretty much a smoke test for #156 ds = datasets['uni2small'] part = ChainNode([ NFoldPartitioner(cvtype=1), Balancer(attr='targets', count=2, limit='partitions', apply_selection=True) ]) partitions = list(part.generate(ds)) sclf = SplitClassifier(sample_clf_lin, part, enable_ca=['stats', 'splits']) sclf.train(ds) pred = sclf.predict(ds) assert_equal(len(pred), len(ds)) # rudimentary check assert_equal(len(sclf.ca.splits), len(partitions)) assert_equal(len(sclf.clfs), len(partitions)) # now let's do sensitivity analyzer just in case sclf.untrain() sensana = sclf.get_sensitivity_analyzer() sens = sensana(ds) # basic check that sensitivities varied across splits from mvpa2.mappers.fx import FxMapper sens_stds = FxMapper('samples', np.std, uattrs=['targets'])(sens) assert_true(np.any(sens_stds != 0))
def run(args): ds = arg2ds(args.data) verbose(3, 'Concatenation yielded %i samples with %i features' % ds.shape) if args.numpy_xfm is not None: from mvpa2.mappers.fx import FxMapper fx, axis = args.numpy_xfm mapper = FxMapper(axis, fx) ds = ds.get_mapped(mapper) info_fx[args.report](ds, args)
def test_james_problem_multiclass(self): percent = 80 dataset = datasets['uni4large'] #dataset = dataset[:, dataset.a.nonbogus_features] rfesvm_split = LinearCSVMC() fs = \ RFE(rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ #FxMapper('features', l2_normed), #FxMapper('samples', np.mean), #FxMapper('samples', np.abs) FxMapper('features', lambda x: np.argsort(np.abs(x))), #maxofabs_sample() mean_sample() ])), ProxyMeasure(rfesvm_split, postproc=BinaryFxNode(mean_mismatch_error, 'targets')), Splitter('train'), fselector=FractionTailSelector( percent / 100.0, mode='select', tail='upper'), update_sensitivity=True) clf = FeatureSelectionClassifier( LinearCSVMC(), # on features selected via RFE fs) # update sensitivity at each step (since we're not using the # same CLF as sensitivity analyzer) class StoreResults(object): def __init__(self): self.storage = [] def __call__(self, data, node, result): self.storage.append((node.measure.mapper.ca.history, node.measure.mapper.ca.errors)), cv_storage = StoreResults() cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), callback=cv_storage, enable_ca=['stats']) #cv = SplitClassifier(clf) try: error = cv(dataset).samples.squeeze() except Exception, e: self.fail('CrossValidation cannot handle classifier with RFE ' 'feature selection. Got exception: %s' % (e, ))
def run(args): ds = arg2ds(args.data) verbose(3, 'Concatenation yielded %i samples with %i features' % ds.shape) # build list of events events = [] timebased_events = False if args.event_attrs is not None: def_attrs = dict([(k, ds.sa[k].value) for k in args.event_attrs]) events = find_events(**def_attrs) elif args.csv_events is not None: if args.csv_events == '-': csv = sys.stdin.read() import cStringIO csv = cStringIO.StringIO(csv) else: csv = open(args.csv_events, 'rU') csvt = _load_csv_table(csv) if not len(csvt): raise ValueError("no CSV columns found") if args.onset_column: csvt['onset'] = csvt[args.onset_column] nevents = len(csvt[csvt.keys()[0]]) events = [] for ev in xrange(nevents): events.append(dict([(k, v[ev]) for k, v in csvt.iteritems()])) elif args.onsets is not None: if not len(args.onsets): args.onsets = [i for i in sys.stdin] # time or sample-based? if args.time_attr is None: oconv = int else: oconv = float events = [{'onset': oconv(o)} for o in args.onsets] elif args.fsl_ev3 is not None: timebased_events = True from mvpa2.misc.fsl import FslEV3 events = [] for evsrc in args.fsl_ev3: events.extend(FslEV3(evsrc).to_events()) if not len(events): raise ValueError("no events defined") verbose(2, 'Extracting %i events' % len(events)) if args.event_compression is None: evmap = None elif args.event_compression == 'mean': evmap = FxMapper('features', np.mean, attrfx=merge2first) elif args.event_compression == 'median': evmap = FxMapper('features', np.median, attrfx=merge2first) elif args.event_compression == 'min': evmap = FxMapper('features', np.min, attrfx=merge2first) elif args.event_compression == 'max': evmap = FxMapper('features', np.max, attrfx=merge2first) # convert to event-related ds evds = eventrelated_dataset(ds, events, time_attr=args.time_attr, match=args.match_strategy, event_offset=args.offset, event_duration=args.duration, event_mapper=evmap) # act on all attribute options evds = process_common_dsattr_opts(evds, args) # and store ds2hdf5(evds, args.output, compression=args.hdf5_compression) return evds
def test_analyzer_with_split_classifier(self, clfds): """Test analyzers in split classifier """ clf, ds = clfds # unroll the tuple # We need to skip some LARSes here _sclf = str(clf) if 'LARS(' in _sclf and "type='stepwise'" in _sclf: # ADD KnownToFail thingie from NiPy return # To don't waste too much time testing lets limit to 3 splits nsplits = 3 partitioner = NFoldPartitioner(count=nsplits) mclf = SplitClassifier(clf=clf, partitioner=partitioner, enable_ca=['training_stats', 'stats']) sana = mclf.get_sensitivity_analyzer( # postproc=absolute_features(), pass_attr=['fa.nonbogus_targets'], enable_ca=["sensitivities"]) ulabels = ds.uniquetargets nlabels = len(ulabels) # Can't rely on splitcfg since count-limit is done in __call__ assert (nsplits == len(list(partitioner.generate(ds)))) sens = sana(ds) assert ('nonbogus_targets' in sens.fa) # were they passsed? # TODO: those few do not expose biases if not len(set(clf.__tags__).intersection(('lars', 'glmnet', 'gpr'))): assert ('biases' in sens.sa) # print sens.sa.biases # It should return either ... # nlabels * nsplits req_nsamples = [nlabels * nsplits] if nlabels == 2: # A single sensitivity in case of binary req_nsamples += [nsplits] else: # and for pairs in case of multiclass req_nsamples += [(nlabels * (nlabels - 1) / 2) * nsplits] # and for 1-vs-1 embedded within Multiclass operating on # pairs (e.g. SMLR) req_nsamples += [req_nsamples[-1] * 2] # Also for regression_based -- they can do multiclass # but only 1 sensitivity is provided if 'regression_based' in clf.__tags__: req_nsamples += [nsplits] # # of features should correspond self.assertEqual(sens.shape[1], ds.nfeatures) # # of samples/sensitivities should also be reasonable self.assertTrue(sens.shape[0] in req_nsamples) # Check if labels are present self.assertTrue('splits' in sens.sa) self.assertTrue('targets' in sens.sa) # should be 1D -- otherwise dtype object self.assertTrue(sens.sa.targets.ndim == 1) sens_ulabels = sens.sa['targets'].unique # Some labels might be pairs(tuples) so ndarray would be of # dtype object and we would need to get them all if sens_ulabels.dtype is np.dtype('object'): sens_ulabels = np.unique( reduce(lambda x, y: x + y, [list(x) for x in sens_ulabels])) assert_array_equal(sens_ulabels, ds.sa['targets'].unique) errors = [x.percent_correct for x in sana.clf.ca.stats.matrices] # lets go through all sensitivities and see if we selected the right # features #if 'meta' in clf.__tags__ and len(sens.samples[0].nonzero()[0])<2: if '5%' in clf.descr \ or (nlabels > 2 and 'regression_based' in clf.__tags__): # Some meta classifiers (5% of ANOVA) are too harsh ;-) # if we get less than 2 features with on-zero sensitivities we # cannot really test # Also -- regression based classifiers performance for multiclass # is expected to suck in general return if cfg.getboolean('tests', 'labile', default='yes'): for conf_matrix in [sana.clf.ca.training_stats] \ + sana.clf.ca.stats.matrices: self.assertTrue( conf_matrix.percent_correct>=70, msg="We must have trained on each one more or " \ "less correctly. Got %f%% correct on %d labels" % (conf_matrix.percent_correct, nlabels)) # Since now we have per split and possibly per label -- lets just find # mean per each feature per label across splits sensm = FxMapper('samples', lambda x: np.sum(x), uattrs=['targets']).forward(sens) sensgm = maxofabs_sample().forward(sensm) # global max of abs of means assert_equal(sensgm.shape[0], 1) assert_equal(sensgm.shape[1], ds.nfeatures) selected = FixedNElementTailSelector(len(ds.a.bogus_features))( sensgm.samples[0]) if cfg.getboolean('tests', 'labile', default='yes'): self.assertEqual( set(selected), set(ds.a.nonbogus_features), msg="At the end we should have selected the right features. " "Chose %s whenever nonbogus are %s" % (selected, ds.a.nonbogus_features)) # Now test each one per label # TODO: collect all failures and spit them out at once -- # that would make it easy to see if the sensitivity # just has incorrect order of labels assigned for sens1 in sensm: labels1 = sens1.targets # labels (1) for this sensitivity lndim = labels1.ndim label = labels1[0] # current label # XXX whole lndim comparison should be gone after # things get fixed and we arrive here with a tuple! if lndim == 1: # just a single label self.assertTrue(label in ulabels) ilabel_all = np.where(ds.fa.nonbogus_targets == label)[0] # should have just 1 feature for the label self.assertEqual(len(ilabel_all), 1) ilabel = ilabel_all[0] maxsensi = np.argmax(sens1) # index of max sensitivity self.assertEqual( maxsensi, ilabel, "Maximal sensitivity for %s was found in %i whenever" " original feature was %i for nonbogus features %s" % (labels1, maxsensi, ilabel, ds.a.nonbogus_features)) elif lndim == 2 and labels1.shape[1] == 2: # pair of labels # we should have highest (in abs) coefficients in # those two labels maxsensi2 = np.argsort(np.abs(sens1))[0][-2:] ilabel2 = [ np.where(ds.fa.nonbogus_targets == l)[0][0] for l in label ] self.assertEqual( set(maxsensi2), set(ilabel2), "Maximal sensitivity for %s was found in %s whenever" " original features were %s for nonbogus features %s" % (labels1, maxsensi2, ilabel2, ds.a.nonbogus_features)) """ # Now test for the sign of each one in pair ;) in # all binary problems L1 (-1) -> L2(+1), then # weights for L2 should be positive. to test for # L1 -- invert the sign # We already know (if we haven't failed in previous test), # that those 2 were the strongest -- so check only signs """ self.assertTrue( sens1.samples[0, ilabel2[0]] < 0, "With %i classes in pair %s got feature %i for %r >= 0" % (nlabels, label, ilabel2[0], label[0])) self.assertTrue( sens1.samples[0, ilabel2[1]] > 0, "With %i classes in pair %s got feature %i for %r <= 0" % (nlabels, label, ilabel2[1], label[1])) else: # yoh could be wrong at this assumption... time will show self.fail("Got unknown number labels per sensitivity: %s." " Should be either a single label or a pair" % labels1)
def test_rfe(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) pmeasure = ProxyMeasure(clf, postproc=BinaryFxNode(mean_mismatch_error, 'targets')) cvmeasure = CrossValidation(clf, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()) rfesvm_split = SplitClassifier(clf, OddEvenPartitioner()) # explore few recipes for rfe, data in [ # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity (RFE(sens_ana, pmeasure, Splitter('train'), fselector=FixedNElementTailSelector(1), train_pmeasure=False), self.get_data()), # use cross-validation within training to get error for the stopping point # but use full training data to derive sensitivity ( RFE( sens_ana, cvmeasure, Repeater( 2 ), # give the same full dataset to sens_ana and cvmeasure fselector=FractionTailSelector(0.70, mode='select', tail='upper'), train_pmeasure=True), normal_feature_dataset(perlabel=20, nchunks=5, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)), # use cross-validation (via SplitClassifier) and get mean # of normed sensitivities across those splits ( RFE( rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ FxMapper('features', l2_normed), FxMapper('samples', np.mean), FxMapper('samples', np.abs) ])), ConfusionBasedError(rfesvm_split, confusion_state='stats'), Repeater( 2), # we will use the same full cv-training dataset fselector=FractionTailSelector(0.50, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit( BestDetector(), 10), train_pmeasure= False, # we just extract it from existing confusion update_sensitivity=True), normal_feature_dataset(perlabel=28, nchunks=7, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)) ]: # prep data # data = datasets['uni2medium'] data_nfeatures = data.nfeatures rfe.train(data) resds = rfe(data) # fail if orig datasets are changed self.assertTrue(data.nfeatures == data_nfeatures) # check that the features set with the least error is selected if len(rfe.ca.errors): e = np.array(rfe.ca.errors) if isinstance(rfe._fselector, FixedNElementTailSelector): self.assertTrue(resds.nfeatures == data_nfeatures - e.argmin()) else: imin = np.argmin(e) if 'does_feature_selection' in clf.__tags__: # if clf is smart it might figure it out right away assert_array_less(imin, len(e)) else: # in this case we can even check if we had actual # going down/up trend... although -- why up??? self.assertTrue(1 < imin < len(e) - 1) else: self.assertTrue(resds.nfeatures == data_nfeatures) # silly check if nfeatures is in decreasing order nfeatures = np.array(rfe.ca.nfeatures).copy() nfeatures.sort() self.assertTrue((nfeatures[::-1] == rfe.ca.nfeatures).all()) # check if history has elements for every step self.assertTrue( set(rfe.ca.history) == set(range(len(np.array( rfe.ca.errors))))) # Last (the largest number) can be present multiple times even # if we remove 1 feature at a time -- just need to stop well # in advance when we have more than 1 feature left ;) self.assertTrue(rfe.ca.nfeatures[-1] == len( np.where(rfe.ca.history == max(rfe.ca.history))[0]))
def test_erdataset(): # 3 chunks, 5 targets, blocks of 5 samples each nchunks = 3 ntargets = 5 blocklength = 5 nfeatures = 10 targets = np.tile(np.repeat(range(ntargets), blocklength), nchunks) chunks = np.repeat(np.arange(nchunks), ntargets * blocklength) samples = np.repeat(np.arange(nchunks * ntargets * blocklength), nfeatures).reshape(-1, nfeatures) ds = dataset_wizard(samples, targets=targets, chunks=chunks) # check if events are determined properly evs = find_events(targets=ds.sa.targets, chunks=ds.sa.chunks) for ev in evs: assert_equal(ev['duration'], blocklength) assert_equal(ntargets * nchunks, len(evs)) for t in range(ntargets): assert_equal(len([ev for ev in evs if ev['targets'] == t]), nchunks) # now turn `ds` into an eventreleated dataset erds = eventrelated_dataset(ds, evs) # the only unprefixed sample attributes are assert_equal(sorted([a for a in ds.sa if not a.startswith('event')]), ['chunks', 'targets']) # samples as expected? assert_array_equal(erds.samples[0], np.repeat(np.arange(blocklength), nfeatures)) # that should also be the temporal feature offset assert_array_equal(erds.samples[0], erds.fa.event_offsetidx) assert_array_equal(erds.sa.event_onsetidx, np.arange(0, 71, 5)) # finally we should see two mappers assert_equal(len(erds.a.mapper), 2) assert_true(isinstance(erds.a.mapper[0], BoxcarMapper)) assert_true(isinstance(erds.a.mapper[1], FlattenMapper)) # check alternative event mapper # this one does temporal compression by averaging erds_compress = eventrelated_dataset(ds, evs, event_mapper=FxMapper( 'features', np.mean)) assert_equal(len(erds), len(erds_compress)) assert_array_equal(erds_compress.samples[:, 0], np.arange(2, 73, 5)) # # now check the same dataset with event descretization tr = 2.5 ds.sa['time'] = np.arange(nchunks * ntargets * blocklength) * tr evs = [{'onset': 4.9, 'duration': 6.2}] # doesn't work without conversion assert_raises(ValueError, eventrelated_dataset, ds, evs) erds = eventrelated_dataset(ds, evs, time_attr='time') assert_equal(len(erds), 1) assert_array_equal(erds.samples[0], np.repeat(np.arange(1, 5), nfeatures)) assert_array_equal(erds.sa.orig_onset, [evs[0]['onset']]) assert_array_equal(erds.sa.orig_duration, [evs[0]['duration']]) assert_array_almost_equal(erds.sa.orig_offset, [2.4]) assert_array_equal(erds.sa.time, [np.arange(2.5, 11, 2.5)]) # now with closest match erds = eventrelated_dataset(ds, evs, time_attr='time', match='closest') expected_nsamples = 3 assert_equal(len(erds), 1) assert_array_equal( erds.samples[0], np.repeat(np.arange(2, 2 + expected_nsamples), nfeatures)) assert_array_equal(erds.sa.orig_onset, [evs[0]['onset']]) assert_array_equal(erds.sa.orig_duration, [evs[0]['duration']]) assert_array_almost_equal(erds.sa.orig_offset, [-0.1]) assert_array_equal(erds.sa.time, [np.arange(5.0, 11, 2.5)]) # now test the way back results = np.arange(erds.nfeatures) assert_array_equal(erds.a.mapper.reverse1(results), results.reshape(expected_nsamples, nfeatures)) # what about multiple results? nresults = 5 results = dataset_wizard([results] * nresults) # and let's have an attribute to make it more difficult results.sa['myattr'] = np.arange(5) rds = erds.a.mapper.reverse(results) assert_array_equal( rds, results.samples.reshape(nresults * expected_nsamples, nfeatures)) assert_array_equal(rds.sa.myattr, np.repeat(results.sa.myattr, expected_nsamples))