def _train(self, dataset): pmeasure = ProxyMeasure( self.lrn, postproc=BinaryFxNode(self.errorfx, self.lrn.space), skip_train=True # do not train since fmeasure will ) # First we need to replicate our RFE construct but this time # with pmeasure for the classifier rfe = RFE( self.fmeasure, pmeasure, Splitter('partitions'), fselector=self.fselector, bestdetector=None, train_pmeasure=False, stopping_criterion=None, # full "track" update_sensitivity=self.update_sensitivity, enable_ca=['errors', 'nfeatures']) errors, nfeatures = [], [] if __debug__: debug("RFEC", "Stage 1: initial nested CV/RFE for %s", (dataset, )) for partition in self.partitioner.generate(dataset): rfe.train(partition) errors.append(rfe.ca.errors) nfeatures.append(rfe.ca.nfeatures) # mean errors across splits and find optimal number errors_mean = np.mean(errors, axis=0) nfeatures_mean = np.mean(nfeatures, axis=0) # we will take the "mean location" of the min to stay # within the most 'stable' choice mins_idx = np.where(errors_mean == np.min(errors_mean))[0] min_idx = mins_idx[int(len(mins_idx) / 2)] min_error = errors_mean[min_idx] assert (min_error == np.min(errors_mean)) nfeatures_min = nfeatures_mean[min_idx] if __debug__: debug( "RFEC", "Choosing among %d choices to have %d features with " "mean error=%.2g (initial mean error %.2g)", (len(mins_idx), nfeatures_min, min_error, errors_mean[0])) self.nfeatures_min = nfeatures_min if __debug__: debug( "RFEC", "Stage 2: running RFE on full training dataset to " "distil best %d features" % nfeatures_min) super(SplitRFE, self)._train(dataset)
def test_james_problem_multiclass(self): percent = 80 dataset = datasets['uni4large'] #dataset = dataset[:, dataset.a.nonbogus_features] rfesvm_split = LinearCSVMC() fs = \ RFE(rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ #FxMapper('features', l2_normed), #FxMapper('samples', np.mean), #FxMapper('samples', np.abs) FxMapper('features', lambda x: np.argsort(np.abs(x))), #maxofabs_sample() mean_sample() ])), ProxyMeasure(rfesvm_split, postproc=BinaryFxNode(mean_mismatch_error, 'targets')), Splitter('train'), fselector=FractionTailSelector( percent / 100.0, mode='select', tail='upper'), update_sensitivity=True) clf = FeatureSelectionClassifier( LinearCSVMC(), # on features selected via RFE fs) # update sensitivity at each step (since we're not using the # same CLF as sensitivity analyzer) class StoreResults(object): def __init__(self): self.storage = [] def __call__(self, data, node, result): self.storage.append((node.measure.mapper.ca.history, node.measure.mapper.ca.errors)), cv_storage = StoreResults() cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), callback=cv_storage, enable_ca=['stats']) #cv = SplitClassifier(clf) try: error = cv(dataset).samples.squeeze() except Exception, e: self.fail('CrossValidation cannot handle classifier with RFE ' 'feature selection. Got exception: %s' % (e, ))
def test_james_problem(self): percent = 80 dataset = datasets['uni2small'] rfesvm_split = LinearCSVMC() fs = \ RFE(rfesvm_split.get_sensitivity_analyzer(), ProxyMeasure(rfesvm_split, postproc=BinaryFxNode(mean_mismatch_error, 'targets')), Splitter('train'), fselector=FractionTailSelector( percent / 100.0, mode='select', tail='upper'), update_sensitivity=True) clf = FeatureSelectionClassifier( LinearCSVMC(), # on features selected via RFE fs) # update sensitivity at each step (since we're not using the # same CLF as sensitivity analyzer) class StoreResults(object): def __init__(self): self.storage = [] def __call__(self, data, node, result): self.storage.append((node.measure.mapper.ca.history, node.measure.mapper.ca.errors)), cv_storage = StoreResults() cv = CrossValidation(clf, NFoldPartitioner(), postproc=mean_sample(), callback=cv_storage, enable_ca=['confusion']) # TODO -- it is stats #cv = SplitClassifier(clf) try: error = cv(dataset).samples.squeeze() except Exception as e: self.fail('CrossValidation cannot handle classifier with RFE ' 'feature selection. Got exception: %s' % (e, )) assert (len(cv_storage.storage) == len(dataset.sa['chunks'].unique)) assert (len(cv_storage.storage[0]) == 2) assert (len(cv_storage.storage[0][0]) == dataset.nfeatures) self.assertTrue(error < 0.2)
def test_ifs(self, svm): # measure for feature selection criterion and performance assesment # use the SAME clf! errorfx = mean_mismatch_error fmeasure = CrossValidation(svm, NFoldPartitioner(), postproc=mean_sample()) pmeasure = ProxyMeasure(svm, postproc=BinaryFxNode(errorfx, 'targets')) ifs = IFS(fmeasure, pmeasure, Splitter('purpose', attr_values=['train', 'test']), fselector=\ # go for lower tail selection as data_measure will return # errors -> low is good FixedNElementTailSelector(1, tail='lower', mode='select'), ) wdata = self.get_data() wdata.sa['purpose'] = np.repeat('train', len(wdata)) tdata = self.get_data() tdata.sa['purpose'] = np.repeat('test', len(tdata)) ds = vstack((wdata, tdata)) orig_nfeatures = ds.nfeatures ifs.train(ds) resds = ifs(ds) # fail if orig datasets are changed self.assertTrue(ds.nfeatures == orig_nfeatures) # check that the features set with the least error is selected self.assertTrue(len(ifs.ca.errors)) e = np.array(ifs.ca.errors) self.assertTrue(resds.nfeatures == e.argmin() + 1) # repeat with dataset where selection order is known wsignal = datasets['dumb2'].copy() wsignal.sa['purpose'] = np.repeat('train', len(wsignal)) tsignal = datasets['dumb2'].copy() tsignal.sa['purpose'] = np.repeat('test', len(tsignal)) signal = vstack((wsignal, tsignal)) ifs.train(signal) resds = ifs(signal) self.assertTrue((resds.samples[:, 0] == signal.samples[:, 0]).all())
def test_cached_kernel_different_datasets(self): skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455) # Inspired by the problem Swaroop ran into k = LinearSGKernel(normalizer_cls=False) k_ = LinearSGKernel(normalizer_cls=False) # to be cached ck = CachedKernel(k_) clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1) clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1) cvte = CrossValidation(clf, NFoldPartitioner()) cvte_ = CrossValidation(clf_, NFoldPartitioner()) postproc=BinaryFxNode(mean_mismatch_error, 'targets') te = ProxyMeasure(clf, postproc=postproc) te_ = ProxyMeasure(clf_, postproc=postproc) for r in xrange(2): ds1 = datasets['uni2medium'] errs1 = cvte(ds1) ck.compute(ds1) ok_(ck._recomputed) errs1_ = cvte_(ds1) ok_(~ck._recomputed) assert_array_equal(errs1, errs1_) ds2 = datasets['uni3small'] errs2 = cvte(ds2) ck.compute(ds2) ok_(ck._recomputed) errs2_ = cvte_(ds2) ok_(~ck._recomputed) assert_array_equal(errs2, errs2_) ssel = np.round(datasets['uni2large'].samples[:5, 0]).astype(int) te.train(datasets['uni3small'][::2]) terr = np.asscalar(te(datasets['uni3small'][ssel])) te_.train(datasets['uni3small'][::2]) terr_ = np.asscalar(te_(datasets['uni3small'][ssel])) ok_(~ck._recomputed) ok_(terr == terr_)
def test_rfe(self, clf): # sensitivity analyser and transfer error quantifier use the SAME clf! sens_ana = clf.get_sensitivity_analyzer(postproc=maxofabs_sample()) pmeasure = ProxyMeasure(clf, postproc=BinaryFxNode(mean_mismatch_error, 'targets')) cvmeasure = CrossValidation(clf, NFoldPartitioner(), errorfx=mean_mismatch_error, postproc=mean_sample()) rfesvm_split = SplitClassifier(clf, OddEvenPartitioner()) # explore few recipes for rfe, data in [ # because the clf is already trained when computing the sensitivity # map, prevent retraining for transfer error calculation # Use absolute of the svm weights as sensitivity (RFE(sens_ana, pmeasure, Splitter('train'), fselector=FixedNElementTailSelector(1), train_pmeasure=False), self.get_data()), # use cross-validation within training to get error for the stopping point # but use full training data to derive sensitivity ( RFE( sens_ana, cvmeasure, Repeater( 2 ), # give the same full dataset to sens_ana and cvmeasure fselector=FractionTailSelector(0.70, mode='select', tail='upper'), train_pmeasure=True), normal_feature_dataset(perlabel=20, nchunks=5, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)), # use cross-validation (via SplitClassifier) and get mean # of normed sensitivities across those splits ( RFE( rfesvm_split.get_sensitivity_analyzer( postproc=ChainMapper([ FxMapper('features', l2_normed), FxMapper('samples', np.mean), FxMapper('samples', np.abs) ])), ConfusionBasedError(rfesvm_split, confusion_state='stats'), Repeater( 2), # we will use the same full cv-training dataset fselector=FractionTailSelector(0.50, mode='select', tail='upper'), stopping_criterion=NBackHistoryStopCrit( BestDetector(), 10), train_pmeasure= False, # we just extract it from existing confusion update_sensitivity=True), normal_feature_dataset(perlabel=28, nchunks=7, nfeatures=200, nonbogus_features=[0, 1], snr=1.5)) ]: # prep data # data = datasets['uni2medium'] data_nfeatures = data.nfeatures rfe.train(data) resds = rfe(data) # fail if orig datasets are changed self.assertTrue(data.nfeatures == data_nfeatures) # check that the features set with the least error is selected if len(rfe.ca.errors): e = np.array(rfe.ca.errors) if isinstance(rfe._fselector, FixedNElementTailSelector): self.assertTrue(resds.nfeatures == data_nfeatures - e.argmin()) else: imin = np.argmin(e) if 'does_feature_selection' in clf.__tags__: # if clf is smart it might figure it out right away assert_array_less(imin, len(e)) else: # in this case we can even check if we had actual # going down/up trend... although -- why up??? self.assertTrue(1 < imin < len(e) - 1) else: self.assertTrue(resds.nfeatures == data_nfeatures) # silly check if nfeatures is in decreasing order nfeatures = np.array(rfe.ca.nfeatures).copy() nfeatures.sort() self.assertTrue((nfeatures[::-1] == rfe.ca.nfeatures).all()) # check if history has elements for every step self.assertTrue( set(rfe.ca.history) == set(range(len(np.array( rfe.ca.errors))))) # Last (the largest number) can be present multiple times even # if we remove 1 feature at a time -- just need to stop well # in advance when we have more than 1 feature left ;) self.assertTrue(rfe.ca.nfeatures[-1] == len( np.where(rfe.ca.history == max(rfe.ca.history))[0]))
def _train(self, dataset): pmeasure = ProxyMeasure( self.lrn, postproc=BinaryFxNode(self.errorfx, self.lrn.space), skip_train=not self. train_pmeasure # do not train since fmeasure will ) # First we need to replicate our RFE construct but this time # with pmeasure for the classifier rfe = RFE( self.fmeasure, pmeasure, Splitter('partitions'), fselector=self.fselector, bestdetector=None, train_pmeasure=self.train_pmeasure, stopping_criterion=None, # full "track" update_sensitivity=self.update_sensitivity, enable_ca=['errors', 'nfeatures']) errors, nfeatures = [], [] if __debug__: debug("RFEC", "Stage 1: initial nested CV/RFE for %s", (dataset, )) if self.nproc != 1 and externals.exists('joblib'): nested_results = jl.Parallel(self.nproc)( jl.delayed(_process_partition)(rfe, partition) for partition in self.partitioner.generate(dataset)) else: nested_results = [ _process_partition(rfe, partition) for partition in self.partitioner.generate(dataset) ] # unzip errors = [x[0] for x in nested_results] nfeatures = [x[1] for x in nested_results] self.ca.nested_nfeatures = nfeatures self.ca.nested_errors = errors # mean errors across splits and find optimal number errors_mean = np.mean(errors, axis=0) nfeatures_mean = np.mean(nfeatures, axis=0) # we will take the "mean location" of the min to stay # within the most 'stable' choice mins_idx = np.where(errors_mean == np.min(errors_mean))[0] min_idx = mins_idx[int(len(mins_idx) / 2)] min_error = errors_mean[min_idx] assert (min_error == np.min(errors_mean)) nfeatures_min = nfeatures_mean[min_idx] if __debug__: debug( "RFEC", "Choosing among %d choices to have %d features with " "mean error=%.2g (initial mean error %.2g)", (len(mins_idx), nfeatures_min, min_error, errors_mean[0])) self.nfeatures_min = nfeatures_min if __debug__: debug( "RFEC", "Stage 2: running RFE on full training dataset to " "obtain the best %d features" % nfeatures_min) super(SplitRFE, self)._train(dataset)