def get_balancer(self, ds, method="pympva"): # TODO: Make also imbalanced-learn methods available balanc = Balancer(count=self._n_balanced_ds, apply_selection=True, limit=None) self.gen = balanc.generate(ds) return self.gen
def test_split_featurewise_dataset_measure(self): ds = datasets['uni3small'] sana = RepeatedMeasure( SMLR(fit_all_weights=True).get_sensitivity_analyzer(), ChainNode( [NFoldPartitioner(), Splitter('partitions', attr_values=[1])])) sens = sana(ds) # a sensitivity for each chunk and each label combination assert_equal(sens.shape, (len(ds.sa['chunks'].unique) * len(ds.sa['targets'].unique), ds.nfeatures)) # Lets try more complex example with 'boosting' ds = datasets['uni3medium'] ds.init_origids('samples') sana = RepeatedMeasure( SMLR(fit_all_weights=True).get_sensitivity_analyzer(), Balancer(amount=0.25, count=2, apply_selection=True), enable_ca=['datasets', 'repetition_results']) sens = sana(ds) assert_equal(sens.shape, (2 * len(ds.sa['targets'].unique), ds.nfeatures)) splits = sana.ca.datasets self.assertEqual(len(splits), 2) self.assertTrue( np.all([s.nsamples == ds.nsamples // 4 for s in splits])) # should have used different samples self.assertTrue(np.any([splits[0].sa.origids != splits[1].sa.origids])) # and should have got different sensitivities self.assertTrue(np.any(sens[0] != sens[3]))
def test_apply_selection(): ds = give_data() seed = np.random.randint(low=0, high=2**32) # Two balancers with same random seed, one with deferred application bal1 = Balancer(apply_selection=True, rng=seed) bal2 = Balancer(apply_selection=False, rng=seed) # Compare Balancer(apply_selection=True) to Balancer -> ApplySelection balanced1 = bal1(ds) balanced2 = ApplySelection()(bal2(ds)) assert_array_equal(balanced1.samples, balanced2.samples) assert_array_equal(balanced1.sa['targets'], balanced2.sa['targets']) assert_array_equal(balanced1.sa['chunks'], balanced2.sa['chunks'])
def test_log_exclusions(): ds = give_data() ds.sa['time_coords'] = np.arange(len(ds)) # only mark the selection in an attribute bal = Balancer() balanced = bal(ds) tmpfile = tempfile.mktemp() logex = LogExclusions(tmpfile, append=False) logged = logex(balanced) subds = balanced[~balanced.sa['balanced_set'].value] assert_true(logged is balanced) with open(tmpfile, 'r') as fobj: assert_true(fobj.readline().startswith('# New entry')) excluded = np.genfromtxt(tmpfile, dtype='u1', delimiter=',') assert_array_equal(excluded[:, 0], subds.sa.chunks) assert_array_equal(excluded[:, 1], subds.sa.targets) assert_array_equal(excluded[:, 2], subds.sa.time_coords) os.unlink(tmpfile)
def test_split_clf_on_chainpartitioner(self): # pretty much a smoke test for #156 ds = datasets['uni2small'] part = ChainNode([ NFoldPartitioner(cvtype=1), Balancer(attr='targets', count=2, limit='partitions', apply_selection=True) ]) partitions = list(part.generate(ds)) sclf = SplitClassifier(sample_clf_lin, part, enable_ca=['stats', 'splits']) sclf.train(ds) pred = sclf.predict(ds) assert_equal(len(pred), len(ds)) # rudimentary check assert_equal(len(sclf.ca.splits), len(partitions)) assert_equal(len(sclf.clfs), len(partitions)) # now let's do sensitivity analyzer just in case sclf.untrain() sensana = sclf.get_sensitivity_analyzer() sens = sensana(ds) # basic check that sensitivities varied across splits from mvpa2.mappers.fx import FxMapper sens_stds = FxMapper('samples', np.std, uattrs=['targets'])(sens) assert_true(np.any(sens_stds != 0))
def test_balancer(): ds = give_data() # only mark the selection in an attribute bal = Balancer() res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) assert_true(ds.samples is res.samples.base) # should kick out 2 samples in each chunk of 10 assert_almost_equal(np.mean(res.sa.balanced_set), 0.8) # same as above, but actually apply the selection bal = Balancer(apply_selection=True, count=5) # just run it once res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) # should kick out 2 samples in each chunk of 10 assert_equal(len(res), int(0.8 * len(ds))) # now use it as a generator dses = list(bal.generate(ds)) assert_equal(len(dses), 5) # with limit bal = Balancer(limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(res.sa['chunks'].unique, (3,)) assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4) # same but include all offlimit samples bal = Balancer(limit={'chunks': 3}, include_offlimit=True, apply_selection=True) res = bal(ds) assert_array_equal(res.sa['chunks'].unique, range(10)) # chunk three still balanced, but the rest is not, i.e. all samples included assert_equal(get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(), [2] * 4) assert_equal(get_nelements_per_value(res.sa.chunks).values(), [10, 10, 10, 8, 10, 10, 10, 10, 10, 10]) # fixed amount bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4) # fraction bal = Balancer(amount=0.499, limit=None, apply_selection=True) res = bal(ds) assert_array_equal( np.round(np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5), np.array(get_nelements_per_value(res.sa.targets).values())) # check on feature attribute ds.fa['one'] = np.tile([1,2], 5) ds.fa['chk'] = np.repeat([1,2], 5) bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)
def balance_dataset(**kwargs): default_args = { 'amount': 'equal', 'attr': 'targets', 'count': 10, 'limit': None, 'apply_selection': True } for arg in kwargs: if (arg.find('balancer') != -1): key = arg[arg.find('__') + 2:] default_args[key] = kwargs[arg] balancer = Balancer(**default_args) return balancer
def test_balancer(): ds = give_data() # only mark the selection in an attribute bal = Balancer() res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) assert_true(ds.samples is res.samples.base) # should kick out 2 samples in each chunk of 10 assert_almost_equal(np.mean(res.sa.balanced_set), 0.8) # same as above, but actually apply the selection bal = Balancer(apply_selection=True, count=5) # just run it once res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) # should kick out 2 samples in each chunk of 10 assert_equal(len(res), int(0.8 * len(ds))) # now use it as a generator dses = list(bal.generate(ds)) assert_equal(len(dses), 5) # with limit bal = Balancer(limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(res.sa['chunks'].unique, (3, )) assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4) # same but include all offlimit samples bal = Balancer(limit={'chunks': 3}, include_offlimit=True, apply_selection=True) res = bal(ds) assert_array_equal(res.sa['chunks'].unique, range(10)) # chunk three still balanced, but the rest is not, i.e. all samples included assert_equal( get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(), [2] * 4) assert_equal( get_nelements_per_value(res.sa.chunks).values(), [10, 10, 10, 8, 10, 10, 10, 10, 10, 10]) # fixed amount bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4) # fraction bal = Balancer(amount=0.499, limit=None, apply_selection=True) res = bal(ds) assert_array_equal( np.round( np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5), np.array(get_nelements_per_value(res.sa.targets).values())) # check on feature attribute ds.fa['one'] = np.tile([1, 2], 5) ds.fa['chk'] = np.repeat([1, 2], 5) bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)
def setup_classifier(**kwargs): ''' Thinked! ''' for arg in kwargs: if arg == 'clf_type': clf_type = kwargs[arg] if arg == 'fsel': f_sel = kwargs[arg] if arg == 'cv_type': cv_approach = kwargs[arg] if arg == 'cv_folds': if np.int(kwargs[arg]) == 0: cv_type = np.float(kwargs[arg]) else: cv_type = np.int(kwargs[arg]) if arg == 'permutations': permutations = np.int(kwargs[arg]) if arg == 'cv_attribute': attribute = kwargs[arg] cv_n = cv_type ################# Classifier ####################### if clf_type == 'SVM': clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities']) elif clf_type == 'GNB': clf = GNB() elif clf_type == 'LDA': clf = LDA() elif clf_type == 'QDA': clf = QDA() elif clf_type == 'SMLR': clf = SMLR() elif clf_type == 'RbfSVM': sk_clf = SVC(gamma=0.1, C=1) clf = SKLLearnerAdapter(sk_clf, enable_ca=['probabilities']) elif clf_type == 'GP': clf = GPR() else: clf = LinearCSVMC(C=1, probability=1, enable_ca=['probabilities']) ############## Feature Selection ######################### if f_sel == 'True': logger.info('Feature Selection selected.') fsel = SensitivityBasedFeatureSelection( OneWayAnova(), FractionTailSelector(0.05, mode='select', tail='upper')) fclf = FeatureSelectionClassifier(clf, fsel) elif f_sel == 'Fixed': logger.info('Fixed Feature Selection selected.') fsel = SensitivityBasedFeatureSelection( OneWayAnova(), FixedNElementTailSelector(100, mode='select', tail='upper')) fclf = FeatureSelectionClassifier(clf, fsel) elif f_sel == 'PCA': from mvpa2.mappers.skl_adaptor import SKLTransformer from sklearn.decomposition import PCA logger.info('Fixed Feature Selection selected.') fsel = SKLTransformer(PCA(n_components=45)) fclf = FeatureSelectionClassifier(clf, fsel) else: fclf = clf ######################### Permutations ############################# if permutations != 0: if __debug__: debug.active += ["STATMC"] repeater = Repeater(count=permutations) permutator = AttributePermutator('targets', limit={'partitions': 1}, count=1) partitioner = NFoldPartitioner(cvtype=cv_n, attr=attribute) null_cv = CrossValidation(clf, ChainNode([partitioner, permutator], space=partitioner.get_space()), errorfx=mean_mismatch_error) distr_est = MCNullDist(repeater, tail='left', measure=null_cv, enable_ca=['dist_samples']) #postproc = mean_sample() else: distr_est = None #postproc = None ######################################################## if cv_approach == 'n_fold': if cv_type != 0: splitter_used = NFoldPartitioner(cvtype=cv_type, attr=attribute) else: splitter_used = NFoldPartitioner(cvtype=1, attr=attribute) else: splitter_used = HalfPartitioner(attr=attribute) chain_splitter = ChainNode([ splitter_used, Balancer( attr='targets', count=1, limit='partitions', apply_selection=True) ], space='partitions') ############################################################# if distr_est == None: cvte = CrossValidation(fclf, chain_splitter, enable_ca=['stats', 'repetition_results']) else: cvte = CrossValidation(fclf, chain_splitter, errorfx=mean_mismatch_error, null_dist=distr_est, enable_ca=['stats', 'repetition_results']) logger.info('Classifier set...') return [fclf, cvte]
def test_balancer(): ds = give_data() ds.sa['ids'] = np.arange(len(ds)) # some sa to ease tracking of samples # only mark the selection in an attribute bal = Balancer() res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) assert_true(ds.samples is res.samples.base) # should kick out 2 samples in each chunk of 10 assert_almost_equal(np.mean(res.sa.balanced_set), 0.8) # same as above, but actually apply the selection bal = Balancer(apply_selection=True, count=5) # just run it once res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) # should kick out 2 samples in each chunk of 10 assert_equal(len(res), int(0.8 * len(ds))) # now use it as a generator dses = list(bal.generate(ds)) assert_equal(len(dses), 5) # if we rerun again, it would be a different selection res2 = bal(ds) assert_true(np.any(res.sa.ids != bal(ds).sa.ids)) # but if we create a balancer providing seed rng int, # should be identical results bal = Balancer(apply_selection=True, count=5, rng=1) assert_false(np.any(bal(ds).sa.ids != bal(ds).sa.ids)) # But results should differ if we use .generate to produce those multiple # balanced datasets b = Balancer(apply_selection=True, count=3, rng=1) balanced = list(b.generate(ds)) assert_false(all(balanced[0].sa.ids == balanced[1].sa.ids)) assert_false(all(balanced[0].sa.ids == balanced[2].sa.ids)) assert_false(all(balanced[1].sa.ids == balanced[2].sa.ids)) # And should be exactly the same for ds_a, ds_b in zip(balanced, b.generate(ds)): assert_datasets_equal(ds_a, ds_b) # Contribution by Chris Markiewicz # And interleaving __call__ and generator fetches gen1 = b.generate(ds) gen2 = b.generate(ds) seq1, seq2, seq3 = [], [], [] for i in xrange(3): seq1.append(gen1.next()) seq2.append(gen2.next()) seq3.append(b(ds)) # Produces expected sequences for i in xrange(3): assert_datasets_equal(balanced[i], seq1[i]) assert_datasets_equal(balanced[i], seq2[i]) # And all __call__s return the same result ds_a = seq3[0] for ds_b in seq3[1:]: assert_array_equal(ds_a.sa.ids, ds_b.sa.ids) # with limit bal = Balancer(limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(res.sa['chunks'].unique, (3, )) assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4) # same but include all offlimit samples bal = Balancer(limit={'chunks': 3}, include_offlimit=True, apply_selection=True) res = bal(ds) assert_array_equal(res.sa['chunks'].unique, range(10)) # chunk three still balanced, but the rest is not, i.e. all samples included assert_equal( get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(), [2] * 4) assert_equal( get_nelements_per_value(res.sa.chunks).values(), [10, 10, 10, 8, 10, 10, 10, 10, 10, 10]) # fixed amount bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4) # fraction bal = Balancer(amount=0.499, limit=None, apply_selection=True) res = bal(ds) assert_array_equal( np.round( np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5), np.array(get_nelements_per_value(res.sa.targets).values())) # check on feature attribute ds.fa['one'] = np.tile([1, 2], 5) ds.fa['chk'] = np.repeat([1, 2], 5) bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)
def get_crossvalidation_instance(learner, partitioner, errorfx, sampling_repetitions=1, learner_space='targets', balance_training=None, permutations=0, avg_datafold_results=True, prob_tail='left'): from mvpa2.base.node import ChainNode from mvpa2.measures.base import CrossValidation if not balance_training is None: # balance training data try: amount = int(balance_training) except ValueError: try: amount = float(balance_training) except ValueError: amount = balance_training from mvpa2.generators.resampling import Balancer balancer = Balancer(amount=amount, attr=learner_space, count=sampling_repetitions, limit={partitioner.get_space(): 1}, apply_selection=True, include_offlimit=True) else: balancer = None # set learner space learner.set_space(learner_space) # setup generator for data folding -- put in a chain node for easy # amending gennode = ChainNode([partitioner], space=partitioner.get_space()) if avg_datafold_results: from mvpa2.mappers.fx import mean_sample postproc = mean_sample() else: postproc = None if not balancer is None: # enable balancing step for each partitioning step gennode.append(balancer) if permutations > 0: from mvpa2.generators.base import Repeater from mvpa2.generators.permutation import AttributePermutator from mvpa2.clfs.stats import MCNullDist # how often do we want to shuffle the data repeater = Repeater(count=permutations) # permute the training part of a dataset exactly ONCE permutator = AttributePermutator(learner_space, limit={partitioner.get_space(): 1}, count=1) # CV with null-distribution estimation that permutes the training data for # each fold independently perm_gen_node = copy.deepcopy(gennode) perm_gen_node.append(permutator) null_cv = CrossValidation(learner, perm_gen_node, postproc=postproc, errorfx=errorfx) # Monte Carlo distribution estimator distr_est = MCNullDist(repeater, tail=prob_tail, measure=null_cv, enable_ca=['dist_samples']) # pass the p-values as feature attributes on to the results pass_attr = [('ca.null_prob', 'fa', 1)] else: distr_est = None pass_attr = None # final CV node cv = CrossValidation(learner, gennode, errorfx=errorfx, null_dist=distr_est, postproc=postproc, enable_ca=['stats', 'null_prob'], pass_attr=pass_attr) return cv
conf['label_dropped'] = 'FIX0' conf['label_included'] = 'NEW'+ev+','+'OLD'+ev count_ = 5 ds.targets = np.core.defchararray.add(np.array(ds.sa[field_].value, dtype=np.str), np.array(ds.sa.evidence,dtype= np.str)) ''' ds.targets = ds.sa.memory_status conf['label_dropped'] = 'None' conf['label_included'] = 'all' ds = preprocess_dataset(ds, data_type, **conf) count_ = 1 field_ = 'memory' balanc = Balancer(count=count_, apply_selection=True, limit=None) gen = balanc.generate(ds) cv_storage = StoreResults() clf = LinearCSVMC(C=1) # This is used for the sklearn crossvalidation y = np.zeros_like(ds.targets, dtype=np.int_) y[ds.targets == ds.uniquetargets[0]] = 1 # We needs to modify the chunks in order to use sklearn ds.chunks = np.arange(len(ds.chunks)) permut_ = []
def test_balancer(): ds = give_data() ds.sa['ids'] = np.arange(len(ds)) # some sa to ease tracking of samples # only mark the selection in an attribute bal = Balancer() res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) assert_true(ds.samples is res.samples.base) # should kick out 2 samples in each chunk of 10 assert_almost_equal(np.mean(res.sa.balanced_set), 0.8) # same as above, but actually apply the selection bal = Balancer(apply_selection=True, count=5) # just run it once res = bal(ds) # we get a new dataset, with shared samples assert_false(ds is res) # should kick out 2 samples in each chunk of 10 assert_equal(len(res), int(0.8 * len(ds))) # now use it as a generator dses = list(bal.generate(ds)) assert_equal(len(dses), 5) # if we rerun again, it would be a different selection res2 = bal(ds) assert_true(np.any(res.sa.ids != bal(ds).sa.ids)) # but if we create a balancer providing seed rng int, # should be identical results bal = Balancer(apply_selection=True, count=5, rng=1) assert_false(np.any(bal(ds).sa.ids != bal(ds).sa.ids)) # But results should differ if we use .generate to produce those multiple # balanced datasets b = Balancer(apply_selection=True, count=3, rng=1) balanced = list(b.generate(ds)) assert_false(all(balanced[0].sa.ids == balanced[1].sa.ids)) assert_false(all(balanced[0].sa.ids == balanced[2].sa.ids)) assert_false(all(balanced[1].sa.ids == balanced[2].sa.ids)) # And should be exactly the same for ds_a, ds_b in zip(balanced, b.generate(ds)): assert_datasets_equal(ds_a, ds_b) # Contribution by Chris Markiewicz # And interleaving __call__ and generator fetches gen1 = b.generate(ds) gen2 = b.generate(ds) seq1, seq2, seq3 = [], [], [] for i in xrange(3): seq1.append(gen1.next()) seq2.append(gen2.next()) seq3.append(b(ds)) # Produces expected sequences for i in xrange(3): assert_datasets_equal(balanced[i], seq1[i]) assert_datasets_equal(balanced[i], seq2[i]) # And all __call__s return the same result ds_a = seq3[0] for ds_b in seq3[1:]: assert_array_equal(ds_a.sa.ids, ds_b.sa.ids) # with limit bal = Balancer(limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(res.sa['chunks'].unique, (3,)) assert_equal(get_nelements_per_value(res.sa.targets).values(), [2] * 4) # same but include all offlimit samples bal = Balancer(limit={'chunks': 3}, include_offlimit=True, apply_selection=True) res = bal(ds) assert_array_equal(res.sa['chunks'].unique, range(10)) # chunk three still balanced, but the rest is not, i.e. all samples included assert_equal(get_nelements_per_value(res[res.sa.chunks == 3].sa.targets).values(), [2] * 4) assert_equal(get_nelements_per_value(res.sa.chunks).values(), [10, 10, 10, 8, 10, 10, 10, 10, 10, 10]) # fixed amount bal = Balancer(amount=1, limit={'chunks': 3}, apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.sa.targets).values(), [1] * 4) # fraction bal = Balancer(amount=0.499, limit=None, apply_selection=True) res = bal(ds) assert_array_equal( np.round(np.array(get_nelements_per_value(ds.sa.targets).values()) * 0.5), np.array(get_nelements_per_value(res.sa.targets).values())) # check on feature attribute ds.fa['one'] = np.tile([1, 2], 5) ds.fa['chk'] = np.repeat([1, 2], 5) bal = Balancer(attr='one', amount=2, limit='chk', apply_selection=True) res = bal(ds) assert_equal(get_nelements_per_value(res.fa.one).values(), [4] * 2)