def test_correct_dimensions_order(self, clf): """To check if known/present Classifiers are working properly with samples being first dimension. Started to worry about possible problems while looking at sg where samples are 2nd dimension """ # specially crafted dataset -- if dimensions are flipped over # the same storage, problem becomes unseparable. Like in this case # incorrect order of dimensions lead to equal samples [0, 1, 0] traindatas = [ dataset_wizard(samples=np.array([ [0, 0, 1.0], [1, 0, 0] ]), targets=[0, 1]), dataset_wizard(samples=np.array([ [0, 0.0], [1, 1] ]), targets=[0, 1])] clf.ca.change_temporarily(enable_ca = ['training_stats']) for traindata in traindatas: clf.train(traindata) self.failUnlessEqual(clf.ca.training_stats.percent_correct, 100.0, "Classifier %s must have 100%% correct learning on %s. Has %f" % (`clf`, traindata.samples, clf.ca.training_stats.percent_correct)) # and we must be able to predict every original sample thus for i in xrange(traindata.nsamples): sample = traindata.samples[i,:] predicted = clf.predict([sample]) self.failUnlessEqual([predicted], traindata.targets[i], "We must be able to predict sample %s using " % sample + "classifier %s" % `clf`) clf.ca.reset_changed_temporarily()
def test_correct_dimensions_order(self, clf): """To check if known/present Classifiers are working properly with samples being first dimension. Started to worry about possible problems while looking at sg where samples are 2nd dimension """ # specially crafted dataset -- if dimensions are flipped over # the same storage, problem becomes unseparable. Like in this case # incorrect order of dimensions lead to equal samples [0, 1, 0] traindatas = [ dataset_wizard(samples=np.array([[0, 0, 1.0], [1, 0, 0]]), targets=[-1, 1]), dataset_wizard(samples=np.array([[0, 0.0], [1, 1]]), targets=[-1, 1]) ] clf.ca.change_temporarily(enable_ca=['training_confusion']) for traindata in traindatas: clf.train(traindata) self.failUnlessEqual( clf.ca.training_confusion.percent_correct, 100.0, "Classifier %s must have 100%% correct learning on %s. Has %f" % ( ` clf `, traindata.samples, clf.ca.training_confusion.percent_correct)) # and we must be able to predict every original sample thus for i in xrange(traindata.nsamples): sample = traindata.samples[i, :] predicted = clf.predict([sample]) self.failUnlessEqual( [predicted], traindata.targets[i], "We must be able to predict sample %s using " % sample + "classifier %s" % ` clf `) clf.ca.reset_changed_temporarily()
def test_feature_selection_classifier(self): from mvpa.featsel.base import \ SensitivityBasedFeatureSelection from mvpa.featsel.helpers import \ FixedNElementTailSelector # should give lowest weight to the feature with lowest index sens_ana = SillySensitivityAnalyzer() # should give lowest weight to the feature with highest index sens_ana_rev = SillySensitivityAnalyzer(mult=-1) # corresponding feature selections feat_sel = SensitivityBasedFeatureSelection( sens_ana, FixedNElementTailSelector(1, mode='discard')) feat_sel_rev = SensitivityBasedFeatureSelection( sens_ana_rev, FixedNElementTailSelector(1)) samples = np.array([[0, 0, -1], [1, 0, 1], [-1, -1, 1], [-1, 0, 1], [1, -1, 1]]) testdata3 = dataset_wizard(samples=samples, targets=1) # dummy train data so proper mapper gets created traindata = dataset_wizard(samples=np.array([[0, 0, -1], [1, 0, 1]]), targets=[1, 2]) # targets res110 = [1, 1, 1, -1, -1] res011 = [-1, 1, -1, 1, -1] # first classifier -- 0th feature should be discarded clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel, enable_ca=['feature_ids']) self.clf_sign.ca.change_temporarily(enable_ca=['estimates']) clf011.train(traindata) self.failUnlessEqual(clf011.predict(testdata3.samples), res011) # just silly test if we get values assigned in the 'ProxyClassifier' self.failUnless(len(clf011.ca.estimates) == len(res110), msg="We need to pass values into ProxyClassifier") self.clf_sign.ca.reset_changed_temporarily() self.failUnlessEqual(len(clf011.ca.feature_ids), 2) "Feature selection classifier had to be trained on 2 features" # first classifier -- last feature should be discarded clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel_rev) clf011.train(traindata) self.failUnlessEqual(clf011.predict(testdata3.samples), res110)
def test_coarsen_chunks(self): """Just basic testing for now""" chunks = [1,1,2,2,3,3,4,4] ds = dataset_wizard(samples=np.arange(len(chunks)).reshape( (len(chunks),1)), targets=[1]*8, chunks=chunks) coarsen_chunks(ds, nchunks=2) chunks1 = coarsen_chunks(chunks, nchunks=2) self.failUnless((chunks1 == ds.chunks).all()) self.failUnless((chunks1 == np.asarray([0,0,0,0,1,1,1,1])).all()) ds2 = dataset_wizard(samples=np.arange(len(chunks)).reshape( (len(chunks),1)), targets=[1]*8, chunks=range(len(chunks))) coarsen_chunks(ds2, nchunks=2) self.failUnless((chunks1 == ds.chunks).all())
def test_feature_selection_classifier(self): from mvpa.featsel.base import \ SensitivityBasedFeatureSelection from mvpa.featsel.helpers import \ FixedNElementTailSelector # should give lowest weight to the feature with lowest index sens_ana = SillySensitivityAnalyzer() # should give lowest weight to the feature with highest index sens_ana_rev = SillySensitivityAnalyzer(mult=-1) # corresponding feature selections feat_sel = SensitivityBasedFeatureSelection(sens_ana, FixedNElementTailSelector(1, mode='discard')) feat_sel_rev = SensitivityBasedFeatureSelection(sens_ana_rev, FixedNElementTailSelector(1)) samples = np.array([ [0, 0, -1], [1, 0, 1], [-1, -1, 1], [-1, 0, 1], [1, -1, 1] ]) testdata3 = dataset_wizard(samples=samples, targets=1) # dummy train data so proper mapper gets created traindata = dataset_wizard(samples=np.array([ [0, 0, -1], [1, 0, 1] ]), targets=[1, 2]) # targets res110 = [1, 1, 1, -1, -1] res011 = [-1, 1, -1, 1, -1] # first classifier -- 0th feature should be discarded clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel, enable_ca=['feature_ids']) self.clf_sign.ca.change_temporarily(enable_ca=['estimates']) clf011.train(traindata) self.failUnlessEqual(clf011.predict(testdata3.samples), res011) # just silly test if we get values assigned in the 'ProxyClassifier' self.failUnless(len(clf011.ca.estimates) == len(res110), msg="We need to pass values into ProxyClassifier") self.clf_sign.ca.reset_changed_temporarily() self.failUnlessEqual(clf011.mapper._oshape, (2,)) "Feature selection classifier had to be trained on 2 features" # first classifier -- last feature should be discarded clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel_rev) clf011.train(traindata) self.failUnlessEqual(clf011.predict(testdata3.samples), res110)
def pure_multivariate_signal(patterns, signal2noise = 1.5, chunks=None, targets=[0, 1]): """ Create a 2d dataset with a clear multivariate signal, but no univariate information. :: %%%%%%%%% % O % X % %%%%%%%%% % X % O % %%%%%%%%% """ # start with noise data = np.random.normal(size=(4*patterns, 2)) # add signal data[:2*patterns, 1] += signal2noise data[2*patterns:4*patterns, 1] -= signal2noise data[:patterns, 0] -= signal2noise data[2*patterns:3*patterns, 0] -= signal2noise data[patterns:2*patterns, 0] += signal2noise data[3*patterns:4*patterns, 0] += signal2noise # two conditions regs = np.array((targets[0:1] * patterns) + (targets[1:2] * 2 * patterns) + (targets[0:1] * patterns)) if chunks is None: chunks = range(len(data)) return dataset_wizard(samples=data, targets=regs, chunks=chunks)
def pure_multivariate_signal(patterns, signal2noise = 1.5, chunks=None): """ Create a 2d dataset with a clear multivariate signal, but no univariate information. :: %%%%%%%%% % O % X % %%%%%%%%% % X % O % %%%%%%%%% """ # start with noise data = np.random.normal(size=(4*patterns, 2)) # add signal data[:2*patterns, 1] += signal2noise data[2*patterns:4*patterns, 1] -= signal2noise data[:patterns, 0] -= signal2noise data[2*patterns:3*patterns, 0] -= signal2noise data[patterns:2*patterns, 0] += signal2noise data[3*patterns:4*patterns, 0] += signal2noise # two conditions regs = np.array(([0] * patterns) + ([1] * 2 * patterns) + ([0] * patterns)) if chunks is None: chunks = range(len(data)) return dataset_wizard(samples=data, targets=regs, chunks=chunks)
def test_feature_selection_classifier_with_regression(self): from mvpa.featsel.base import \ SensitivityBasedFeatureSelection from mvpa.featsel.helpers import \ FixedNElementTailSelector if sample_clf_reg is None: # none regression was found, so nothing to test return # should give lowest weight to the feature with lowest index sens_ana = SillySensitivityAnalyzer() # corresponding feature selections feat_sel = SensitivityBasedFeatureSelection(sens_ana, FixedNElementTailSelector(1, mode='discard')) # now test with regression-based classifier. The problem is # that it is determining predictions twice from values and # then setting the values from the results, which the second # time is set to predictions. The final outcome is that the # values are actually predictions... dat = dataset_wizard(samples=np.random.randn(4, 10), targets=[-1, -1, 1, 1]) clf_reg = FeatureSelectionClassifier(sample_clf_reg, feat_sel) clf_reg.train(dat) _ = clf_reg.predict(dat.samples) self.failIf((np.array(clf_reg.ca.estimates) - clf_reg.ca.predictions).sum()==0, msg="Values were set to the predictions in %s." % sample_clf_reg)
def test_feature_selection_classifier_with_regression(self): from mvpa.featsel.base import \ SensitivityBasedFeatureSelection from mvpa.featsel.helpers import \ FixedNElementTailSelector if sample_clf_reg is None: # none regression was found, so nothing to test return # should give lowest weight to the feature with lowest index sens_ana = SillySensitivityAnalyzer() # corresponding feature selections feat_sel = SensitivityBasedFeatureSelection( sens_ana, FixedNElementTailSelector(1, mode='discard')) # now test with regression-based classifier. The problem is # that it is determining predictions twice from values and # then setting the values from the results, which the second # time is set to predictions. The final outcome is that the # values are actually predictions... dat = dataset_wizard(samples=np.random.randn(4, 10), targets=[-1, -1, 1, 1]) clf_reg = FeatureSelectionClassifier(sample_clf_reg, feat_sel) clf_reg.train(dat) _ = clf_reg.predict(dat.samples) self.failIf( (np.array(clf_reg.ca.estimates) - clf_reg.ca.predictions).sum() == 0, msg="Values were set to the predictions in %s." % sample_clf_reg)
def dumb_feature_binary_dataset(): """Very simple binary (2 labels) dataset """ data = [ [1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1], [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1], [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0], [12, 1], ] regs = ([0] * 12) + ([1] * 12) return dataset_wizard(samples=np.array(data), targets=regs, chunks=range(len(regs)))
def dumb_feature_dataset(): """Create a very simple dataset with 2 features and 3 labels """ data = [ [1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1], [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1], [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0], [12, 1], ] regs = ([1] * 8) + ([2] * 8) + ([3] * 8) return dataset_wizard(samples=np.array(data), targets=regs, chunks=range(len(regs)))
def test_origid_handling(): ds = dataset_wizard(np.atleast_2d(np.arange(35)).T) ds.init_origids('both') ok_(ds.nsamples == 35) assert_equal(len(np.unique(ds.sa.origids)), 35) assert_equal(len(np.unique(ds.fa.origids)), 1) selector = [3, 7, 10, 15] subds = ds[selector] assert_array_equal(subds.sa.origids, ds.sa.origids[selector]) # Now if we request new origids if they are present we could # expect different behavior assert_raises(ValueError, subds.init_origids, 'both', mode='raises') sa_origids = subds.sa.origids.copy() fa_origids = subds.fa.origids.copy() for s in ('both', 'samples', 'features'): assert_raises(RuntimeError, subds.init_origids, s, mode='raise') subds.init_origids(s, mode='existing') # we should have the same origids as before assert_array_equal(subds.sa.origids, sa_origids) assert_array_equal(subds.fa.origids, fa_origids) # Lets now change, which should be default behavior subds.init_origids('both') assert_equal(len(sa_origids), len(subds.sa.origids)) assert_equal(len(fa_origids), len(subds.fa.origids)) # values should change though ok_((sa_origids != subds.sa.origids).any()) ok_((fa_origids != subds.fa.origids).any())
def linear1d_gaussian_noise(size=100, slope=0.5, intercept=1.0, x_min=-2.0, x_max=3.0, sigma=0.2): """A straight line with some Gaussian noise. """ x = np.linspace(start=x_min, stop=x_max, num=size) noise = np.random.randn(size) * sigma y = x * slope + intercept + noise return dataset_wizard(samples=x[:, None], targets=y)
def test_idhash(): ds = dataset_wizard(np.arange(12).reshape((4, 3)), targets=1, chunks=1) origid = ds.idhash #XXX BUG -- no assurance that labels would become an array... for now -- do manually ds.targets = np.array([3, 1, 2, 3]) # change all labels ok_(origid != ds.idhash, msg="Changing all targets should alter dataset's idhash") origid = ds.idhash z = ds.targets[1] assert_equal(origid, ds.idhash, msg="Accessing shouldn't change idhash") z = ds.chunks assert_equal(origid, ds.idhash, msg="Accessing shouldn't change idhash") z[2] = 333 ok_(origid != ds.idhash, msg="Changing value in attribute should change idhash") origid = ds.idhash ds.samples[1, 1] = 1000 ok_(origid != ds.idhash, msg="Changing value in data should change idhash") origid = ds.idhash orig_labels = ds.targets #.copy() ds.permute_targets() ok_(origid != ds.idhash, msg="Permutation also changes idhash") ds.targets = orig_labels ok_(origid == ds.idhash, msg="idhash should be restored after reassigning orig targets")
def test_aggregation(self): data = dataset_wizard(np.arange( 20 ).reshape((4, 5)), targets=1, chunks=1) ag_data = aggregate_features(data, np.mean) ok_(ag_data.nsamples == 4) ok_(ag_data.nfeatures == 1) assert_array_equal(ag_data.samples[:, 0], [2, 7, 12, 17])
def linear1d_gaussian_noise(size=100, slope=0.5, intercept=1.0, x_min=-2.0, x_max=3.0, sigma=0.2): """A straight line with some Gaussian noise. """ x = np.linspace(start=x_min, stop=x_max, num=size) noise = np.random.randn(size)*sigma y = x * slope + intercept + noise return dataset_wizard(samples=x[:, None], targets=y)
def setUp(self): self.clf_sign = SameSignClassifier() self.clf_less1 = Less1Classifier() # simple binary dataset self.data_bin_1 = dataset_wizard( samples=[[0, 0], [-10, -1], [1, 0.1], [1, -1], [-1, 1]], targets=[1, 1, 1, -1, -1], # labels chunks=[0, 1, 2, 2, 3]) # chunks
def setUp(self): self.clf_sign = SameSignClassifier() self.clf_less1 = Less1Classifier() # simple binary dataset self.data_bin_1 = dataset_wizard( samples=[[0,0],[-10,-1],[1,0.1],[1,-1],[-1,1]], targets=[1, 1, 1, -1, -1], # labels chunks=[0, 1, 2, 2, 3]) # chunks
def test_str(): args = ( np.arange(12, dtype=np.int8).reshape((4, 3)), range(4), [1, 1, 2, 2]) for iargs in range(1, len(args)): ds = dataset_wizard(*(args[:iargs])) ds_s = str(ds) ok_(ds_s.startswith('<Dataset: 4x3@int8')) ok_(ds_s.endswith('>'))
def dumb_feature_binary_dataset(): """Very simple binary (2 labels) dataset """ data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1], [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1], [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0], [12, 1]] regs = ([0] * 12) + ([1] * 12) return dataset_wizard(samples=np.array(data), targets=regs, chunks=range(len(regs)))
def dumb_feature_dataset(): """Create a very simple dataset with 2 features and 3 labels """ data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1], [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1], [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0], [12, 1]] regs = ([1] * 8) + ([2] * 8) + ([3] * 8) return dataset_wizard(samples=np.array(data), targets=regs, chunks=range(len(regs)))
def test_aggregation(self): data = dataset_wizard(np.arange(20).reshape((4, 5)), targets=1, chunks=1) ag_data = aggregate_features(data, np.mean) ok_(ag_data.nsamples == 4) ok_(ag_data.nfeatures == 1) assert_array_equal(ag_data.samples[:, 0], [2, 7, 12, 17])
def test_coarsen_chunks(self): """Just basic testing for now""" chunks = [1, 1, 2, 2, 3, 3, 4, 4] ds = dataset_wizard(samples=np.arange(len(chunks)).reshape( (len(chunks), 1)), targets=[1] * 8, chunks=chunks) coarsen_chunks(ds, nchunks=2) chunks1 = coarsen_chunks(chunks, nchunks=2) self.failUnless((chunks1 == ds.chunks).all()) self.failUnless((chunks1 == np.asarray([0, 0, 0, 0, 1, 1, 1, 1])).all()) ds2 = dataset_wizard(samples=np.arange(len(chunks)).reshape( (len(chunks), 1)), targets=[1] * 8, chunks=range(len(chunks))) coarsen_chunks(ds2, nchunks=2) self.failUnless((chunks1 == ds.chunks).all())
def test_mergeds2(): """Test composition of new datasets by addition of existing ones """ data = dataset_wizard([range(5)], targets=1, chunks=1) assert_array_equal(data.UT, [1]) # simple sequence has to be a single pattern assert_equal(data.nsamples, 1) # check correct pattern layout (1x5) assert_array_equal(data.samples, [[0, 1, 2, 3, 4]]) # check for single labels and origin assert_array_equal(data.targets, [1]) assert_array_equal(data.chunks, [1]) # now try adding pattern with wrong shape assert_raises(DatasetError, data.append, dataset_wizard(np.ones((2,3)), targets=1, chunks=1)) # now add two real patterns dss = datasets['uni2large'].samples data.append(dataset_wizard(dss[:2, :5], targets=2, chunks=2)) assert_equal(data.nfeatures, 5) assert_array_equal(data.targets, [1, 2, 2]) assert_array_equal(data.chunks, [1, 2, 2]) # test automatic origins data.append(dataset_wizard(dss[3:5, :5], targets=3, chunks=[0, 1])) assert_array_equal(data.chunks, [1, 2, 2, 0, 1]) # test unique class labels assert_array_equal(data.UT, [1, 2, 3]) # test wrong label length assert_raises(ValueError, dataset_wizard, dss[:4, :5], targets=[ 1, 2, 3 ], chunks=2) # test wrong origin length assert_raises(ValueError, dataset_wizard, dss[:4, :5], targets=[ 1, 2, 3, 4 ], chunks=[ 2, 2, 2 ])
def test_invar_features_removal(self): r = np.random.normal(size=(3, 1)) ds = dataset_wizard(samples=np.hstack((np.zeros((3, 2)), r)), targets=1) self.failUnless(ds.nfeatures == 3) dsc = remove_invariant_features(ds) self.failUnless(dsc.nfeatures == 1) self.failUnless((dsc.samples == r).all())
def test_invar_features_removal(self): r = np.random.normal(size=(3,1)) ds = dataset_wizard(samples=np.hstack((np.zeros((3,2)), r)), targets=1) self.failUnless(ds.nfeatures == 3) dsc = remove_invariant_features(ds) self.failUnless(dsc.nfeatures == 1) self.failUnless((dsc.samples == r).all())
def test_arrayattributes(): samples = np.arange(12).reshape((4, 3)) labels = range(4) chunks = [1, 1, 2, 2] ds = dataset_wizard(samples, labels, chunks) for a in (ds.samples, ds.targets, ds.chunks): ok_(isinstance(a, np.ndarray)) ds.targets = labels ok_(isinstance(ds.targets, np.ndarray)) ds.chunks = chunks ok_(isinstance(ds.chunks, np.ndarray))
def sin_modulated(n_instances, n_features, flat=False, noise=0.4): """ Generate a (quite) complex multidimensional non-linear dataset Used for regression testing. In the data label is a sin of a x^2 + uniform noise """ if flat: data = np.arange(0.0, 1.0, 1.0 / n_instances) * np.pi data.resize(n_instances, n_features) else: data = np.random.rand(n_instances, n_features) * np.pi label = np.sin((data ** 2).sum(1)).round() label += np.random.rand(label.size) * noise return dataset_wizard(samples=data, targets=label)
def sin_modulated(n_instances, n_features, flat=False, noise=0.4): """ Generate a (quite) complex multidimensional non-linear dataset Used for regression testing. In the data label is a sin of a x^2 + uniform noise """ if flat: data = (np.arange(0.0, 1.0, 1.0/n_instances)*np.pi) data.resize(n_instances, n_features) else: data = np.random.rand(n_instances, n_features)*np.pi label = np.sin((data**2).sum(1)).round() label += np.random.rand(label.size)*noise return dataset_wizard(samples=data, targets=label)
def test_combined_samplesfeature_selection(): data = dataset_wizard(np.arange(20).reshape((4, 5)).view(myarray), targets=[1,2,3,4], chunks=[5,6,7,8]) # array subclass survives ok_(isinstance(data.samples, myarray)) ok_(data.nsamples == 4) ok_(data.nfeatures == 5) sel = data[[0, 3], [1, 2]] ok_(sel.nsamples == 2) ok_(sel.nfeatures == 2) assert_array_equal(sel.targets, [1, 4]) assert_array_equal(sel.chunks, [5, 8]) assert_array_equal(sel.samples, [[1, 2], [16, 17]]) # array subclass survives ok_(isinstance(sel.samples, myarray)) # should yield the same result if done sequentially sel2 = data[:, [1, 2]] sel2 = sel2[[0, 3]] assert_array_equal(sel.samples, sel2.samples) ok_(sel2.nsamples == 2) ok_(sel2.nfeatures == 2) # array subclass survives ok_(isinstance(sel.samples, myarray)) assert_raises(ValueError, data.__getitem__, (1, 2, 3)) # test correct behavior when selecting just single rows/columns single = data[0] ok_(single.nsamples == 1) ok_(single.nfeatures == 5) assert_array_equal(single.samples, [[0, 1, 2, 3, 4]]) single = data[:, 0] ok_(single.nsamples == 4) ok_(single.nfeatures == 1) assert_array_equal(single.samples, [[0], [5], [10], [15]]) single = data[1, 1] ok_(single.nsamples == 1) ok_(single.nfeatures == 1) assert_array_equal(single.samples, [[6]]) # array subclass survives ok_(isinstance(single.samples, myarray))
def test_samplesgroup_mapper(): data = np.arange(24).reshape(8,3) labels = [0, 1] * 4 chunks = np.repeat(np.array((0,1)),4) # correct results csamples = [[3, 4, 5], [6, 7, 8], [15, 16, 17], [18, 19, 20]] clabels = [0, 1, 0, 1] cchunks = [0, 0, 1, 1] ds = dataset_wizard(samples=data, targets=labels, chunks=chunks) # add some feature attribute -- just to check ds.fa['checker'] = np.arange(3) ds.init_origids('samples') m = mean_group_sample(['targets', 'chunks']) mds = m.forward(ds) assert_array_equal(mds.samples, csamples) # FAs should simply remain the same assert_array_equal(mds.fa.checker, np.arange(3)) # now without grouping m = mean_sample() # forwarding just the samples should yield the same result assert_array_equal(m.forward(ds.samples), m.forward(ds).samples) # directly apply to dataset # using untrained mapper m = mean_group_sample(['targets', 'chunks']) mapped = ds.get_mapped(m) assert_equal(mapped.nsamples, 4) assert_equal(mapped.nfeatures, 3) assert_array_equal(mapped.samples, csamples) assert_array_equal(mapped.targets, clabels) assert_array_equal(mapped.chunks, cchunks) # make sure origids get regenerated assert_array_equal([s.count('+') for s in mapped.sa.origids], [1] * 4) # disbalanced dataset -- lets remove 0th sample so there is no target # 0 in 0th chunk ds_ = ds[[0, 1, 3, 5]] mapped = ds_.get_mapped(m) ok_(len(mapped) == 3) ok_(not None in mapped.sa.origids)
def chirp_linear(n_instances, n_features=4, n_nonbogus_features=2, data_noise=0.4, noise=0.1): """ Generates simple dataset for linear regressions Generates chirp signal, populates n_nonbogus_features out of n_features with it with different noise level and then provides signal itself with additional noise as labels """ x = np.linspace(0, 1, n_instances) y = np.sin((10 * np.pi * x ** 2)) data = np.random.normal(size=(n_instances, n_features)) * data_noise for i in xrange(n_nonbogus_features): data[:, i] += y[:] labels = y + np.random.normal(size=(n_instances,)) * noise return dataset_wizard(samples=data, targets=labels)
def chirp_linear(n_instances, n_features=4, n_nonbogus_features=2, data_noise=0.4, noise=0.1): """ Generates simple dataset for linear regressions Generates chirp signal, populates n_nonbogus_features out of n_features with it with different noise level and then provides signal itself with additional noise as labels """ x = np.linspace(0, 1, n_instances) y = np.sin((10 * np.pi * x **2)) data = np.random.normal(size=(n_instances, n_features ))*data_noise for i in xrange(n_nonbogus_features): data[:, i] += y[:] labels = y + np.random.normal(size=(n_instances,))*noise return dataset_wizard(samples=data, targets=labels)
def test_classifier(self): clf = ParametrizedClassifier() self.failUnlessEqual(len(clf.params.items()), 3) # + targets # retrainable self.failUnlessEqual(len(clf.kernel_params.items()), 1) clfe = ParametrizedClassifierExtended() self.failUnlessEqual(len(clfe.params.items()), 3) self.failUnlessEqual(len(clfe.kernel_params.items()), 2) self.failUnlessEqual(len(clfe.kernel_params.listing), 2) # check assignment once again self.failUnlessEqual(clfe.kernel_params.kp2, 200.0) clfe.kernel_params.kp2 = 201.0 self.failUnlessEqual(clfe.kernel_params.kp2, 201.0) self.failUnlessEqual(clfe.kernel_params.is_set("kp2"), True) clfe.train(dataset_wizard(samples=[[0, 0]], targets=[1], chunks=[1])) self.failUnlessEqual(clfe.kernel_params.is_set("kp2"), False) self.failUnlessEqual(clfe.kernel_params.is_set(), False) self.failUnlessEqual(clfe.params.is_set(), False)
def linear_awgn(size=10, intercept=0.0, slope=0.4, noise_std=0.01, flat=False): """Generate a dataset from a linear function with AWGN (Added White Gaussian Noise). It can be multidimensional if 'slope' is a vector. If flat is True (in 1 dimesion) generate equally spaces samples instead of random ones. This is useful for the test phase. """ dimensions = 1 if isinstance(slope, np.ndarray): dimensions = slope.size if flat and dimensions == 1: x = np.linspace(0, 1, size)[:, np.newaxis] else: x = np.random.rand(size, dimensions) y = np.dot(x, slope)[:, np.newaxis] + (np.random.randn(*(x.shape[0], 1)) * noise_std) + intercept return dataset_wizard(samples=x, targets=y)
def linear_awgn(size=10, intercept=0.0, slope=0.4, noise_std=0.01, flat=False): """Generate a dataset from a linear function with AWGN (Added White Gaussian Noise). It can be multidimensional if 'slope' is a vector. If flat is True (in 1 dimesion) generate equally spaces samples instead of random ones. This is useful for the test phase. """ dimensions = 1 if isinstance(slope, np.ndarray): dimensions = slope.size if flat and dimensions == 1: x = np.linspace(0, 1, size)[:, np.newaxis] else: x = np.random.rand(size, dimensions) y = np.dot(x, slope)[:, np.newaxis] \ + (np.random.randn(*(x.shape[0], 1)) * noise_std) + intercept return dataset_wizard(samples=x, targets=y)
def wr1996(size=200): """Generate '6d robot arm' dataset (Williams and Rasmussen 1996) Was originally created in order to test the correctness of the implementation of kernel ARD. For full details see: http://www.gaussianprocess.org/gpml/code/matlab/doc/regression.html#ard x_1 picked randomly in [-1.932, -0.453] x_2 picked randomly in [0.534, 3.142] r_1 = 2.0 r_2 = 1.3 f(x_1,x_2) = r_1 cos (x_1) + r_2 cos(x_1 + x_2) + N(0,0.0025) etc. Expected relevances: ell_1 1.804377 ell_2 1.963956 ell_3 8.884361 ell_4 34.417657 ell_5 1081.610451 ell_6 375.445823 sigma_f 2.379139 sigma_n 0.050835 """ intervals = np.array([[-1.932, -0.453], [0.534, 3.142]]) r = np.array([2.0, 1.3]) x = np.random.rand(size, 2) x *= np.array(intervals[:, 1]-intervals[:, 0]) x += np.array(intervals[:, 0]) if __debug__: for i in xrange(2): debug('DG', '%d columnt Min: %g Max: %g' % (i, x[:, i].min(), x[:, i].max())) y = r[0]*np.cos(x[:, 0] + r[1]*np.cos(x.sum(1))) + \ np.random.randn(size)*np.sqrt(0.0025) y -= y.mean() x34 = x + np.random.randn(size, 2)*0.02 x56 = np.random.randn(size, 2) x = np.hstack([x, x34, x56]) return dataset_wizard(samples=x, targets=y)
def test_mapper_vs_zscore(): """Test by comparing to results of elderly z-score function """ # data: 40 sample feature line in 20d space (40x20; samples x features) dss = [ dataset_wizard(np.concatenate( [np.arange(40) for i in range(20)]).reshape(20,-1).T, targets=1, chunks=1), ] + datasets.values() for ds in dss: ds1 = deepcopy(ds) ds2 = deepcopy(ds) zsm = ZScoreMapper(chunks_attr=None) assert_raises(RuntimeError, zsm.forward, ds1.samples) zsm.train(ds1) ds1z = zsm.forward(ds1.samples) zscore(ds2, chunks_attr=None) assert_array_almost_equal(ds1z, ds2.samples) assert_array_equal(ds1.samples, ds.samples)
def test_binary_decorator(self): ds = dataset_wizard(samples=[ [0,0], [0,1], [1,100], [-1,0], [-1,-3], [ 0,-10] ], targets=[ 'sp', 'sp', 'sp', 'dn', 'sn', 'dp']) testdata = [ [0,0], [10,10], [-10, -1], [0.1, -0.1], [-0.2, 0.2] ] # labels [s]ame/[d]ifferent (sign), and [p]ositive/[n]egative first element clf = SameSignClassifier() # lets create classifier to descriminate only between same/different, # which is a primary task of SameSignClassifier bclf1 = BinaryClassifier(clf=clf, poslabels=['sp', 'sn'], neglabels=['dp', 'dn']) orig_labels = ds.targets[:] bclf1.train(ds) self.failUnless(bclf1.predict(testdata) == [['sp', 'sn'], ['sp', 'sn'], ['sp', 'sn'], ['dn', 'dp'], ['dn', 'dp']]) self.failUnless((ds.targets == orig_labels).all(), msg="BinaryClassifier should not alter labels")
def noisy_2d_fx(size_per_fx, dfx, sfx, center, noise_std=1): """Yet another generator of random dataset """ # used in projection example x = [] y = [] labels = [] for fx in sfx: nx = np.random.normal(size=size_per_fx) ny = fx(nx) + np.random.normal(size=nx.shape, scale=noise_std) x.append(nx) y.append(ny) # whenever larger than first function value labels.append(np.array(ny < dfx(nx), dtype='int')) samples = np.array((np.hstack(x), np.hstack(y))).squeeze().T labels = np.hstack(labels).squeeze().T samples += np.array(center) return dataset_wizard(samples=samples, targets=labels)
def test_arrayattributes(): samples = np.arange(12).reshape((4, 3)) labels = range(4) chunks = [1, 1, 2, 2] ds = dataset_wizard(samples, labels, chunks) for a in (ds.samples, ds.targets, ds.chunks): ok_(isinstance(a, np.ndarray)) ds.targets = labels ok_(isinstance(ds.targets, np.ndarray)) ds.chunks = chunks ok_(isinstance(ds.chunks, np.ndarray)) # we should allow assigning somewhat more complex # iterables -- use ndarray of dtype object then # and possibly spit out a warning ds.sa['complex_list'] = [[], [1], [1, 2], []] ok_(ds.sa.complex_list.dtype == object) # but incorrect length should still fail assert_raises(ValueError, ds.sa.__setitem__, 'complex_list2', [[], [1], [1, 2]])
def test_binary_decorator(self): ds = dataset_wizard(samples=[[0, 0], [0, 1], [1, 100], [-1, 0], [-1, -3], [0, -10]], targets=['sp', 'sp', 'sp', 'dn', 'sn', 'dp']) testdata = [[0, 0], [10, 10], [-10, -1], [0.1, -0.1], [-0.2, 0.2]] # labels [s]ame/[d]ifferent (sign), and [p]ositive/[n]egative first element clf = SameSignClassifier() # lets create classifier to descriminate only between same/different, # which is a primary task of SameSignClassifier bclf1 = BinaryClassifier(clf=clf, poslabels=['sp', 'sn'], neglabels=['dp', 'dn']) orig_labels = ds.targets[:] bclf1.train(ds) self.failUnless( bclf1.predict(testdata) == [['sp', 'sn'], ['sp', 'sn'], [ 'sp', 'sn' ], ['dn', 'dp'], ['dn', 'dp']]) self.failUnless((ds.targets == orig_labels).all(), msg="BinaryClassifier should not alter labels")
def setUp(self): self.data = dataset_wizard(np.random.normal(size=(100,10)), targets=[ i%4 for i in range(100) ], chunks=[ i/10 for i in range(100)])
bdot = base + '.' tcomp = [bdot + a for a in attrs] return tcomp def activate(): """Activate the PyMVPA Collections completer. """ ipget().set_hook('complete_command', pymvpa_completer, re_key='.*') ############################################################################# if __name__ == '__main__': # Testing/debugging, can be done only under interactive IPython session from mvpa.datasets.base import dataset_wizard t = dataset_wizard([1, 2, 3], targets=1, chunks=2) ip = ipget().IP assert (not 'targets' in ip.complete('t.sa.')) assert (not 'chunks' in ip.complete('t.sa.')) from ipy_pymvpa_completer import activate activate() # A few simplistic tests assert ip.complete('t.ed') == [] assert ('targets' in ip.complete('t.sa.')) assert ('chunks' in ip.complete('t.sa.')) print 'Tests OK'
def setUp(self): self.data = dataset_wizard(np.random.normal(size=(100, 10)), targets=[i % 4 for i in range(100)], chunks=[i / 10 for i in range(100)])
First import a necessary pieces of PyMVPA -- this time each bit individually. """ from mvpa.datasets.base import dataset_wizard from mvpa.datasets.splitters import OddEvenSplitter from mvpa.clfs.svm import LinearCSVMC from mvpa.clfs.transerror import TransferError from mvpa.algorithms.cvtranserror import CrossValidatedTransferError from mvpa.measures.searchlight import Searchlight from mvpa.misc.data_generators import normal_feature_dataset """For the sake of simplicity, let's use a small artificial dataset.""" # overcomplicated way to generate an example dataset ds = normal_feature_dataset(perlabel=10, nlabels=2, nchunks=2, nfeatures=10, nonbogus_features=[3, 7], snr=5.0) dataset = dataset_wizard(samples=ds.samples, targets=ds.targets, chunks=ds.chunks) """Now it only takes three lines for a searchlight analysis.""" # setup measure to be computed in each sphere (cross-validated # generalization error on odd/even splits) cv = CrossValidatedTransferError(TransferError(LinearCSVMC()), OddEvenSplitter()) # setup searchlight with 5 mm radius and measure configured above sl = Searchlight(cv, radius=5) # run searchlight on dataset sl_map = sl(dataset) print "Best performing sphere error:", min(sl_map)
def test_simple(self, oblique): d_orig = datasets['uni2large'].samples d_orig2 = datasets['uni4large'].samples for sdim, nf_s, nf_t, full_test \ in (('Same 2D', 2, 2, True), ('Same 10D', 10, 10, True), ('2D -> 3D', 2, 3, True), ('3D -> 2D', 3, 2, False)): # figure out some "random" rotation d = max(nf_s, nf_t) R = get_random_rotation(nf_s, nf_t, d_orig) if nf_s == nf_t: adR = np.abs(1.0 - np.linalg.det(R)) self.failUnless(adR < 1e-10, "Determinant of rotation matrix should " "be 1. Got it 1+%g" % adR) self.failUnless(norm(np.dot(R, R.T) - np.eye(R.shape[0])) < 1e-10) for s, scaling in ((0.3, True), (1.0, False)): pm = ProcrusteanMapper(scaling=scaling, oblique=oblique) pm2 = ProcrusteanMapper(scaling=scaling, oblique=oblique) t1, t2 = d_orig[23, 1], d_orig[22, 1] # Create source/target data d = d_orig[:, :nf_s] d_s = d + t1 d_t = np.dot(s * d, R) + t2 # train bloody mapper(s) ds = dataset_wizard(samples=d_s, targets=d_t) pm.train(ds) ## not possible with new interface #pm2.train(d_s, d_t) ## verify that both created the same transformation #npm2proj = norm(pm.proj - pm2.proj) #self.failUnless(npm2proj <= 1e-10, # msg="Got transformation different by norm %g." # " Had to be less than 1e-10" % npm2proj) #self.failUnless(norm(pm._offset_in - pm2._offset_in) <= 1e-10) #self.failUnless(norm(pm._offset_out - pm2._offset_out) <= 1e-10) # do forward transformation on the same source data d_s_f = pm.forward(d_s) self.failUnlessEqual(d_s_f.shape, d_t.shape, msg="Mapped shape should be identical to the d_t") dsf = d_s_f - d_t ndsf = norm(dsf)/norm(d_t) if full_test: dsR = norm(s*R - pm.proj) if not oblique: self.failUnless(dsR <= 1e-12, msg="We should have got reconstructed rotation+scaling " "perfectly. Now got d scale*R=%g" % dsR) self.failUnless(np.abs(s - pm._scale) < 1e-12, msg="We should have got reconstructed scale " "perfectly. Now got %g for %g" % (pm._scale, s)) self.failUnless(ndsf <= 1e-12, msg="%s: Failed to get to the target space correctly." " normed error=%g" % (sdim, ndsf)) # Test if we get back d_s_f_r = pm.reverse(d_s_f) dsfr = d_s_f_r - d_s ndsfr = norm(dsfr)/norm(d_s) if full_test: self.failUnless(ndsfr <= 1e-12, msg="%s: Failed to reconstruct into source space correctly." " normed error=%g" % (sdim, ndsfr))
def test_zscore(): """Test z-scoring transformation """ # dataset: mean=2, std=1 samples = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)).\ reshape((16, 1)) data = dataset_wizard(samples.copy(), targets=range(16), chunks=[0] * 16) assert_equal(data.samples.mean(), 2.0) assert_equal(data.samples.std(), 1.0) data_samples = data.samples.copy() zscore(data, chunks_attr='chunks') # copy should stay intact assert_equal(data_samples.mean(), 2.0) assert_equal(data_samples.std(), 1.0) # we should be able to operate on ndarrays # But we can't change type inplace for an array, can't we? assert_raises(TypeError, zscore, data_samples, chunks_attr=None) # so lets do manually data_samples = data_samples.astype(float) zscore(data_samples, chunks_attr=None) assert_array_equal(data.samples, data_samples) print data_samples # check z-scoring check = np.array([-2, -1, 1, 2, 0, 0, 1, -1, -1, 1, 1, -1, 0, 0, 0, 0], dtype='float64').reshape(16, 1) assert_array_equal(data.samples, check) data = dataset_wizard(samples.copy(), targets=range(16), chunks=[0] * 16) zscore(data, chunks_attr=None) assert_array_equal(data.samples, check) # check z-scoring taking set of labels as a baseline data = dataset_wizard(samples.copy(), targets=[0, 2, 2, 2, 1] + [2] * 11, chunks=[0] * 16) zscore(data, param_est=('targets', [0, 1])) assert_array_equal(samples, data.samples + 1.0) # check that zscore modifies in-place; only guaranteed if no upcasting is # necessary samples = samples.astype('float') data = dataset_wizard(samples, targets=[0, 2, 2, 2, 1] + [2] * 11, chunks=[0] * 16) zscore(data, param_est=('targets', [0, 1])) assert_array_equal(samples, data.samples) # these might be duplicating code above -- but twice is better than nothing # dataset: mean=2, std=1 raw = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)) # dataset: mean=12, std=1 raw2 = np.array((0, 1, 3, 4, 2, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2)) + 10 # zscore target check = [-2, -1, 1, 2, 0, 0, 1, -1, -1, 1, 1, -1, 0, 0, 0, 0] ds = dataset_wizard(raw.copy(), targets=range(16), chunks=[0] * 16) pristine = dataset_wizard(raw.copy(), targets=range(16), chunks=[0] * 16) zm = ZScoreMapper() # should do global zscore by default zm.train(ds) # train assert_array_almost_equal(zm.forward(ds), np.transpose([check])) # should not modify the source assert_array_equal(pristine, ds) # if we tell it a different mean it should obey the order zm = ZScoreMapper(params=(3,1)) zm.train(ds) assert_array_almost_equal(zm.forward(ds), np.transpose([check]) - 1 ) assert_array_equal(pristine, ds) # let's look at chunk-wise z-scoring ds = dataset_wizard(np.hstack((raw.copy(), raw2.copy())), targets=range(32), chunks=[0] * 16 + [1] * 16) # by default chunk-wise zm = ZScoreMapper() zm.train(ds) # train assert_array_almost_equal(zm.forward(ds), np.transpose([check + check])) # we should be able to do that same manually zm = ZScoreMapper(params={0: (2,1), 1: (12,1)}) zm.train(ds) # train assert_array_almost_equal(zm.forward(ds), np.transpose([check + check]))
def test_glm(self): """Test GLM """ # play fmri # full-blown HRF with initial dip and undershoot ;-) hrf_x = np.linspace(0, 25, 250) hrf = double_gamma_hrf(hrf_x) - single_gamma_hrf(hrf_x, 0.8, 1, 0.05) # come up with an experimental design samples = 1800 fast_er_onsets = np.array([10, 200, 250, 500, 600, 900, 920, 1400]) fast_er = np.zeros(samples) fast_er[fast_er_onsets] = 1 # high resolution model of the convolved regressor model_hr = np.convolve(fast_er, hrf)[:samples] # downsample the regressor to fMRI resolution tr = 2.0 model_lr = signal.resample(model_hr, int(samples / tr / 10), window='ham') # generate artifical fMRI data: two voxels one is noise, one has # something baseline = 800.0 wsignal = baseline + 2 * model_lr + \ np.random.randn(int(samples / tr / 10)) * 0.2 nsignal = baseline + np.random.randn(int(samples / tr / 10)) * 0.5 # build design matrix: bold-regressor and constant X = np.array([model_lr, np.repeat(1, len(model_lr))]).T # two 'voxel' dataset data = dataset_wizard(samples=np.array((wsignal, nsignal, nsignal)).T, targets=1) # check GLM betas glm = GLM(X) betas = glm(data) # betas for each feature and each regressor self.failUnless(betas.shape == (X.shape[1], data.nfeatures)) self.failUnless(np.absolute(betas.samples[1] - baseline < 10).all(), msg="baseline betas should be huge and around 800") self.failUnless(betas.samples[0,0] > betas[0,1], msg="feature (with signal) beta should be larger than for noise") if cfg.getboolean('tests', 'labile', default='yes'): self.failUnless(np.absolute(betas[0,1]) < 0.5) self.failUnless(np.absolute(betas[0,0]) > 1.0) # check GLM zscores glm = GLM(X, voi='zstat') zstats = glm(data) self.failUnless(zstats.shape == betas.shape) self.failUnless((zstats.samples[1] > 1000).all(), msg='constant zstats should be huge') if cfg.getboolean('tests', 'labile', default='yes'): self.failUnless(np.absolute(betas[0,0]) > betas[0,1], msg='with signal should have higher zstats')
def normal_feature_dataset(perlabel=50, nlabels=2, nfeatures=4, nchunks=5, means=None, nonbogus_features=None, snr=3.0, normalize=True): """Generate a univariate dataset with normal noise and specified means. Could be considered to be a generalization of `pure_multivariate_signal` where means=[ [0,1], [1,0] ]. Specify either means or `nonbogus_features` so means get assigned accordingly. If neither `means` nor `nonbogus_features` are provided, data will be pure noise and no per-label information. Parameters ---------- perlabel : int, optional Number of samples per each label nlabels : int, optional Number of labels in the dataset nfeatures : int, optional Total number of features (including bogus features which carry no label-related signal) nchunks : int, optional Number of chunks (perlabel should be multiple of nchunks) means : None or ndarray of (nlabels, nfeatures) shape Specified means for each of features (columns) for all labels (rows). nonbogus_features : None or list of int Indexes of non-bogus features (1 per label). snr : float, optional Signal-to-noise ration assuming that signal has std 1.0 so we just divide random normal noise by snr normalize : bool, optional Divide by max(abs()) value to bring data into [-1, 1] range. """ data = np.random.standard_normal((perlabel*nlabels, nfeatures))/np.sqrt(snr) if (means is None) and (not nonbogus_features is None): if len(nonbogus_features) > nlabels: raise ValueError, "Can't assign simply a feature to a " + \ "class: more nonbogus_features than labels" means = np.zeros((len(nonbogus_features), nfeatures)) # pure multivariate -- single bit per feature for i in xrange(len(nonbogus_features)): means[i, nonbogus_features[i]] = 1.0 if not means is None: # add mean data += np.repeat(np.array(means, ndmin=2), perlabel, axis=0) if normalize: # bring it 'under 1', since otherwise some classifiers have difficulties # during optimization data = 1.0/(np.max(np.abs(data))) * data labels = np.concatenate([np.repeat('L%d' % i, perlabel) for i in range(nlabels)]) chunks = np.concatenate([np.repeat(range(nchunks), perlabel/nchunks) for i in range(nlabels)]) ds = dataset_wizard(data, targets=labels, chunks=chunks) # If nonbogus was provided -- assign .a and .fa accordingly if nonbogus_features is not None: ds.fa['targets'] = np.array([None]*nfeatures) ds.fa.targets[nonbogus_features] = ['L%d' % i for i in range(nlabels)] ds.a['nonbogus_features'] = nonbogus_features ds.a['bogus_features'] = [x for x in range(nfeatures) if not x in nonbogus_features] return ds
def normal_feature_dataset(perlabel=50, nlabels=2, nfeatures=4, nchunks=5, means=None, nonbogus_features=None, snr=3.0, normalize=True): """Generate a univariate dataset with normal noise and specified means. Could be considered to be a generalization of `pure_multivariate_signal` where means=[ [0,1], [1,0] ]. Specify either means or `nonbogus_features` so means get assigned accordingly. If neither `means` nor `nonbogus_features` are provided, data will be pure noise and no per-label information. Parameters ---------- perlabel : int, optional Number of samples per each label nlabels : int, optional Number of labels in the dataset nfeatures : int, optional Total number of features (including bogus features which carry no label-related signal) nchunks : int, optional Number of chunks (perlabel should be multiple of nchunks) means : None or ndarray of (nlabels, nfeatures) shape Specified means for each of features (columns) for all labels (rows). nonbogus_features : None or list of int Indexes of non-bogus features (1 per label). snr : float, optional Signal-to-noise ration assuming that signal has std 1.0 so we just divide random normal noise by snr normalize : bool, optional Divide by max(abs()) value to bring data into [-1, 1] range. """ data = np.random.standard_normal((perlabel*nlabels, nfeatures))/np.sqrt(snr) if (means is None) and (not nonbogus_features is None): if len(nonbogus_features) > nlabels: raise ValueError, "Can't assign simply a feature to a " + \ "class: more nonbogus_features than labels" means = np.zeros((len(nonbogus_features), nfeatures)) # pure multivariate -- single bit per feature for i in xrange(len(nonbogus_features)): means[i, nonbogus_features[i]] = 1.0 if not means is None: # add mean data += np.repeat(np.array(means, ndmin=2), perlabel, axis=0) if normalize: # bring it 'under 1', since otherwise some classifiers have difficulties # during optimization data = 1.0/(np.max(np.abs(data))) * data labels = np.concatenate([np.repeat('L%d' % i, perlabel) for i in range(nlabels)]) chunks = np.concatenate([np.repeat(range(nchunks), perlabel/nchunks) for i in range(nlabels)]) ds = dataset_wizard(data, targets=labels, chunks=chunks) # If nonbogus was provided -- assign .a and .fa accordingly if nonbogus_features is not None: ds.fa['nonbogus_targets'] = np.array([None]*nfeatures) ds.fa.nonbogus_targets[nonbogus_features] = ['L%d' % i for i in range(nlabels)] ds.a['nonbogus_features'] = nonbogus_features ds.a['bogus_features'] = [x for x in range(nfeatures) if not x in nonbogus_features] return ds