Example #1
0
    def test_correct_dimensions_order(self, clf):
        """To check if known/present Classifiers are working properly
        with samples being first dimension. Started to worry about
        possible problems while looking at sg where samples are 2nd
        dimension
        """
        # specially crafted dataset -- if dimensions are flipped over
        # the same storage, problem becomes unseparable. Like in this case
        # incorrect order of dimensions lead to equal samples [0, 1, 0]
        traindatas = [
            dataset_wizard(samples=np.array([[0, 0, 1.0], [1, 0, 0]]),
                           targets=[0, 1]),
            dataset_wizard(samples=np.array([[0, 0.0], [1, 1]]),
                           targets=[0, 1])
        ]

        clf.ca.change_temporarily(enable_ca=['training_stats'])
        for traindata in traindatas:
            clf.train(traindata)
            self.assertEqual(
                clf.ca.training_stats.percent_correct, 100.0,
                "Classifier %s must have 100%% correct learning on %s. Has %f"
                % ( ` clf `, traindata.samples,
                    clf.ca.training_stats.percent_correct))

            # and we must be able to predict every original sample thus
            for i in xrange(traindata.nsamples):
                sample = traindata.samples[i, :]
                predicted = clf.predict([sample])
                self.assertEqual(
                    [predicted], traindata.targets[i],
                    "We must be able to predict sample %s using " % sample +
                    "classifier %s" % ` clf `)
        clf.ca.reset_changed_temporarily()
Example #2
0
    def test_correct_dimensions_order(self, clf):
        """To check if known/present Classifiers are working properly
        with samples being first dimension. Started to worry about
        possible problems while looking at sg where samples are 2nd
        dimension
        """
        # specially crafted dataset -- if dimensions are flipped over
        # the same storage, problem becomes unseparable. Like in this case
        # incorrect order of dimensions lead to equal samples [0, 1, 0]
        traindatas = [
            dataset_wizard(samples=np.array([ [0, 0, 1.0],
                                        [1, 0, 0] ]), targets=[0, 1]),
            dataset_wizard(samples=np.array([ [0, 0.0],
                                      [1, 1] ]), targets=[0, 1])]

        clf.ca.change_temporarily(enable_ca = ['training_stats'])
        for traindata in traindatas:
            clf.train(traindata)
            self.assertEqual(clf.ca.training_stats.percent_correct, 100.0,
                "Classifier %s must have 100%% correct learning on %s. Has %f" %
                (`clf`, traindata.samples, clf.ca.training_stats.percent_correct))

            # and we must be able to predict every original sample thus
            for i in xrange(traindata.nsamples):
                sample = traindata.samples[i,:]
                predicted = clf.predict([sample])
                self.assertEqual([predicted], traindata.targets[i],
                    "We must be able to predict sample %s using " % sample +
                    "classifier %s" % `clf`)
        clf.ca.reset_changed_temporarily()
Example #3
0
    def test_feature_selection_classifier(self):
        from mvpa2.featsel.base import \
             SensitivityBasedFeatureSelection
        from mvpa2.featsel.helpers import \
             FixedNElementTailSelector

        # should give lowest weight to the feature with lowest index
        sens_ana = SillySensitivityAnalyzer()
        # should give lowest weight to the feature with highest index
        sens_ana_rev = SillySensitivityAnalyzer(mult=-1)

        # corresponding feature selections
        feat_sel = SensitivityBasedFeatureSelection(
            sens_ana, FixedNElementTailSelector(1, mode='discard'))

        feat_sel_rev = SensitivityBasedFeatureSelection(
            sens_ana_rev, FixedNElementTailSelector(1))

        samples = np.array([[0, 0, -1], [1, 0, 1], [-1, -1, 1], [-1, 0, 1],
                            [1, -1, 1]])

        testdata3 = dataset_wizard(samples=samples, targets=1)
        # dummy train data so proper mapper gets created
        traindata = dataset_wizard(samples=np.array([[0, 0, -1], [1, 0, 1]]),
                                   targets=[1, 2])

        # targets
        res110 = [1, 1, 1, -1, -1]
        res011 = [-1, 1, -1, 1, -1]

        # first classifier -- 0th feature should be discarded
        clf011 = FeatureSelectionClassifier(self.clf_sign,
                                            feat_sel,
                                            enable_ca=['feature_ids'])

        self.clf_sign.ca.change_temporarily(enable_ca=['estimates'])
        clf011.train(traindata)

        self.assertEqual(clf011.predict(testdata3.samples), res011)
        # just silly test if we get values assigned in the 'ProxyClassifier'
        self.assertTrue(len(clf011.ca.estimates) == len(res110),
                        msg="We need to pass values into ProxyClassifier")
        self.clf_sign.ca.reset_changed_temporarily()

        self.assertEqual(clf011.mapper._oshape, (2, ))
        "Feature selection classifier had to be trained on 2 features"

        # first classifier -- last feature should be discarded
        clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel_rev)
        clf011.train(traindata)
        self.assertEqual(clf011.predict(testdata3.samples), res110)
Example #4
0
    def test_feature_selection_classifier(self):
        from mvpa2.featsel.base import \
             SensitivityBasedFeatureSelection
        from mvpa2.featsel.helpers import \
             FixedNElementTailSelector

        # should give lowest weight to the feature with lowest index
        sens_ana = SillySensitivityAnalyzer()
        # should give lowest weight to the feature with highest index
        sens_ana_rev = SillySensitivityAnalyzer(mult=-1)

        # corresponding feature selections
        feat_sel = SensitivityBasedFeatureSelection(sens_ana,
            FixedNElementTailSelector(1, mode='discard'))

        feat_sel_rev = SensitivityBasedFeatureSelection(sens_ana_rev,
            FixedNElementTailSelector(1))

        samples = np.array([ [0, 0, -1], [1, 0, 1], [-1, -1, 1],
                            [-1, 0, 1], [1, -1, 1] ])

        testdata3 = dataset_wizard(samples=samples, targets=1)
        # dummy train data so proper mapper gets created
        traindata = dataset_wizard(samples=np.array([ [0, 0, -1], [1, 0, 1] ]),
                            targets=[1, 2])

        # targets
        res110 = [1, 1, 1, -1, -1]
        res011 = [-1, 1, -1, 1, -1]

        # first classifier -- 0th feature should be discarded
        clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel,
                    enable_ca=['feature_ids'])

        self.clf_sign.ca.change_temporarily(enable_ca=['estimates'])
        clf011.train(traindata)

        self.assertEqual(clf011.predict(testdata3.samples), res011)
        # just silly test if we get values assigned in the 'ProxyClassifier'
        self.assertTrue(len(clf011.ca.estimates) == len(res110),
                        msg="We need to pass values into ProxyClassifier")
        self.clf_sign.ca.reset_changed_temporarily()

        self.assertEqual(clf011.mapper._oshape, (2,))
        "Feature selection classifier had to be trained on 2 features"

        # first classifier -- last feature should be discarded
        clf011 = FeatureSelectionClassifier(self.clf_sign, feat_sel_rev)
        clf011.train(traindata)
        self.assertEqual(clf011.predict(testdata3.samples), res110)
Example #5
0
    def test_coarsen_chunks(self):
        """Just basic testing for now"""
        chunks = [1,1,2,2,3,3,4,4]
        ds = dataset_wizard(samples=np.arange(len(chunks)).reshape(
            (len(chunks),1)), targets=[1]*8, chunks=chunks)
        coarsen_chunks(ds, nchunks=2)
        chunks1 = coarsen_chunks(chunks, nchunks=2)
        self.assertTrue((chunks1 == ds.chunks).all())
        self.assertTrue((chunks1 == np.asarray([0,0,0,0,1,1,1,1])).all())

        ds2 = dataset_wizard(samples=np.arange(len(chunks)).reshape(
            (len(chunks),1)), targets=[1]*8, chunks=list(range(len(chunks))))
        coarsen_chunks(ds2, nchunks=2)
        self.assertTrue((chunks1 == ds.chunks).all())
Example #6
0
    def test_coarsen_chunks(self):
        """Just basic testing for now"""
        chunks = [1,1,2,2,3,3,4,4]
        ds = dataset_wizard(samples=np.arange(len(chunks)).reshape(
            (len(chunks),1)), targets=[1]*8, chunks=chunks)
        coarsen_chunks(ds, nchunks=2)
        chunks1 = coarsen_chunks(chunks, nchunks=2)
        self.assertTrue((chunks1 == ds.chunks).all())
        self.assertTrue((chunks1 == np.asarray([0,0,0,0,1,1,1,1])).all())

        ds2 = dataset_wizard(samples=np.arange(len(chunks)).reshape(
            (len(chunks),1)), targets=[1]*8, chunks=range(len(chunks)))
        coarsen_chunks(ds2, nchunks=2)
        self.assertTrue((chunks1 == ds.chunks).all())
Example #7
0
def test_mergeds2():
    """Test composition of new datasets by addition of existing ones
    """
    data = dataset_wizard([range(5)], targets=1, chunks=1)

    assert_array_equal(data.UT, [1])

    # simple sequence has to be a single pattern
    assert_equal(data.nsamples, 1)
    # check correct pattern layout (1x5)
    assert_array_equal(data.samples, [[0, 1, 2, 3, 4]])

    # check for single labels and origin
    assert_array_equal(data.targets, [1])
    assert_array_equal(data.chunks, [1])

    # now try adding pattern with wrong shape
    assert_raises(ValueError, vstack,
                  (data, dataset_wizard(np.ones((2, 3)), targets=1, chunks=1)))

    # now add two real patterns
    dss = datasets['uni2large'].samples
    data = vstack((data, dataset_wizard(dss[:2, :5], targets=2, chunks=2)))
    assert_equal(data.nfeatures, 5)
    assert_array_equal(data.targets, [1, 2, 2])
    assert_array_equal(data.chunks, [1, 2, 2])

    # test automatic origins
    data = vstack((data, (dataset_wizard(dss[3:5, :5],
                                         targets=3,
                                         chunks=[0, 1]))))
    assert_array_equal(data.chunks, [1, 2, 2, 0, 1])

    # test unique class labels
    assert_array_equal(data.UT, [1, 2, 3])

    # test wrong label length
    assert_raises(ValueError,
                  dataset_wizard,
                  dss[:4, :5],
                  targets=[1, 2, 3],
                  chunks=2)

    # test wrong origin length
    assert_raises(ValueError,
                  dataset_wizard,
                  dss[:4, :5],
                  targets=[1, 2, 3, 4],
                  chunks=[2, 2, 2])
Example #8
0
def test_origid_handling():
    ds = dataset_wizard(np.atleast_2d(np.arange(35)).T)
    ds.init_origids('both')
    ok_(ds.nsamples == 35)
    assert_equal(len(np.unique(ds.sa.origids)), 35)
    assert_equal(len(np.unique(ds.fa.origids)), 1)
    selector = [3, 7, 10, 15]
    subds = ds[selector]
    assert_array_equal(subds.sa.origids, ds.sa.origids[selector])

    # Now if we request new origids if they are present we could
    # expect different behavior
    assert_raises(ValueError, subds.init_origids, 'both', mode='raises')
    sa_origids = subds.sa.origids.copy()
    fa_origids = subds.fa.origids.copy()
    for s in ('both', 'samples', 'features'):
        assert_raises(RuntimeError, subds.init_origids, s, mode='raise')
        subds.init_origids(s, mode='existing')
        # we should have the same origids as before
        assert_array_equal(subds.sa.origids, sa_origids)
        assert_array_equal(subds.fa.origids, fa_origids)

    # Lets now change, which should be default behavior
    subds.init_origids('both')
    assert_equal(len(sa_origids), len(subds.sa.origids))
    assert_equal(len(fa_origids), len(subds.fa.origids))
    # values should change though
    ok_((sa_origids != subds.sa.origids).any())
    ok_((fa_origids != subds.fa.origids).any())
Example #9
0
def test_idhash():
    ds = dataset_wizard(np.arange(12).reshape((4, 3)),
                 targets=1, chunks=1)
    origid = ds.idhash
    #XXX BUG -- no assurance that labels would become an array... for now -- do manually
    ds.targets = np.array([3, 1, 2, 3])   # change all labels
    ok_(origid != ds.idhash,
                    msg="Changing all targets should alter dataset's idhash")

    origid = ds.idhash

    z = ds.targets[1]
    assert_equal(origid, ds.idhash,
                 msg="Accessing shouldn't change idhash")
    z = ds.chunks
    assert_equal(origid, ds.idhash,
                 msg="Accessing shouldn't change idhash")
    z[2] = 333
    ok_(origid != ds.idhash,
        msg="Changing value in attribute should change idhash")

    origid = ds.idhash
    ds.samples[1, 1] = 1000
    ok_(origid != ds.idhash,
        msg="Changing value in data should change idhash")

    origid = ds.idhash
    orig_labels = ds.targets #.copy()
    ds.sa.targets = range(len(ds))
    ok_(origid != ds.idhash,
        msg="Chaging attribute also changes idhash")

    ds.targets = orig_labels
    ok_(origid == ds.idhash,
        msg="idhash should be restored after reassigning orig targets")
Example #10
0
def test_DissimilarityConsistencyMeasure():
    targets = np.tile(xrange(3),2)
    chunks = np.repeat(np.array((0,1)),3)
    # correct results
    cres1 = 0.41894348
    cres2 = np.array([[ 0.16137995, 0.73062639, 0.59441713]])
    dc1 = data[0:3,:] - np.mean(data[0:3,:],0)
    dc2 = data[3:6,:] - np.mean(data[3:6,:],0)
    center = squareform(np.corrcoef(pdist(dc1,'correlation'),pdist(dc2,'correlation')), 
                        checks=False).reshape((1,-1))
    dsm1 = stats.rankdata(pdist(data[0:3,:],'correlation').reshape((1,-1)))
    dsm2 = stats.rankdata(pdist(data[3:6,:],'correlation').reshape((1,-1)))

    spearman = squareform(np.corrcoef(np.vstack((dsm1,dsm2))), 
                        checks=False).reshape((1,-1))
    
    ds = dataset_wizard(samples=data, targets=targets, chunks=chunks)
    dscm = DissimilarityConsistencyMeasure()
    res1 = dscm(ds)
    dscm_c = DissimilarityConsistencyMeasure(center_data=True)
    res2 = dscm_c(ds)
    dscm_sp = DissimilarityConsistencyMeasure(consistency_metric='spearman')
    res3 = dscm_sp(ds)
    ds.append(ds)
    chunks = np.repeat(np.array((0,1,2,)),4)
    ds.sa['chunks'] = chunks
    res4 = dscm(ds)
    assert_almost_equal(np.mean(res1.samples),cres1)
    assert_array_almost_equal(res2.samples, center)
    assert_array_almost_equal(res3.samples, spearman)
    assert_array_almost_equal(res4.samples,cres2)
Example #11
0
def dumb_feature_binary_dataset():
    """Very simple binary (2 labels) dataset
    """
    data = [
        [1, 0],
        [1, 1],
        [2, 0],
        [2, 1],
        [3, 0],
        [3, 1],
        [4, 0],
        [4, 1],
        [5, 0],
        [5, 1],
        [6, 0],
        [6, 1],
        [7, 0],
        [7, 1],
        [8, 0],
        [8, 1],
        [9, 0],
        [9, 1],
        [10, 0],
        [10, 1],
        [11, 0],
        [11, 1],
        [12, 0],
        [12, 1],
    ]
    regs = ([0] * 12) + ([1] * 12)

    return dataset_wizard(samples=np.array(data), targets=regs, chunks=range(len(regs)))
Example #12
0
def dumb_feature_dataset():
    """Create a very simple dataset with 2 features and 3 labels
    """
    data = [
        [1, 0],
        [1, 1],
        [2, 0],
        [2, 1],
        [3, 0],
        [3, 1],
        [4, 0],
        [4, 1],
        [5, 0],
        [5, 1],
        [6, 0],
        [6, 1],
        [7, 0],
        [7, 1],
        [8, 0],
        [8, 1],
        [9, 0],
        [9, 1],
        [10, 0],
        [10, 1],
        [11, 0],
        [11, 1],
        [12, 0],
        [12, 1],
    ]
    regs = ([1] * 8) + ([2] * 8) + ([3] * 8)

    return dataset_wizard(samples=np.array(data), targets=regs, chunks=range(len(regs)))
Example #13
0
def linear1d_gaussian_noise(size=100, slope=0.5, intercept=1.0, x_min=-2.0, x_max=3.0, sigma=0.2):
    """A straight line with some Gaussian noise.
    """
    x = np.linspace(start=x_min, stop=x_max, num=size)
    noise = np.random.randn(size) * sigma
    y = x * slope + intercept + noise
    return dataset_wizard(samples=x[:, None], targets=y)
Example #14
0
def pure_multivariate_signal(patterns,
                             signal2noise=1.5,
                             chunks=None,
                             targets=[0, 1]):
    """ Create a 2d dataset with a clear multivariate signal, but no
    univariate information.

    ::

      %%%%%%%%%
      % O % X %
      %%%%%%%%%
      % X % O %
      %%%%%%%%%
    """

    # start with noise
    data = np.random.normal(size=(4 * patterns, 2))

    # add signal
    data[:2 * patterns, 1] += signal2noise

    data[2 * patterns:4 * patterns, 1] -= signal2noise
    data[:patterns, 0] -= signal2noise
    data[2 * patterns:3 * patterns, 0] -= signal2noise
    data[patterns:2 * patterns, 0] += signal2noise
    data[3 * patterns:4 * patterns, 0] += signal2noise

    # two conditions
    regs = np.array((targets[0:1] * patterns) + (targets[1:2] * 2 * patterns) +
                    (targets[0:1] * patterns))

    if chunks is None:
        chunks = range(len(data))
    return dataset_wizard(samples=data, targets=regs, chunks=chunks)
Example #15
0
    def test_feature_selection_classifier_with_regression(self):
        from mvpa2.featsel.base import \
             SensitivityBasedFeatureSelection
        from mvpa2.featsel.helpers import \
             FixedNElementTailSelector
        if sample_clf_reg is None:
            # none regression was found, so nothing to test
            return
        # should give lowest weight to the feature with lowest index
        sens_ana = SillySensitivityAnalyzer()

        # corresponding feature selections
        feat_sel = SensitivityBasedFeatureSelection(
            sens_ana, FixedNElementTailSelector(1, mode='discard'))

        # now test with regression-based classifier. The problem is
        # that it is determining predictions twice from values and
        # then setting the values from the results, which the second
        # time is set to predictions.  The final outcome is that the
        # values are actually predictions...
        dat = dataset_wizard(samples=np.random.randn(4, 10),
                             targets=[-1, -1, 1, 1])
        clf_reg = FeatureSelectionClassifier(sample_clf_reg, feat_sel)
        clf_reg.train(dat)
        _ = clf_reg.predict(dat.samples)
        self.failIf(
            (np.array(clf_reg.ca.estimates) -
             clf_reg.ca.predictions).sum() == 0,
            msg="Values were set to the predictions in %s." % sample_clf_reg)
Example #16
0
def test_mapper_vs_zscore():
    """Test by comparing to results of elderly z-score function
    """
    # data: 40 sample feature line in 20d space (40x20; samples x features)
    dss = [
        dataset_wizard(np.concatenate(
            [np.arange(40) for i in range(20)]).reshape(20,-1).T,
                targets=1, chunks=1),
        ] + datasets.values()

    for ds in dss:
        ds1 = deepcopy(ds)
        ds2 = deepcopy(ds)

        zsm = ZScoreMapper(chunks_attr=None)
        assert_raises(RuntimeError, zsm.forward, ds1.samples)
        idhashes = (idhash(ds1), idhash(ds1.samples))
        zsm.train(ds1)
        idhashes_train = (idhash(ds1), idhash(ds1.samples))
        assert_equal(idhashes, idhashes_train)

        # forward dataset
        ds1z_ds = zsm.forward(ds1)
        idhashes_forwardds = (idhash(ds1), idhash(ds1.samples))
        # must not modify samples in place!
        assert_equal(idhashes, idhashes_forwardds)

        # forward samples explicitly
        ds1z = zsm.forward(ds1.samples)
        idhashes_forward = (idhash(ds1), idhash(ds1.samples))
        assert_equal(idhashes, idhashes_forward)

        zscore(ds2, chunks_attr=None)
        assert_array_almost_equal(ds1z, ds2.samples)
        assert_array_equal(ds1.samples, ds.samples)
Example #17
0
def test_PDist():
    targets = np.tile(xrange(3), 2)
    chunks = np.repeat(np.array((0, 1)), 3)
    ds = dataset_wizard(samples=data, targets=targets, chunks=chunks)
    data_c = data - np.mean(data, 0)
    # DSM matrix elements should come out as samples of one feature
    # to be in line with what e.g. a classifier returns -- facilitates
    # collection in a searchlight ...
    euc = pdist(data, 'euclidean')[None].T
    pear = pdist(data, 'correlation')[None].T
    city = pdist(data, 'cityblock')[None].T
    center_sq = squareform(pdist(data_c, 'correlation'))

    # Now center each chunk separately
    dsm1 = PDist()
    dsm2 = PDist(pairwise_metric='euclidean')
    dsm3 = PDist(pairwise_metric='cityblock')
    dsm4 = PDist(center_data=True, square=True)
    assert_array_almost_equal(dsm1(ds).samples, pear)
    assert_array_almost_equal(dsm2(ds).samples, euc)
    dsm_res = dsm3(ds)
    assert_array_almost_equal(dsm_res.samples, city)
    # length correspondings to a single triangular matrix
    assert_equal(len(dsm_res.sa.pairs), len(ds) * (len(ds) - 1) / 2)
    # generate label pairs actually reflect the vectorform generated by
    # squareform()
    dsm_res_square = squareform(dsm_res.samples.T[0])
    for i, p in enumerate(dsm_res.sa.pairs):
        assert_equal(dsm_res_square[p[0], p[1]], dsm_res.samples[i, 0])
    dsm_res = dsm4(ds)
    assert_array_almost_equal(dsm_res.samples, center_sq)
    # sample attributes are carried over
    assert_almost_equal(ds.sa.targets, dsm_res.sa.targets)
Example #18
0
def test_PDist():
    targets = np.tile(xrange(3),2)
    chunks = np.repeat(np.array((0,1)),3)
    ds = dataset_wizard(samples=data, targets=targets, chunks=chunks)
    data_c = data - np.mean(data,0)
    # DSM matrix elements should come out as samples of one feature
    # to be in line with what e.g. a classifier returns -- facilitates
    # collection in a searchlight ...
    euc = pdist(data, 'euclidean')[None].T
    pear = pdist(data, 'correlation')[None].T
    city = pdist(data, 'cityblock')[None].T
    center_sq = squareform(pdist(data_c,'correlation'))

    # Now center each chunk separately
    dsm1 = PDist()
    dsm2 = PDist(pairwise_metric='euclidean')
    dsm3 = PDist(pairwise_metric='cityblock')
    dsm4 = PDist(center_data=True,square=True)
    assert_array_almost_equal(dsm1(ds).samples,pear)
    assert_array_almost_equal(dsm2(ds).samples,euc)
    dsm_res = dsm3(ds)
    assert_array_almost_equal(dsm_res.samples,city)
    # length correspondings to a single triangular matrix
    assert_equal(len(dsm_res.sa.pairs), len(ds) * (len(ds) - 1) / 2)
    # generate label pairs actually reflect the vectorform generated by
    # squareform()
    dsm_res_square = squareform(dsm_res.samples.T[0])
    for i, p in enumerate(dsm_res.sa.pairs):
        assert_equal(dsm_res_square[p[0], p[1]], dsm_res.samples[i, 0])
    dsm_res = dsm4(ds)
    assert_array_almost_equal(dsm_res.samples,center_sq)
    # sample attributes are carried over
    assert_almost_equal(ds.sa.targets, dsm_res.sa.targets)
Example #19
0
def pure_multivariate_signal(patterns, signal2noise = 1.5, chunks=None, targets=[0, 1]):
    """ Create a 2d dataset with a clear multivariate signal, but no
    univariate information.

    ::

      %%%%%%%%%
      % O % X %
      %%%%%%%%%
      % X % O %
      %%%%%%%%%
    """

    # start with noise
    data = np.random.normal(size=(4*patterns, 2))

    # add signal
    data[:2*patterns, 1] += signal2noise

    data[2*patterns:4*patterns, 1] -= signal2noise
    data[:patterns, 0] -= signal2noise
    data[2*patterns:3*patterns, 0] -= signal2noise
    data[patterns:2*patterns, 0] += signal2noise
    data[3*patterns:4*patterns, 0] += signal2noise

    # two conditions
    regs = np.array((targets[0:1] * patterns) + (targets[1:2] * 2 * patterns) + (targets[0:1] * patterns))

    if chunks is None:
        chunks = range(len(data))
    return dataset_wizard(samples=data, targets=regs, chunks=chunks)
Example #20
0
    def test_feature_selection_classifier_with_regression(self):
        from mvpa2.featsel.base import \
             SensitivityBasedFeatureSelection
        from mvpa2.featsel.helpers import \
             FixedNElementTailSelector
        if sample_clf_reg is None:
            # none regression was found, so nothing to test
            return
        # should give lowest weight to the feature with lowest index
        sens_ana = SillySensitivityAnalyzer()

        # corresponding feature selections
        feat_sel = SensitivityBasedFeatureSelection(sens_ana,
            FixedNElementTailSelector(1, mode='discard'))

        # now test with regression-based classifier. The problem is
        # that it is determining predictions twice from values and
        # then setting the values from the results, which the second
        # time is set to predictions.  The final outcome is that the
        # values are actually predictions...
        dat = dataset_wizard(samples=np.random.randn(4, 10),
                      targets=[-1, -1, 1, 1])
        clf_reg = FeatureSelectionClassifier(sample_clf_reg, feat_sel)
        clf_reg.train(dat)
        _ = clf_reg.predict(dat.samples)
        self.failIf((np.array(clf_reg.ca.estimates)
                     - clf_reg.ca.predictions).sum()==0,
                    msg="Values were set to the predictions in %s." %
                    sample_clf_reg)
Example #21
0
def load_fcmri_dataset(data, subjects, conditions, group, level, n_run=3):

    attributes = []
    samples = []

    for ic, c in enumerate(conditions):
        for isb, s in enumerate(subjects):
            for i in range(n_run):

                matrix = data[ic, isb, i, :]
                fmatrix = flatten_matrix(matrix)

                samples.append(fmatrix)
                attributes.append([c, s, i, group[isb], level[isb]])

    attributes = np.array(attributes)

    ds = dataset_wizard(np.array(samples),
                        targets=attributes.T[0],
                        chunks=attributes.T[1])
    ds.sa['run'] = attributes.T[2]
    ds.sa['group'] = attributes.T[3]
    ds.sa['level'] = np.int_(attributes.T[4])
    ds.sa['meditation'] = attributes.T[0]

    return ds
Example #22
0
def test_origid_handling():
    ds = dataset_wizard(np.atleast_2d(np.arange(35)).T)
    ds.init_origids('both')
    ok_(ds.nsamples == 35)
    assert_equal(len(np.unique(ds.sa.origids)), 35)
    assert_equal(len(np.unique(ds.fa.origids)), 1)
    selector = [3, 7, 10, 15]
    subds = ds[selector]
    assert_array_equal(subds.sa.origids, ds.sa.origids[selector])

    # Now if we request new origids if they are present we could
    # expect different behavior
    assert_raises(ValueError, subds.init_origids, 'both', mode='raises')
    sa_origids = subds.sa.origids.copy()
    fa_origids = subds.fa.origids.copy()
    for s in ('both', 'samples', 'features'):
        assert_raises(RuntimeError, subds.init_origids, s, mode='raise')
        subds.init_origids(s, mode='existing')
        # we should have the same origids as before
        assert_array_equal(subds.sa.origids, sa_origids)
        assert_array_equal(subds.fa.origids, fa_origids)

    # Lets now change, which should be default behavior
    subds.init_origids('both')
    assert_equal(len(sa_origids), len(subds.sa.origids))
    assert_equal(len(fa_origids), len(subds.fa.origids))
    # values should change though
    ok_((sa_origids != subds.sa.origids).any())
    ok_((fa_origids != subds.fa.origids).any())
Example #23
0
def test_idhash():
    ds = dataset_wizard(np.arange(12).reshape((4, 3)), targets=1, chunks=1)
    origid = ds.idhash
    #XXX BUG -- no assurance that labels would become an array... for now -- do manually
    ds.targets = np.array([3, 1, 2, 3])  # change all labels
    ok_(origid != ds.idhash,
        msg="Changing all targets should alter dataset's idhash")

    origid = ds.idhash

    z = ds.targets[1]
    assert_equal(origid, ds.idhash, msg="Accessing shouldn't change idhash")
    z = ds.chunks
    assert_equal(origid, ds.idhash, msg="Accessing shouldn't change idhash")
    z[2] = 333
    ok_(origid != ds.idhash,
        msg="Changing value in attribute should change idhash")

    origid = ds.idhash
    ds.samples[1, 1] = 1000
    ok_(origid != ds.idhash, msg="Changing value in data should change idhash")

    origid = ds.idhash
    orig_labels = ds.targets  #.copy()
    ds.sa.targets = range(len(ds))
    ok_(origid != ds.idhash, msg="Chaging attribute also changes idhash")

    ds.targets = orig_labels
    ok_(origid == ds.idhash,
        msg="idhash should be restored after reassigning orig targets")
Example #24
0
def _test_mcasey20120222():  # pragma: no cover
    # http://lists.alioth.debian.org/pipermail/pkg-exppsy-pymvpa/2012q1/002034.html

    # This one is conditioned on allowing # of samples to be changed
    # by the mapper provided to MappedClassifier.  See
    # https://github.com/yarikoptic/PyMVPA/tree/_tent/allow_ch_nsamples

    import numpy as np
    from mvpa2.datasets.base import dataset_wizard
    from mvpa2.generators.partition import NFoldPartitioner
    from mvpa2.mappers.base import ChainMapper
    from mvpa2.mappers.svd import SVDMapper
    from mvpa2.mappers.fx import mean_group_sample
    from mvpa2.clfs.svm import LinearCSVMC
    from mvpa2.clfs.meta import MappedClassifier
    from mvpa2.measures.base import CrossValidation

    mapper = ChainMapper([mean_group_sample(['targets','chunks']),
                          SVDMapper()])
    clf = MappedClassifier(LinearCSVMC(), mapper)
    cvte = CrossValidation(clf, NFoldPartitioner(),
                           enable_ca=['repetition_results', 'stats'])

    ds = dataset_wizard(
        samples=np.arange(32).reshape((8, -1)),
        targets=[1, 1, 2, 2, 1, 1, 2, 2],
        chunks=[1, 1, 1, 1, 2, 2, 2, 2])

    errors = cvte(ds)
Example #25
0
    def test_aggregation(self):
        data = dataset_wizard(np.arange( 20 ).reshape((4, 5)), targets=1, chunks=1)

        ag_data = aggregate_features(data, np.mean)

        ok_(ag_data.nsamples == 4)
        ok_(ag_data.nfeatures == 1)
        assert_array_equal(ag_data.samples[:, 0], [2, 7, 12, 17])
Example #26
0
def test_str():
    args = (np.arange(12, dtype=np.int8).reshape(
        (4, 3)), range(4), [1, 1, 2, 2])
    for iargs in range(1, len(args)):
        ds = dataset_wizard(*(args[:iargs]))
        ds_s = str(ds)
        ok_(ds_s.startswith('<Dataset: 4x3@int8'))
        ok_(ds_s.endswith('>'))
Example #27
0
    def test_aggregation(self):
        data = dataset_wizard(np.arange( 20 ).reshape((4, 5)), targets=1, chunks=1)

        ag_data = aggregate_features(data, np.mean)

        ok_(ag_data.nsamples == 4)
        ok_(ag_data.nfeatures == 1)
        assert_array_equal(ag_data.samples[:, 0], [2, 7, 12, 17])
Example #28
0
def linear1d_gaussian_noise(size=100, slope=0.5, intercept=1.0,
                            x_min=-2.0, x_max=3.0, sigma=0.2):
    """A straight line with some Gaussian noise.
    """
    x = np.linspace(start=x_min, stop=x_max, num=size)
    noise = np.random.randn(size)*sigma
    y = x * slope + intercept + noise
    return dataset_wizard(samples=x[:, None], targets=y)
Example #29
0
    def setUp(self):
        self.clf_sign = SameSignClassifier()
        self.clf_less1 = Less1Classifier()

        # simple binary dataset
        self.data_bin_1 = dataset_wizard(
            samples=[[0, 0], [-10, -1], [1, 0.1], [1, -1], [-1, 1]],
            targets=[1, 1, 1, -1, -1],  # labels
            chunks=[0, 1, 2, 2, 3])  # chunks
Example #30
0
def test_str():
    args = ( np.arange(12, dtype=np.int8).reshape((4, 3)),
             range(4),
             [1, 1, 2, 2])
    for iargs in range(1, len(args)):
        ds = dataset_wizard(*(args[:iargs]))
        ds_s = str(ds)
        ok_(ds_s.startswith('<Dataset: 4x3@int8'))
        ok_(ds_s.endswith('>'))
Example #31
0
    def setUp(self):
        self.clf_sign = SameSignClassifier()
        self.clf_less1 = Less1Classifier()

        # simple binary dataset
        self.data_bin_1 = dataset_wizard(
            samples=[[0,0],[-10,-1],[1,0.1],[1,-1],[-1,1]],
            targets=[1, 1, 1, -1, -1], # labels
            chunks=[0, 1, 2,  2, 3])  # chunks
Example #32
0
 def test_nfold_random_counted_selection_partitioner_huge(self):
     # Just test that it completes in a reasonable time and does
     # not blow up as if would do if it was not limited by count
     kwargs = dict(count=10)
     ds = dataset_wizard(np.arange(1000).reshape((-1, 1)), targets=range(1000), chunks=range(500) * 2)
     split_partitions_random = [
         tuple(x.sa.partitions) for x in NFoldPartitioner(100, selection_strategy="random", **kwargs).generate(ds)
     ]
     assert_equal(len(split_partitions_random), 10)  # we get just 10
Example #33
0
    def test_nonfinite_features_removal(self):
        r = np.random.normal(size=(4, 5))
        ds = dataset_wizard(r, targets=1, chunks=1)
        ds.samples[2,0]=np.NaN
        ds.samples[3,3]=np.Inf

        dsc = remove_nonfinite_features(ds)

        self.assertTrue(dsc.nfeatures == 3)
        assert_array_equal(ds[:, [1, 2, 4]].samples, dsc.samples)
Example #34
0
def dumb_feature_binary_dataset():
    """Very simple binary (2 labels) dataset
    """
    data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1],
            [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1],
            [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0],
            [12, 1]]
    regs = ([0] * 12) + ([1] * 12)

    return dataset_wizard(samples=np.array(data), targets=regs, chunks=range(len(regs)))
Example #35
0
    def test_invar_features_removal(self):
        r = np.random.normal(size=(3, 1))
        ds = dataset_wizard(samples=np.hstack((np.zeros((3, 2)), r)), targets=1)

        self.failUnless(ds.nfeatures == 3)

        dsc = remove_invariant_features(ds)

        self.failUnless(dsc.nfeatures == 1)
        self.failUnless((dsc.samples == r).all())
Example #36
0
    def test_nonfinite_features_removal(self):
        r = np.random.normal(size=(4, 5))
        ds = dataset_wizard(r, targets=1, chunks=1)
        ds.samples[2,0]=np.NaN
        ds.samples[3,3]=np.Inf

        dsc = remove_nonfinite_features(ds)

        self.assertTrue(dsc.nfeatures == 3)
        assert_array_equal(ds[:, [1, 2, 4]].samples, dsc.samples)
Example #37
0
def dumb_feature_dataset():
    """Create a very simple dataset with 2 features and 3 labels
    """
    data = [[1, 0], [1, 1], [2, 0], [2, 1], [3, 0], [3, 1], [4, 0], [4, 1],
            [5, 0], [5, 1], [6, 0], [6, 1], [7, 0], [7, 1], [8, 0], [8, 1],
            [9, 0], [9, 1], [10, 0], [10, 1], [11, 0], [11, 1], [12, 0],
            [12, 1]]
    regs = ([1] * 8) + ([2] * 8) + ([3] * 8)

    return dataset_wizard(samples=np.array(data), targets=regs, chunks=range(len(regs)))
Example #38
0
    def get_dataset(self):

        zresults = self.results

        new_shape = list(zresults.shape[-2:])
        new_shape.insert(0, -1)

        zreshaped = zresults.reshape(new_shape)

        upper_mask = np.ones_like(zreshaped[0])
        upper_mask[np.tril_indices(zreshaped[0].shape[0])] = 0
        upper_mask = np.bool_(upper_mask)

        # Reshape data to have samples x features
        ds_data = zreshaped[:, upper_mask]

        labels = []
        n_runs = zresults.shape[2]
        n_subj = zresults.shape[1]

        for l in self.conditions.keys():
            labels += [l for _ in range(n_runs * n_subj)]
        ds_labels = np.array(labels)

        ds_subjects = []

        for s in self.subjects:
            ds_subjects += [s for _ in range(n_runs)]
        ds_subjects = np.array(ds_subjects)

        ds_info = []
        for _ in self.conditions.keys():
            ds_info.append(ds_subjects)
        ds_info = np.vstack(ds_info)

        logger.debug(ds_info)
        logger.debug(ds_info.shape)

        logger.debug(ds_data.shape)

        self.ds = dataset_wizard(ds_data,
                                 targets=ds_labels,
                                 chunks=np.int_(ds_info.T[5]))
        self.ds.sa['subjects'] = ds_info.T[0]
        self.ds.sa['groups'] = ds_info.T[1]
        self.ds.sa['chunks_1'] = ds_info.T[2]
        self.ds.sa['expertise'] = ds_info.T[3]
        self.ds.sa['age'] = ds_info.T[4]
        self.ds.sa['chunks_2'] = ds_info.T[5]
        self.ds.sa['meditation'] = ds_labels

        logger.debug(ds_info.T[4])
        logger.debug(self.ds.sa.keys())

        return self.ds
Example #39
0
def test_mergeds2():
    """Test composition of new datasets by addition of existing ones
    """
    data = dataset_wizard([range(5)], targets=1, chunks=1)

    assert_array_equal(data.UT, [1])

    # simple sequence has to be a single pattern
    assert_equal(data.nsamples, 1)
    # check correct pattern layout (1x5)
    assert_array_equal(data.samples, [[0, 1, 2, 3, 4]])

    # check for single labels and origin
    assert_array_equal(data.targets, [1])
    assert_array_equal(data.chunks, [1])

    # now try adding pattern with wrong shape
    assert_raises(DatasetError,
                  data.append,
                  dataset_wizard(np.ones((2,3)), targets=1, chunks=1))

    # now add two real patterns
    dss = datasets['uni2large'].samples
    data.append(dataset_wizard(dss[:2, :5], targets=2, chunks=2))
    assert_equal(data.nfeatures, 5)
    assert_array_equal(data.targets, [1, 2, 2])
    assert_array_equal(data.chunks, [1, 2, 2])

    # test automatic origins
    data.append(dataset_wizard(dss[3:5, :5], targets=3, chunks=[0, 1]))
    assert_array_equal(data.chunks, [1, 2, 2, 0, 1])

    # test unique class labels
    assert_array_equal(data.UT, [1, 2, 3])

    # test wrong label length
    assert_raises(ValueError, dataset_wizard, dss[:4, :5], targets=[ 1, 2, 3 ],
                                         chunks=2)

    # test wrong origin length
    assert_raises(ValueError, dataset_wizard, dss[:4, :5],
                  targets=[ 1, 2, 3, 4 ], chunks=[ 2, 2, 2 ])
Example #40
0
    def test_invar_features_removal(self):
        r = np.random.normal(size=(3,1))
        ds = dataset_wizard(samples=np.hstack((np.zeros((3,2)), r)),
                     targets=1)

        self.assertTrue(ds.nfeatures == 3)

        dsc = remove_invariant_features(ds)

        self.assertTrue(dsc.nfeatures == 1)
        self.assertTrue((dsc.samples == r).all())
Example #41
0
 def test_nfold_random_counted_selection_partitioner_huge(self):
     # Just test that it completes in a reasonable time and does
     # not blow up as if would do if it was not limited by count
     kwargs = dict(count=10)
     ds = dataset_wizard(np.arange(1000).reshape((-1, 1)),
                         targets=range(1000),
                         chunks=range(500) * 2)
     split_partitions_random = [
         tuple(x.sa.partitions) for x in NFoldPartitioner(
             100, selection_strategy='random', **kwargs).generate(ds)
     ]
     assert_equal(len(split_partitions_random), 10)  # we get just 10
Example #42
0
    def get_dataset(self):
        
        zresults = self.results
        
        new_shape = list(zresults.shape[-2:])
        new_shape.insert(0, -1)
        
        zreshaped = zresults.reshape(new_shape)
        
        upper_mask = np.ones_like(zreshaped[0])
        upper_mask[np.tril_indices(zreshaped[0].shape[0])] = 0
        upper_mask = np.bool_(upper_mask)
        
        # Reshape data to have samples x features
        ds_data = zreshaped[:,upper_mask]
    
        labels = []
        n_runs = zresults.shape[2]
        n_subj = zresults.shape[1]
        
        for l in self.conditions.keys():
            labels += [l for _ in range(n_runs * n_subj)]
        ds_labels = np.array(labels)
        
        ds_subjects = []

        for s in self.subjects:
            ds_subjects += [s for _ in range(n_runs)]
        ds_subjects = np.array(ds_subjects)
        
        ds_info = []
        for _ in self.conditions.keys():
            ds_info.append(ds_subjects)
        ds_info = np.vstack(ds_info)
        
        logger.debug(ds_info)
        logger.debug(ds_info.shape)
        
        logger.debug(ds_data.shape)
        
        self.ds = dataset_wizard(ds_data, targets=ds_labels, chunks=np.int_(ds_info.T[5]))
        self.ds.sa['subjects'] = ds_info.T[0]
        self.ds.sa['groups'] = ds_info.T[1]
        self.ds.sa['chunks_1'] = ds_info.T[2]
        self.ds.sa['expertise'] = ds_info.T[3]
        self.ds.sa['age'] = ds_info.T[4]
        self.ds.sa['chunks_2'] = ds_info.T[5]
        self.ds.sa['meditation'] = ds_labels
        
        logger.debug(ds_info.T[4])
        logger.debug(self.ds.sa.keys())
        
        return self.ds
Example #43
0
def load_connectivity_ds(path, subjects, extra_sa_file=None):
    """
    Loads txt connectivity matrices in the form of n_roi x n_roi matrix
    
    Parameters
    ----------
    path : string
        Pathname of the data folder
        
    subjects : list
        List of subjects included in the analysis
        it is always a directory in the path
        
    extra_sa : dictionary
        Dictionary of extra fields to be included.
        The dictionary must be in the form of 
            'field': list of extra fields
        dictionary must include the field 'subject'
        
    Returns
    -------
    ds : pympva dataset
        The loaded dataset
    """
    
    data, attributes = load_txt_matrices(path, subjects)
    
    n_roi = data.shape[1]
    indices = np.triu_indices(n_roi, k=1)
    
    
    n_samples = data.shape[0]
    n_features = len(indices[0])
    samples = np.zeros((n_samples, n_features))
    
    for i in range(n_samples):
        samples[i] = data[i][indices]
        
    mask = np.isnan(samples).sum(0)
    
    
    ds = dataset_wizard(samples)
    for key, value in attributes.iteritems():
        ds.sa[key] = value
        
    ds.fa['nan_mask'] = np.bool_(mask)
    
    if extra_sa_file != None:
        _, fextra_sa = load_subject_file(extra_sa_file)
        ds = add_subject_attributes(ds, fextra_sa)
    
    return ds
Example #44
0
    def test_reflection(self, rep=10):
        for i in range(rep):
            from mvpa2.testing.datasets import get_random_rotation
            d = np.random.random((100, 2))
            T = get_random_rotation(d.shape[1])
            d2 = np.dot(d, T)
            # scale it up a bit
            d2 *= 1.2
            # add a reflection by flipping the first dimension
            d2[:, 0] *= -1
            ds = dataset_wizard(samples=d, targets=d2)

            norm0 = np.linalg.norm(d - d2)

            mapper = ProcrusteanMapper(scaling=False, reflection=False)
            mapper.train(ds)
            norm1 = np.linalg.norm(d2 - mapper.forward(ds).samples)
            eps = 1e-7
            self.assertLess(norm1,
                            norm0 + eps,
                            msg='Procrustes should reduce difference, '
                            'but %f > %f' % (norm1, norm0))

            mapper = ProcrusteanMapper(scaling=True, reflection=False)
            mapper.train(ds)
            norm2 = np.linalg.norm(d2 - mapper.forward(ds).samples)
            self.assertLess(norm2,
                            norm1 + eps,
                            msg='Procrustes with scaling should work better, '
                            'but %f > %f' % (norm2, norm1))

            mapper = ProcrusteanMapper(scaling=False, reflection=True)
            mapper.train(ds)
            norm3 = np.linalg.norm(d2 - mapper.forward(ds).samples)
            self.assertLess(
                norm3,
                norm1 + eps,
                msg='Procrustes with reflection should work better, '
                'but %f > %f' % (norm3, norm1))

            mapper = ProcrusteanMapper(scaling=True, reflection=True)
            mapper.train(ds)
            norm4 = np.linalg.norm(d2 - mapper.forward(ds).samples)
            self.assertLess(norm4,
                            norm3 + eps,
                            msg='Procrustes with scaling should work better, '
                            'but %f > %f' % (norm4, norm3))
            self.assertLess(
                norm4,
                norm2 + eps,
                msg='Procrustes with reflection should work better, '
                'but %f > %f' % (norm4, norm2))
Example #45
0
    def test_surface_dset_h5py_io_with_unicode(self, fn):
        skip_if_no_external('h5py')
        from mvpa2.base.hdf5 import h5save, h5load

        ds = dataset_wizard(np.arange(20).reshape((4, 5)), targets=1, chunks=1)
        ds.sa['unicode'] = ['u1', 'uu2', 'uuu3', 'uuuu4']
        ds.sa['str'] = ['s1', 'ss2', 'sss3', 'ssss4']
        ds.fa['node_indices'] = np.arange(5)

        # test h5py I/O
        h5save(fn, ds)
        ds2 = h5load(fn)
        assert_datasets_equal(ds, ds2)
Example #46
0
    def test_surface_dset_h5py_io_with_unicode(self, fn):
        skip_if_no_external('h5py')
        from mvpa2.base.hdf5 import h5save, h5load

        ds = dataset_wizard(np.arange(20).reshape((4, 5)), targets=1, chunks=1)
        ds.sa['unicode'] = [u'u1', u'uu2', u'uuu3', u'uuuu4']
        ds.sa['str'] = ['s1', 'ss2', 'sss3', 'ssss4']
        ds.fa['node_indices'] = np.arange(5)

        # test h5py I/O
        h5save(fn, ds)
        ds2 = h5load(fn)
        assert_datasets_equal(ds, ds2)
Example #47
0
def sin_modulated(n_instances, n_features, flat=False, noise=0.4):
    """ Generate a (quite) complex multidimensional non-linear dataset

    Used for regression testing. In the data label is a sin of a x^2 +
    uniform noise
    """
    if flat:
        data = np.arange(0.0, 1.0, 1.0 / n_instances) * np.pi
        data.resize(n_instances, n_features)
    else:
        data = np.random.rand(n_instances, n_features) * np.pi
    label = np.sin((data ** 2).sum(1)).round()
    label += np.random.rand(label.size) * noise
    return dataset_wizard(samples=data, targets=label)
Example #48
0
def sin_modulated(n_instances, n_features, flat=False, noise=0.4):
    """ Generate a (quite) complex multidimensional non-linear dataset

    Used for regression testing. In the data label is a sin of a x^2 +
    uniform noise
    """
    if flat:
        data = (np.arange(0.0, 1.0, 1.0 / n_instances) * np.pi)
        data.resize(n_instances, n_features)
    else:
        data = np.random.rand(n_instances, n_features) * np.pi
    label = np.sin((data**2).sum(1)).round()
    label += np.random.rand(label.size) * noise
    return dataset_wizard(samples=data, targets=label)
Example #49
0
def load_connectivity_ds(path, subjects, extra_sa_file=None):
    """
    Loads txt connectivity matrices in the form of n_roi x n_roi matrix
    
    Parameters
    ----------
    path : string
        Pathname of the data folder
        
    subjects : list
        List of subjects included in the analysis
        it is always a directory in the path
        
    extra_sa : dictionary
        Dictionary of extra fields to be included.
        The dictionary must be in the form of 
            'field': list of extra fields
        dictionary must include the field 'subject'
        
    Returns
    -------
    ds : pympva dataset
        The loaded dataset
    """

    data, attributes = load_txt_matrices(path, subjects)

    n_roi = data.shape[1]
    indices = np.triu_indices(n_roi, k=1)

    n_samples = data.shape[0]
    n_features = len(indices[0])
    samples = np.zeros((n_samples, n_features))

    for i in range(n_samples):
        samples[i] = data[i][indices]

    mask = np.isnan(samples).sum(0)

    ds = dataset_wizard(samples)
    for key, value in attributes.iteritems():
        ds.sa[key] = value

    ds.fa['nan_mask'] = np.bool_(mask)

    if extra_sa_file != None:
        _, fextra_sa = load_subject_file(extra_sa_file)
        ds = add_subject_attributes(ds, fextra_sa)

    return ds
Example #50
0
def pure_multivariate_signal(patterns,
                             signal2noise=1.5,
                             chunks=None,
                             targets=None):
    """ Create a 2d dataset with a clear purely multivariate signal.

    This is known is the XOR problem.

    ::

      %%%%%%%%%
      % O % X %
      %%%%%%%%%
      % X % O %
      %%%%%%%%%

    Parameters
    ----------
    patterns: int
      Number of data points in each of the four dot clouds
    signal2noise: float, optional
      Univariate signal pedestal.
    chunks: array, optional
      Vector for chunk labels for all generated samples.
    targets: list, optional
      Length-2 sequence of target values for both classes. If None,
      [0, 1] is used.
    """
    if targets is None:
        targets = [0, 1]

    # start with noise
    data = np.random.normal(size=(4 * patterns, 2))

    # add signal
    data[:2 * patterns, 1] += signal2noise

    data[2 * patterns:4 * patterns, 1] -= signal2noise
    data[:patterns, 0] -= signal2noise
    data[2 * patterns:3 * patterns, 0] -= signal2noise
    data[patterns:2 * patterns, 0] += signal2noise
    data[3 * patterns:4 * patterns, 0] += signal2noise

    # two conditions
    regs = np.array((targets[0:1] * patterns) + (targets[1:2] * 2 * patterns) +
                    (targets[0:1] * patterns))

    if chunks is None:
        chunks = range(len(data))
    return dataset_wizard(samples=data, targets=regs, chunks=chunks)
Example #51
0
def test_combined_samplesfeature_selection():
    data = dataset_wizard(np.arange(20).reshape((4, 5)).view(myarray),
                   targets=[1,2,3,4],
                   chunks=[5,6,7,8])

    # array subclass survives
    ok_(isinstance(data.samples, myarray))

    ok_(data.nsamples == 4)
    ok_(data.nfeatures == 5)
    sel = data[[0, 3], [1, 2]]
    ok_(sel.nsamples == 2)
    ok_(sel.nfeatures == 2)
    assert_array_equal(sel.targets, [1, 4])
    assert_array_equal(sel.chunks, [5, 8])
    assert_array_equal(sel.samples, [[1, 2], [16, 17]])
    # array subclass survives
    ok_(isinstance(sel.samples, myarray))


    # should yield the same result if done sequentially
    sel2 = data[:, [1, 2]]
    sel2 = sel2[[0, 3]]
    assert_array_equal(sel.samples, sel2.samples)
    ok_(sel2.nsamples == 2)
    ok_(sel2.nfeatures == 2)
    # array subclass survives
    ok_(isinstance(sel.samples, myarray))


    assert_raises(ValueError, data.__getitem__, (1, 2, 3))

    # test correct behavior when selecting just single rows/columns
    single = data[0]
    ok_(single.nsamples == 1)
    ok_(single.nfeatures == 5)
    assert_array_equal(single.samples, [[0, 1, 2, 3, 4]])
    single = data[:, 0]
    ok_(single.nsamples == 4)
    ok_(single.nfeatures == 1)
    assert_array_equal(single.samples, [[0], [5], [10], [15]])
    single = data[1, 1]
    ok_(single.nsamples == 1)
    ok_(single.nfeatures == 1)
    assert_array_equal(single.samples, [[6]])
    # array subclass survives
    ok_(isinstance(single.samples, myarray))
Example #52
0
def test_combined_samplesfeature_selection():
    data = dataset_wizard(np.arange(20).reshape((4, 5)).view(myarray),
                   targets=[1,2,3,4],
                   chunks=[5,6,7,8])

    # array subclass survives
    ok_(isinstance(data.samples, myarray))

    ok_(data.nsamples == 4)
    ok_(data.nfeatures == 5)
    sel = data[[0, 3], [1, 2]]
    ok_(sel.nsamples == 2)
    ok_(sel.nfeatures == 2)
    assert_array_equal(sel.targets, [1, 4])
    assert_array_equal(sel.chunks, [5, 8])
    assert_array_equal(sel.samples, [[1, 2], [16, 17]])
    # array subclass survives
    ok_(isinstance(sel.samples, myarray))


    # should yield the same result if done sequentially
    sel2 = data[:, [1, 2]]
    sel2 = sel2[[0, 3]]
    assert_array_equal(sel.samples, sel2.samples)
    ok_(sel2.nsamples == 2)
    ok_(sel2.nfeatures == 2)
    # array subclass survives
    ok_(isinstance(sel.samples, myarray))


    assert_raises(ValueError, data.__getitem__, (1, 2, 3))

    # test correct behavior when selecting just single rows/columns
    single = data[0]
    ok_(single.nsamples == 1)
    ok_(single.nfeatures == 5)
    assert_array_equal(single.samples, [[0, 1, 2, 3, 4]])
    single = data[:, 0]
    ok_(single.nsamples == 4)
    ok_(single.nfeatures == 1)
    assert_array_equal(single.samples, [[0], [5], [10], [15]])
    single = data[1, 1]
    ok_(single.nsamples == 1)
    ok_(single.nfeatures == 1)
    assert_array_equal(single.samples, [[6]])
    # array subclass survives
    ok_(isinstance(single.samples, myarray))
Example #53
0
def pure_multivariate_signal(patterns, signal2noise=1.5, chunks=None,
                             targets=None):
    """ Create a 2d dataset with a clear purely multivariate signal.

    This is known is the XOR problem.

    ::

      %%%%%%%%%
      % O % X %
      %%%%%%%%%
      % X % O %
      %%%%%%%%%

    Parameters
    ----------
    patterns: int
      Number of data points in each of the four dot clouds
    signal2noise: float, optional
      Univariate signal pedestal.
    chunks: array, optional
      Vector for chunk labels for all generated samples.
    targets: list, optional
      Length-2 sequence of target values for both classes. If None,
      [0, 1] is used.
    """
    if targets is None:
        targets = [0, 1]

    # start with noise
    data = np.random.normal(size=(4 * patterns, 2))

    # add signal
    data[:2 * patterns, 1] += signal2noise

    data[2 * patterns:4 * patterns, 1] -= signal2noise
    data[:patterns, 0] -= signal2noise
    data[2 * patterns:3 * patterns, 0] -= signal2noise
    data[patterns:2 * patterns, 0] += signal2noise
    data[3 * patterns:4 * patterns, 0] += signal2noise

    # two conditions
    regs = np.array((targets[0:1] * patterns) + (targets[1:2] * 2 * patterns) + (targets[0:1] * patterns))

    if chunks is None:
        chunks = range(len(data))
    return dataset_wizard(samples=data, targets=regs, chunks=chunks)
Example #54
0
def chirp_linear(n_instances, n_features=4, n_nonbogus_features=2, data_noise=0.4, noise=0.1):
    """ Generates simple dataset for linear regressions

    Generates chirp signal, populates n_nonbogus_features out of
    n_features with it with different noise level and then provides
    signal itself with additional noise as labels
    """
    x = np.linspace(0, 1, n_instances)
    y = np.sin((10 * np.pi * x ** 2))

    data = np.random.normal(size=(n_instances, n_features)) * data_noise
    for i in xrange(n_nonbogus_features):
        data[:, i] += y[:]

    labels = y + np.random.normal(size=(n_instances,)) * noise

    return dataset_wizard(samples=data, targets=labels)
Example #55
0
def test_samplesgroup_mapper():
    data = np.arange(24).reshape(8,3)
    labels = [0, 1] * 4
    chunks = np.repeat(np.array((0,1)),4)

    # correct results
    csamples = [[3, 4, 5], [6, 7, 8], [15, 16, 17], [18, 19, 20]]
    clabels = [0, 1, 0, 1]
    cchunks = [0, 0, 1, 1]

    ds = dataset_wizard(samples=data, targets=labels, chunks=chunks)
    # add some feature attribute -- just to check
    ds.fa['checker'] = np.arange(3)
    ds.init_origids('samples')

    m = mean_group_sample(['targets', 'chunks'])
    mds = m.forward(ds)
    assert_array_equal(mds.samples, csamples)
    # FAs should simply remain the same
    assert_array_equal(mds.fa.checker, np.arange(3))

    # now without grouping
    m = mean_sample()
    # forwarding just the samples should yield the same result
    assert_array_equal(m.forward(ds.samples),
                       m.forward(ds).samples)

    # directly apply to dataset
    # using untrained mapper
    m = mean_group_sample(['targets', 'chunks'])
    mapped = ds.get_mapped(m)

    assert_equal(mapped.nsamples, 4)
    assert_equal(mapped.nfeatures, 3)
    assert_array_equal(mapped.samples, csamples)
    assert_array_equal(mapped.targets, clabels)
    assert_array_equal(mapped.chunks, cchunks)
    # make sure origids get regenerated
    assert_array_equal([s.count('+') for s in mapped.sa.origids], [1] * 4)

    # disbalanced dataset -- lets remove 0th sample so there is no target
    # 0 in 0th chunk
    ds_ = ds[[0, 1, 3, 5]]
    mapped = ds_.get_mapped(m)
    ok_(len(mapped) == 3)
    ok_(not None in mapped.sa.origids)
Example #56
0
    def test_surface_dset_niml_io_with_unicode(self, fn):
        ds = dataset_wizard(np.arange(20).reshape((4, 5)), targets=1, chunks=1)
        ds.sa['unicode'] = [u'u1', u'uu2', u'uuu3', u'uuuu4']
        ds.sa['str'] = ['s1', 'ss2', 'sss3', 'ssss4']
        ds.fa['node_indices'] = np.arange(5)

        # ensure sample attributes are of String type (not array)
        niml_dict = afni_niml_dset.dset2rawniml(niml.to_niml(ds))
        expected_dtypes = dict(PYMVPA_SA_unicode='String',
                               PYMVPA_SA_str='String',
                               PYMVPA_SA_targets='1*int32')


        def assert_has_expected_datatype(name, expected_dtype, niml):
            """helper function"""
            nodes = niml_dict['nodes']

            for node in nodes:
                if node['name'] == name:
                    assert_equal(node['ni_type'], expected_dtype)
                    return

            raise ValueError('not found: %s', name)


        for name, expected_dtype in expected_dtypes.iteritems():
            assert_has_expected_datatype(name, expected_dtype, niml)


        # test NIML I/O
        niml.write(fn, ds)

        # remove extra fields added when reading the file
        ds2 = niml.from_any(fn)
        ds2.a.pop('history')
        ds2.a.pop('filename')
        ds2.sa.pop('labels')
        ds2.sa.pop('stats')

        # NIML does not support int64, only int32;
        # compare equality of values in samples by setting the
        # datatype the same as in the input (int32 or int64 depending
        # on the platform)
        ds2.samples = np.asarray(ds2.samples, dtype=ds.samples.dtype)
        assert_datasets_equal(ds, ds2)
Example #57
0
def test_samplesgroup_mapper():
    data = np.arange(24).reshape(8, 3)
    labels = [0, 1] * 4
    chunks = np.repeat(np.array((0, 1)), 4)

    # correct results
    csamples = [[3, 4, 5], [6, 7, 8], [15, 16, 17], [18, 19, 20]]
    clabels = [0, 1, 0, 1]
    cchunks = [0, 0, 1, 1]

    ds = dataset_wizard(samples=data, targets=labels, chunks=chunks)
    # add some feature attribute -- just to check
    ds.fa['checker'] = np.arange(3)
    ds.init_origids('samples')

    m = mean_group_sample(['targets', 'chunks'])
    mds = m.forward(ds)
    assert_array_equal(mds.samples, csamples)
    # FAs should simply remain the same
    assert_array_equal(mds.fa.checker, np.arange(3))

    # now without grouping
    m = mean_sample()
    # forwarding just the samples should yield the same result
    assert_array_equal(m.forward(ds.samples), m.forward(ds).samples)

    # directly apply to dataset
    # using untrained mapper
    m = mean_group_sample(['targets', 'chunks'])
    mapped = ds.get_mapped(m)

    assert_equal(mapped.nsamples, 4)
    assert_equal(mapped.nfeatures, 3)
    assert_array_equal(mapped.samples, csamples)
    assert_array_equal(mapped.targets, clabels)
    assert_array_equal(mapped.chunks, cchunks)
    # make sure origids get regenerated
    assert_array_equal([s.count('+') for s in mapped.sa.origids], [1] * 4)

    # disbalanced dataset -- lets remove 0th sample so there is no target
    # 0 in 0th chunk
    ds_ = ds[[0, 1, 3, 5]]
    mapped = ds_.get_mapped(m)
    ok_(len(mapped) == 3)
    ok_(not None in mapped.sa.origids)
Example #58
0
def chirp_linear(n_instances, n_features=4, n_nonbogus_features=2,
                data_noise=0.4, noise=0.1):
    """ Generates simple dataset for linear regressions

    Generates chirp signal, populates n_nonbogus_features out of
    n_features with it with different noise level and then provides
    signal itself with additional noise as labels
    """
    x = np.linspace(0, 1, n_instances)
    y = np.sin((10 * np.pi * x **2))

    data = np.random.normal(size=(n_instances, n_features ))*data_noise
    for i in xrange(n_nonbogus_features):
        data[:, i] += y[:]

    labels = y + np.random.normal(size=(n_instances,))*noise

    return dataset_wizard(samples=data, targets=labels)
Example #59
0
    def test_classifier(self):
        clf = ParametrizedClassifier()
        self.assertEqual(len(clf.params.items()), 2)  # + retrainable
        self.assertEqual(len(clf.kernel_params.items()), 1)

        clfe = ParametrizedClassifierExtended()
        self.assertEqual(len(clfe.params.items()), 2)
        self.assertEqual(len(clfe.kernel_params.items()), 2)
        self.assertEqual(len(clfe.kernel_params.listing), 2)

        # check assignment once again
        self.assertEqual(clfe.kernel_params.kp2, 200.0)
        clfe.kernel_params.kp2 = 201.0
        self.assertEqual(clfe.kernel_params.kp2, 201.0)
        self.assertEqual(clfe.kernel_params.is_set("kp2"), True)
        clfe.train(dataset_wizard(samples=[[0, 0]], targets=[1], chunks=[1]))
        self.assertEqual(clfe.kernel_params.is_set("kp2"), False)
        self.assertEqual(clfe.kernel_params.is_set(), False)
        self.assertEqual(clfe.params.is_set(), False)