Beispiel #1
0
    def test_custom_split(self):
        #simulate half splitter
        hs = CustomPartitioner([(None,[0,1,2,3,4]),(None,[5,6,7,8,9])])
        spl = Splitter(attr='partitions')
        splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ]
        self.assertTrue(len(splits) == 2)

        for i,p in enumerate(splits):
            self.assertTrue( len(p) == 2 )
            self.assertTrue( p[0].nsamples == 50 )
            self.assertTrue( p[1].nsamples == 50 )

        assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4])
        assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9])
        assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9])
        assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4])


        # check fully customized split with working and validation set specified
        cs = CustomPartitioner([([0,3,4],[5,9])])
        # we want to discared the unselected partition of the data, hence attr_value
        # these two splitters should do exactly the same thing
        splitters = (Splitter(attr='partitions', attr_values=[1,2]),
                     Splitter(attr='partitions', ignore_values=(0,)))
        for spl in splitters:
            splits = [ list(spl.generate(p)) for p in cs.generate(self.data) ]
            self.assertTrue(len(splits) == 1)

            for i,p in enumerate(splits):
                self.assertTrue( len(p) == 2 )
                self.assertTrue( p[0].nsamples == 30 )
                self.assertTrue( p[1].nsamples == 20 )

            self.assertTrue((splits[0][1].sa['chunks'].unique == [5, 9]).all())
            self.assertTrue((splits[0][0].sa['chunks'].unique == [0, 3, 4]).all())
Beispiel #2
0
    def test_slicing(self):
        hs = HalfPartitioner()
        spl = Splitter(attr="partitions")
        splits = list(hs.generate(self.data))
        for s in splits:
            # partitioned dataset shared the data
            assert_true(s.samples.base is self.data.samples)
        splits = [list(spl.generate(p)) for p in hs.generate(self.data)]

        # with numpy 1.7.0b1 "chaining" was deprecated so let's create
        # check function appropriate for the given numpy version
        _a = np.arange(5)
        __a = _a[:4][:3]
        if __a.base is _a:
            # 1.7.0b1
            def is_the_same_base(x, base=self.data.samples):
                return x.base is base

        elif __a.base.base is _a:
            # prior 1.7.0b1
            def is_the_same_base(x, base=self.data.samples):
                return x.base.base is base

        else:
            raise RuntimeError("Uknown handling of .base by numpy")

        for s in splits:
            # we get slicing all the time
            assert_true(is_the_same_base(s[0].samples))
            assert_true(is_the_same_base(s[1].samples))
        spl = Splitter(attr="partitions", noslicing=True)
        splits = [list(spl.generate(p)) for p in hs.generate(self.data)]
        for s in splits:
            # we no slicing at all
            assert_false(s[0].samples.base is self.data.samples)
            assert_false(s[1].samples.base is self.data.samples)
        nfs = NFoldPartitioner()
        spl = Splitter(attr="partitions")
        splits = [list(spl.generate(p)) for p in nfs.generate(self.data)]
        for i, s in enumerate(splits):
            # training only first and last split
            if i == 0 or i == len(splits) - 1:
                assert_true(is_the_same_base(s[0].samples))
            else:
                assert_true(s[0].samples.base is None)
            # we get slicing all the time
            assert_true(is_the_same_base(s[1].samples))
        step_ds = Dataset(np.random.randn(20, 2), sa={"chunks": np.tile([0, 1], 10)})
        oes = OddEvenPartitioner()
        spl = Splitter(attr="partitions")
        splits = list(oes.generate(step_ds))
        for s in splits:
            # partitioned dataset shared the data
            assert_true(s.samples.base is step_ds.samples)
        splits = [list(spl.generate(p)) for p in oes.generate(step_ds)]
        assert_equal(len(splits), 2)
        for s in splits:
            # we get slicing all the time
            assert_true(is_the_same_base(s[0].samples, step_ds.samples))
            assert_true(is_the_same_base(s[1].samples, step_ds.samples))
Beispiel #3
0
    def test_custom_split(self):
        #simulate half splitter
        hs = CustomPartitioner([(None,[0,1,2,3,4]),(None,[5,6,7,8,9])])
        spl = Splitter(attr='partitions')
        splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ]
        self.failUnless(len(splits) == 2)

        for i,p in enumerate(splits):
            self.failUnless( len(p) == 2 )
            self.failUnless( p[0].nsamples == 50 )
            self.failUnless( p[1].nsamples == 50 )

        assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4])
        assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9])
        assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9])
        assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4])


        # check fully customized split with working and validation set specified
        cs = CustomPartitioner([([0,3,4],[5,9])])
        # we want to discared the unselected partition of the data, hence attr_value
        # these two splitters should do exactly the same thing
        splitters = (Splitter(attr='partitions', attr_values=[1,2]),
                     Splitter(attr='partitions', ignore_values=(0,)))
        for spl in splitters:
            splits = [ list(spl.generate(p)) for p in cs.generate(self.data) ]
            self.failUnless(len(splits) == 1)

            for i,p in enumerate(splits):
                self.failUnless( len(p) == 2 )
                self.failUnless( p[0].nsamples == 30 )
                self.failUnless( p[1].nsamples == 20 )

            self.failUnless((splits[0][1].sa['chunks'].unique == [5, 9]).all())
            self.failUnless((splits[0][0].sa['chunks'].unique == [0, 3, 4]).all())
Beispiel #4
0
    def test_odd_even_split(self):
        oes = OddEvenPartitioner()
        spl = Splitter(attr='partitions')

        splits = [list(spl.generate(p)) for p in oes.generate(self.data)]

        self.assertTrue(len(splits) == 2)

        for i, p in enumerate(splits):
            self.assertTrue(len(p) == 2)
            self.assertTrue(p[0].nsamples == 50)
            self.assertTrue(p[1].nsamples == 50)

        assert_array_equal(splits[0][1].sa['chunks'].unique, [1, 3, 5, 7, 9])
        assert_array_equal(splits[0][0].sa['chunks'].unique, [0, 2, 4, 6, 8])
        assert_array_equal(splits[1][0].sa['chunks'].unique, [1, 3, 5, 7, 9])
        assert_array_equal(splits[1][1].sa['chunks'].unique, [0, 2, 4, 6, 8])

        # check if it works on pure odd and even chunk ids
        moresplits = [
            list(spl.generate(p)) for p in oes.generate(splits[0][0])
        ]

        for split in moresplits:
            self.assertTrue(split[0] != None)
            self.assertTrue(split[1] != None)
Beispiel #5
0
    def test_slicing(self):
        hs = HalfPartitioner()
        spl = Splitter(attr='partitions')
        splits = list(hs.generate(self.data))
        for s in splits:
            # partitioned dataset shared the data
            assert_true(s.samples.base is self.data.samples)
        splits = [list(spl.generate(p)) for p in hs.generate(self.data)]

        # with numpy 1.7.0b1 "chaining" was deprecated so let's create
        # check function appropriate for the given numpy version
        _a = np.arange(5)
        __a = _a[:4][:3]
        if __a.base is _a:
            # 1.7.0b1
            def is_the_same_base(x, base=self.data.samples):
                return x.base is base
        elif __a.base.base is _a:
            # prior 1.7.0b1
            def is_the_same_base(x, base=self.data.samples):
                return x.base.base is base
        else:
            raise RuntimeError("Uknown handling of .base by numpy")

        for s in splits:
            # we get slicing all the time
            assert_true(is_the_same_base(s[0].samples))
            assert_true(is_the_same_base(s[1].samples))
        spl = Splitter(attr='partitions', noslicing=True)
        splits = [list(spl.generate(p)) for p in hs.generate(self.data)]
        for s in splits:
            # we no slicing at all
            assert_false(s[0].samples.base is self.data.samples)
            assert_false(s[1].samples.base is self.data.samples)
        nfs = NFoldPartitioner()
        spl = Splitter(attr='partitions')
        splits = [list(spl.generate(p)) for p in nfs.generate(self.data)]
        for i, s in enumerate(splits):
            # training only first and last split
            if i == 0 or i == len(splits) - 1:
                assert_true(is_the_same_base(s[0].samples))
            else:
                assert_true(s[0].samples.base is None)
            # we get slicing all the time
            assert_true(s[1].samples.base.base is self.data.samples)
        step_ds = Dataset(np.random.randn(20, 2),
                          sa={'chunks': np.tile([0, 1], 10)})
        oes = OddEvenPartitioner()
        spl = Splitter(attr='partitions')
        splits = list(oes.generate(step_ds))
        for s in splits:
            # partitioned dataset shared the data
            assert_true(s.samples.base is step_ds.samples)
        splits = [list(spl.generate(p)) for p in oes.generate(step_ds)]
        assert_equal(len(splits), 2)
        for s in splits:
            # we get slicing all the time
            assert_true(s[0].samples.base.base is step_ds.samples)
            assert_true(s[1].samples.base.base is step_ds.samples)
Beispiel #6
0
def test_splitter():
    ds = give_data()
    # split with defaults
    spl1 = Splitter('chunks')
    assert_raises(NotImplementedError, spl1, ds)

    splits = list(spl1.generate(ds))
    assert_equal(len(splits), len(ds.sa['chunks'].unique))

    for split in splits:
        # it should have perform basic slicing!
        assert_true(split.samples.base is ds.samples)
        assert_equal(len(split.sa['chunks'].unique), 1)
        assert_true('lastsplit' in split.a)
    assert_true(splits[-1].a.lastsplit)

    # now again, more customized
    spl2 = Splitter('targets',
                    attr_values=[0, 1, 1, 2, 3, 3, 3],
                    count=4,
                    noslicing=True)
    splits = list(spl2.generate(ds))
    assert_equal(len(splits), 4)
    for split in splits:
        # it should NOT have perform basic slicing!
        assert_false(split.samples.base is ds.samples)
        assert_equal(len(split.sa['targets'].unique), 1)
        assert_equal(len(split.sa['chunks'].unique), 10)
    assert_true(splits[-1].a.lastsplit)

    # two should be identical
    assert_array_equal(splits[1].samples, splits[2].samples)

    # now go wild and split by feature attribute
    ds.fa['roi'] = np.repeat([0, 1], 5)
    # splitter should auto-detect that this is a feature attribute
    spl3 = Splitter('roi')
    splits = list(spl3.generate(ds))
    assert_equal(len(splits), 2)
    for split in splits:
        assert_true(split.samples.base is ds.samples)
        assert_equal(len(split.fa['roi'].unique), 1)
        assert_equal(split.shape, (100, 5))

    # and finally test chained splitters
    cspl = ChainNode([spl2, spl3, spl1])
    splits = list(cspl.generate(ds))
    # 4 target splits and 2 roi splits each and 10 chunks each
    assert_equal(len(splits), 80)
Beispiel #7
0
    def test_repeated_features(self):
        class CountFeatures(Measure):
            is_trained = True
            def _call(self, ds):
                return Dataset([ds.nfeatures],
                                fa={'nonbogus_targets': list(ds.fa['nonbogus_targets'].unique)})

        cf = CountFeatures()
        spl = Splitter('fa.nonbogus_targets')
        nsplits = len(list(spl.generate(self.dataset)))
        assert_equal(nsplits, 3)
        rm = RepeatedMeasure(cf, spl, concat_as='features')
        res = rm(self.dataset)
        assert_equal(res.shape, (1, nsplits))
        # due to https://github.com/numpy/numpy/issues/641 we are
        # using list(set(...)) construct and there order of
        # nonbogus_targets.unique can vary from run to run, thus there
        # is no guarantee that we would get 18 first, which is a
        # questionable assumption anyways, thus performing checks
        # which do not require any specific order.
        # And yet due to another issue
        # https://github.com/numpy/numpy/issues/3759
        # we can't just == None for the bool mask
        None_fa = np.array([x == None for x in  res.fa.nonbogus_targets])
        assert_array_equal(res.samples[0, None_fa], [18])
        assert_array_equal(res.samples[0, ~None_fa], [1, 1])

        if sys.version_info[0] < 3:
            # with python2 order seems to be consistent
            assert_array_equal(res.samples[0], [18, 1, 1])
Beispiel #8
0
def test_exclude_targets_combinations():
    partitioner = ChainNode([
        NFoldPartitioner(),
        ExcludeTargetsCombinationsPartitioner(
            k=2, targets_attr='targets', space='partitions')
    ],
                            space='partitions')
    from mvpa2.misc.data_generators import normal_feature_dataset
    ds = normal_feature_dataset(snr=0.,
                                nlabels=4,
                                perlabel=3,
                                nchunks=3,
                                nonbogus_features=[0, 1, 2, 3],
                                nfeatures=4)
    partitions = list(partitioner.generate(ds))
    assert_equal(len(partitions), 3 * 6)
    splitter = Splitter('partitions')
    combs = []
    comb_chunks = []
    for p in partitions:
        trds, teds = list(splitter.generate(p))[:2]
        comb = tuple(np.unique(teds.targets))
        combs.append(comb)
        comb_chunks.append(comb + tuple(np.unique(teds.chunks)))
    assert_equal(len(set(combs)),
                 6)  # just 6 possible combinations of 2 out of 4
    assert_equal(len(set(comb_chunks)), 3 * 6)  # all unique
Beispiel #9
0
    def test_repeated_features(self):
        class CountFeatures(Measure):
            is_trained = True

            def _call(self, ds):
                return Dataset([ds.nfeatures],
                               fa={
                                   'nonbogus_targets':
                                   list(ds.fa['nonbogus_targets'].unique)
                               })

        cf = CountFeatures()
        spl = Splitter('fa.nonbogus_targets')
        nsplits = len(list(spl.generate(self.dataset)))
        assert_equal(nsplits, 3)
        rm = RepeatedMeasure(cf, spl, concat_as='features')
        res = rm(self.dataset)
        assert_equal(res.shape, (1, nsplits))
        # due to https://github.com/numpy/numpy/issues/641 we are
        # using list(set(...)) construct and there order of
        # nonbogus_targets.unique can vary from run to run, thus there
        # is no guarantee that we would get 18 first, which is a
        # questionable assumption anyways, thus performing checks
        # which do not require any specific order.
        # And yet due to another issue
        # https://github.com/numpy/numpy/issues/3759
        # we can't just is None for the bool mask
        None_fa = np.array([x is None for x in res.fa.nonbogus_targets])
        assert_array_equal(res.samples[0, None_fa], [18])
        assert_array_equal(res.samples[0, ~None_fa], [1, 1])

        if sys.version_info[0] < 3:
            # with python2 order seems to be consistent
            assert_array_equal(res.samples[0], [18, 1, 1])
Beispiel #10
0
def test_splitter():
    ds = give_data()
    # split with defaults
    spl1 = Splitter('chunks')
    assert_raises(NotImplementedError, spl1, ds)

    splits = list(spl1.generate(ds))
    assert_equal(len(splits), len(ds.sa['chunks'].unique))

    for split in splits:
        # it should have perform basic slicing!
        assert_true(split.samples.base is ds.samples)
        assert_equal(len(split.sa['chunks'].unique), 1)
        assert_true('lastsplit' in split.a)
    assert_true(splits[-1].a.lastsplit)

    # now again, more customized
    spl2 = Splitter('targets', attr_values = [0,1,1,2,3,3,3], count=4,
                   noslicing=True)
    splits = list(spl2.generate(ds))
    assert_equal(len(splits), 4)
    for split in splits:
        # it should NOT have perform basic slicing!
        assert_false(split.samples.base is ds.samples)
        assert_equal(len(split.sa['targets'].unique), 1)
        assert_equal(len(split.sa['chunks'].unique), 10)
    assert_true(splits[-1].a.lastsplit)

    # two should be identical
    assert_array_equal(splits[1].samples, splits[2].samples)

    # now go wild and split by feature attribute
    ds.fa['roi'] = np.repeat([0,1], 5)
    # splitter should auto-detect that this is a feature attribute
    spl3 = Splitter('roi')
    splits = list(spl3.generate(ds))
    assert_equal(len(splits), 2)
    for split in splits:
        assert_true(split.samples.base is ds.samples)
        assert_equal(len(split.fa['roi'].unique), 1)
        assert_equal(split.shape, (100, 5))

    # and finally test chained splitters
    cspl = ChainNode([spl2, spl3, spl1])
    splits = list(cspl.generate(ds))
    # 4 target splits and 2 roi splits each and 10 chunks each
    assert_equal(len(splits), 80)
Beispiel #11
0
    def test_label_splitter(self):
        oes = OddEvenPartitioner(attr='targets')
        spl = Splitter(attr='partitions')

        splits = [list(spl.generate(p)) for p in oes.generate(self.data)]

        assert_array_equal(splits[0][0].sa['targets'].unique, [0, 2])
        assert_array_equal(splits[0][1].sa['targets'].unique, [1, 3])
        assert_array_equal(splits[1][0].sa['targets'].unique, [1, 3])
        assert_array_equal(splits[1][1].sa['targets'].unique, [0, 2])
Beispiel #12
0
 def test_slicing(self):
     hs = HalfPartitioner()
     spl = Splitter(attr='partitions')
     splits = list(hs.generate(self.data))
     for s in splits:
         # partitioned dataset shared the data
         assert_true(s.samples.base is self.data.samples)
     splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ]
     for s in splits:
         # we get slicing all the time
         assert_true(s[0].samples.base.base is self.data.samples)
         assert_true(s[1].samples.base.base is self.data.samples)
     spl = Splitter(attr='partitions', noslicing=True)
     splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ]
     for s in splits:
         # we no slicing at all
         assert_false(s[0].samples.base is self.data.samples)
         assert_false(s[1].samples.base is self.data.samples)
     nfs = NFoldPartitioner()
     spl = Splitter(attr='partitions')
     splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ]
     for i, s in enumerate(splits):
         # training only first and last split
         if i == 0 or i == len(splits) - 1:
             assert_true(s[0].samples.base.base is self.data.samples)
         else:
             assert_true(s[0].samples.base is None)
         # we get slicing all the time
         assert_true(s[1].samples.base.base is self.data.samples)
     step_ds = Dataset(np.random.randn(20,2),
                       sa={'chunks': np.tile([0,1], 10)})
     oes = OddEvenPartitioner()
     spl = Splitter(attr='partitions')
     splits = list(oes.generate(step_ds))
     for s in splits:
         # partitioned dataset shared the data
         assert_true(s.samples.base is step_ds.samples)
     splits = [ list(spl.generate(p)) for p in oes.generate(step_ds) ]
     assert_equal(len(splits), 2)
     for s in splits:
         # we get slicing all the time
         assert_true(s[0].samples.base.base is step_ds.samples)
         assert_true(s[1].samples.base.base is step_ds.samples)
Beispiel #13
0
    def test_label_splitter(self):
        oes = OddEvenPartitioner(attr='targets')
        spl = Splitter(attr='partitions')

        splits = [ list(spl.generate(p)) for p in oes.generate(self.data) ]

        assert_array_equal(splits[0][0].sa['targets'].unique, [0,2])
        assert_array_equal(splits[0][1].sa['targets'].unique, [1,3])
        assert_array_equal(splits[1][0].sa['targets'].unique, [1,3])
        assert_array_equal(splits[1][1].sa['targets'].unique, [0,2])
Beispiel #14
0
    def test_simplest_cv_pat_gen(self):
        # create the generator
        nfs = NFoldPartitioner(cvtype=1)
        spl = Splitter(attr='partitions')
        # now get the xval pattern sets One-Fold CV)
        xvpat = [ list(spl.generate(p)) for p in nfs.generate(self.data) ]

        self.failUnless( len(xvpat) == 10 )

        for i,p in enumerate(xvpat):
            self.failUnless( len(p) == 2 )
            self.failUnless( p[0].nsamples == 90 )
            self.failUnless( p[1].nsamples == 10 )
            self.failUnless( p[1].chunks[0] == i )
Beispiel #15
0
    def test_simplest_cv_pat_gen(self):
        # create the generator
        nfs = NFoldPartitioner(cvtype=1)
        spl = Splitter(attr='partitions')
        # now get the xval pattern sets One-Fold CV)
        xvpat = [list(spl.generate(p)) for p in nfs.generate(self.data)]

        self.assertTrue(len(xvpat) == 10)

        for i, p in enumerate(xvpat):
            self.assertTrue(len(p) == 2)
            self.assertTrue(p[0].nsamples == 90)
            self.assertTrue(p[1].nsamples == 10)
            self.assertTrue(p[1].chunks[0] == i)
Beispiel #16
0
    def test_half_split(self):
        hs = HalfPartitioner()
        spl = Splitter(attr='partitions')

        splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ]

        self.assertTrue(len(splits) == 2)

        for i,p in enumerate(splits):
            self.assertTrue( len(p) == 2 )
            self.assertTrue( p[0].nsamples == 50 )
            self.assertTrue( p[1].nsamples == 50 )

        assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4])
        assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9])
        assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9])
        assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4])

        # check if it works on pure odd and even chunk ids
        moresplits = [ list(spl.generate(p)) for p in hs.generate(splits[0][0])]

        for split in moresplits:
            self.assertTrue(split[0] is not None)
            self.assertTrue(split[1] is not None)
Beispiel #17
0
    def test_half_split(self):
        hs = HalfPartitioner()
        spl = Splitter(attr='partitions')

        splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ]

        self.failUnless(len(splits) == 2)

        for i,p in enumerate(splits):
            self.failUnless( len(p) == 2 )
            self.failUnless( p[0].nsamples == 50 )
            self.failUnless( p[1].nsamples == 50 )

        assert_array_equal(splits[0][1].sa['chunks'].unique, [0, 1, 2, 3, 4])
        assert_array_equal(splits[0][0].sa['chunks'].unique, [5, 6, 7, 8, 9])
        assert_array_equal(splits[1][1].sa['chunks'].unique, [5, 6, 7, 8, 9])
        assert_array_equal(splits[1][0].sa['chunks'].unique, [0, 1, 2, 3, 4])

        # check if it works on pure odd and even chunk ids
        moresplits = [ list(spl.generate(p)) for p in hs.generate(splits[0][0])]

        for split in moresplits:
            self.failUnless(split[0] != None)
            self.failUnless(split[1] != None)
Beispiel #18
0
    def test_odd_even_split(self):
        oes = OddEvenPartitioner()
        spl = Splitter(attr='partitions')

        splits = [ list(spl.generate(p)) for p in oes.generate(self.data) ]

        self.assertTrue(len(splits) == 2)

        for i,p in enumerate(splits):
            self.assertTrue( len(p) == 2 )
            self.assertTrue( p[0].nsamples == 50 )
            self.assertTrue( p[1].nsamples == 50 )

        assert_array_equal(splits[0][1].sa['chunks'].unique, [1, 3, 5, 7, 9])
        assert_array_equal(splits[0][0].sa['chunks'].unique, [0, 2, 4, 6, 8])
        assert_array_equal(splits[1][0].sa['chunks'].unique, [1, 3, 5, 7, 9])
        assert_array_equal(splits[1][1].sa['chunks'].unique, [0, 2, 4, 6, 8])

        # check if it works on pure odd and even chunk ids
        moresplits = [ list(spl.generate(p)) for p in oes.generate(splits[0][0])]

        for split in moresplits:
            self.assertTrue(split[0] != None)
            self.assertTrue(split[1] != None)
Beispiel #19
0
    def test_repeated_features(self):
        print self.dataset
        print self.dataset.fa.nonbogus_targets
        class CountFeatures(Measure):
            is_trained = True
            def _call(self, ds):
                return ds.nfeatures

        cf = CountFeatures()
        spl = Splitter('fa.nonbogus_targets')
        nsplits = len(list(spl.generate(self.dataset)))
        assert_equal(nsplits, 3)
        rm = RepeatedMeasure(cf, spl, concat_as='features')
        res = rm(self.dataset)
        assert_equal(res.shape, (1, nsplits))
        assert_array_equal(res.samples[0], [18,1,1])
Beispiel #20
0
    def test_repeated_features(self):
        print self.dataset
        print self.dataset.fa.nonbogus_targets
        class CountFeatures(Measure):
            is_trained = True
            def _call(self, ds):
                return ds.nfeatures

        cf = CountFeatures()
        spl = Splitter('fa.nonbogus_targets')
        nsplits = len(list(spl.generate(self.dataset)))
        assert_equal(nsplits, 3)
        rm = RepeatedMeasure(cf, spl, concat_as='features')
        res = rm(self.dataset)
        assert_equal(res.shape, (1, nsplits))
        assert_array_equal(res.samples[0], [18,1,1])
Beispiel #21
0
    def test_counted_splitting(self):
        spl = Splitter(attr='partitions')
        # count > #chunks, should result in 10 splits
        nchunks = len(self.data.sa['chunks'].unique)
        for strategy in Partitioner._STRATEGIES:
            for count, target in [ (nchunks*2, nchunks),
                                   (nchunks, nchunks),
                                   (nchunks-1, nchunks-1),
                                   (3, 3),
                                   (0, 0),
                                   (1, 1)
                                   ]:
                nfs = NFoldPartitioner(cvtype=1, count=count,
                                       selection_strategy=strategy)
                splits = [ list(spl.generate(p)) for p in nfs.generate(self.data) ]
                self.failUnless(len(splits) == target)
                chosenchunks = [int(s[1].uniquechunks) for s in splits]

                # Test if configuration matches as well
                nsplits_cfg = len(nfs.get_partition_specs(self.data))
                self.failUnlessEqual(nsplits_cfg, target)

                # Check if "lastsplit" dsattr was assigned appropriately
                nsplits = len(splits)
                if nsplits > 0:
                    # dummy-proof testing of last split
                    for ds_ in splits[-1]:
                        self.failUnless(ds_.a.lastpartitionset)
                    # test all now
                    for isplit,split in enumerate(splits):
                        for ds_ in split:
                            ds_.a.lastpartitionset == isplit==nsplits-1

                # Check results of different strategies
                if strategy == 'first':
                    self.failUnlessEqual(chosenchunks, range(target))
                elif strategy == 'equidistant':
                    if target == 3:
                        self.failUnlessEqual(chosenchunks, [0, 3, 7])
                elif strategy == 'random':
                    # none is selected twice
                    self.failUnless(len(set(chosenchunks)) == len(chosenchunks))
                    self.failUnless(target == len(chosenchunks))
                else:
                    raise RuntimeError, "Add unittest for strategy %s" \
                          % strategy
Beispiel #22
0
    def test_counted_splitting(self):
        spl = Splitter(attr='partitions')
        # count > #chunks, should result in 10 splits
        nchunks = len(self.data.sa['chunks'].unique)
        for strategy in Partitioner._STRATEGIES:
            for count, target in [(nchunks * 2, nchunks), (nchunks, nchunks),
                                  (nchunks - 1, nchunks - 1), (3, 3), (0, 0),
                                  (1, 1)]:
                nfs = NFoldPartitioner(cvtype=1,
                                       count=count,
                                       selection_strategy=strategy)
                splits = [
                    list(spl.generate(p)) for p in nfs.generate(self.data)
                ]
                self.assertTrue(len(splits) == target)
                chosenchunks = [int(s[1].uniquechunks) for s in splits]

                # Test if configuration matches as well
                nsplits_cfg = len(nfs.get_partition_specs(self.data))
                self.assertEqual(nsplits_cfg, target)

                # Check if "lastsplit" dsattr was assigned appropriately
                nsplits = len(splits)
                if nsplits > 0:
                    # dummy-proof testing of last split
                    for ds_ in splits[-1]:
                        self.assertTrue(ds_.a.lastpartitionset)
                    # test all now
                    for isplit, split in enumerate(splits):
                        for ds_ in split:
                            ds_.a.lastpartitionset == isplit == nsplits - 1

                # Check results of different strategies
                if strategy == 'first':
                    self.assertEqual(chosenchunks, range(target))
                elif strategy == 'equidistant':
                    if target == 3:
                        self.assertEqual(chosenchunks, [0, 3, 7])
                elif strategy == 'random':
                    # none is selected twice
                    self.assertTrue(
                        len(set(chosenchunks)) == len(chosenchunks))
                    self.assertTrue(target == len(chosenchunks))
                else:
                    raise RuntimeError, "Add unittest for strategy %s" \
                          % strategy
Beispiel #23
0
    def test_svms(self, clf):
        knows_probabilities = \
            'probabilities' in clf.ca.keys() and clf.params.probability
        enable_ca = ['estimates']
        if knows_probabilities:
            enable_ca += ['probabilities']

        clf.ca.change_temporarily(enable_ca=enable_ca)
        spl = Splitter('train', count=2)
        traindata, testdata = list(spl.generate(datasets['uni2small']))
        clf.train(traindata)
        predicts = clf.predict(testdata.samples)
        # values should be different from predictions for SVMs we have
        self.assertTrue(np.any(predicts != clf.ca.estimates))

        if knows_probabilities and clf.ca.is_set('probabilities'):
            # XXX test more thoroughly what we are getting here ;-)
            self.assertEqual(len(clf.ca.probabilities), len(testdata.samples))
        clf.ca.reset_changed_temporarily()
Beispiel #24
0
    def test_svms(self, clf):
        knows_probabilities = \
            'probabilities' in clf.ca.keys() and clf.params.probability
        enable_ca = ['estimates']
        if knows_probabilities:
            enable_ca += ['probabilities']

        clf.ca.change_temporarily(enable_ca = enable_ca)
        spl = Splitter('train', count=2)
        traindata, testdata = list(spl.generate(datasets['uni2small']))
        clf.train(traindata)
        predicts = clf.predict(testdata.samples)
        # values should be different from predictions for SVMs we have
        self.assertTrue(np.any(predicts != clf.ca.estimates))

        if knows_probabilities and clf.ca.is_set('probabilities'):
            # XXX test more thoroughly what we are getting here ;-)
            self.assertEqual( len(clf.ca.probabilities),
                                  len(testdata.samples)  )
        clf.ca.reset_changed_temporarily()
Beispiel #25
0
 def _forward_dataset(self, ds):
     if self.__chunks_attr is None:
         return self._forward_dataset_helper(ds)
     else:
         # strip down dataset to speedup local processing
         if self.__attr_strategy == "remove":
             keep_sa = []
         else:
             keep_sa = None
         proc_ds = ds.copy(deep=False, sa=keep_sa, fa=[], a=[])
         # process all chunks individually
         # use a customsplitter to speed-up splitting
         spl = Splitter(self.__chunks_attr)
         dses = [self._forward_dataset_helper(d) for d in spl.generate(proc_ds)]
         # and merge them again
         mds = vstack(dses)
         # put back attributes
         mds.fa.update(ds.fa)
         mds.a.update(ds.a)
         return mds
Beispiel #26
0
def test_exclude_targets_combinations():
    partitioner = ChainNode(
        [NFoldPartitioner(), ExcludeTargetsCombinationsPartitioner(k=2, targets_attr="targets", space="partitions")],
        space="partitions",
    )
    from mvpa2.misc.data_generators import normal_feature_dataset

    ds = normal_feature_dataset(snr=0.0, nlabels=4, perlabel=3, nchunks=3, nonbogus_features=[0, 1, 2, 3], nfeatures=4)
    partitions = list(partitioner.generate(ds))
    assert_equal(len(partitions), 3 * 6)
    splitter = Splitter("partitions")
    combs = []
    comb_chunks = []
    for p in partitions:
        trds, teds = list(splitter.generate(p))[:2]
        comb = tuple(np.unique(teds.targets))
        combs.append(comb)
        comb_chunks.append(comb + tuple(np.unique(teds.chunks)))
    assert_equal(len(set(combs)), 6)  # just 6 possible combinations of 2 out of 4
    assert_equal(len(set(comb_chunks)), 3 * 6)  # all unique
Beispiel #27
0
 def _forward_dataset(self, ds):
     if self.__chunks_attr is None:
         return self._forward_dataset_helper(ds)
     else:
         # strip down dataset to speedup local processing
         if self.__attr_strategy == 'remove':
             keep_sa = []
         else:
             keep_sa = None
         proc_ds = ds.copy(deep=False, sa=keep_sa, fa=[], a=[])
         # process all chunks individually
         # use a customsplitter to speed-up splitting
         spl = Splitter(self.__chunks_attr)
         dses = [self._forward_dataset_helper(d)
                     for d in spl.generate(proc_ds)]
         # and merge them again
         mds = vstack(dses)
         # put back attributes
         mds.fa.update(ds.fa)
         mds.a.update(ds.a)
         return mds
Beispiel #28
0
    def _sl_call(self, dataset, roi_ids, nproc):
        """Call to SimpleStatBaseSearchlight
        """
        # Local bindings
        generator = self.generator
        qe = self.queryengine
        errorfx = self.errorfx

        if __debug__:
            time_start = time.time()

        targets_sa_name = self._get_space()
        targets_sa = dataset.sa[targets_sa_name]

        if __debug__:
            debug_slc_ = 'SLC_' in debug.active

        # get the dataset information into easy vars
        X = dataset.samples
        if len(X.shape) != 2:
            raise ValueError(
                'Unlike a classifier, %s (for now) operates on already'
                'flattened datasets' % (self.__class__.__name__))
        labels = targets_sa.value
        ulabels = targets_sa.unique
        nlabels = len(ulabels)
        label2index = dict((l, il) for il, l in enumerate(ulabels))
        labels_numeric = np.array([label2index[l] for l in labels])
        self._ulabels_numeric = [label2index[l] for l in ulabels]
        # set the feature dimensions
        nsamples = len(X)
        nrois = len(roi_ids)
        s_shape = X.shape[1:]  # shape of a single sample
        # The shape of results
        r_shape = (nrois, ) + X.shape[2:]

        def assign_ulabels(a):
            out = np.empty(shape=a.shape, dtype=ulabels.dtype)
            it = np.nditer([a, out],
                           flags=['external_loop', 'buffered'],
                           op_flags=[['readonly'],
                                     ['writeonly', 'allocate',
                                      'no_broadcast']])
            for x, y in it:
                y[...] = ulabels[x]
            return it.operands[1]

        #
        # Everything toward optimization ;)
        #
        # Silly Yarik thinks that it might be worth to pre-compute
        # statistics per each feature within a block of the samples
        # which always come together in splits -- most often it is a
        # (chunk, label) combination, but since we simply use a
        # generator -- who knows! Therefore lets figure out what are
        # those blocks and operate on them instead of original samples.
        #
        # After additional thinking about this -- probably it would be
        # just minor additional improvements (ie not worth it) but
        # since it is coded already -- let it be so

        # 1. Query generator for the splits we will have
        if __debug__:
            debug(
                'SLC', 'Phase 1. Initializing partitions using %s on %s' %
                (generator, dataset))

        # Lets just create a dummy ds which will store for us actual sample
        # indicies
        # XXX we could make it even more lightweight I guess...
        dataset_indicies = Dataset(np.arange(nsamples), sa=dataset.sa)

        splitter = Splitter(attr=generator.get_space(), attr_values=[1, 2]) \
            if self._splitter is None \
            else self._splitter

        partitions = list(generator.generate(dataset_indicies)) \
            if generator \
            else [dataset_indicies]

        if __debug__:
            for p in partitions:
                if not (np.all(p.sa[targets_sa_name].value == labels)):
                    raise NotImplementedError(
                        "%s does not yet support partitioners altering the targets "
                        "(e.g. permutators)" % self.__class__)

        nsplits = len(partitions)
        # ATM we need to keep the splits instead since they are used
        # in two places in the code: step 2 and 5
        # We care only about training and testing partitions (i.e. first two)
        splits = list(tuple(splitter.generate(ds_))[:2] for ds_ in partitions)
        del partitions  # not used any longer

        # 2. Figure out the new 'chunks x labels' blocks of combinations
        #    of samples
        if __debug__:
            debug(
                'SLC', 'Phase 2. Blocking data for %i splits and %i labels' %
                (nsplits, nlabels))
        # array of indicies for label, split1, split2, ...
        # through which we will pass later on to figure out
        # unique combinations
        combinations = np.ones((nsamples, 1 + nsplits), dtype=int) * -1
        # labels
        combinations[:, 0] = labels_numeric
        for ipartition, (split1, split2) in enumerate(splits):
            combinations[split1.samples[:, 0], 1 + ipartition] = 1
            combinations[split2.samples[:, 0], 1 + ipartition] = 2
            # Check for over-sampling, i.e. no same sample used twice here
            if not (len(np.unique(split1.samples[:, 0])) == len(split1)
                    and len(np.unique(split2.samples[:, 0])) == len(split2)):
                raise RuntimeError(
                    "%s needs a partitioner which does not reuse "
                    "the same the same samples more than once" %
                    self.__class__)
        # sample descriptions -- should be unique for
        # samples within the same block
        descriptions = [tuple(c) for c in combinations]
        udescriptions = sorted(list(set(descriptions)))
        nblocks = len(udescriptions)
        description2block = dict([(d, i) for i, d in enumerate(udescriptions)])
        # Indices for samples to point to their block
        self.__sample2block = sample2block = \
            np.array([description2block[d] for d in descriptions])

        # 3. Compute statistics per each block
        #
        if __debug__:
            debug('SLC',
                  'Phase 3. Computing statistics for %i blocks' % (nblocks, ))

        self._compute_pb_stats(labels_numeric, X, (nblocks, ) + s_shape)

        # derived classes might decide differently on what they
        # actually need, so defer reserving the space and computing
        # stats to them
        self._reserve_pl_stats_space((nlabels, ) + s_shape)

        # results
        if errorfx is mean_mismatch_error:
            # if we know how it would look like, prepare the storage
            results = np.zeros((nsplits, ) + r_shape)
        else:
            # Otherwise delay assembling the results
            results = []

        all_targets, all_cvfolds = [], []

        # 4. Lets deduce all neighbors... might need to be RF into the
        #    parallel part later on
        # TODO: needs OPT since this is the step consuming 50% of time
        #       or more allow to cache them entirely so this would
        #       not be an unnecessary burden during permutation testing
        if not self.reuse_neighbors or self.__roi_fids is None:
            if __debug__:
                debug(
                    'SLC',
                    'Phase 4. Deducing neighbors information for %i ROIs' %
                    (nrois, ))
            roi_fids = [qe.query_byid(f) for f in roi_ids]

        else:
            if __debug__:
                debug(
                    'SLC',
                    'Phase 4. Reusing neighbors information for %i ROIs' %
                    (nrois, ))
            roi_fids = self.__roi_fids

        self.ca.roi_feature_ids = roi_fids

        roi_sizes = []
        if isinstance(roi_fids, list):
            nroi_fids = len(roi_fids)
            if self.ca.is_enabled('roi_sizes'):
                roi_sizes = [len(x) for x in roi_fids]
        elif externals.exists('scipy') and isinstance(roi_fids, sps.spmatrix):
            nroi_fids = roi_fids.shape[1]
            if self.ca.is_enabled('roi_sizes'):
                # very expensive operation, so better not to ask over again
                # roi_sizes = [roi_fids.getrow(r).nnz for r in range(nroi_fids)]
                warning(
                    "Since 'sparse' trick is used, extracting sizes of "
                    "roi's are expensive at this point.  Get them from the "
                    ".ca value of the original instance before "
                    "calling again and using reuse_neighbors")
        else:
            raise RuntimeError("Should not be reachable")

        # Since this is ad-hoc implementation of the searchlight, we are not passing
        # those via ds.a  but rather assign directly to self.ca
        self.ca.roi_sizes = roi_sizes

        indexsum = self._indexsum
        if indexsum == 'sparse':
            if not self.reuse_neighbors or self.__roi_fids is None:
                if __debug__:
                    debug(
                        'SLC',
                        'Phase 4b. Converting neighbors to sparse matrix '
                        'representation')
                # convert to "sparse representation" where column j contains
                # 1s only at the roi_fids[j] indices
                roi_fids = inds_to_coo(roi_fids,
                                       shape=(dataset.nfeatures, nroi_fids))
            indexsum_fx = lastdim_columnsums_spmatrix
        elif indexsum == 'fancy':
            indexsum_fx = lastdim_columnsums_fancy_indexing
        else:
            raise ValueError("Do not know how to deal with indexsum=%s" %
                             indexsum)
        # Store roi_fids
        if self.reuse_neighbors and self.__roi_fids is None:
            self.__roi_fids = roi_fids

        # 5. Lets do actual "splitting" and "classification"
        if __debug__:
            debug('SLC', 'Phase 5. Major loop')

        for isplit, split in enumerate(splits):
            if __debug__:
                debug('SLC', ' Split %i out of %i' % (isplit, nsplits))
            # figure out for a given splits the blocks we want to work
            # with
            # sample_indicies
            training_sis = split[0].samples[:, 0]
            testing_sis = split[1].samples[:, 0]

            # That is the GNB specificity
            targets, predictions = self._sl_call_on_a_split(
                split,
                X,  # X2 might light to go
                training_sis,
                testing_sis,
                ## training_nsamples,      # GO? == np.sum(pl.nsamples)
                ## training_non0labels,
                ## pl.sums, pl.means, pl.sums2, pl.variances,
                # passing nroi_fids as well since in 'sparse' way it has no 'length'
                nroi_fids,
                roi_fids,
                indexsum_fx,
                labels_numeric,
            )

            # assess the errors
            if __debug__:
                debug('SLC', "  Assessing accuracies")

            if errorfx is mean_mismatch_error:
                results[isplit, :] = \
                    (predictions != targets[:, None]).sum(axis=0) \
                    / float(len(targets))
                all_cvfolds += [isplit]
            elif errorfx:
                # somewhat silly but a way which allows to use pre-crafted
                # error functions without a chance to screw up
                result = np.atleast_2d(
                    np.array([
                        errorfx(fpredictions, targets)
                        for fpredictions in predictions.T
                    ]))
                results.append(result)
                all_cvfolds += [isplit] * result.shape[0]

            else:
                # and if no errorfx -- we just need to assign original
                # labels to the predictions BUT keep in mind that it is a matrix
                results.append(assign_ulabels(predictions))
                all_targets += [ulabels[i] for i in targets]
                all_cvfolds += [isplit] * len(targets)

            pass  # end of the split loop

        if isinstance(results, list):
            # we have just collected them, now they need to be vstacked
            results = np.vstack(results)
            assert (results.ndim >= 2)

        if __debug__:
            debug(
                'SLC', "%s._call() is done in %.3g sec" %
                (self.__class__.__name__, time.time() - time_start))

        out = Dataset(results)
        if all_targets:
            out.sa['targets'] = all_targets
        out.sa['cvfolds'] = all_cvfolds
        out.fa['center_ids'] = roi_ids
        return out
Beispiel #29
0
    def test_n_group_split(self):
        """Test NGroupSplitter alongside with the reversal of the
        order of spit out datasets
        """
        # Test 2 groups like HalfSplitter first
        hs = NGroupPartitioner(2)

        for isreversed, splitter in enumerate((hs, hs)):
            if isreversed:
                spl = Splitter(attr='partitions', reverse=True)
            else:
                spl = Splitter(attr='partitions')
            splits = [list(spl.generate(p)) for p in hs.generate(self.data)]
            self.assertTrue(len(splits) == 2)

            for i, p in enumerate(splits):
                self.assertTrue(len(p) == 2)
                self.assertTrue(p[0].nsamples == 50)
                self.assertTrue(p[1].nsamples == 50)

            assert_array_equal(splits[0][1 - isreversed].sa['chunks'].unique,
                               [0, 1, 2, 3, 4])
            assert_array_equal(splits[0][isreversed].sa['chunks'].unique,
                               [5, 6, 7, 8, 9])
            assert_array_equal(splits[1][1 - isreversed].sa['chunks'].unique,
                               [5, 6, 7, 8, 9])
            assert_array_equal(splits[1][isreversed].sa['chunks'].unique,
                               [0, 1, 2, 3, 4])

        # check if it works on pure odd and even chunk ids
        moresplits = [list(spl.generate(p)) for p in hs.generate(splits[0][0])]

        for split in moresplits:
            self.assertTrue(split[0] != None)
            self.assertTrue(split[1] != None)

        # now test more groups
        s5 = NGroupPartitioner(5)

        # get the splits
        for isreversed, s5splitter in enumerate((s5, s5)):
            if isreversed:
                spl = Splitter(attr='partitions', reverse=True)
            else:
                spl = Splitter(attr='partitions')
            splits = [
                list(spl.generate(p)) for p in s5splitter.generate(self.data)
            ]

            # must have 10 splits
            self.assertTrue(len(splits) == 5)

            # check split content
            assert_array_equal(splits[0][1 - isreversed].sa['chunks'].unique,
                               [0, 1])
            assert_array_equal(splits[0][isreversed].sa['chunks'].unique,
                               [2, 3, 4, 5, 6, 7, 8, 9])
            assert_array_equal(splits[1][1 - isreversed].sa['chunks'].unique,
                               [2, 3])
            assert_array_equal(splits[1][isreversed].sa['chunks'].unique,
                               [0, 1, 4, 5, 6, 7, 8, 9])
            # ...
            assert_array_equal(splits[4][1 - isreversed].sa['chunks'].unique,
                               [8, 9])
            assert_array_equal(splits[4][isreversed].sa['chunks'].unique,
                               [0, 1, 2, 3, 4, 5, 6, 7])

        # Test for too many groups
        def splitcall(spl, dat):
            return list(spl.generate(dat))

        s20 = NGroupPartitioner(20)
        self.assertRaises(ValueError, splitcall, s20, self.data)
Beispiel #30
0
    def _sl_call(self, dataset, roi_ids, nproc):
        """Call to GNBSearchlight
        """
        # Local bindings
        gnb = self.gnb
        params = gnb.params
        generator = self.generator
        errorfx = self.errorfx
        qe = self.queryengine

        ## if False:
        ##     class A(Learner):
        ##         pass
        ##     self = A()
        ##     import numpy as np
        ##     from mvpa2.clfs.gnb import GNB
        ##     from mvpa2.generators.partition import NFoldPartitioner
        ##     from mvpa2.misc.errorfx import mean_mismatch_error
        ##     from mvpa2.testing.datasets import datasets as tdatasets
        ##     from mvpa2.datasets import Dataset
        ##     from mvpa2.misc.neighborhood import IndexQueryEngine, Sphere
        ##     from mvpa2.clfs.distance import absmin_distance
        ##     import time
        ##     if __debug__:
        ##         from mvpa2.base import debug
        ##         debug.active += ['SLC.*']
        ##         # XXX is it that ugly?
        ##         debug.active.pop(debug.active.index('SLC_'))
        ##         debug.metrics += ['reltime']
        ##     dataset = tdatasets['3dlarge'].copy()
        ##     dataset.fa['voxel_indices'] = dataset.fa.myspace
        ##     sphere = Sphere(radius=1,
        ##                     distance_func=absmin_distance)
        ##     qe = IndexQueryEngine(myspace=sphere)

        ##     # Fracisco's data
        ##     #dataset = ds_fp
        ##     qe = IndexQueryEngine(voxel_indices=sphere)

        ##     qe.train(dataset)
        ##     roi_ids = np.arange(dataset.nfeatures)
        ##     gnb = GNB()
        ##     params = gnb.params
        ##     generator = NFoldPartitioner()
        ##     errorfx = mean_mismatch_error

        if __debug__:
            time_start = time.time()

        targets_sa_name = gnb.get_space()
        targets_sa = dataset.sa[targets_sa_name]

        if __debug__:
            debug_slc_ = 'SLC_' in debug.active

        # get the dataset information into easy vars
        X = dataset.samples
        if len(X.shape) != 2:
            raise ValueError, \
                  'Unlike GNB, GNBSearchlight (for now) operates on already' \
                  'flattened datasets'
        labels = targets_sa.value
        ulabels = targets_sa.unique
        nlabels = len(ulabels)
        label2index = dict((l, il) for il, l in enumerate(ulabels))
        labels_numeric = np.array([label2index[l] for l in labels])
        ulabels_numeric = [label2index[l] for l in ulabels]
        # set the feature dimensions
        nsamples = len(X)
        nrois = len(roi_ids)
        s_shape = X.shape[1:]           # shape of a single sample
        # The shape of results
        r_shape = (nrois,) + X.shape[2:]

        #
        # Everything toward optimization ;)
        #
        # Silly Yarik thinks that it might be worth to pre-compute
        # statistics per each feature within a block of the samples
        # which always come together in splits -- most often it is a
        # (chunk, label) combination, but since we simply use a
        # generator -- who knows! Therefore lets figure out what are
        # those blocks and operate on them instead of original samples.
        #
        # After additional thinking about this -- probably it would be
        # just minor additional improvements (ie not worth it) but
        # since it is coded already -- let it be so

        # 1. Query generator for the splits we will have
        if __debug__:
            debug('SLC',
                  'Phase 1. Initializing partitions using %s on %s'
                  % (generator, dataset))

        # Lets just create a dummy ds which will store for us actual sample
        # indicies
        # XXX we could make it even more lightweight I guess...
        dataset_indicies = Dataset(np.arange(nsamples), sa=dataset.sa)
        splitter = Splitter(attr=generator.get_space())
        splits = list(tuple(splitter.generate(ds_))
                      for ds_ in generator.generate(dataset_indicies))
        nsplits = len(splits)

        # 2. Figure out the new 'chunks x labels' blocks of combinations
        #    of samples
        if __debug__:
            debug('SLC',
                  'Phase 2. Blocking data for %i splits and %i labels'
                  % (nsplits, nlabels))
        # array of indicies for label, split1, split2, ...
        # through which we will pass later on to figure out
        # unique combinations
        combinations = np.ones((nsamples, 1+nsplits), dtype=int)*-1
        # labels
        combinations[:, 0] = labels_numeric
        for ipartition, (split1, split2) in enumerate(splits):
            combinations[split1.samples[:, 0], 1+ipartition] = 1
            combinations[split2.samples[:, 0], 1+ipartition] = 2
            # Check for over-sampling, i.e. no same sample used twice here
            if not (len(np.unique(split1.samples[:, 0])) == len(split1) and
                    len(np.unique(split2.samples[:, 0])) == len(split2)):
                raise RuntimeError(
                    "GNBSearchlight needs a partitioner which does not reuse "
                    "the same the same samples more than once")
        # sample descriptions -- should be unique for
        # samples within the same block
        descriptions = [tuple(c) for c in combinations]
        udescriptions = sorted(list(set(descriptions)))
        nblocks = len(udescriptions)
        description2block = dict([(d, i) for i, d in enumerate(udescriptions)])
        # Indices for samples to point to their block
        sample2block = np.array([description2block[d] for d in descriptions])

        # 3. Compute statistics per each block
        #
        if __debug__:
            debug('SLC',
                  'Phase 3. Computing statistics for %i blocks' % (nblocks,))

        #
        # reusable containers which should stay of the same size
        #

        # sums and sums of squares per each block
        sums = np.zeros((nblocks, ) + s_shape)
        # sums of squares
        sums2 = np.zeros((nblocks, ) + s_shape)

        # per each label:
        means = np.zeros((nlabels, ) + s_shape)
        # means of squares for stddev computation
        means2 = np.zeros((nlabels, ) + s_shape)
        variances = np.zeros((nlabels, ) + s_shape)
        # degenerate dimension are added for easy broadcasting later on
        nsamples_per_class = np.zeros((nlabels,) + (1,)*len(s_shape))

        # results
        results = np.zeros((nsplits,) + r_shape)

        block_counts = np.zeros((nblocks,))
        block_labels = [None] * nblocks

        X2 = np.square(X)
        # silly way for now
        for l, s, s2, ib in zip(labels_numeric, X, X2, sample2block):
            sums[ib] += s
            sums2[ib] += s2
            block_counts[ib] += 1
            if block_labels[ib] is None:
                block_labels[ib] = l
            else:
                assert(block_labels[ib] == l)
        block_labels = np.asanyarray(block_labels)
        # additional silly tests for paranoid
        assert(block_labels.dtype.kind is 'i')

        # 4. Lets deduce all neighbors... might need to be RF into the
        #    parallel part later on
        if __debug__:
            debug('SLC',
                  'Phase 4. Deducing neighbors information for %i ROIs'
                  % (nrois,))
        roi_fids = [qe.query_byid(f) for f in roi_ids]
        nroi_fids = len(roi_fids)
        # makes sense to waste precious ms only if ca is enabled
        if self.ca.is_enabled('roi_sizes'):
            roi_sizes = [len(x) for x in roi_fids]
        else:
            roi_sizes = []

        indexsum = self._indexsum
        if indexsum == 'sparse':
            if __debug__:
                debug('SLC',
                      'Phase 4b. Converting neighbors to sparse matrix '
                      'representation')
            # convert to "sparse representation" where column j contains
            # 1s only at the roi_fids[j] indices
            roi_fids = inds_to_coo(roi_fids,
                                   shape=(dataset.nfeatures, nroi_fids))
            indexsum_fx = lastdim_columnsums_spmatrix
        elif indexsum == 'fancy':
            indexsum_fx = lastdim_columnsums_fancy_indexing
        else:
            raise ValueError, \
                  "Do not know how to deal with indexsum=%s" % indexsum

        # 5. Lets do actual "splitting" and "classification"
        if __debug__:
            debug('SLC', 'Phase 5. Major loop' )

        for isplit, split in enumerate(splits):
            if __debug__:
                debug('SLC', ' Split %i out of %i' % (isplit, nsplits))
            # figure out for a given splits the blocks we want to work
            # with
            # sample_indicies
            training_sis = split[0].samples[:, 0]
            # convert to blocks training split
            training_bis = np.unique(sample2block[training_sis])

            # now lets do our GNB business
            training_nsamples = 0
            for il, l in enumerate(ulabels_numeric):
                bis_il = training_bis[block_labels[training_bis] == l]
                nsamples_per_class[il] = N_float = \
                                         float(np.sum(block_counts[bis_il]))
                training_nsamples += N_float
                if N_float == 0.0:
                    variances[il] = means[il] = means2[il] = 0.
                else:
                    means[il] = np.sum(sums[bis_il], axis=0) / N_float
                    # Not yet normed
                    means2[il] = np.sum(sums2[bis_il], axis=0)

            ## Actually compute the non-0 variances
            non0labels = (nsamples_per_class.squeeze() != 0)
            if np.all(non0labels):
                # For a possible tiny speed up avoiding copying and
                # using (no) slicing
                non0labels = slice(None)

            if params.common_variance:
                variances[:] = \
                    np.sum(means2 - nsamples_per_class*np.square(means),
                           axis=0) \
                    / training_nsamples
            else:
                variances[non0labels] = \
                    (means2 - nsamples_per_class*np.square(means))[non0labels] \
                    / nsamples_per_class[non0labels]

            # assign priors
            priors = gnb._get_priors(
                nlabels, training_nsamples, nsamples_per_class)

            # proceed in a way we have in GNB code with logprob=True,
            # i.e. operating within the exponents -- should lead to some
            # performance advantage
            norm_weight = -0.5 * np.log(2*np.pi*variances)
            # last added dimension would be for ROIs
            logpriors = np.log(priors[:, np.newaxis, np.newaxis])

            if __debug__:
                debug('SLC', "  'Training' is done")

            # Now it is time to "classify" our samples.
            # and for that we first need to compute corresponding
            # probabilities (or may be un
            data = X[split[1].samples[:, 0]]
            targets = labels_numeric[split[1].samples[:, 0]]

            # argument of exponentiation
            scaled_distances = \
                 -0.5 * (((data - means[:, np.newaxis, ...])**2) \
                         / variances[:, np.newaxis, ...])

            # incorporate the normalization from normals
            lprob_csfs = norm_weight[:, np.newaxis, ...] + scaled_distances

            ## First we need to reshape to get class x samples x features
            lprob_csf = lprob_csfs.reshape(lprob_csfs.shape[:2] + (-1,))

            ## Now we come to naive part which requires looping
            ## through all spheres
            if __debug__:
                debug('SLC', "  Doing 'Searchlight'")
            # resultant logprobs for each class x sample x roi
            lprob_cs_sl = np.zeros(lprob_csfs.shape[:2] + (nroi_fids,))
            indexsum_fx(lprob_csf, roi_fids, out=lprob_cs_sl)

            lprob_cs_sl += logpriors
            lprob_cs_cp_sl = lprob_cs_sl
            # for each of the ROIs take the class with maximal (log)probability
            predictions = lprob_cs_cp_sl.argmax(axis=0)
            # no need to map back [self.ulabels[c] for c in winners]
            #predictions = winners
            # assess the errors
            if __debug__:
                debug('SLC', "  Assessing accuracies")

            if errorfx is mean_mismatch_error:
                results[isplit, :] = \
                    (predictions != targets[:, None]).sum(axis=0) \
                    / float(len(targets))
            else:
                # somewhat silly but a way which allows to use pre-crafted
                # error functions without a chance to screw up
                for i, fpredictions in enumerate(predictions.T):
                    results[isplit, i] = errorfx(fpredictions, targets)

        if __debug__:
            debug('SLC', "GNBSearchlight is done in %.3g sec" %
                  (time.time() - time_start))

        return Dataset(results), roi_sizes
    def _sl_call(self, dataset, roi_ids, nproc):
        """Call to SimpleStatBaseSearchlight
        """
        # Local bindings
        generator = self.generator
        qe = self.queryengine
        errorfx = self.errorfx

        if __debug__:
            time_start = time.time()

        targets_sa_name = self._get_space()
        targets_sa = dataset.sa[targets_sa_name]

        if __debug__:
            debug_slc_ = 'SLC_' in debug.active

        # get the dataset information into easy vars
        X = dataset.samples
        if len(X.shape) != 2:
            raise ValueError(
                  'Unlike a classifier, %s (for now) operates on already'
                  'flattened datasets' % (self.__class__.__name__))
        labels = targets_sa.value
        ulabels = targets_sa.unique
        nlabels = len(ulabels)
        label2index = dict((l, il) for il, l in enumerate(ulabels))
        labels_numeric = np.array([label2index[l] for l in labels])
        self._ulabels_numeric = [label2index[l] for l in ulabels]
        # set the feature dimensions
        nsamples = len(X)
        nrois = len(roi_ids)
        s_shape = X.shape[1:]           # shape of a single sample
        # The shape of results
        r_shape = (nrois,) + X.shape[2:]

        #
        # Everything toward optimization ;)
        #
        # Silly Yarik thinks that it might be worth to pre-compute
        # statistics per each feature within a block of the samples
        # which always come together in splits -- most often it is a
        # (chunk, label) combination, but since we simply use a
        # generator -- who knows! Therefore lets figure out what are
        # those blocks and operate on them instead of original samples.
        #
        # After additional thinking about this -- probably it would be
        # just minor additional improvements (ie not worth it) but
        # since it is coded already -- let it be so

        # 1. Query generator for the splits we will have
        if __debug__:
            debug('SLC',
                  'Phase 1. Initializing partitions using %s on %s'
                  % (generator, dataset))

        # Lets just create a dummy ds which will store for us actual sample
        # indicies
        # XXX we could make it even more lightweight I guess...
        dataset_indicies = Dataset(np.arange(nsamples), sa=dataset.sa)
        splitter = Splitter(attr=generator.get_space())
        partitions = list(generator.generate(dataset_indicies))
        if __debug__:
            for p in partitions:
                if not (np.all(p.sa[targets_sa_name].value == labels)):
                    raise NotImplementedError(
                        "%s does not yet support partitioners altering the targets "
                        "(e.g. permutators)" % self.__class__)

        nsplits = len(partitions)
        # ATM we need to keep the splits instead since they are used
        # in two places in the code: step 2 and 5
        splits = list(tuple(splitter.generate(ds_)) for ds_ in partitions)
        del partitions                    # not used any longer

        # 2. Figure out the new 'chunks x labels' blocks of combinations
        #    of samples
        if __debug__:
            debug('SLC',
                  'Phase 2. Blocking data for %i splits and %i labels'
                  % (nsplits, nlabels))
        # array of indicies for label, split1, split2, ...
        # through which we will pass later on to figure out
        # unique combinations
        combinations = np.ones((nsamples, 1+nsplits), dtype=int)*-1
        # labels
        combinations[:, 0] = labels_numeric
        for ipartition, (split1, split2) in enumerate(splits):
            combinations[split1.samples[:, 0], 1+ipartition] = 1
            combinations[split2.samples[:, 0], 1+ipartition] = 2
            # Check for over-sampling, i.e. no same sample used twice here
            if not (len(np.unique(split1.samples[:, 0])) == len(split1) and
                    len(np.unique(split2.samples[:, 0])) == len(split2)):
                raise RuntimeError(
                    "%s needs a partitioner which does not reuse "
                    "the same the same samples more than once"
                    % self.__class__)
        # sample descriptions -- should be unique for
        # samples within the same block
        descriptions = [tuple(c) for c in combinations]
        udescriptions = sorted(list(set(descriptions)))
        nblocks = len(udescriptions)
        description2block = dict([(d, i) for i, d in enumerate(udescriptions)])
        # Indices for samples to point to their block
        self.__sample2block = sample2block = \
            np.array([description2block[d] for d in descriptions])

        # 3. Compute statistics per each block
        #
        if __debug__:
            debug('SLC',
                  'Phase 3. Computing statistics for %i blocks' % (nblocks,))

        self._compute_pb_stats(labels_numeric, X, (nblocks,) + s_shape)

        # derived classes might decide differently on what they
        # actually need, so defer reserving the space and computing
        # stats to them
        self._reserve_pl_stats_space((nlabels, ) + s_shape)

        # results
        results = np.zeros((nsplits,) + r_shape)

        # 4. Lets deduce all neighbors... might need to be RF into the
        #    parallel part later on
        # TODO: needs OPT since this is the step consuming 50% of time
        #       or more allow to cache them entirely so this would
        #       not be an unnecessary burden during permutation testing
        if not self.reuse_neighbors or self.__roi_fids is None:
            if __debug__:
                debug('SLC',
                      'Phase 4. Deducing neighbors information for %i ROIs'
                      % (nrois,))
            roi_fids = [qe.query_byid(f) for f in roi_ids]

        else:
            if __debug__:
                debug('SLC',
                      'Phase 4. Reusing neighbors information for %i ROIs'
                      % (nrois,))
            roi_fids = self.__roi_fids

        self.ca.roi_feature_ids = roi_fids

        roi_sizes = []
        if isinstance(roi_fids, list):
            nroi_fids = len(roi_fids)
            if self.ca.is_enabled('roi_sizes'):
                roi_sizes = [len(x) for x in roi_fids]
        elif externals.exists('scipy') and isinstance(roi_fids, sps.spmatrix):
            nroi_fids = roi_fids.shape[1]
            if self.ca.is_enabled('roi_sizes'):
                # very expensive operation, so better not to ask over again
                # roi_sizes = [roi_fids.getrow(r).nnz for r in range(nroi_fids)]
                warning("Since 'sparse' trick is used, extracting sizes of "
                        "roi's are expensive at this point.  Get them from the "
                        ".ca value of the original instance before "
                        "calling again and using reuse_neighbors")
        else:
            raise RuntimeError("Should not be reachable")

        # Since this is ad-hoc implementation of the searchlight, we are not passing
        # those via ds.a  but rather assign directly to self.ca
        self.ca.roi_sizes = roi_sizes

        indexsum = self._indexsum
        if indexsum == 'sparse':
            if not self.reuse_neighbors or self.__roi_fids is None:
                if __debug__:
                    debug('SLC',
                          'Phase 4b. Converting neighbors to sparse matrix '
                          'representation')
                # convert to "sparse representation" where column j contains
                # 1s only at the roi_fids[j] indices
                roi_fids = inds_to_coo(roi_fids,
                                       shape=(dataset.nfeatures, nroi_fids))
            indexsum_fx = lastdim_columnsums_spmatrix
        elif indexsum == 'fancy':
            indexsum_fx = lastdim_columnsums_fancy_indexing
        else:
            raise ValueError, \
                  "Do not know how to deal with indexsum=%s" % indexsum

        # Store roi_fids
        if self.reuse_neighbors and self.__roi_fids is None:
            self.__roi_fids = roi_fids

        # 5. Lets do actual "splitting" and "classification"
        if __debug__:
            debug('SLC', 'Phase 5. Major loop' )


        for isplit, split in enumerate(splits):
            if __debug__:
                debug('SLC', ' Split %i out of %i' % (isplit, nsplits))
            # figure out for a given splits the blocks we want to work
            # with
            # sample_indicies
            training_sis = split[0].samples[:, 0]
            testing_sis = split[1].samples[:, 0]

            # That is the GNB specificity
            targets, predictions = self._sl_call_on_a_split(
                split, X,               # X2 might light to go
                training_sis, testing_sis,
                ## training_nsamples,      # GO? == np.sum(pl.nsamples)
                ## training_non0labels,
                ## pl.sums, pl.means, pl.sums2, pl.variances,
                # passing nroi_fids as well since in 'sparse' way it has no 'length'
                nroi_fids, roi_fids,
                indexsum_fx,
                labels_numeric,
                )

            # assess the errors
            if __debug__:
                debug('SLC', "  Assessing accuracies")

            if errorfx is mean_mismatch_error:
                results[isplit, :] = \
                    (predictions != targets[:, None]).sum(axis=0) \
                    / float(len(targets))
            else:
                # somewhat silly but a way which allows to use pre-crafted
                # error functions without a chance to screw up
                for i, fpredictions in enumerate(predictions.T):
                    results[isplit, i] = errorfx(fpredictions, targets)


        if __debug__:
            debug('SLC', "%s._call() is done in %.3g sec" %
                  (self.__class__.__name__, time.time() - time_start))

        return Dataset(results)
Beispiel #32
0
    def test_n_group_split(self):
        """Test NGroupSplitter alongside with the reversal of the
        order of spit out datasets
        """
        # Test 2 groups like HalfSplitter first
        hs = NGroupPartitioner(2)

        for isreversed, splitter in enumerate((hs, hs)):
            if isreversed:
                spl = Splitter(attr='partitions', reverse=True)
            else:
                spl = Splitter(attr='partitions')
            splits = [ list(spl.generate(p)) for p in hs.generate(self.data) ]
            self.failUnless(len(splits) == 2)

            for i, p in enumerate(splits):
                self.failUnless( len(p) == 2 )
                self.failUnless( p[0].nsamples == 50 )
                self.failUnless( p[1].nsamples == 50 )

            assert_array_equal(splits[0][1-isreversed].sa['chunks'].unique,
                               [0, 1, 2, 3, 4])
            assert_array_equal(splits[0][isreversed].sa['chunks'].unique,
                               [5, 6, 7, 8, 9])
            assert_array_equal(splits[1][1-isreversed].sa['chunks'].unique,
                               [5, 6, 7, 8, 9])
            assert_array_equal(splits[1][isreversed].sa['chunks'].unique,
                               [0, 1, 2, 3, 4])

        # check if it works on pure odd and even chunk ids
        moresplits = [ list(spl.generate(p)) for p in hs.generate(splits[0][0])]

        for split in moresplits:
            self.failUnless(split[0] != None)
            self.failUnless(split[1] != None)

        # now test more groups
        s5 = NGroupPartitioner(5)

        # get the splits
        for isreversed, s5splitter in enumerate((s5, s5)):
            if isreversed:
                spl = Splitter(attr='partitions', reverse=True)
            else:
                spl = Splitter(attr='partitions')
            splits = [ list(spl.generate(p)) for p in s5splitter.generate(self.data) ]

            # must have 10 splits
            self.failUnless(len(splits) == 5)

            # check split content
            assert_array_equal(splits[0][1-isreversed].sa['chunks'].unique,
                               [0, 1])
            assert_array_equal(splits[0][isreversed].sa['chunks'].unique,
                               [2, 3, 4, 5, 6, 7, 8, 9])
            assert_array_equal(splits[1][1-isreversed].sa['chunks'].unique,
                               [2, 3])
            assert_array_equal(splits[1][isreversed].sa['chunks'].unique,
                               [0, 1, 4, 5, 6, 7, 8, 9])
            # ...
            assert_array_equal(splits[4][1-isreversed].sa['chunks'].unique,
                               [8, 9])
            assert_array_equal(splits[4][isreversed].sa['chunks'].unique,
                               [0, 1, 2, 3, 4, 5, 6, 7])


        # Test for too many groups
        def splitcall(spl, dat):
            return list(spl.generate(dat))
        s20 = NGroupPartitioner(20)
        self.assertRaises(ValueError,splitcall,s20,self.data)