Esempio n. 1
0
    def test_cached_kernel_different_datasets(self):
        skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455)

        # Inspired by the problem Swaroop ran into
        k  = LinearSGKernel(normalizer_cls=False)
        k_ = LinearSGKernel(normalizer_cls=False)   # to be cached
        ck = CachedKernel(k_)

        clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1)
        clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1)

        cvte = CrossValidation(clf, NFoldPartitioner())
        cvte_ = CrossValidation(clf_, NFoldPartitioner())

        postproc=BinaryFxNode(mean_mismatch_error, 'targets')
        te = ProxyMeasure(clf, postproc=postproc)
        te_ = ProxyMeasure(clf_, postproc=postproc)

        for r in xrange(2):
            ds1 = datasets['uni2medium']
            errs1 = cvte(ds1)
            ck.compute(ds1)
            ok_(ck._recomputed)
            errs1_ = cvte_(ds1)
            ok_(~ck._recomputed)
            assert_array_equal(errs1, errs1_)

            ds2 = datasets['uni3small']
            errs2 = cvte(ds2)
            ck.compute(ds2)
            ok_(ck._recomputed)
            errs2_ = cvte_(ds2)
            ok_(~ck._recomputed)
            assert_array_equal(errs2, errs2_)

            ssel = np.round(datasets['uni2large'].samples[:5, 0]).astype(int)
            te.train(datasets['uni3small'][::2])
            terr = np.asscalar(te(datasets['uni3small'][ssel]))
            te_.train(datasets['uni3small'][::2])
            terr_ = np.asscalar(te_(datasets['uni3small'][ssel]))
            ok_(~ck._recomputed)
            ok_(terr == terr_)
Esempio n. 2
0
    def test_cached_kernel_different_datasets(self):
        skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455)

        # Inspired by the problem Swaroop ran into
        k  = LinearSGKernel(normalizer_cls=False)
        k_ = LinearSGKernel(normalizer_cls=False)   # to be cached
        ck = CachedKernel(k_)

        clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1)
        clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1)

        cvte = CrossValidatedTransferError(
            TransferError(clf), NFoldSplitter())
        cvte_ = CrossValidatedTransferError(
            TransferError(clf_), NFoldSplitter())

        te = TransferError(clf)
        te_ = TransferError(clf_)

        for r in xrange(2):
            ds1 = datasets['uni2medium']
            errs1 = cvte(ds1)
            ck.compute(ds1)
            ok_(ck._recomputed)
            errs1_ = cvte_(ds1)
            ok_(~ck._recomputed)
            assert_array_equal(errs1, errs1_)

            ds2 = datasets['uni3small']
            errs2 = cvte(ds2)
            ck.compute(ds2)
            ok_(ck._recomputed)
            errs2_ = cvte_(ds2)
            ok_(~ck._recomputed)
            assert_array_equal(errs2, errs2_)

            ssel = np.round(datasets['uni2large'].samples[:5, 0]).astype(int)
            terr = te(datasets['uni3small_test'][ssel], datasets['uni3small_train'][::2])
            terr_ = te_(datasets['uni3small_test'][ssel], datasets['uni3small_train'][::2])
            ok_(~ck._recomputed)
            ok_(terr == terr_)
Esempio n. 3
0
    def test_vstack_and_origids_issue(self):
        # That is actually what swaroop hit
        skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455)

        # Inspired by the problem Swaroop ran into
        k  = LinearSGKernel(normalizer_cls=False)
        k_ = LinearSGKernel(normalizer_cls=False)   # to be cached
        ck = CachedKernel(k_)

        clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1)
        clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1)

        cvte = CrossValidatedTransferError(
            TransferError(clf), NFoldSplitter())
        cvte_ = CrossValidatedTransferError(
            TransferError(clf_), NFoldSplitter())

        ds = datasets['uni2large_test'].copy(deep=True)
        ok_(~('orig_ids' in ds.sa))     # assure that there are None
        ck.compute(ds)                  # so we initialize origids
        ok_('origids' in ds.sa)
        ds2 = ds.copy(deep=True)
        ds2.samples = np.zeros(ds2.shape)
        from mvpa.base.dataset import vstack
        ds_vstacked = vstack((ds2, ds))
        # should complaint now since there would not be unique
        # samples' origids
        if __debug__:
            assert_raises(ValueError, ck.compute, ds_vstacked)

        ds_vstacked.init_origids('samples')      # reset origids
        ck.compute(ds_vstacked)

        errs = cvte(ds_vstacked)
        errs_ = cvte_(ds_vstacked)
        # Following test would have failed since origids
        # were just ints, and then non-unique after vstack
        assert_array_equal(errs.samples, errs_.samples)
    def test_cache_speedup(self):
        skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455)

        ck = sgSVM(kernel=CachedKernel(kernel=RbfSGKernel(sigma=2)), C=1)
        sk = sgSVM(kernel=RbfSGKernel(sigma=2), C=1)

        cv_c = CrossValidatedTransferError(TransferError(ck),
                                           splitter=NFoldSplitter())
        cv_s = CrossValidatedTransferError(TransferError(sk),
                                           splitter=NFoldSplitter())

        #data = datasets['uni4large']
        P = 5000
        data = normal_feature_dataset(snr=2,
                                      perlabel=200,
                                      nchunks=10,
                                      means=np.random.randn(2, P),
                                      nfeatures=P)

        t0 = time()
        ck.params.kernel.compute(data)
        cachetime = time() - t0

        t0 = time()
        cached_err = cv_c(data)
        ccv_time = time() - t0

        t0 = time()
        norm_err = cv_s(data)
        ncv_time = time() - t0

        assert_almost_equal(np.asanyarray(cached_err), np.asanyarray(norm_err))
        ok_(cachetime < ncv_time)
        ok_(ccv_time < ncv_time)
        #print 'Regular CV time: %s seconds'%ncv_time
        #print 'Caching time: %s seconds'%cachetime
        #print 'Cached CV time: %s seconds'%ccv_time

        speedup = ncv_time / (ccv_time + cachetime)
        #print 'Speedup factor: %s'%speedup

        # Speedup ideally should be 10, though it's not purely linear
        self.failIf(speedup < 2, 'Problem caching data - too slow!')
    def test_cached_kernel_different_datasets(self):
        skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455)

        # Inspired by the problem Swaroop ran into
        k = LinearSGKernel(normalizer_cls=False)
        k_ = LinearSGKernel(normalizer_cls=False)  # to be cached
        ck = CachedKernel(k_)

        clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1)
        clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1)

        cvte = CrossValidatedTransferError(TransferError(clf), NFoldSplitter())
        cvte_ = CrossValidatedTransferError(TransferError(clf_),
                                            NFoldSplitter())

        te = TransferError(clf)
        te_ = TransferError(clf_)

        for r in xrange(2):
            ds1 = datasets['uni2medium']
            errs1 = cvte(ds1)
            ck.compute(ds1)
            ok_(ck._recomputed)
            errs1_ = cvte_(ds1)
            ok_(~ck._recomputed)
            assert_array_equal(errs1, errs1_)

            ds2 = datasets['uni3small']
            errs2 = cvte(ds2)
            ck.compute(ds2)
            ok_(ck._recomputed)
            errs2_ = cvte_(ds2)
            ok_(~ck._recomputed)
            assert_array_equal(errs2, errs2_)

            ssel = np.round(datasets['uni2large'].samples[:5, 0]).astype(int)
            terr = te(datasets['uni3small_test'][ssel],
                      datasets['uni3small_train'][::2])
            terr_ = te_(datasets['uni3small_test'][ssel],
                        datasets['uni3small_train'][::2])
            ok_(~ck._recomputed)
            ok_(terr == terr_)
    def test_vstack_and_origids_issue(self):
        # That is actually what swaroop hit
        skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455)

        # Inspired by the problem Swaroop ran into
        k = LinearSGKernel(normalizer_cls=False)
        k_ = LinearSGKernel(normalizer_cls=False)  # to be cached
        ck = CachedKernel(k_)

        clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1)
        clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1)

        cvte = CrossValidatedTransferError(TransferError(clf), NFoldSplitter())
        cvte_ = CrossValidatedTransferError(TransferError(clf_),
                                            NFoldSplitter())

        ds = datasets['uni2large_test'].copy(deep=True)
        ok_(~('orig_ids' in ds.sa))  # assure that there are None
        ck.compute(ds)  # so we initialize origids
        ok_('origids' in ds.sa)
        ds2 = ds.copy(deep=True)
        ds2.samples = np.zeros(ds2.shape)
        from mvpa.base.dataset import vstack
        ds_vstacked = vstack((ds2, ds))
        # should complaint now since there would not be unique
        # samples' origids
        if __debug__:
            assert_raises(ValueError, ck.compute, ds_vstacked)

        ds_vstacked.init_origids('samples')  # reset origids
        ck.compute(ds_vstacked)

        errs = cvte(ds_vstacked)
        errs_ = cvte_(ds_vstacked)
        # Following test would have failed since origids
        # were just ints, and then non-unique after vstack
        assert_array_equal(errs.samples, errs_.samples)
Esempio n. 7
0
    def test_cached_kernel(self):
        nchunks = 5
        n = 50 * nchunks
        d = Dataset(np.random.randn(n, 132))
        d.sa.chunks = np.random.randint(nchunks, size=n)

        # We'll compare against an Rbf just because it has a parameter to change
        rk = npK.RbfKernel(sigma=1.5)

        # Assure two kernels are independent for this test
        ck = CachedKernel(kernel=npK.RbfKernel(sigma=1.5))
        ck.compute(d)  # Initial cache of all data

        self.failUnless(ck._recomputed, "CachedKernel was not initially computed")

        # Try some splitting
        for chunk in [d[d.sa.chunks == i] for i in range(nchunks)]:
            rk.compute(chunk)
            ck.compute(chunk)
            self.kernel_equiv(rk, ck)  # , accuracy=1e-12)
            self.failIf(ck._recomputed, "CachedKernel incorrectly recomputed it's kernel")

        # Test what happens when a parameter changes
        ck.params.sigma = 3.5
        ck.compute(d)
        self.failUnless(ck._recomputed, "CachedKernel doesn't recompute on kernel change")
        rk.params.sigma = 3.5
        rk.compute(d)
        self.failUnless(np.all(rk._k == ck._k), "Cached and rbf kernels disagree after kernel change")

        # Now test handling new data
        d2 = Dataset(np.random.randn(32, 43))
        ck.compute(d2)
        self.failUnless(ck._recomputed, "CachedKernel did not automatically recompute new data")
        ck.compute(d)
        self.failUnless(
            ck._recomputed,
            "CachedKernel did not recompute old data which had\n"
            + "previously been computed, but had the cache overriden",
        )
Esempio n. 8
0
    def test_cached_kernel(self):
        nchunks = 5
        n = 50 * nchunks
        d = Dataset(np.random.randn(n, 132))
        d.sa.chunks = np.random.randint(nchunks, size=n)

        # We'll compare against an Rbf just because it has a parameter to change
        rk = npK.RbfKernel(sigma=1.5)

        # Assure two kernels are independent for this test
        ck = CachedKernel(kernel=npK.RbfKernel(sigma=1.5))
        ck.compute(d)  # Initial cache of all data

        self.failUnless(ck._recomputed,
                        'CachedKernel was not initially computed')

        # Try some splitting
        for chunk in [d[d.sa.chunks == i] for i in range(nchunks)]:
            rk.compute(chunk)
            ck.compute(chunk)
            self.kernel_equiv(rk, ck)  #, accuracy=1e-12)
            self.failIf(ck._recomputed,
                        "CachedKernel incorrectly recomputed it's kernel")

        # Test what happens when a parameter changes
        ck.params.sigma = 3.5
        ck.compute(d)
        self.failUnless(ck._recomputed,
                        "CachedKernel doesn't recompute on kernel change")
        rk.params.sigma = 3.5
        rk.compute(d)
        self.failUnless(np.all(rk._k == ck._k),
                        'Cached and rbf kernels disagree after kernel change')

        # Now test handling new data
        d2 = Dataset(np.random.randn(32, 43))
        ck.compute(d2)
        self.failUnless(
            ck._recomputed,
            "CachedKernel did not automatically recompute new data")
        ck.compute(d)
        self.failUnless(ck._recomputed,
                        "CachedKernel did not recompute old data which had\n" +\
                        "previously been computed, but had the cache overriden")