def test_cached_kernel_different_datasets(self): skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455) # Inspired by the problem Swaroop ran into k = LinearSGKernel(normalizer_cls=False) k_ = LinearSGKernel(normalizer_cls=False) # to be cached ck = CachedKernel(k_) clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1) clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1) cvte = CrossValidation(clf, NFoldPartitioner()) cvte_ = CrossValidation(clf_, NFoldPartitioner()) postproc=BinaryFxNode(mean_mismatch_error, 'targets') te = ProxyMeasure(clf, postproc=postproc) te_ = ProxyMeasure(clf_, postproc=postproc) for r in xrange(2): ds1 = datasets['uni2medium'] errs1 = cvte(ds1) ck.compute(ds1) ok_(ck._recomputed) errs1_ = cvte_(ds1) ok_(~ck._recomputed) assert_array_equal(errs1, errs1_) ds2 = datasets['uni3small'] errs2 = cvte(ds2) ck.compute(ds2) ok_(ck._recomputed) errs2_ = cvte_(ds2) ok_(~ck._recomputed) assert_array_equal(errs2, errs2_) ssel = np.round(datasets['uni2large'].samples[:5, 0]).astype(int) te.train(datasets['uni3small'][::2]) terr = np.asscalar(te(datasets['uni3small'][ssel])) te_.train(datasets['uni3small'][::2]) terr_ = np.asscalar(te_(datasets['uni3small'][ssel])) ok_(~ck._recomputed) ok_(terr == terr_)
def test_cached_kernel_different_datasets(self): skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455) # Inspired by the problem Swaroop ran into k = LinearSGKernel(normalizer_cls=False) k_ = LinearSGKernel(normalizer_cls=False) # to be cached ck = CachedKernel(k_) clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1) clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1) cvte = CrossValidatedTransferError( TransferError(clf), NFoldSplitter()) cvte_ = CrossValidatedTransferError( TransferError(clf_), NFoldSplitter()) te = TransferError(clf) te_ = TransferError(clf_) for r in xrange(2): ds1 = datasets['uni2medium'] errs1 = cvte(ds1) ck.compute(ds1) ok_(ck._recomputed) errs1_ = cvte_(ds1) ok_(~ck._recomputed) assert_array_equal(errs1, errs1_) ds2 = datasets['uni3small'] errs2 = cvte(ds2) ck.compute(ds2) ok_(ck._recomputed) errs2_ = cvte_(ds2) ok_(~ck._recomputed) assert_array_equal(errs2, errs2_) ssel = np.round(datasets['uni2large'].samples[:5, 0]).astype(int) terr = te(datasets['uni3small_test'][ssel], datasets['uni3small_train'][::2]) terr_ = te_(datasets['uni3small_test'][ssel], datasets['uni3small_train'][::2]) ok_(~ck._recomputed) ok_(terr == terr_)
def test_vstack_and_origids_issue(self): # That is actually what swaroop hit skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455) # Inspired by the problem Swaroop ran into k = LinearSGKernel(normalizer_cls=False) k_ = LinearSGKernel(normalizer_cls=False) # to be cached ck = CachedKernel(k_) clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1) clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1) cvte = CrossValidatedTransferError( TransferError(clf), NFoldSplitter()) cvte_ = CrossValidatedTransferError( TransferError(clf_), NFoldSplitter()) ds = datasets['uni2large_test'].copy(deep=True) ok_(~('orig_ids' in ds.sa)) # assure that there are None ck.compute(ds) # so we initialize origids ok_('origids' in ds.sa) ds2 = ds.copy(deep=True) ds2.samples = np.zeros(ds2.shape) from mvpa.base.dataset import vstack ds_vstacked = vstack((ds2, ds)) # should complaint now since there would not be unique # samples' origids if __debug__: assert_raises(ValueError, ck.compute, ds_vstacked) ds_vstacked.init_origids('samples') # reset origids ck.compute(ds_vstacked) errs = cvte(ds_vstacked) errs_ = cvte_(ds_vstacked) # Following test would have failed since origids # were just ints, and then non-unique after vstack assert_array_equal(errs.samples, errs_.samples)
def test_cache_speedup(self): skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455) ck = sgSVM(kernel=CachedKernel(kernel=RbfSGKernel(sigma=2)), C=1) sk = sgSVM(kernel=RbfSGKernel(sigma=2), C=1) cv_c = CrossValidatedTransferError(TransferError(ck), splitter=NFoldSplitter()) cv_s = CrossValidatedTransferError(TransferError(sk), splitter=NFoldSplitter()) #data = datasets['uni4large'] P = 5000 data = normal_feature_dataset(snr=2, perlabel=200, nchunks=10, means=np.random.randn(2, P), nfeatures=P) t0 = time() ck.params.kernel.compute(data) cachetime = time() - t0 t0 = time() cached_err = cv_c(data) ccv_time = time() - t0 t0 = time() norm_err = cv_s(data) ncv_time = time() - t0 assert_almost_equal(np.asanyarray(cached_err), np.asanyarray(norm_err)) ok_(cachetime < ncv_time) ok_(ccv_time < ncv_time) #print 'Regular CV time: %s seconds'%ncv_time #print 'Caching time: %s seconds'%cachetime #print 'Cached CV time: %s seconds'%ccv_time speedup = ncv_time / (ccv_time + cachetime) #print 'Speedup factor: %s'%speedup # Speedup ideally should be 10, though it's not purely linear self.failIf(speedup < 2, 'Problem caching data - too slow!')
def test_cached_kernel_different_datasets(self): skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455) # Inspired by the problem Swaroop ran into k = LinearSGKernel(normalizer_cls=False) k_ = LinearSGKernel(normalizer_cls=False) # to be cached ck = CachedKernel(k_) clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1) clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1) cvte = CrossValidatedTransferError(TransferError(clf), NFoldSplitter()) cvte_ = CrossValidatedTransferError(TransferError(clf_), NFoldSplitter()) te = TransferError(clf) te_ = TransferError(clf_) for r in xrange(2): ds1 = datasets['uni2medium'] errs1 = cvte(ds1) ck.compute(ds1) ok_(ck._recomputed) errs1_ = cvte_(ds1) ok_(~ck._recomputed) assert_array_equal(errs1, errs1_) ds2 = datasets['uni3small'] errs2 = cvte(ds2) ck.compute(ds2) ok_(ck._recomputed) errs2_ = cvte_(ds2) ok_(~ck._recomputed) assert_array_equal(errs2, errs2_) ssel = np.round(datasets['uni2large'].samples[:5, 0]).astype(int) terr = te(datasets['uni3small_test'][ssel], datasets['uni3small_train'][::2]) terr_ = te_(datasets['uni3small_test'][ssel], datasets['uni3small_train'][::2]) ok_(~ck._recomputed) ok_(terr == terr_)
def test_vstack_and_origids_issue(self): # That is actually what swaroop hit skip_if_no_external('shogun', ver_dep='shogun:rev', min_version=4455) # Inspired by the problem Swaroop ran into k = LinearSGKernel(normalizer_cls=False) k_ = LinearSGKernel(normalizer_cls=False) # to be cached ck = CachedKernel(k_) clf = sgSVM(svm_impl='libsvm', kernel=k, C=-1) clf_ = sgSVM(svm_impl='libsvm', kernel=ck, C=-1) cvte = CrossValidatedTransferError(TransferError(clf), NFoldSplitter()) cvte_ = CrossValidatedTransferError(TransferError(clf_), NFoldSplitter()) ds = datasets['uni2large_test'].copy(deep=True) ok_(~('orig_ids' in ds.sa)) # assure that there are None ck.compute(ds) # so we initialize origids ok_('origids' in ds.sa) ds2 = ds.copy(deep=True) ds2.samples = np.zeros(ds2.shape) from mvpa.base.dataset import vstack ds_vstacked = vstack((ds2, ds)) # should complaint now since there would not be unique # samples' origids if __debug__: assert_raises(ValueError, ck.compute, ds_vstacked) ds_vstacked.init_origids('samples') # reset origids ck.compute(ds_vstacked) errs = cvte(ds_vstacked) errs_ = cvte_(ds_vstacked) # Following test would have failed since origids # were just ints, and then non-unique after vstack assert_array_equal(errs.samples, errs_.samples)
def test_cached_kernel(self): nchunks = 5 n = 50 * nchunks d = Dataset(np.random.randn(n, 132)) d.sa.chunks = np.random.randint(nchunks, size=n) # We'll compare against an Rbf just because it has a parameter to change rk = npK.RbfKernel(sigma=1.5) # Assure two kernels are independent for this test ck = CachedKernel(kernel=npK.RbfKernel(sigma=1.5)) ck.compute(d) # Initial cache of all data self.failUnless(ck._recomputed, "CachedKernel was not initially computed") # Try some splitting for chunk in [d[d.sa.chunks == i] for i in range(nchunks)]: rk.compute(chunk) ck.compute(chunk) self.kernel_equiv(rk, ck) # , accuracy=1e-12) self.failIf(ck._recomputed, "CachedKernel incorrectly recomputed it's kernel") # Test what happens when a parameter changes ck.params.sigma = 3.5 ck.compute(d) self.failUnless(ck._recomputed, "CachedKernel doesn't recompute on kernel change") rk.params.sigma = 3.5 rk.compute(d) self.failUnless(np.all(rk._k == ck._k), "Cached and rbf kernels disagree after kernel change") # Now test handling new data d2 = Dataset(np.random.randn(32, 43)) ck.compute(d2) self.failUnless(ck._recomputed, "CachedKernel did not automatically recompute new data") ck.compute(d) self.failUnless( ck._recomputed, "CachedKernel did not recompute old data which had\n" + "previously been computed, but had the cache overriden", )
def test_cached_kernel(self): nchunks = 5 n = 50 * nchunks d = Dataset(np.random.randn(n, 132)) d.sa.chunks = np.random.randint(nchunks, size=n) # We'll compare against an Rbf just because it has a parameter to change rk = npK.RbfKernel(sigma=1.5) # Assure two kernels are independent for this test ck = CachedKernel(kernel=npK.RbfKernel(sigma=1.5)) ck.compute(d) # Initial cache of all data self.failUnless(ck._recomputed, 'CachedKernel was not initially computed') # Try some splitting for chunk in [d[d.sa.chunks == i] for i in range(nchunks)]: rk.compute(chunk) ck.compute(chunk) self.kernel_equiv(rk, ck) #, accuracy=1e-12) self.failIf(ck._recomputed, "CachedKernel incorrectly recomputed it's kernel") # Test what happens when a parameter changes ck.params.sigma = 3.5 ck.compute(d) self.failUnless(ck._recomputed, "CachedKernel doesn't recompute on kernel change") rk.params.sigma = 3.5 rk.compute(d) self.failUnless(np.all(rk._k == ck._k), 'Cached and rbf kernels disagree after kernel change') # Now test handling new data d2 = Dataset(np.random.randn(32, 43)) ck.compute(d2) self.failUnless( ck._recomputed, "CachedKernel did not automatically recompute new data") ck.compute(d) self.failUnless(ck._recomputed, "CachedKernel did not recompute old data which had\n" +\ "previously been computed, but had the cache overriden")