def statistics_kmm (n,d): from modshogun import RealFeatures from modshogun import DataGenerator from modshogun import GaussianKernel, MSG_DEBUG from modshogun import KernelMeanMatching from modshogun import Math # init seed for reproducability Math.init_random(1) random.seed(1); data = random.randn(d,n) # create shogun feature representation features=RealFeatures(data) # use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization # which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard # k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2 kernel=GaussianKernel(10,8) kernel.init(features,features) kmm = KernelMeanMatching(kernel,array([0,1,2,3,7,8,9],dtype=int32),array([4,5,6],dtype=int32)) w = kmm.compute_weights() #print w return w
def kernel_io_modular(train_fname=traindat, test_fname=testdat, width=1.9): from modshogun import RealFeatures, GaussianKernel, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) kernel = GaussianKernel(feats_train, feats_train, width) km_train = kernel.get_kernel_matrix() f = CSVFile("tmp/gaussian_train.csv", "w") kernel.save(f) del f kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() f = CSVFile("tmp/gaussian_test.csv", "w") kernel.save(f) del f # clean up import os os.unlink("tmp/gaussian_test.csv") os.unlink("tmp/gaussian_train.csv") return km_train, km_test, kernel
def kernel_gaussian_modular (train_fname=traindat,test_fname=testdat, width=1.3): from modshogun import RealFeatures, GaussianKernel, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) kernel=GaussianKernel(feats_train, feats_train, width) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_sparse_gaussian_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.1 ): from modshogun import SparseRealFeatures from modshogun import GaussianKernel feats_train=SparseRealFeatures(fm_train_real) feats_test=SparseRealFeatures(fm_test_real) kernel=GaussianKernel(feats_train, feats_train, width) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def classifier_multiclasslibsvm_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,width=2.1,C=1,epsilon=1e-5): from modshogun import RealFeatures, MulticlassLabels from modshogun import GaussianKernel from modshogun import MulticlassLibSVM feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) kernel=GaussianKernel(feats_train, feats_train, width) labels=MulticlassLabels(label_train_multiclass) svm=MulticlassLibSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() kernel.init(feats_train, feats_test) out = svm.apply().get_labels() predictions = svm.apply() return predictions, svm, predictions.get_labels()
def regression_kernel_ridge_modular (n=100,n_test=100, \ x_range=6,x_range_test=10,noise_var=0.5,width=1, tau=1e-6, seed=1): from modshogun import RegressionLabels, RealFeatures from modshogun import GaussianKernel from modshogun import KernelRidgeRegression # reproducable results random.seed(seed) # easy regression data: one dimensional noisy sine wave n=15 n_test=100 x_range_test=10 noise_var=0.5; X=random.rand(1,n)*x_range X_test=array([[float(i)/n_test*x_range_test for i in range(n_test)]]) Y_test=sin(X_test) Y=sin(X)+random.randn(n)*noise_var # shogun representation labels=RegressionLabels(Y[0]) feats_train=RealFeatures(X) feats_test=RealFeatures(X_test) kernel=GaussianKernel(feats_train, feats_train, width) krr=KernelRidgeRegression(tau, kernel, labels) krr.train(feats_train) kernel.init(feats_train, feats_test) out = krr.apply().get_labels() # plot results #plot(X[0],Y[0],'x') # training observations #plot(X_test[0],Y_test[0],'-') # ground truth of test #plot(X_test[0],out, '-') # mean predictions of test #legend(["training", "ground truth", "mean predictions"]) #show() return out,kernel,krr
def regression_libsvr_modular (svm_c=1, svr_param=0.1, n=100,n_test=100, \ x_range=6,x_range_test=10,noise_var=0.5,width=1, seed=1): from modshogun import RegressionLabels, RealFeatures from modshogun import GaussianKernel from modshogun import LibSVR, LIBSVR_NU_SVR, LIBSVR_EPSILON_SVR # reproducable results random.seed(seed) # easy regression data: one dimensional noisy sine wave n=15 n_test=100 x_range_test=10 noise_var=0.5; X=random.rand(1,n)*x_range X_test=array([[float(i)/n_test*x_range_test for i in range(n_test)]]) Y_test=sin(X_test) Y=sin(X)+random.randn(n)*noise_var # shogun representation labels=RegressionLabels(Y[0]) feats_train=RealFeatures(X) feats_test=RealFeatures(X_test) kernel=GaussianKernel(feats_train, feats_train, width) # two svr models: epsilon and nu svr_epsilon=LibSVR(svm_c, svr_param, kernel, labels, LIBSVR_EPSILON_SVR) svr_epsilon.train() svr_nu=LibSVR(svm_c, svr_param, kernel, labels, LIBSVR_NU_SVR) svr_nu.train() # predictions kernel.init(feats_train, feats_test) out1_epsilon=svr_epsilon.apply().get_labels() out2_epsilon=svr_epsilon.apply(feats_test).get_labels() out1_nu=svr_epsilon.apply().get_labels() out2_nu=svr_epsilon.apply(feats_test).get_labels() return out1_epsilon,out2_epsilon,out1_nu,out2_nu ,kernel
def classifier_multiclassmachine_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,width=2.1,C=1,epsilon=1e-5): from modshogun import RealFeatures, MulticlassLabels from modshogun import GaussianKernel from modshogun import LibSVM, KernelMulticlassMachine, MulticlassOneVsRestStrategy feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) kernel=GaussianKernel(feats_train, feats_train, width) labels=MulticlassLabels(label_train_multiclass) classifier = LibSVM() classifier.set_epsilon(epsilon) #print labels.get_labels() mc_classifier = KernelMulticlassMachine(MulticlassOneVsRestStrategy(),kernel,classifier,labels) mc_classifier.train() kernel.init(feats_train, feats_test) out = mc_classifier.apply().get_labels() return out
from modshogun import GaussianKernel from modshogun import LibSVM, LDA from modshogun import ROCEvaluation import util util.set_title('ROC example') util.DISTANCE=0.5 subplots_adjust(hspace=0.3) pos=util.get_realdata(True) neg=util.get_realdata(False) features=util.get_realfeatures(pos, neg) labels=util.get_labels() # classifiers gk=GaussianKernel(features, features, 1.0) svm = LibSVM(1000.0, gk, labels) svm.train() lda=LDA(1,features,labels) lda.train() ## plot points subplot(211) plot(pos[0,:], pos[1,:], "r.") plot(neg[0,:], neg[1,:], "b.") grid(True) title('Data',size=10) # plot ROC for SVM subplot(223) ROC_evaluation=ROCEvaluation()
def statistics_quadratic_time_mmd (m,dim,difference): from modshogun import RealFeatures from modshogun import MeanShiftDataGenerator from modshogun import GaussianKernel, CustomKernel from modshogun import QuadraticTimeMMD from modshogun import PERMUTATION, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, BIASED_DEPRECATED from modshogun import Statistics, IntVector, RealVector, Math # init seed for reproducability Math.init_random(1) random.seed(17) # number of examples kept low in order to make things fast # streaming data generator for mean shift distributions gen_p=MeanShiftDataGenerator(0, dim); #gen_p.parallel.set_num_threads(1) gen_q=MeanShiftDataGenerator(difference, dim); # stream some data from generator feat_p=gen_p.get_streamed_features(m); feat_q=gen_q.get_streamed_features(m); # set kernel a-priori. usually one would do some kernel selection. See # other examples for this. width=10; kernel=GaussianKernel(10, width); # create quadratic time mmd instance. Note that this constructor # copies p and q and does not reference them mmd=QuadraticTimeMMD(kernel, feat_p, feat_q); # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 alpha=0.05; # using permutation (slow, not the most reliable way. Consider pre- # computing the kernel when using it, see below). # Also, in practice, use at least 250 iterations mmd.set_null_approximation_method(PERMUTATION); mmd.set_num_null_samples(3); p_value_null=mmd.perform_test(); # reject if p-value is smaller than test level #print "bootstrap: p!=q: ", p_value_null<alpha # using spectrum method. Use at least 250 samples from null. # This is consistent but sometimes breaks, always monitor type I error. # See tutorial for number of eigenvalues to use . mmd.set_statistic_type(BIASED); mmd.set_null_approximation_method(MMD2_SPECTRUM); mmd.set_num_eigenvalues_spectrum(3); mmd.set_num_samples_spectrum(250); p_value_spectrum=mmd.perform_test(); # reject if p-value is smaller than test level #print "spectrum: p!=q: ", p_value_spectrum<alpha # using gamma method. This is a quick hack, which works most of the time # but is NOT guaranteed to. See tutorial for details. # Only works with BIASED_DEPRECATED statistic mmd.set_statistic_type(BIASED_DEPRECATED); mmd.set_null_approximation_method(MMD2_GAMMA); p_value_gamma=mmd.perform_test(); # reject if p-value is smaller than test level #print "gamma: p!=q: ", p_value_gamma<alpha # compute tpye I and II error (use many more trials in practice). # Type I error is not necessary if one uses permutation. We do it here # anyway, but note that this is an efficient way of computing it. # Also note that testing has to happen on # difference data than kernel selection, but the linear time mmd does this # implicitly and we used a fixed kernel here. mmd.set_statistic_type(BIASED); mmd.set_null_approximation_method(PERMUTATION); mmd.set_num_null_samples(5); num_trials=5; type_I_errors=RealVector(num_trials); type_II_errors=RealVector(num_trials); inds=int32(array([x for x in range(2*m)])) # numpy p_and_q=mmd.get_p_and_q(); # use a precomputed kernel to be faster kernel.init(p_and_q, p_and_q); precomputed=CustomKernel(kernel); mmd.set_kernel(precomputed); for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error inds=random.permutation(inds) # numpy permutation precomputed.add_row_subset(inds); precomputed.add_col_subset(inds); type_I_errors[i]=mmd.perform_test()>alpha; precomputed.remove_row_subset(); precomputed.remove_col_subset(); # on normal data, this gives type II error type_II_errors[i]=mmd.perform_test()>alpha; return type_I_errors.get(),type_I_errors.get(),p_value_null,p_value_spectrum,p_value_gamma,
def serialization_svmlight_modular(num, dist, width, C): from modshogun import MSG_DEBUG from modshogun import RealFeatures, BinaryLabels, DNA, Alphabet from modshogun import WeightedDegreeStringKernel, GaussianKernel try: from modshogun import SVMLight except ImportError: print("SVMLight not available") exit(0) from numpy import concatenate, ones from numpy.random import randn, seed import sys import types import random import bz2 import pickle import inspect def save(filename, myobj): """ save object to file using pickle @param filename: name of destination file @type filename: str @param myobj: object to save (has to be pickleable) @type myobj: obj """ try: f = bz2.BZ2File(filename, 'wb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be written\n') sys.stderr.write(details) return pickle.dump(myobj, f, protocol=2) f.close() def load(filename): """ Load from filename using pickle @param filename: name of file to load from @type filename: str """ try: f = bz2.BZ2File(filename, 'rb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be read\n') sys.stderr.write(details) return myobj = pickle.load(f) f.close() return myobj ################################################## # set up toy data and svm traindata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist), axis=1) testdata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist), axis=1) trainlab = concatenate((-ones(num), ones(num))) testlab = concatenate((-ones(num), ones(num))) feats_train = RealFeatures(traindata_real) feats_test = RealFeatures(testdata_real) kernel = GaussianKernel(feats_train, feats_train, width) #kernel.io.set_loglevel(MSG_DEBUG) labels = BinaryLabels(trainlab) svm = SVMLight(C, kernel, labels) svm.train() #svm.io.set_loglevel(MSG_DEBUG) ################################################## # serialize to file fn = "serialized_svm.bz2" #print("serializing SVM to file", fn) save(fn, svm) ################################################## # unserialize and sanity check #print("unserializing SVM") svm2 = load(fn) #print("comparing objectives") svm2.train() #print("objective before serialization:", svm.get_objective()) #print("objective after serialization:", svm2.get_objective()) #print("comparing predictions") out = svm.apply(feats_test).get_labels() out2 = svm2.apply(feats_test).get_labels() # assert outputs are close for i in range(len(out)): assert abs(out[i] - out2[i] < 0.000001) #print("all checks passed.") return True
def quadratic_time_mmd_graphical(): # parameters, change to get different results m=100 dim=2 # setting the difference of the first dimension smaller makes a harder test difference=0.5 # number of samples taken from null and alternative distribution num_null_samples=500 # streaming data generator for mean shift distributions gen_p=MeanShiftDataGenerator(0, dim) gen_q=MeanShiftDataGenerator(difference, dim) # Stream examples and merge them in order to compute MMD on joint sample # alternative is to call a different constructor of QuadraticTimeMMD features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) # use the median kernel selection # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable sigmas=[2**x for x in range(-3,10)] widths=[x*x*2 for x in sigmas] print "kernel widths:", widths combined=CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # create MMD instance, use biased statistic mmd=QuadraticTimeMMD(combined,features, m) mmd.set_statistic_type(BIASED) # kernel selection instance (this can easily replaced by the other methods for selecting # single kernels selection=MMDKernelSelectionMax(mmd) # perform kernel selection kernel=selection.select_kernel() kernel=GaussianKernel.obtain_from_generic(kernel) mmd.set_kernel(kernel); print "selected kernel width:", kernel.get_width() # sample alternative distribution (new data each trial) alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): # Stream examples and merge them in order to replace in MMD features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) mmd.set_p_and_q(features) alt_samples[i]=mmd.compute_statistic() # sample from null distribution # bootstrapping, biased statistic mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_statistic_type(BIASED) mmd.set_bootstrap_iterations(num_null_samples) null_samples_boot=mmd.bootstrap_null() # sample from null distribution # spectrum, biased statistic if "sample_null_spectrum" in dir(QuadraticTimeMMD): mmd.set_null_approximation_method(MMD2_SPECTRUM) mmd.set_statistic_type(BIASED) null_samples_spectrum=mmd.sample_null_spectrum(num_null_samples, m-10) # fit gamma distribution, biased statistic mmd.set_null_approximation_method(MMD2_GAMMA) mmd.set_statistic_type(BIASED) gamma_params=mmd.fit_null_gamma() # sample gamma with parameters null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)]) # to plot data, sample a few examples from stream first features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) data=features.get_feature_matrix() # plot figure() title('Quadratic Time MMD') # plot data of p and q subplot(2,3,1) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks plot(data[0][0:m], data[1][0:m], 'ro', label='$x$') plot(data[0][m+1:2*m], data[1][m+1:2*m], 'bo', label='$x$', alpha=0.5) title('Data, shift in $x_1$='+str(difference)+'\nm='+str(m)) xlabel('$x_1, y_1$') ylabel('$x_2, y_2$') # histogram of first data dimension and pdf subplot(2,3,2) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3 )) # reduce number of x-ticks hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True) hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True) xs=linspace(min(data[0])-1,max(data[0])+1, 50) plot(xs,normpdf( xs, 0, 1), 'r', linewidth=3) plot(xs,normpdf( xs, difference, 1), 'b', linewidth=3) xlabel('$x_1, y_1$') ylabel('$p(x_1), p(y_1)$') title('Data PDF in $x_1, y_1$') # compute threshold for test level alpha=0.05 null_samples_boot.sort() null_samples_spectrum.sort() null_samples_gamma.sort() thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))]; thresh_spectrum=null_samples_spectrum[floor(len(null_samples_spectrum)*(1-alpha))]; thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))]; type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples) type_one_error_spectrum=sum(null_samples_spectrum<thresh_boot)/float(num_null_samples) type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples) # plot alternative distribution with threshold subplot(2,3,4) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(alt_samples, 20, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range=[min([min(null_samples_boot), min(null_samples_spectrum), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_spectrum), max(null_samples_gamma)])] # plot null distribution with threshold subplot(2,3,3) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3 )) # reduce number of x-ticks hist(null_samples_boot, 20, range=hist_range, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Bootstrapped Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) grid(True) # plot null distribution spectrum subplot(2,3,5) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(null_samples_spectrum, 20, range=hist_range, normed=True); axvline(thresh_spectrum, 0, 1, linewidth=2, color='red') title('Null Dist. Spectrum\nType I error is ' + str(type_one_error_spectrum)) # plot null distribution gamma subplot(2,3,6) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(null_samples_gamma, 20, range=hist_range, normed=True); axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def regression_gaussian_process_modelselection (n=100, n_test=100, \ x_range=5, x_range_test=10, noise_var=0.4): from modshogun import RealFeatures, RegressionLabels from modshogun import GaussianKernel from modshogun import GradientModelSelection, ModelSelectionParameters from modshogun import GaussianLikelihood, ZeroMean, \ ExactInferenceMethod, GaussianProcessRegression, GradientCriterion, \ GradientEvaluation # easy regression data: one dimensional noisy sine wave X_train = random.rand(1, n) * x_range X_test = array([[float(i) / n_test * x_range_test for i in range(n_test)]]) y_test = sin(X_test) y_train = sin(X_train) + random.randn(n) * noise_var # shogun representation labels = RegressionLabels(y_train[0]) feats_train = RealFeatures(X_train) feats_test = RealFeatures(X_test) # GP specification kernel = GaussianKernel(10, 0.05) mean = ZeroMean() likelihood = GaussianLikelihood(0.8) inf = ExactInferenceMethod(kernel, feats_train, mean, labels, likelihood) inf.set_scale(2.5) gp = GaussianProcessRegression(inf) means = gp.get_mean_vector(feats_test) variances = gp.get_variance_vector(feats_test) # plot results figure() subplot(2, 1, 1) title('Initial parameter\'s values') plot(X_train[0], y_train[0], 'bx') # training observations plot(X_test[0], y_test[0], 'g-') # ground truth of test plot(X_test[0], means, 'r-') # mean predictions of test fill_between(X_test[0], means - 1.96 * sqrt(variances), means + 1.96 * sqrt(variances), color='grey') legend(["training", "ground truth", "mean predictions"]) # evaluate our inference method for its derivatives grad = GradientEvaluation(gp, feats_train, labels, GradientCriterion(), False) grad.set_function(inf) # handles all of the above structures in memory grad_search = GradientModelSelection(grad) # search for best parameters best_combination = grad_search.select_model(True) # outputs all result and information best_combination.apply_to_machine(gp) means = gp.get_mean_vector(feats_test) variances = gp.get_variance_vector(feats_test) # plot results subplot(2, 1, 2) title('Selected by gradient search parameter\'s values') plot(X_train[0], y_train[0], 'bx') # training observations plot(X_test[0], y_test[0], 'g-') # ground truth of test plot(X_test[0], means, 'r-') # mean predictions of test fill_between(X_test[0], means - 1.96 * sqrt(variances), means + 1.96 * sqrt(variances), color='grey') legend(["training", "ground truth", "mean predictions"]) show()
def serialization_complex_example (num=5, dist=1, dim=10, C=2.0, width=10): import os from numpy import concatenate, zeros, ones from numpy.random import randn, seed from modshogun import RealFeatures, MulticlassLabels from modshogun import GMNPSVM from modshogun import GaussianKernel try: from modshogun import SerializableHdf5File,SerializableAsciiFile, \ SerializableJsonFile,SerializableXmlFile,MSG_DEBUG except ImportError: return from modshogun import NormOne, LogPlusOne seed(17) data=concatenate((randn(dim, num), randn(dim, num) + dist, randn(dim, num) + 2*dist, randn(dim, num) + 3*dist), axis=1) lab=concatenate((zeros(num), ones(num), 2*ones(num), 3*ones(num))) feats=RealFeatures(data) #feats.io.set_loglevel(MSG_DEBUG) #feats.io.enable_file_and_line() kernel=GaussianKernel(feats, feats, width) labels=MulticlassLabels(lab) svm = GMNPSVM(C, kernel, labels) feats.add_preprocessor(NormOne()) feats.add_preprocessor(LogPlusOne()) feats.set_preprocessed(1) svm.train(feats) bias_ref = svm.get_svm(0).get_bias() #svm.print_serializable() fstream = SerializableHdf5File("blaah.h5", "w") status = svm.save_serializable(fstream) check_status(status,'h5') fstream = SerializableAsciiFile("blaah.asc", "w") status = svm.save_serializable(fstream) check_status(status,'asc') fstream = SerializableJsonFile("blaah.json", "w") status = svm.save_serializable(fstream) check_status(status,'json') fstream = SerializableXmlFile("blaah.xml", "w") status = svm.save_serializable(fstream) check_status(status,'xml') fstream = SerializableHdf5File("blaah.h5", "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status,'h5') new_svm.train() bias_h5 = new_svm.get_svm(0).get_bias() fstream = SerializableAsciiFile("blaah.asc", "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status,'asc') new_svm.train() bias_asc = new_svm.get_svm(0).get_bias() fstream = SerializableJsonFile("blaah.json", "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status,'json') new_svm.train() bias_json = new_svm.get_svm(0).get_bias() fstream = SerializableXmlFile("blaah.xml", "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status,'xml') new_svm.train() bias_xml = new_svm.get_svm(0).get_bias() os.unlink("blaah.h5") os.unlink("blaah.asc") os.unlink("blaah.json") os.unlink("blaah.xml") return svm,new_svm, bias_ref, bias_h5, bias_asc, bias_json, bias_xml
def modelselection_grid_search_libsvr_modular (fm_train=traindat,fm_test=testdat,label_train=label_traindat,\ width=2.1,C=1,epsilon=1e-5,tube_epsilon=1e-2): from modshogun import CrossValidation, CrossValidationResult from modshogun import MeanSquaredError from modshogun import CrossValidationSplitting from modshogun import RegressionLabels from modshogun import RealFeatures from modshogun import GaussianKernel from modshogun import LibSVR from modshogun import GridSearchModelSelection from modshogun import ModelSelectionParameters, R_EXP from modshogun import ParameterCombination # training data features_train = RealFeatures(traindat) labels = RegressionLabels(label_traindat) # kernel kernel = GaussianKernel(features_train, features_train, width) # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #kernel.print_modsel_params() labels = RegressionLabels(label_train) # predictor predictor = LibSVR(C, tube_epsilon, kernel, labels) predictor.set_epsilon(epsilon) # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy = CrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium = MeanSquaredError() # cross-validation instance cross_validation = CrossValidation(predictor, features_train, labels, splitting_strategy, evaluation_criterium) # (optional) repeat x-val (set larger to get better estimates, at least two # for confidence intervals) cross_validation.set_num_runs(2) # (optional) request 95% confidence intervals for results (not actually # needed for this toy example) cross_validation.set_conf_int_alpha(0.05) # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #predictor.print_modsel_params() # build parameter tree to select C1 and C2 param_tree_root = ModelSelectionParameters() c1 = ModelSelectionParameters("C1") param_tree_root.append_child(c1) c1.build_values(-1.0, 0.0, R_EXP) c2 = ModelSelectionParameters("C2") param_tree_root.append_child(c2) c2.build_values(-1.0, 0.0, R_EXP) # model selection instance model_selection = GridSearchModelSelection(cross_validation, param_tree_root) # perform model selection with selected methods #print "performing model selection of" #print "parameter tree" #param_tree_root.print_tree() #print "starting model selection" # print the current parameter combination, if no parameter nothing is printed print_state = False # lock data before since model selection will not change the kernel matrix # (use with care) This avoids that the kernel matrix is recomputed in every # iteration of the model search predictor.data_lock(labels, features_train) best_parameters = model_selection.select_model(print_state) # print best parameters #print "best parameters:" #best_parameters.print_tree() # apply them and print result best_parameters.apply_to_machine(predictor) result = cross_validation.evaluate()
def evaluation_cross_validation_multiclass_storage (traindat=traindat, label_traindat=label_traindat): from modshogun import CrossValidation, CrossValidationResult from modshogun import CrossValidationPrintOutput from modshogun import CrossValidationMKLStorage, CrossValidationMulticlassStorage from modshogun import MulticlassAccuracy, F1Measure from modshogun import StratifiedCrossValidationSplitting from modshogun import MulticlassLabels from modshogun import RealFeatures, CombinedFeatures from modshogun import GaussianKernel, CombinedKernel from modshogun import MKLMulticlass from modshogun import Statistics, MSG_DEBUG, Math Math.init_random(1) # training data, combined features all on same data features=RealFeatures(traindat) comb_features=CombinedFeatures() comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) labels=MulticlassLabels(label_traindat) # kernel, different Gaussians combined kernel=CombinedKernel() kernel.append_kernel(GaussianKernel(10, 0.1)) kernel.append_kernel(GaussianKernel(10, 1)) kernel.append_kernel(GaussianKernel(10, 2)) # create mkl using libsvm, due to a mem-bug, interleaved is not possible svm=MKLMulticlass(1.0,kernel,labels); svm.set_kernel(kernel); # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy=StratifiedCrossValidationSplitting(labels, 3) # evaluation method evaluation_criterium=MulticlassAccuracy() # cross-validation instance cross_validation=CrossValidation(svm, comb_features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # append cross vlaidation output classes #cross_validation.add_cross_validation_output(CrossValidationPrintOutput()) #mkl_storage=CrossValidationMKLStorage() #cross_validation.add_cross_validation_output(mkl_storage) multiclass_storage=CrossValidationMulticlassStorage() multiclass_storage.append_binary_evaluation(F1Measure()) cross_validation.add_cross_validation_output(multiclass_storage) cross_validation.set_num_runs(3) # perform cross-validation result=cross_validation.evaluate() roc_0_0_0 = multiclass_storage.get_fold_ROC(0,0,0) #print roc_0_0_0 auc_0_0_0 = multiclass_storage.get_fold_evaluation_result(0,0,0,0) #print auc_0_0_0 return roc_0_0_0, auc_0_0_0
def statistics_linear_time_mmd(n, dim, difference): from modshogun import RealFeatures from modshogun import MeanShiftDataGenerator from modshogun import GaussianKernel from modshogun import LinearTimeMMD from modshogun import PERMUTATION, MMD1_GAUSSIAN from modshogun import EuclideanDistance from modshogun import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # so increase to get reasonable results # streaming data generator for mean shift distributions gen_p = MeanShiftDataGenerator(0, dim) gen_q = MeanShiftDataGenerator(difference, dim) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Stream examples and merge them in order to compute median on joint sample features = gen_p.get_streamed_features(100) features = features.create_merged_copy(gen_q.get_streamed_features(100)) # compute all pairwise distances dist = EuclideanDistance(features, features) distances = dist.get_distance_matrix() # compute median and determine kernel width (using shogun) median_distance = Statistics.matrix_median(distances, True) sigma = median_distance**2 #print "median distance for Gaussian kernel:", sigma kernel = GaussianKernel(10, sigma) # mmd instance using streaming features, blocksize of 10000 mmd = LinearTimeMMD(kernel, gen_p, gen_q, n, 10000) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic = mmd.compute_statistic() #print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # sampling null and gaussian approximation (ony for really large samples) alpha = 0.05 #print "computing p-value using sampling null" mmd.set_null_approximation_method(PERMUTATION) mmd.set_num_null_samples(50) # normally, far more iterations are needed p_value_boot = mmd.compute_p_value(statistic) #print "p_value_boot:", p_value_boot #print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha #print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value_gaussian = mmd.compute_p_value(statistic) #print "p_value_gaussian:", p_value_gaussian #print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(PERMUTATION) mmd.set_num_null_samples(10) # normally, far more iterations are needed null_samples = mmd.sample_null() #print "null mean:", mean(null_samples) #print "null variance:", var(null_samples) # compute type I and type II errors for Gaussian approximation # number of trials should be larger to compute tight confidence bounds mmd.set_null_approximation_method(MMD1_GAUSSIAN) num_trials = 5 alpha = 0.05 # test power typeIerrors = [0 for x in range(num_trials)] typeIIerrors = [0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i] = mmd.perform_test() > alpha mmd.set_simulate_h0(False) typeIIerrors[i] = mmd.perform_test() > alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors
def RunKPCAShogun(q): totalTimer = Timer() try: # Load input dataset. Log.Info("Loading dataset", self.verbose) data = np.genfromtxt(self.dataset, delimiter=',') dataFeat = RealFeatures(data.T) with totalTimer: # Get the new dimensionality, if it is necessary. if "new_dimensionality" in options: d = int(options.pop("new_dimensionality")) if (d > data.shape[1]): Log.Fatal("New dimensionality (" + str(d) + ") cannot be greater " + "than existing dimensionality (" + str(data.shape[1]) + ")!") q.put(-1) return -1 else: d = data.shape[1] # Get the kernel type and make sure it is valid. if "kernel" in options: kernel = str(options.pop("kernel")) else: Log.Fatal( "Choose kernel type, valid choices are 'linear'," + " 'hyptan', 'polynomial' and 'gaussian'.") q.put(-1) return -1 if "degree" in options: degree = int(options.pop("degree")) if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") if kernel == "polynomial": kernel = PolyKernel(dataFeat, dataFeat, degree, True) elif kernel == "gaussian": kernel = GaussianKernel(dataFeat, dataFeat, 2.0) elif kernel == "linear": kernel = LinearKernel(dataFeat, dataFeat) elif kernel == "hyptan": kernel = SigmoidKernel(dataFeat, dataFeat, 2, 1.0, 1.0) else: Log.Fatal( "Invalid kernel type (" + kernel.group(1) + "); valid " + "choices are 'linear', 'hyptan', 'polynomial' and 'gaussian'." ) q.put(-1) return -1 # Perform Kernel Principal Components Analysis. model = KernelPCA(kernel) model.set_target_dim(d) model.init(dataFeat) model.apply_to_feature_matrix(dataFeat) except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def predict_new_data(graph_file, cons_file, tri_file, other_feature_file): print 'reading extracted features' graph_feature = read_feature_data(graph_file) graph_feature = get_normalized_given_max_min(graph_feature, 'models/grtaph_max_size') cons_feature = read_feature_data(cons_file) cons_feature = get_normalized_given_max_min(cons_feature, 'models/cons_max_size') CC_feature = read_feature_data(tri_file) CC_feature = get_normalized_given_max_min(CC_feature, 'models/tri_max_size') ATOS_feature = read_feature_data(other_feature_file) ATOS_feature = get_normalized_given_max_min(ATOS_feature, 'models/alu_max_size') width, C, epsilon, num_threads, mkl_epsilon, mkl_norm = 0.5, 1.2, 1e-5, 1, 0.001, 3.5 kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() #pdb.set_trace() subkfeats_train = RealFeatures() subkfeats_test = RealFeatures(np.transpose(np.array(graph_feature))) subkernel = GaussianKernel(10, width) feats_test.append_feature_obj(subkfeats_test) fstream = SerializableAsciiFile("models/graph.dat", "r") status = subkfeats_train.load_serializable(fstream) feats_train.append_feature_obj(subkfeats_train) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures() subkfeats_test = RealFeatures(np.transpose(np.array(cons_feature))) subkernel = GaussianKernel(10, width) feats_test.append_feature_obj(subkfeats_test) fstream = SerializableAsciiFile("models/cons.dat", "r") status = subkfeats_train.load_serializable(fstream) feats_train.append_feature_obj(subkfeats_train) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures() subkfeats_test = RealFeatures(np.transpose(np.array(CC_feature))) subkernel = GaussianKernel(10, width) feats_test.append_feature_obj(subkfeats_test) fstream = SerializableAsciiFile("models/tri.dat", "r") status = subkfeats_train.load_serializable(fstream) feats_train.append_feature_obj(subkfeats_train) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures() subkfeats_test = RealFeatures(np.transpose(np.array(ATOS_feature))) subkernel = GaussianKernel(10, width) feats_test.append_feature_obj(subkfeats_test) fstream = SerializableAsciiFile("models/alu.dat", "r") status = subkfeats_train.load_serializable(fstream) feats_train.append_feature_obj(subkfeats_train) kernel.append_kernel(subkernel) model_file = "models/mkl.dat" if not os.path.exists(model_file): print 'downloading model file' url_add = 'http://rth.dk/resources/mirnasponge/data/mkl.dat' urllib.urlretrieve(url_add, model_file) print 'loading trained model' fstream = SerializableAsciiFile("models/mkl.dat", "r") new_mkl = MKLClassification() status = new_mkl.load_serializable(fstream) print 'model predicting' kernel.init(feats_train, feats_test) new_mkl.set_kernel(kernel) y_out = new_mkl.apply().get_labels() return y_out
# of the authors and should not be interpreted as representing official policies, # either expressed or implied, of the Shogun Development Team. import argparse import logging from contextlib import contextmanager, closing from modshogun import (LibSVMFile, GaussianKernel, MulticlassLibSVM, SerializableHdf5File, LinearKernel) from utils import get_features_and_labels, track_execution LOGGER = logging.getLogger(__file__) KERNELS = { 'linear': lambda feats, width: LinearKernel(feats, feats), 'gaussian': lambda feats, width: GaussianKernel(feats, feats, width), } def parse_arguments(): parser = argparse.ArgumentParser(description="Train a multiclass SVM \ stored in libsvm format") parser.add_argument('--dataset', required=True, type=str, help='Path to training dataset in LibSVM format.') parser.add_argument('--capacity', default=1.0, type=float, help='SVM capacity parameter') parser.add_argument('--width', default=2.1, type=float, help='Width of the Gaussian Kernel to approximate') parser.add_argument('--epsilon', default=0.01, type=float, help='SVMOcas epsilon parameter') parser.add_argument('--kernel', type=str, default='linear', choices=['linear', 'gaussian'],
def linear_time_mmd_graphical(): # parameters, change to get different results m=1000 # set to 10000 for a good test result dim=2 # setting the difference of the first dimension smaller makes a harder test difference=1 # number of samples taken from null and alternative distribution num_null_samples=150 # streaming data generator for mean shift distributions gen_p=MeanShiftDataGenerator(0, dim) gen_q=MeanShiftDataGenerator(difference, dim) # use the median kernel selection # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable sigmas=[2**x for x in range(-3,10)] widths=[x*x*2 for x in sigmas] print "kernel widths:", widths combined=CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # mmd instance using streaming features, blocksize of 10000 block_size=1000 mmd=LinearTimeMMD(combined, gen_p, gen_q, m, block_size) # kernel selection instance (this can easily replaced by the other methods for selecting # single kernels selection=MMDKernelSelectionOpt(mmd) # perform kernel selection kernel=selection.select_kernel() kernel=GaussianKernel.obtain_from_generic(kernel) mmd.set_kernel(kernel); print "selected kernel width:", kernel.get_width() # sample alternative distribution, stream ensures different samples each run alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): alt_samples[i]=mmd.compute_statistic() # sample from null distribution # bootstrapping, biased statistic mmd.set_null_approximation_method(PERMUTATION) mmd.set_num_null_samples(num_null_samples) null_samples_boot=mmd.sample_null() # fit normal distribution to null and sample a normal distribution mmd.set_null_approximation_method(MMD1_GAUSSIAN) variance=mmd.compute_variance_estimate() null_samples_gaussian=normal(0,sqrt(variance),num_null_samples) # to plot data, sample a few examples from stream first features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) data=features.get_feature_matrix() # plot figure() # plot data of p and q subplot(2,3,1) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks plot(data[0][0:m], data[1][0:m], 'ro', label='$x$') plot(data[0][m+1:2*m], data[1][m+1:2*m], 'bo', label='$x$', alpha=0.5) title('Data, shift in $x_1$='+str(difference)+'\nm='+str(m)) xlabel('$x_1, y_1$') ylabel('$x_2, y_2$') # histogram of first data dimension and pdf subplot(2,3,2) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True) hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True) xs=linspace(min(data[0])-1,max(data[0])+1, 50) plot(xs,normpdf( xs, 0, 1), 'r', linewidth=3) plot(xs,normpdf( xs, difference, 1), 'b', linewidth=3) xlabel('$x_1, y_1$') ylabel('$p(x_1), p(y_1)$') title('Data PDF in $x_1, y_1$') # compute threshold for test level alpha=0.05 null_samples_boot.sort() null_samples_gaussian.sort() thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))]; thresh_gaussian=null_samples_gaussian[floor(len(null_samples_gaussian)*(1-alpha))]; type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples) type_one_error_gaussian=sum(null_samples_gaussian<thresh_boot)/float(num_null_samples) # plot alternative distribution with threshold subplot(2,3,4) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(alt_samples, 20, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range=[min([min(null_samples_boot), min(null_samples_gaussian)]), max([max(null_samples_boot), max(null_samples_gaussian)])] # plot null distribution with threshold subplot(2,3,3) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(null_samples_boot, 20, range=hist_range, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Sampled Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gaussian subplot(2,3,5) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(null_samples_gaussian, 20, range=hist_range, normed=True); axvline(thresh_gaussian, 0, 1, linewidth=2, color='red') title('Null Dist. Gaussian\nType I error is ' + str(type_one_error_gaussian)) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def gaussian_process_binary_classification_laplace(X_train, y_train, n_test=50): # import all necessary modules from Shogun (some of them require Eigen3) try: from modshogun import RealFeatures, BinaryLabels, GaussianKernel, \ LogitLikelihood, ProbitLikelihood, ZeroMean, LaplacianInferenceMethod, \ EPInferenceMethod, GaussianProcessClassification except ImportError: print('Eigen3 needed for Gaussian Processes') return # convert training data into Shogun representation train_features = RealFeatures(X_train) train_labels = BinaryLabels(y_train) # generate all pairs in 2d range of testing data x1 = linspace(X_train[0, :].min() - 1, X_train[0, :].max() + 1, n_test) x2 = linspace(X_train[1, :].min() - 1, X_train[1, :].max() + 1, n_test) X_test = asarray(list(product(x1, x2))).T # convert testing features into Shogun representation test_features = RealFeatures(X_test) # create Gaussian kernel with width = 2.0 kernel = GaussianKernel(10, 2.0) # create zero mean function mean = ZeroMean() # you can easily switch between probit and logit likelihood models # by uncommenting/commenting the following lines: # create probit likelihood model # lik = ProbitLikelihood() # create logit likelihood model lik = LogitLikelihood() # you can easily switch between Laplace and EP approximation by # uncommenting/commenting the following lines: # specify Laplace approximation inference method # inf = LaplacianInferenceMethod(kernel, train_features, mean, train_labels, lik) # specify EP approximation inference method inf = EPInferenceMethod(kernel, train_features, mean, train_labels, lik) # create and train GP classifier, which uses Laplace approximation gp = GaussianProcessClassification(inf) gp.train() # get probabilities p(y*=1|x*) for each testing feature x* p_test = gp.get_probabilities(test_features) # create figure figure() title('Training examples, predictive probability and decision boundary') # plot training data plot(X_train[0, argwhere(y_train == 1)], X_train[1, argwhere(y_train == 1)], 'ro') plot(X_train[0, argwhere(y_train == -1)], X_train[1, argwhere(y_train == -1)], 'bo') # plot decision boundary contour(x1, x2, reshape(p_test, (n_test, n_test)), levels=[0.5], colors=('black')) # plot probabilities pcolor(x1, x2, reshape(p_test, (n_test, n_test))) # show color bar colorbar() # show figure show()
def statistics_hsic(n, difference, angle): from modshogun import RealFeatures from modshogun import DataGenerator from modshogun import GaussianKernel from modshogun import HSIC from modshogun import PERMUTATION, HSIC_GAMMA from modshogun import EuclideanDistance from modshogun import Statistics, Math # for reproducable results (the numpy one might not be reproducible across # different OS/Python-distributions Math.init_random(1) np.random.seed(1) # note that the HSIC has to store kernel matrices # which upper bounds the sample size # use data generator class to produce example data data = DataGenerator.generate_sym_mix_gauss(n, difference, angle) #plot(data[0], data[1], 'x');show() # create shogun feature representation features_x = RealFeatures(np.array([data[0]])) features_y = RealFeatures(np.array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset = np.random.permutation(features_x.get_num_vectors()).astype( np.int32) subset = subset[0:200] features_x.add_subset(subset) dist = EuclideanDistance(features_x, features_x) distances = dist.get_distance_matrix() features_x.remove_subset() median_distance = np.median(distances) sigma_x = median_distance**2 features_y.add_subset(subset) dist = EuclideanDistance(features_y, features_y) distances = dist.get_distance_matrix() features_y.remove_subset() median_distance = np.median(distances) sigma_y = median_distance**2 #print "median distance for Gaussian kernel on x:", sigma_x #print "median distance for Gaussian kernel on y:", sigma_y kernel_x = GaussianKernel(10, sigma_x) kernel_y = GaussianKernel(10, sigma_y) hsic = HSIC(kernel_x, kernel_y, features_x, features_y) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic = hsic.compute_statistic() #print "HSIC:", statistic alpha = 0.05 #print "computing p-value using sampling null" hsic.set_null_approximation_method(PERMUTATION) # normally, at least 250 iterations should be done, but that takes long hsic.set_num_null_samples(100) # sampling null allows usage of unbiased or biased statistic p_value_boot = hsic.compute_p_value(statistic) thresh_boot = hsic.compute_threshold(alpha) #print "p_value:", p_value_boot #print "threshold for 0.05 alpha:", thresh_boot #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha #print "computing p-value using gamma method" hsic.set_null_approximation_method(HSIC_GAMMA) p_value_gamma = hsic.compute_p_value(statistic) thresh_gamma = hsic.compute_threshold(alpha) #print "p_value:", p_value_gamma #print "threshold for 0.05 alpha:", thresh_gamma #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_gamma<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # sampling null, biased statistic #print "sampling null distribution using sample_null" hsic.set_null_approximation_method(PERMUTATION) hsic.set_num_null_samples(100) null_samples = hsic.sample_null() #print "null mean:", np.mean(null_samples) #print "null variance:", np.var(null_samples) #hist(null_samples, 100); show() return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples
def statistics_mmd_kernel_selection_single(m,distance,stretch,num_blobs,angle,selection_method): from modshogun import RealFeatures from modshogun import GaussianBlobsDataGenerator from modshogun import GaussianKernel, CombinedKernel from modshogun import LinearTimeMMD from modshogun import MMDKernelSelectionMedian from modshogun import MMDKernelSelectionMax from modshogun import MMDKernelSelectionOpt from modshogun import PERMUTATION, MMD1_GAUSSIAN from modshogun import EuclideanDistance from modshogun import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # results for this low number will be bad (unstable, type I error wrong) m=1000 distance=10 stretch=5 num_blobs=3 angle=pi/4 # streaming data generator gen_p=GaussianBlobsDataGenerator(num_blobs, distance, 1, 0) gen_q=GaussianBlobsDataGenerator(num_blobs, distance, stretch, angle) # stream some data and plot num_plot=1000 features=gen_p.get_streamed_features(num_plot) features=features.create_merged_copy(gen_q.get_streamed_features(num_plot)) data=features.get_feature_matrix() #figure() #subplot(2,2,1) #grid(True) #plot(data[0][0:num_plot], data[1][0:num_plot], 'r.', label='$x$') #title('$X\sim p$') #subplot(2,2,2) #grid(True) #plot(data[0][num_plot+1:2*num_plot], data[1][num_plot+1:2*num_plot], 'b.', label='$x$', alpha=0.5) #title('$Y\sim q$') # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # different to the standard form, see documentation) sigmas=[2**x for x in range(-3,10)] widths=[x*x*2 for x in sigmas] combined=CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # mmd instance using streaming features, blocksize of 10000 block_size=1000 mmd=LinearTimeMMD(combined, gen_p, gen_q, m, block_size) # kernel selection instance (this can easily replaced by the other methods for selecting # single kernels if selection_method=="opt": selection=MMDKernelSelectionOpt(mmd) elif selection_method=="max": selection=MMDKernelSelectionMax(mmd) elif selection_method=="median": selection=MMDKernelSelectionMedian(mmd) # print measures (just for information) # in case Opt: ratios of MMD and standard deviation # in case Max: MMDs for each kernel # Does not work for median method if selection_method!="median": ratios=selection.compute_measures() #print "Measures:", ratios #subplot(2,2,3) #plot(ratios) #title('Measures') # perform kernel selection kernel=selection.select_kernel() kernel=GaussianKernel.obtain_from_generic(kernel) #print "selected kernel width:", kernel.get_width() # compute tpye I and II error (use many more trials). Type I error is only # estimated to check MMD1_GAUSSIAN method for estimating the null # distribution. Note that testing has to happen on difference data than # kernel selecting, but the linear time mmd does this implicitly mmd.set_kernel(kernel) mmd.set_null_approximation_method(MMD1_GAUSSIAN) # number of trials should be larger to compute tight confidence bounds num_trials=5; alpha=0.05 # test power typeIerrors=[0 for x in range(num_trials)] typeIIerrors=[0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i]=mmd.perform_test()>alpha mmd.set_simulate_h0(False) typeIIerrors[i]=mmd.perform_test()>alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return kernel,typeIerrors,typeIIerrors
def modelselection_parameter_tree_modular(dummy): from modshogun import ParameterCombination from modshogun import ModelSelectionParameters, R_EXP, R_LINEAR from modshogun import PowerKernel from modshogun import GaussianKernel from modshogun import DistantSegmentsKernel from modshogun import MinkowskiMetric root = ModelSelectionParameters() combinations = root.get_combinations() combinations.get_num_elements() c = ModelSelectionParameters('C') root.append_child(c) c.build_values(1, 11, R_EXP) power_kernel = PowerKernel() # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #power_kernel.print_modsel_params() param_power_kernel = ModelSelectionParameters('kernel', power_kernel) root.append_child(param_power_kernel) param_power_kernel_degree = ModelSelectionParameters('degree') param_power_kernel_degree.build_values(1, 1, R_EXP) param_power_kernel.append_child(param_power_kernel_degree) metric1 = MinkowskiMetric(10) # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #metric1.print_modsel_params() param_power_kernel_metric1 = ModelSelectionParameters('distance', metric1) param_power_kernel.append_child(param_power_kernel_metric1) param_power_kernel_metric1_k = ModelSelectionParameters('k') param_power_kernel_metric1_k.build_values(1, 12, R_LINEAR) param_power_kernel_metric1.append_child(param_power_kernel_metric1_k) gaussian_kernel = GaussianKernel() # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #gaussian_kernel.print_modsel_params() param_gaussian_kernel = ModelSelectionParameters('kernel', gaussian_kernel) root.append_child(param_gaussian_kernel) param_gaussian_kernel_width = ModelSelectionParameters('width') param_gaussian_kernel_width.build_values(1, 2, R_EXP) param_gaussian_kernel.append_child(param_gaussian_kernel_width) ds_kernel = DistantSegmentsKernel() # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #ds_kernel.print_modsel_params() param_ds_kernel = ModelSelectionParameters('kernel', ds_kernel) root.append_child(param_ds_kernel) param_ds_kernel_delta = ModelSelectionParameters('delta') param_ds_kernel_delta.build_values(1, 2, R_EXP) param_ds_kernel.append_child(param_ds_kernel_delta) param_ds_kernel_theta = ModelSelectionParameters('theta') param_ds_kernel_theta.build_values(1, 2, R_EXP) param_ds_kernel.append_child(param_ds_kernel_theta) # root.print_tree() combinations = root.get_combinations() # for i in range(combinations.get_num_elements()): # combinations.get_element(i).print_tree() return
def statistics_quadratic_time_mmd(m, dim, difference): from modshogun import RealFeatures from modshogun import MeanShiftDataGenerator from modshogun import GaussianKernel, CustomKernel from modshogun import QuadraticTimeMMD from modshogun import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED from modshogun import Statistics, IntVector, RealVector, Math # init seed for reproducability Math.init_random(1) random.seed(17) # number of examples kept low in order to make things fast # streaming data generator for mean shift distributions gen_p = MeanShiftDataGenerator(0, dim) #gen_p.parallel.set_num_threads(1) gen_q = MeanShiftDataGenerator(difference, dim) # stream some data from generator feat_p = gen_p.get_streamed_features(m) feat_q = gen_q.get_streamed_features(m) # set kernel a-priori. usually one would do some kernel selection. See # other examples for this. width = 10 kernel = GaussianKernel(10, width) # create quadratic time mmd instance. Note that this constructor # copies p and q and does not reference them mmd = QuadraticTimeMMD(kernel, feat_p, feat_q) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 alpha = 0.05 # using bootstrapping (slow, not the most reliable way. Consider pre- # computing the kernel when using it, see below). # Also, in practice, use at least 250 iterations mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(3) p_value_boot = mmd.perform_test() # reject if p-value is smaller than test level #print "bootstrap: p!=q: ", p_value_boot<alpha # using spectrum method. Use at least 250 samples from null. # This is consistent but sometimes breaks, always monitor type I error. # See tutorial for number of eigenvalues to use . # Only works with BIASED statistic mmd.set_statistic_type(BIASED) mmd.set_null_approximation_method(MMD2_SPECTRUM) mmd.set_num_eigenvalues_spectrum(3) mmd.set_num_samples_sepctrum(250) p_value_spectrum = mmd.perform_test() # reject if p-value is smaller than test level #print "spectrum: p!=q: ", p_value_spectrum<alpha # using gamma method. This is a quick hack, which works most of the time # but is NOT guaranteed to. See tutorial for details. # Only works with BIASED statistic mmd.set_statistic_type(BIASED) mmd.set_null_approximation_method(MMD2_GAMMA) p_value_gamma = mmd.perform_test() # reject if p-value is smaller than test level #print "gamma: p!=q: ", p_value_gamma<alpha # compute tpye I and II error (use many more trials in practice). # Type I error is not necessary if one uses bootstrapping. We do it here # anyway, but note that this is an efficient way of computing it. # Also note that testing has to happen on # difference data than kernel selection, but the linear time mmd does this # implicitly and we used a fixed kernel here. mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(5) num_trials = 5 type_I_errors = RealVector(num_trials) type_II_errors = RealVector(num_trials) inds = int32(array([x for x in range(2 * m)])) # numpy p_and_q = mmd.get_p_and_q() # use a precomputed kernel to be faster kernel.init(p_and_q, p_and_q) precomputed = CustomKernel(kernel) mmd.set_kernel(precomputed) for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error inds = random.permutation(inds) # numpy permutation precomputed.add_row_subset(inds) precomputed.add_col_subset(inds) type_I_errors[i] = mmd.perform_test() > alpha precomputed.remove_row_subset() precomputed.remove_col_subset() # on normal data, this gives type II error type_II_errors[i] = mmd.perform_test() > alpha return type_I_errors.get(), type_I_errors.get( ), p_value_boot, p_value_spectrum, p_value_gamma,
def quadratic_time_mmd_graphical(): # parameters, change to get different results m = 100 dim = 2 # setting the difference of the first dimension smaller makes a harder test difference = 0.5 # number of samples taken from null and alternative distribution num_null_samples = 500 # streaming data generator for mean shift distributions gen_p = MeanShiftDataGenerator(0, dim) gen_q = MeanShiftDataGenerator(difference, dim) # Stream examples and merge them in order to compute MMD on joint sample # alternative is to call a different constructor of QuadraticTimeMMD features = gen_p.get_streamed_features(m) features = features.create_merged_copy(gen_q.get_streamed_features(m)) # use the median kernel selection # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable sigmas = [2**x for x in range(-3, 10)] widths = [x * x * 2 for x in sigmas] print "kernel widths:", widths combined = CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # create MMD instance, use biased statistic mmd = QuadraticTimeMMD(combined, features, m) mmd.set_statistic_type(BIASED) # kernel selection instance (this can easily replaced by the other methods for selecting # single kernels selection = MMDKernelSelectionMax(mmd) # perform kernel selection kernel = selection.select_kernel() kernel = GaussianKernel.obtain_from_generic(kernel) mmd.set_kernel(kernel) print "selected kernel width:", kernel.get_width() # sample alternative distribution (new data each trial) alt_samples = zeros(num_null_samples) for i in range(len(alt_samples)): # Stream examples and merge them in order to replace in MMD features = gen_p.get_streamed_features(m) features = features.create_merged_copy(gen_q.get_streamed_features(m)) mmd.set_p_and_q(features) alt_samples[i] = mmd.compute_statistic() # sample from null distribution # bootstrapping, biased statistic mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_statistic_type(BIASED) mmd.set_bootstrap_iterations(num_null_samples) null_samples_boot = mmd.bootstrap_null() # sample from null distribution # spectrum, biased statistic if "sample_null_spectrum" in dir(QuadraticTimeMMD): mmd.set_null_approximation_method(MMD2_SPECTRUM) mmd.set_statistic_type(BIASED) null_samples_spectrum = mmd.sample_null_spectrum( num_null_samples, m - 10) # fit gamma distribution, biased statistic mmd.set_null_approximation_method(MMD2_GAMMA) mmd.set_statistic_type(BIASED) gamma_params = mmd.fit_null_gamma() # sample gamma with parameters null_samples_gamma = array([ gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples) ]) # to plot data, sample a few examples from stream first features = gen_p.get_streamed_features(m) features = features.create_merged_copy(gen_q.get_streamed_features(m)) data = features.get_feature_matrix() # plot figure() title('Quadratic Time MMD') # plot data of p and q subplot(2, 3, 1) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks plot(data[0][0:m], data[1][0:m], 'ro', label='$x$') plot(data[0][m + 1:2 * m], data[1][m + 1:2 * m], 'bo', label='$x$', alpha=0.5) title('Data, shift in $x_1$=' + str(difference) + '\nm=' + str(m)) xlabel('$x_1, y_1$') ylabel('$x_2, y_2$') # histogram of first data dimension and pdf subplot(2, 3, 2) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True) hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True) xs = linspace(min(data[0]) - 1, max(data[0]) + 1, 50) plot(xs, normpdf(xs, 0, 1), 'r', linewidth=3) plot(xs, normpdf(xs, difference, 1), 'b', linewidth=3) xlabel('$x_1, y_1$') ylabel('$p(x_1), p(y_1)$') title('Data PDF in $x_1, y_1$') # compute threshold for test level alpha = 0.05 null_samples_boot.sort() null_samples_spectrum.sort() null_samples_gamma.sort() thresh_boot = null_samples_boot[floor( len(null_samples_boot) * (1 - alpha))] thresh_spectrum = null_samples_spectrum[floor( len(null_samples_spectrum) * (1 - alpha))] thresh_gamma = null_samples_gamma[floor( len(null_samples_gamma) * (1 - alpha))] type_one_error_boot = sum( null_samples_boot < thresh_boot) / float(num_null_samples) type_one_error_spectrum = sum( null_samples_spectrum < thresh_boot) / float(num_null_samples) type_one_error_gamma = sum( null_samples_gamma < thresh_boot) / float(num_null_samples) # plot alternative distribution with threshold subplot(2, 3, 4) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(alt_samples, 20, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error = sum(alt_samples < thresh_boot) / float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range = [ min([ min(null_samples_boot), min(null_samples_spectrum), min(null_samples_gamma) ]), max([ max(null_samples_boot), max(null_samples_spectrum), max(null_samples_gamma) ]) ] # plot null distribution with threshold subplot(2, 3, 3) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(null_samples_boot, 20, range=hist_range, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Bootstrapped Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) grid(True) # plot null distribution spectrum subplot(2, 3, 5) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(null_samples_spectrum, 20, range=hist_range, normed=True) axvline(thresh_spectrum, 0, 1, linewidth=2, color='red') title('Null Dist. Spectrum\nType I error is ' + str(type_one_error_spectrum)) # plot null distribution gamma subplot(2, 3, 6) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(null_samples_gamma, 20, range=hist_range, normed=True) axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def hsic_graphical(): # parameters, change to get different results m = 250 difference = 3 # setting the angle lower makes a harder test angle = pi / 30 # number of samples taken from null and alternative distribution num_null_samples = 500 # use data generator class to produce example data data = DataGenerator.generate_sym_mix_gauss(m, difference, angle) # create shogun feature representation features_x = RealFeatures(array([data[0]])) features_y = RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset = int32(array([x for x in range(features_x.get_num_vectors()) ])) # numpy subset = random.permutation(subset) # numpy permutation subset = subset[0:200] features_x.add_subset(subset) dist = EuclideanDistance(features_x, features_x) distances = dist.get_distance_matrix() features_x.remove_subset() median_distance = Statistics.matrix_median(distances, True) sigma_x = median_distance**2 features_y.add_subset(subset) dist = EuclideanDistance(features_y, features_y) distances = dist.get_distance_matrix() features_y.remove_subset() median_distance = Statistics.matrix_median(distances, True) sigma_y = median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x = GaussianKernel(10, sigma_x) kernel_y = GaussianKernel(10, sigma_y) # create hsic instance. Note that this is a convienience constructor which copies # feature data. features_x and features_y are not these used in hsic. # This is only for user-friendlyness. Usually, its ok to do this. # Below, the alternative distribution is sampled, which means # that new feature objects have to be created in each iteration (slow) # However, normally, the alternative distribution is not sampled hsic = HSIC(kernel_x, kernel_y, features_x, features_y) # sample alternative distribution alt_samples = zeros(num_null_samples) for i in range(len(alt_samples)): data = DataGenerator.generate_sym_mix_gauss(m, difference, angle) features_x.set_feature_matrix(array([data[0]])) features_y.set_feature_matrix(array([data[1]])) # re-create hsic instance everytime since feature objects are copied due to # useage of convienience constructor hsic = HSIC(kernel_x, kernel_y, features_x, features_y) alt_samples[i] = hsic.compute_statistic() # sample from null distribution # permutation, biased statistic hsic.set_null_approximation_method(PERMUTATION) hsic.set_num_null_samples(num_null_samples) null_samples_boot = hsic.sample_null() # fit gamma distribution, biased statistic hsic.set_null_approximation_method(HSIC_GAMMA) gamma_params = hsic.fit_null_gamma() # sample gamma with parameters null_samples_gamma = array([ gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples) ]) # plot figure() # plot data x and y subplot(2, 2, 1) gca().xaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks grid(True) plot(data[0], data[1], 'o') title('Data, rotation=$\pi$/' + str(1 / angle * pi) + '\nm=' + str(m)) xlabel('$x$') ylabel('$y$') # compute threshold for test level alpha = 0.05 null_samples_boot.sort() null_samples_gamma.sort() thresh_boot = null_samples_boot[floor( len(null_samples_boot) * (1 - alpha))] thresh_gamma = null_samples_gamma[floor( len(null_samples_gamma) * (1 - alpha))] type_one_error_boot = sum( null_samples_boot < thresh_boot) / float(num_null_samples) type_one_error_gamma = sum( null_samples_gamma < thresh_boot) / float(num_null_samples) # plot alternative distribution with threshold subplot(2, 2, 2) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks grid(True) hist(alt_samples, 20, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error = sum(alt_samples < thresh_boot) / float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range = [ min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)]) ] # plot null distribution with threshold subplot(2, 2, 3) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks grid(True) hist(null_samples_boot, 20, range=hist_range, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Sampled Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gamma subplot(2, 2, 4) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks grid(True) hist(null_samples_gamma, 20, range=hist_range, normed=True) axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) grid(True) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def RunKPCAShogun(q): totalTimer = Timer() try: # Load input dataset. Log.Info("Loading dataset", self.verbose) data = np.genfromtxt(self.dataset, delimiter=',') dataFeat = RealFeatures(data.T) with totalTimer: # Get the new dimensionality, if it is necessary. dimension = re.search('-d (\d+)', options) if not dimension: d = data.shape[1] else: d = int(dimension.group(1)) if (d > data.shape[1]): Log.Fatal("New dimensionality (" + str(d) + ") cannot be greater " + "than existing dimensionality (" + str(data.shape[1]) + ")!") q.put(-1) return -1 # Get the kernel type and make sure it is valid. kernel = re.search("-k ([^\s]+)", options) if not kernel: Log.Fatal( "Choose kernel type, valid choices are 'linear'," + " 'hyptan', 'polynomial' and 'gaussian'.") q.put(-1) return -1 elif kernel.group(1) == "polynomial": degree = re.search('-D (\d+)', options) degree = 1 if not degree else int(degree.group(1)) kernel = PolyKernel(dataFeat, dataFeat, degree, True) elif kernel.group(1) == "gaussian": kernel = GaussianKernel(dataFeat, dataFeat, 2.0) elif kernel.group(1) == "linear": kernel = LinearKernel(dataFeat, dataFeat) elif kernel.group(1) == "hyptan": kernel = SigmoidKernel(dataFeat, dataFeat, 2, 1.0, 1.0) else: Log.Fatal( "Invalid kernel type (" + kernel.group(1) + "); valid " + "choices are 'linear', 'hyptan', 'polynomial' and 'gaussian'." ) q.put(-1) return -1 # Perform Kernel Principal Components Analysis. model = KernelPCA(kernel) model.set_target_dim(d) model.init(dataFeat) model.apply_to_feature_matrix(dataFeat) except Exception as e: q.put(-1) return -1 time = totalTimer.ElapsedTime() q.put(time) return time