def quadratic_time_mmd_graphical(): # parameters, change to get different results m=100 dim=2 # setting the difference of the first dimension smaller makes a harder test difference=0.5 # number of samples taken from null and alternative distribution num_null_samples=500 # streaming data generator for mean shift distributions gen_p=MeanShiftDataGenerator(0, dim) gen_q=MeanShiftDataGenerator(difference, dim) # Stream examples and merge them in order to compute MMD on joint sample # alternative is to call a different constructor of QuadraticTimeMMD features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) # use the median kernel selection # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable sigmas=[2**x for x in range(-3,10)] widths=[x*x*2 for x in sigmas] print "kernel widths:", widths combined=CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # create MMD instance, use biased statistic mmd=QuadraticTimeMMD(combined,features, m) mmd.set_statistic_type(BIASED) # kernel selection instance (this can easily replaced by the other methods for selecting # single kernels selection=MMDKernelSelectionMax(mmd) # perform kernel selection kernel=selection.select_kernel() kernel=GaussianKernel.obtain_from_generic(kernel) mmd.set_kernel(kernel); print "selected kernel width:", kernel.get_width() # sample alternative distribution (new data each trial) alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): # Stream examples and merge them in order to replace in MMD features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) mmd.set_p_and_q(features) alt_samples[i]=mmd.compute_statistic() # sample from null distribution # bootstrapping, biased statistic mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_statistic_type(BIASED) mmd.set_bootstrap_iterations(num_null_samples) null_samples_boot=mmd.bootstrap_null() # sample from null distribution # spectrum, biased statistic if "sample_null_spectrum" in dir(QuadraticTimeMMD): mmd.set_null_approximation_method(MMD2_SPECTRUM) mmd.set_statistic_type(BIASED) null_samples_spectrum=mmd.sample_null_spectrum(num_null_samples, m-10) # fit gamma distribution, biased statistic mmd.set_null_approximation_method(MMD2_GAMMA) mmd.set_statistic_type(BIASED) gamma_params=mmd.fit_null_gamma() # sample gamma with parameters null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)]) # to plot data, sample a few examples from stream first features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) data=features.get_feature_matrix() # plot figure() title('Quadratic Time MMD') # plot data of p and q subplot(2,3,1) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks plot(data[0][0:m], data[1][0:m], 'ro', label='$x$') plot(data[0][m+1:2*m], data[1][m+1:2*m], 'bo', label='$x$', alpha=0.5) title('Data, shift in $x_1$='+str(difference)+'\nm='+str(m)) xlabel('$x_1, y_1$') ylabel('$x_2, y_2$') # histogram of first data dimension and pdf subplot(2,3,2) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3 )) # reduce number of x-ticks hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True) hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True) xs=linspace(min(data[0])-1,max(data[0])+1, 50) plot(xs,normpdf( xs, 0, 1), 'r', linewidth=3) plot(xs,normpdf( xs, difference, 1), 'b', linewidth=3) xlabel('$x_1, y_1$') ylabel('$p(x_1), p(y_1)$') title('Data PDF in $x_1, y_1$') # compute threshold for test level alpha=0.05 null_samples_boot.sort() null_samples_spectrum.sort() null_samples_gamma.sort() thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))]; thresh_spectrum=null_samples_spectrum[floor(len(null_samples_spectrum)*(1-alpha))]; thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))]; type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples) type_one_error_spectrum=sum(null_samples_spectrum<thresh_boot)/float(num_null_samples) type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples) # plot alternative distribution with threshold subplot(2,3,4) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(alt_samples, 20, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range=[min([min(null_samples_boot), min(null_samples_spectrum), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_spectrum), max(null_samples_gamma)])] # plot null distribution with threshold subplot(2,3,3) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3 )) # reduce number of x-ticks hist(null_samples_boot, 20, range=hist_range, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Bootstrapped Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) grid(True) # plot null distribution spectrum subplot(2,3,5) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(null_samples_spectrum, 20, range=hist_range, normed=True); axvline(thresh_spectrum, 0, 1, linewidth=2, color='red') title('Null Dist. Spectrum\nType I error is ' + str(type_one_error_spectrum)) # plot null distribution gamma subplot(2,3,6) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(null_samples_gamma, 20, range=hist_range, normed=True); axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def statistics_quadratic_time_mmd(): from shogun.Features import RealFeatures from shogun.Features import MeanShiftDataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import QuadraticTimeMMD from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, IntVector # note that the quadratic time mmd has to store kernel matrices # which upper bounds the sample size n = 100 dim = 2 difference = 0.5 # streaming data generator for mean shift distributions gen_p = MeanShiftDataGenerator(0, dim) gen_q = MeanShiftDataGenerator(difference, dim) # Stream examples and merge them in order to compute median on joint sample # alternative is to call a different constructor of QuadraticTimeMMD features = gen_p.get_streamed_features(n) features = features.create_merged_copy(gen_q.get_streamed_features(n)) # use data generator class to produce example data data = features.get_feature_matrix() print "dimension means of X", mean(data.T[0:n].T) print "dimension means of Y", mean(data.T[n : 2 * n + 1].T) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Use a permutation set to temporarily merge features in merged examples subset = IntVector.randperm_vec(features.get_num_vectors()) subset = subset[0:200] features.add_subset(subset) dist = EuclideanDistance(features, features) distances = dist.get_distance_matrix() features.remove_subset() median_distance = Statistics.matrix_median(distances, True) sigma = median_distance ** 2 print "median distance for Gaussian kernel:", sigma kernel = GaussianKernel(10, sigma) mmd = QuadraticTimeMMD(kernel, features, n) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic = mmd.compute_statistic() alpha = 0.05 print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long mmd.set_bootstrap_iterations(10) # bootstrapping allows usage of unbiased or biased statistic mmd.set_statistic_type(UNBIASED) p_value = mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha # only can do this if SHOGUN was compiled with LAPACK so check if "sample_null_spectrum" in dir(QuadraticTimeMMD): print "computing p-value using spectrum method" mmd.set_null_approximation_method(MMD2_SPECTRUM) # normally, at least 250 iterations should be done, but that takes long mmd.set_num_samples_sepctrum(50) mmd.set_num_eigenvalues_spectrum(n - 10) # spectrum method computes p-value for biased statistics only mmd.set_statistic_type(BIASED) p_value = mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha print "computing p-value using gamma method" mmd.set_null_approximation_method(MMD2_GAMMA) # gamma method computes p-value for biased statistics only mmd.set_statistic_type(BIASED) p_value = mmd.compute_p_value(statistic) print "p_value:", p_value print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic print "sampling null distribution using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_statistic_type(BIASED) mmd.set_bootstrap_iterations(10) null_samples = mmd.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples) # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # spectrum, biased statistic print "sampling null distribution using spectrum method" mmd.set_null_approximation_method(MMD2_SPECTRUM) mmd.set_statistic_type(BIASED) # 200 samples using 100 eigenvalues null_samples = mmd.sample_null_spectrum(50, 10) print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
def statistics_quadratic_time_mmd (m,dim,difference): from shogun.Features import RealFeatures from shogun.Features import MeanShiftDataGenerator from shogun.Kernel import GaussianKernel, CustomKernel from shogun.Statistics import QuadraticTimeMMD from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED from shogun.Mathematics import Statistics, IntVector, RealVector, Math # init seed for reproducability Math.init_random(1) # number of examples kept low in order to make things fast # streaming data generator for mean shift distributions gen_p=MeanShiftDataGenerator(0, dim); gen_q=MeanShiftDataGenerator(difference, dim); # stream some data from generator feat_p=gen_p.get_streamed_features(m); feat_q=gen_q.get_streamed_features(m); # set kernel a-priori. usually one would do some kernel selection. See # other examples for this. width=10; kernel=GaussianKernel(10, width); # create quadratic time mmd instance. Note that this constructor # copies p and q and does not reference them mmd=QuadraticTimeMMD(kernel, feat_p, feat_q); # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 alpha=0.05; # using bootstrapping (slow, not the most reliable way. Consider pre- # computing the kernel when using it, see below). # Also, in practice, use at least 250 iterations mmd.set_null_approximation_method(BOOTSTRAP); mmd.set_bootstrap_iterations(3); p_value_boot=mmd.perform_test(); # reject if p-value is smaller than test level #print "bootstrap: p!=q: ", p_value_boot<alpha # using spectrum method. Use at least 250 samples from null. # This is consistent but sometimes breaks, always monitor type I error. # See tutorial for number of eigenvalues to use . # Only works with BIASED statistic mmd.set_statistic_type(BIASED); mmd.set_null_approximation_method(MMD2_SPECTRUM); mmd.set_num_eigenvalues_spectrum(3); mmd.set_num_samples_sepctrum(250); p_value_spectrum=mmd.perform_test(); # reject if p-value is smaller than test level #print "spectrum: p!=q: ", p_value_spectrum<alpha # using gamma method. This is a quick hack, which works most of the time # but is NOT guaranteed to. See tutorial for details. # Only works with BIASED statistic mmd.set_statistic_type(BIASED); mmd.set_null_approximation_method(MMD2_GAMMA); p_value_gamma=mmd.perform_test(); # reject if p-value is smaller than test level #print "gamma: p!=q: ", p_value_gamma<alpha # compute tpye I and II error (use many more trials in practice). # Type I error is not necessary if one uses bootstrapping. We do it here # anyway, but note that this is an efficient way of computing it. # Also note that testing has to happen on # difference data than kernel selection, but the linear time mmd does this # implicitly and we used a fixed kernel here. mmd.set_null_approximation_method(BOOTSTRAP); mmd.set_bootstrap_iterations(5); num_trials=5; type_I_errors=RealVector(num_trials); type_II_errors=RealVector(num_trials); inds=int32(array([x for x in range(2*m)])) # numpy p_and_q=mmd.get_p_and_q(); # use a precomputed kernel to be faster kernel.init(p_and_q, p_and_q); precomputed=CustomKernel(kernel); mmd.set_kernel(precomputed); for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error inds=random.permutation(inds) # numpy permutation precomputed.add_row_subset(inds); precomputed.add_col_subset(inds); type_I_errors[i]=mmd.perform_test()>alpha; precomputed.remove_row_subset(); precomputed.remove_col_subset(); # on normal data, this gives type II error type_II_errors[i]=mmd.perform_test()>alpha; return type_I_errors.get(),type_I_errors.get(),p_value_boot,p_value_spectrum,p_value_gamma,
def statistics_linear_time_mmd (n,dim,difference): from shogun.Features import RealFeatures from shogun.Features import MeanShiftDataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # so increase to get reasonable results # streaming data generator for mean shift distributions gen_p=MeanShiftDataGenerator(0, dim) gen_q=MeanShiftDataGenerator(difference, dim) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Stream examples and merge them in order to compute median on joint sample features=gen_p.get_streamed_features(100) features=features.create_merged_copy(gen_q.get_streamed_features(100)) # compute all pairwise distances dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() # compute median and determine kernel width (using shogun) median_distance=Statistics.matrix_median(distances, True) sigma=median_distance**2 #print "median distance for Gaussian kernel:", sigma kernel=GaussianKernel(10,sigma) # mmd instance using streaming features, blocksize of 10000 mmd=LinearTimeMMD(kernel, gen_p, gen_q, n, 10000) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic=mmd.compute_statistic() #print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha=0.05 #print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed p_value_boot=mmd.compute_p_value(statistic) #print "p_value_boot:", p_value_boot #print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha #print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value_gaussian=mmd.compute_p_value(statistic) #print "p_value_gaussian:", p_value_gaussian #print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(10) # normally, far more iterations are needed null_samples=mmd.bootstrap_null() #print "null mean:", mean(null_samples) #print "null variance:", var(null_samples) # compute type I and type II errors for Gaussian approximation # number of trials should be larger to compute tight confidence bounds mmd.set_null_approximation_method(MMD1_GAUSSIAN) num_trials=5; alpha=0.05 # test power typeIerrors=[0 for x in range(num_trials)] typeIIerrors=[0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i]=mmd.perform_test()>alpha mmd.set_simulate_h0(False) typeIIerrors[i]=mmd.perform_test()>alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors
def linear_time_mmd_graphical(): # parameters, change to get different results m=1000 # set to 10000 for a good test result dim=2 # setting the difference of the first dimension smaller makes a harder test difference=1 # number of samples taken from null and alternative distribution num_null_samples=150 # streaming data generator for mean shift distributions gen_p=MeanShiftDataGenerator(0, dim) gen_q=MeanShiftDataGenerator(difference, dim) # use the median kernel selection # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable sigmas=[2**x for x in range(-3,10)] widths=[x*x*2 for x in sigmas] print "kernel widths:", widths combined=CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # mmd instance using streaming features, blocksize of 10000 block_size=1000 mmd=LinearTimeMMD(combined, gen_p, gen_q, m, block_size) # kernel selection instance (this can easily replaced by the other methods for selecting # single kernels selection=MMDKernelSelectionOpt(mmd) # perform kernel selection kernel=selection.select_kernel() kernel=GaussianKernel.obtain_from_generic(kernel) mmd.set_kernel(kernel); print "selected kernel width:", kernel.get_width() # sample alternative distribution, stream ensures different samples each run alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): alt_samples[i]=mmd.compute_statistic() # sample from null distribution # bootstrapping, biased statistic mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(num_null_samples) null_samples_boot=mmd.bootstrap_null() # fit normal distribution to null and sample a normal distribution mmd.set_null_approximation_method(MMD1_GAUSSIAN) variance=mmd.compute_variance_estimate() null_samples_gaussian=normal(0,sqrt(variance),num_null_samples) # to plot data, sample a few examples from stream first features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) data=features.get_feature_matrix() # plot figure() # plot data of p and q subplot(2,3,1) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks plot(data[0][0:m], data[1][0:m], 'ro', label='$x$') plot(data[0][m+1:2*m], data[1][m+1:2*m], 'bo', label='$x$', alpha=0.5) title('Data, shift in $x_1$='+str(difference)+'\nm='+str(m)) xlabel('$x_1, y_1$') ylabel('$x_2, y_2$') # histogram of first data dimension and pdf subplot(2,3,2) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True) hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True) xs=linspace(min(data[0])-1,max(data[0])+1, 50) plot(xs,normpdf( xs, 0, 1), 'r', linewidth=3) plot(xs,normpdf( xs, difference, 1), 'b', linewidth=3) xlabel('$x_1, y_1$') ylabel('$p(x_1), p(y_1)$') title('Data PDF in $x_1, y_1$') # compute threshold for test level alpha=0.05 null_samples_boot.sort() null_samples_gaussian.sort() thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))]; thresh_gaussian=null_samples_gaussian[floor(len(null_samples_gaussian)*(1-alpha))]; type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples) type_one_error_gaussian=sum(null_samples_gaussian<thresh_boot)/float(num_null_samples) # plot alternative distribution with threshold subplot(2,3,4) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(alt_samples, 20, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range=[min([min(null_samples_boot), min(null_samples_gaussian)]), max([max(null_samples_boot), max(null_samples_gaussian)])] # plot null distribution with threshold subplot(2,3,3) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(null_samples_boot, 20, range=hist_range, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Bootstrapped Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gaussian subplot(2,3,5) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(null_samples_gaussian, 20, range=hist_range, normed=True); axvline(thresh_gaussian, 0, 1, linewidth=2, color='red') title('Null Dist. Gaussian\nType I error is ' + str(type_one_error_gaussian)) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def statistics_linear_time_mmd(n, dim, difference): from shogun.Features import RealFeatures from shogun.Features import MeanShiftDataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # so increase to get reasonable results # streaming data generator for mean shift distributions gen_p = MeanShiftDataGenerator(0, dim) gen_q = MeanShiftDataGenerator(difference, dim) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Stream examples and merge them in order to compute median on joint sample features = gen_p.get_streamed_features(100) features = features.create_merged_copy(gen_q.get_streamed_features(100)) # compute all pairwise distances dist = EuclideanDistance(features, features) distances = dist.get_distance_matrix() # compute median and determine kernel width (using shogun) median_distance = Statistics.matrix_median(distances, True) sigma = median_distance**2 #print "median distance for Gaussian kernel:", sigma kernel = GaussianKernel(10, sigma) # mmd instance using streaming features, blocksize of 10000 mmd = LinearTimeMMD(kernel, gen_p, gen_q, n, 10000) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic = mmd.compute_statistic() #print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha = 0.05 #print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations( 50) # normally, far more iterations are needed p_value_boot = mmd.compute_p_value(statistic) #print "p_value_boot:", p_value_boot #print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha #print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value_gaussian = mmd.compute_p_value(statistic) #print "p_value_gaussian:", p_value_gaussian #print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations( 10) # normally, far more iterations are needed null_samples = mmd.bootstrap_null() #print "null mean:", mean(null_samples) #print "null variance:", var(null_samples) # compute type I and type II errors for Gaussian approximation # number of trials should be larger to compute tight confidence bounds mmd.set_null_approximation_method(MMD1_GAUSSIAN) num_trials = 5 alpha = 0.05 # test power typeIerrors = [0 for x in range(num_trials)] typeIIerrors = [0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i] = mmd.perform_test() > alpha mmd.set_simulate_h0(False) typeIIerrors[i] = mmd.perform_test() > alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors
def linear_time_mmd_graphical(): # parameters, change to get different results m = 1000 # set to 10000 for a good test result dim = 2 # setting the difference of the first dimension smaller makes a harder test difference = 1 # number of samples taken from null and alternative distribution num_null_samples = 150 # streaming data generator for mean shift distributions gen_p = MeanShiftDataGenerator(0, dim) gen_q = MeanShiftDataGenerator(difference, dim) # use the median kernel selection # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable sigmas = [2**x for x in range(-3, 10)] widths = [x * x * 2 for x in sigmas] print "kernel widths:", widths combined = CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # mmd instance using streaming features, blocksize of 10000 block_size = 1000 mmd = LinearTimeMMD(combined, gen_p, gen_q, m, block_size) # kernel selection instance (this can easily replaced by the other methods for selecting # single kernels selection = MMDKernelSelectionOpt(mmd) # perform kernel selection kernel = selection.select_kernel() kernel = GaussianKernel.obtain_from_generic(kernel) mmd.set_kernel(kernel) print "selected kernel width:", kernel.get_width() # sample alternative distribution, stream ensures different samples each run alt_samples = zeros(num_null_samples) for i in range(len(alt_samples)): alt_samples[i] = mmd.compute_statistic() # sample from null distribution # bootstrapping, biased statistic mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(num_null_samples) null_samples_boot = mmd.bootstrap_null() # fit normal distribution to null and sample a normal distribution mmd.set_null_approximation_method(MMD1_GAUSSIAN) variance = mmd.compute_variance_estimate() null_samples_gaussian = normal(0, sqrt(variance), num_null_samples) # to plot data, sample a few examples from stream first features = gen_p.get_streamed_features(m) features = features.create_merged_copy(gen_q.get_streamed_features(m)) data = features.get_feature_matrix() # plot figure() # plot data of p and q subplot(2, 3, 1) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks plot(data[0][0:m], data[1][0:m], 'ro', label='$x$') plot(data[0][m + 1:2 * m], data[1][m + 1:2 * m], 'bo', label='$x$', alpha=0.5) title('Data, shift in $x_1$=' + str(difference) + '\nm=' + str(m)) xlabel('$x_1, y_1$') ylabel('$x_2, y_2$') # histogram of first data dimension and pdf subplot(2, 3, 2) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True) hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True) xs = linspace(min(data[0]) - 1, max(data[0]) + 1, 50) plot(xs, normpdf(xs, 0, 1), 'r', linewidth=3) plot(xs, normpdf(xs, difference, 1), 'b', linewidth=3) xlabel('$x_1, y_1$') ylabel('$p(x_1), p(y_1)$') title('Data PDF in $x_1, y_1$') # compute threshold for test level alpha = 0.05 null_samples_boot.sort() null_samples_gaussian.sort() thresh_boot = null_samples_boot[floor( len(null_samples_boot) * (1 - alpha))] thresh_gaussian = null_samples_gaussian[floor( len(null_samples_gaussian) * (1 - alpha))] type_one_error_boot = sum( null_samples_boot < thresh_boot) / float(num_null_samples) type_one_error_gaussian = sum( null_samples_gaussian < thresh_boot) / float(num_null_samples) # plot alternative distribution with threshold subplot(2, 3, 4) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(alt_samples, 20, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error = sum(alt_samples < thresh_boot) / float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range = [ min([min(null_samples_boot), min(null_samples_gaussian)]), max([max(null_samples_boot), max(null_samples_gaussian)]) ] # plot null distribution with threshold subplot(2, 3, 3) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(null_samples_boot, 20, range=hist_range, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Bootstrapped Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gaussian subplot(2, 3, 5) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(null_samples_gaussian, 20, range=hist_range, normed=True) axvline(thresh_gaussian, 0, 1, linewidth=2, color='red') title('Null Dist. Gaussian\nType I error is ' + str(type_one_error_gaussian)) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)