Python MeanShiftDataGenerator Examples

Programming Language: Python

Namespace/Package Name: shogun.Features

Examples at hotexamples.com: 7

Python MeanShiftDataGenerator - 7 examples found. These are the top rated real world Python examples of shogun.Features.MeanShiftDataGenerator extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

MeanShiftDataGenerator(4)

get_streamed_features(4)

Example #1

Show file

File: statistics_quadratic_time_mmd.py Project: theoryno3/shogun

def quadratic_time_mmd_graphical():
	
	# parameters, change to get different results
	m=100
	dim=2
	
	# setting the difference of the first dimension smaller makes a harder test
	difference=0.5
	
	# number of samples taken from null and alternative distribution
	num_null_samples=500
	
	# streaming data generator for mean shift distributions
	gen_p=MeanShiftDataGenerator(0, dim)
	gen_q=MeanShiftDataGenerator(difference, dim)
	
	# Stream examples and merge them in order to compute MMD on joint sample
	# alternative is to call a different constructor of QuadraticTimeMMD
	features=gen_p.get_streamed_features(m)
	features=features.create_merged_copy(gen_q.get_streamed_features(m))
	
	# use the median kernel selection
	# create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is
	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	sigmas=[2**x for x in range(-3,10)]
	widths=[x*x*2 for x in sigmas]
	print "kernel widths:", widths
	combined=CombinedKernel()
	for i in range(len(sigmas)):
		combined.append_kernel(GaussianKernel(10, widths[i]))

	# create MMD instance, use biased statistic
	mmd=QuadraticTimeMMD(combined,features, m)
	mmd.set_statistic_type(BIASED)
	
	# kernel selection instance (this can easily replaced by the other methods for selecting
	# single kernels
	selection=MMDKernelSelectionMax(mmd)

	# perform kernel selection
	kernel=selection.select_kernel()
	kernel=GaussianKernel.obtain_from_generic(kernel)
	mmd.set_kernel(kernel);
	print "selected kernel width:", kernel.get_width()
	
	# sample alternative distribution (new data each trial)
	alt_samples=zeros(num_null_samples)
	for i in range(len(alt_samples)):
		# Stream examples and merge them in order to replace in MMD
		features=gen_p.get_streamed_features(m)
		features=features.create_merged_copy(gen_q.get_streamed_features(m))
		mmd.set_p_and_q(features)
		alt_samples[i]=mmd.compute_statistic()
	
	# sample from null distribution
	# bootstrapping, biased statistic
	mmd.set_null_approximation_method(BOOTSTRAP)
	mmd.set_statistic_type(BIASED)
	mmd.set_bootstrap_iterations(num_null_samples)
	null_samples_boot=mmd.bootstrap_null()
	
	# sample from null distribution
	# spectrum, biased statistic
	if "sample_null_spectrum" in dir(QuadraticTimeMMD):
			mmd.set_null_approximation_method(MMD2_SPECTRUM)
			mmd.set_statistic_type(BIASED)
			null_samples_spectrum=mmd.sample_null_spectrum(num_null_samples, m-10)
			
	# fit gamma distribution, biased statistic
	mmd.set_null_approximation_method(MMD2_GAMMA)
	mmd.set_statistic_type(BIASED)
	gamma_params=mmd.fit_null_gamma()
	# sample gamma with parameters
	null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)])
	
	# to plot data, sample a few examples from stream first
	features=gen_p.get_streamed_features(m)
	features=features.create_merged_copy(gen_q.get_streamed_features(m))
	data=features.get_feature_matrix()
	
	# plot
	figure()
	title('Quadratic Time MMD')
	
	# plot data of p and q
	subplot(2,3,1)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	plot(data[0][0:m], data[1][0:m], 'ro', label='$x$')
	plot(data[0][m+1:2*m], data[1][m+1:2*m], 'bo', label='$x$', alpha=0.5)
	title('Data, shift in $x_1$='+str(difference)+'\nm='+str(m))
	xlabel('$x_1, y_1$')
	ylabel('$x_2, y_2$')
	
	# histogram of first data dimension and pdf
	subplot(2,3,2)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3 )) # reduce number of x-ticks
	hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True)
	hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True)
	xs=linspace(min(data[0])-1,max(data[0])+1, 50)
	plot(xs,normpdf( xs, 0, 1), 'r', linewidth=3)
	plot(xs,normpdf( xs, difference, 1), 'b', linewidth=3)
	xlabel('$x_1, y_1$')
	ylabel('$p(x_1), p(y_1)$')
	title('Data PDF in $x_1, y_1$')
	
	# compute threshold for test level
	alpha=0.05
	null_samples_boot.sort()
	null_samples_spectrum.sort()
	null_samples_gamma.sort()
	thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))];
	thresh_spectrum=null_samples_spectrum[floor(len(null_samples_spectrum)*(1-alpha))];
	thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))];
	
	type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples)
	type_one_error_spectrum=sum(null_samples_spectrum<thresh_boot)/float(num_null_samples)
	type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples)
	
	# plot alternative distribution with threshold
	subplot(2,3,4)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(alt_samples, 20, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples)
	title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error))
	
	# compute range for all null distribution histograms
	hist_range=[min([min(null_samples_boot), min(null_samples_spectrum), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_spectrum), max(null_samples_gamma)])]
	
	# plot null distribution with threshold
	subplot(2,3,3)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3 )) # reduce number of x-ticks
	hist(null_samples_boot, 20, range=hist_range, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	title('Bootstrapped Null Dist.\n' + 'Type I error is '  + str(type_one_error_boot))
	grid(True)
	
	# plot null distribution spectrum
	subplot(2,3,5)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(null_samples_spectrum, 20, range=hist_range, normed=True);
	axvline(thresh_spectrum, 0, 1, linewidth=2, color='red')
	title('Null Dist. Spectrum\nType I error is '  + str(type_one_error_spectrum))
	
	# plot null distribution gamma
	subplot(2,3,6)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(null_samples_gamma, 20, range=hist_range, normed=True);
	axvline(thresh_gamma, 0, 1, linewidth=2, color='red')
	title('Null Dist. Gamma\nType I error is '  + str(type_one_error_gamma))
	
	# pull plots a bit apart
	subplots_adjust(hspace=0.5)
	subplots_adjust(wspace=0.5)

Example #2

Show file

File: statistics_quadratic_time_mmd.py Project: Kalep/shogun

def statistics_quadratic_time_mmd():
    from shogun.Features import RealFeatures
    from shogun.Features import MeanShiftDataGenerator
    from shogun.Kernel import GaussianKernel
    from shogun.Statistics import QuadraticTimeMMD
    from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED
    from shogun.Distance import EuclideanDistance
    from shogun.Mathematics import Statistics, IntVector

    # note that the quadratic time mmd has to store kernel matrices
    # which upper bounds the sample size
    n = 100
    dim = 2
    difference = 0.5

    # streaming data generator for mean shift distributions
    gen_p = MeanShiftDataGenerator(0, dim)
    gen_q = MeanShiftDataGenerator(difference, dim)

    # Stream examples and merge them in order to compute median on joint sample
    # alternative is to call a different constructor of QuadraticTimeMMD
    features = gen_p.get_streamed_features(n)
    features = features.create_merged_copy(gen_q.get_streamed_features(n))

    # use data generator class to produce example data
    data = features.get_feature_matrix()

    print "dimension means of X", mean(data.T[0:n].T)
    print "dimension means of Y", mean(data.T[n : 2 * n + 1].T)

    # compute median data distance in order to use for Gaussian kernel width
    # 0.5*median_distance normally (factor two in Gaussian kernel)
    # However, shoguns kernel width is different to usual parametrization
    # Therefore 0.5*2*median_distance^2
    # Use a subset of data for that, only 200 elements. Median is stable
    # Use a permutation set to temporarily merge features in merged examples
    subset = IntVector.randperm_vec(features.get_num_vectors())
    subset = subset[0:200]
    features.add_subset(subset)
    dist = EuclideanDistance(features, features)
    distances = dist.get_distance_matrix()
    features.remove_subset()
    median_distance = Statistics.matrix_median(distances, True)
    sigma = median_distance ** 2
    print "median distance for Gaussian kernel:", sigma
    kernel = GaussianKernel(10, sigma)

    mmd = QuadraticTimeMMD(kernel, features, n)

    # perform test: compute p-value and test if null-hypothesis is rejected for
    # a test level of 0.05 using different methods to approximate
    # null-distribution
    statistic = mmd.compute_statistic()
    alpha = 0.05

    print "computing p-value using bootstrapping"
    mmd.set_null_approximation_method(BOOTSTRAP)
    # normally, at least 250 iterations should be done, but that takes long
    mmd.set_bootstrap_iterations(10)
    # bootstrapping allows usage of unbiased or biased statistic
    mmd.set_statistic_type(UNBIASED)
    p_value = mmd.compute_p_value(statistic)
    print "p_value:", p_value
    print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha

    # only can do this if SHOGUN was compiled with LAPACK so check
    if "sample_null_spectrum" in dir(QuadraticTimeMMD):
        print "computing p-value using spectrum method"
        mmd.set_null_approximation_method(MMD2_SPECTRUM)
        # normally, at least 250 iterations should be done, but that takes long
        mmd.set_num_samples_sepctrum(50)
        mmd.set_num_eigenvalues_spectrum(n - 10)
        # spectrum method computes p-value for biased statistics only
        mmd.set_statistic_type(BIASED)
        p_value = mmd.compute_p_value(statistic)
        print "p_value:", p_value
        print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha

    print "computing p-value using gamma method"
    mmd.set_null_approximation_method(MMD2_GAMMA)
    # gamma method computes p-value for biased statistics only
    mmd.set_statistic_type(BIASED)
    p_value = mmd.compute_p_value(statistic)
    print "p_value:", p_value
    print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha

    # sample from null distribution (these may be plotted or whatsoever)
    # mean should be close to zero, variance stronly depends on data/kernel
    # bootstrapping, biased statistic
    print "sampling null distribution using bootstrapping"
    mmd.set_null_approximation_method(BOOTSTRAP)
    mmd.set_statistic_type(BIASED)
    mmd.set_bootstrap_iterations(10)
    null_samples = mmd.bootstrap_null()
    print "null mean:", mean(null_samples)
    print "null variance:", var(null_samples)

    # sample from null distribution (these may be plotted or whatsoever)
    # mean should be close to zero, variance stronly depends on data/kernel
    # spectrum, biased statistic
    print "sampling null distribution using spectrum method"
    mmd.set_null_approximation_method(MMD2_SPECTRUM)
    mmd.set_statistic_type(BIASED)
    # 200 samples using 100 eigenvalues
    null_samples = mmd.sample_null_spectrum(50, 10)
    print "null mean:", mean(null_samples)
    print "null variance:", var(null_samples)

Example #3

Show file

def statistics_quadratic_time_mmd (m,dim,difference):
	from shogun.Features import RealFeatures
	from shogun.Features import MeanShiftDataGenerator
	from shogun.Kernel import GaussianKernel, CustomKernel
	from shogun.Statistics import QuadraticTimeMMD
	from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED
	from shogun.Mathematics import Statistics, IntVector, RealVector, Math
	
	# init seed for reproducability
	Math.init_random(1)

	# number of examples kept low in order to make things fast

	# streaming data generator for mean shift distributions
	gen_p=MeanShiftDataGenerator(0, dim);
	gen_q=MeanShiftDataGenerator(difference, dim);

	# stream some data from generator
	feat_p=gen_p.get_streamed_features(m);
	feat_q=gen_q.get_streamed_features(m);

	# set kernel a-priori. usually one would do some kernel selection. See
	# other examples for this.
	width=10;
	kernel=GaussianKernel(10, width);

	# create quadratic time mmd instance. Note that this constructor
	# copies p and q and does not reference them
	mmd=QuadraticTimeMMD(kernel, feat_p, feat_q);

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05
	alpha=0.05;
	
	# using bootstrapping (slow, not the most reliable way. Consider pre-
	# computing the kernel when using it, see below).
	# Also, in practice, use at least 250 iterations
	mmd.set_null_approximation_method(BOOTSTRAP);
	mmd.set_bootstrap_iterations(3);
	p_value_boot=mmd.perform_test();
	# reject if p-value is smaller than test level
	#print "bootstrap: p!=q: ", p_value_boot<alpha

	# using spectrum method. Use at least 250 samples from null.
	# This is consistent but sometimes breaks, always monitor type I error.
	# See tutorial for number of eigenvalues to use .
	# Only works with BIASED statistic
	mmd.set_statistic_type(BIASED);
	mmd.set_null_approximation_method(MMD2_SPECTRUM);
	mmd.set_num_eigenvalues_spectrum(3);
	mmd.set_num_samples_sepctrum(250);
	p_value_spectrum=mmd.perform_test();
	# reject if p-value is smaller than test level
	#print "spectrum: p!=q: ", p_value_spectrum<alpha

	# using gamma method. This is a quick hack, which works most of the time
	# but is NOT guaranteed to. See tutorial for details.
	# Only works with BIASED statistic
	mmd.set_statistic_type(BIASED);
	mmd.set_null_approximation_method(MMD2_GAMMA);
	p_value_gamma=mmd.perform_test();
	# reject if p-value is smaller than test level
	#print "gamma: p!=q: ", p_value_gamma<alpha

	# compute tpye I and II error (use many more trials in practice).
	# Type I error is not necessary if one uses bootstrapping. We do it here
	# anyway, but note that this is an efficient way of computing it.
	# Also note that testing has to happen on
	# difference data than kernel selection, but the linear time mmd does this
	# implicitly and we used a fixed kernel here.
	mmd.set_null_approximation_method(BOOTSTRAP);
	mmd.set_bootstrap_iterations(5);
	num_trials=5;
	type_I_errors=RealVector(num_trials);
	type_II_errors=RealVector(num_trials);
	inds=int32(array([x for x in range(2*m)])) # numpy
	p_and_q=mmd.get_p_and_q();

	# use a precomputed kernel to be faster
	kernel.init(p_and_q, p_and_q);
	precomputed=CustomKernel(kernel);
	mmd.set_kernel(precomputed);
	for i in range(num_trials):
		# this effectively means that p=q - rejecting is tpye I error
		inds=random.permutation(inds) # numpy permutation
		precomputed.add_row_subset(inds);
		precomputed.add_col_subset(inds);
		type_I_errors[i]=mmd.perform_test()>alpha;
		precomputed.remove_row_subset();
		precomputed.remove_col_subset();

		# on normal data, this gives type II error
		type_II_errors[i]=mmd.perform_test()>alpha;
		
	return type_I_errors.get(),type_I_errors.get(),p_value_boot,p_value_spectrum,p_value_gamma,

Example #4

Show file

File: statistics_linear_time_mmd.py Project: Argram/shogun

def statistics_linear_time_mmd (n,dim,difference):
	from shogun.Features import RealFeatures
	from shogun.Features import MeanShiftDataGenerator
	from shogun.Kernel import GaussianKernel
	from shogun.Statistics import LinearTimeMMD
	from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN
	from shogun.Distance import EuclideanDistance
	from shogun.Mathematics import Statistics, Math

	# init seed for reproducability
	Math.init_random(1)

	# note that the linear time statistic is designed for much larger datasets
	# so increase to get reasonable results

	# streaming data generator for mean shift distributions
	gen_p=MeanShiftDataGenerator(0, dim)
	gen_q=MeanShiftDataGenerator(difference, dim)

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	
	# Stream examples and merge them in order to compute median on joint sample
	features=gen_p.get_streamed_features(100)
	features=features.create_merged_copy(gen_q.get_streamed_features(100))
	
	# compute all pairwise distances
	dist=EuclideanDistance(features, features)
	distances=dist.get_distance_matrix()
	
	# compute median and determine kernel width (using shogun)
	median_distance=Statistics.matrix_median(distances, True)
	sigma=median_distance**2
	#print "median distance for Gaussian kernel:", sigma
	kernel=GaussianKernel(10,sigma)

	# mmd instance using streaming features, blocksize of 10000
	mmd=LinearTimeMMD(kernel, gen_p, gen_q, n, 10000)

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05
	statistic=mmd.compute_statistic()
	#print "test statistic:", statistic
	
	# do the same thing using two different way to approximate null-dstribution
	# bootstrapping and gaussian approximation (ony for really large samples)
	alpha=0.05

	#print "computing p-value using bootstrapping"
	mmd.set_null_approximation_method(BOOTSTRAP)
	mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed
	p_value_boot=mmd.compute_p_value(statistic)
	#print "p_value_boot:", p_value_boot
	#print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha
	
	#print "computing p-value using gaussian approximation"
	mmd.set_null_approximation_method(MMD1_GAUSSIAN)
	p_value_gaussian=mmd.compute_p_value(statistic)
	#print "p_value_gaussian:", p_value_gaussian
	#print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha
	
	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	mmd.set_null_approximation_method(BOOTSTRAP)
	mmd.set_bootstrap_iterations(10) # normally, far more iterations are needed
	null_samples=mmd.bootstrap_null()
	#print "null mean:", mean(null_samples)
	#print "null variance:", var(null_samples)
	
	# compute type I and type II errors for Gaussian approximation
	# number of trials should be larger to compute tight confidence bounds
	mmd.set_null_approximation_method(MMD1_GAUSSIAN)
	num_trials=5;
	alpha=0.05 # test power
	typeIerrors=[0 for x in range(num_trials)]
	typeIIerrors=[0 for x in range(num_trials)]
	for i in range(num_trials):
		# this effectively means that p=q - rejecting is tpye I error
		mmd.set_simulate_h0(True)
		typeIerrors[i]=mmd.perform_test()>alpha
		mmd.set_simulate_h0(False)
		
		typeIIerrors[i]=mmd.perform_test()>alpha
	
	#print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors)
	
	return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors

Example #5

Show file

File: statistics_linear_time_mmd.py Project: Argram/shogun

def linear_time_mmd_graphical():

	
	# parameters, change to get different results
	m=1000 # set to 10000 for a good test result
	dim=2
	
	# setting the difference of the first dimension smaller makes a harder test
	difference=1
	
	# number of samples taken from null and alternative distribution
	num_null_samples=150
	
	# streaming data generator for mean shift distributions
	gen_p=MeanShiftDataGenerator(0, dim)
	gen_q=MeanShiftDataGenerator(difference, dim)
	
	# use the median kernel selection
	# create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is
	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	sigmas=[2**x for x in range(-3,10)]
	widths=[x*x*2 for x in sigmas]
	print "kernel widths:", widths
	combined=CombinedKernel()
	for i in range(len(sigmas)):
		combined.append_kernel(GaussianKernel(10, widths[i]))

	# mmd instance using streaming features, blocksize of 10000
	block_size=1000
	mmd=LinearTimeMMD(combined, gen_p, gen_q, m, block_size)
	
	# kernel selection instance (this can easily replaced by the other methods for selecting
	# single kernels
	selection=MMDKernelSelectionOpt(mmd)

	# perform kernel selection
	kernel=selection.select_kernel()
	kernel=GaussianKernel.obtain_from_generic(kernel)
	mmd.set_kernel(kernel);
	print "selected kernel width:", kernel.get_width()
	
	# sample alternative distribution, stream ensures different samples each run
	alt_samples=zeros(num_null_samples)
	for i in range(len(alt_samples)):
		alt_samples[i]=mmd.compute_statistic()
	
	# sample from null distribution
	# bootstrapping, biased statistic
	mmd.set_null_approximation_method(BOOTSTRAP)
	mmd.set_bootstrap_iterations(num_null_samples)
	null_samples_boot=mmd.bootstrap_null()
	
	# fit normal distribution to null and sample a normal distribution
	mmd.set_null_approximation_method(MMD1_GAUSSIAN)
	variance=mmd.compute_variance_estimate()
	null_samples_gaussian=normal(0,sqrt(variance),num_null_samples)
	
	# to plot data, sample a few examples from stream first
	features=gen_p.get_streamed_features(m)
	features=features.create_merged_copy(gen_q.get_streamed_features(m))
	data=features.get_feature_matrix()
	
	# plot
	figure()
	
	# plot data of p and q
	subplot(2,3,1)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	plot(data[0][0:m], data[1][0:m], 'ro', label='$x$')
	plot(data[0][m+1:2*m], data[1][m+1:2*m], 'bo', label='$x$', alpha=0.5)
	title('Data, shift in $x_1$='+str(difference)+'\nm='+str(m))
	xlabel('$x_1, y_1$')
	ylabel('$x_2, y_2$')
	
	# histogram of first data dimension and pdf
	subplot(2,3,2)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True)
	hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True)
	xs=linspace(min(data[0])-1,max(data[0])+1, 50)
	plot(xs,normpdf( xs, 0, 1), 'r', linewidth=3)
	plot(xs,normpdf( xs, difference, 1), 'b', linewidth=3)
	xlabel('$x_1, y_1$')
	ylabel('$p(x_1), p(y_1)$')
	title('Data PDF in $x_1, y_1$')
	
	# compute threshold for test level
	alpha=0.05
	null_samples_boot.sort()
	null_samples_gaussian.sort()
	thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))];
	thresh_gaussian=null_samples_gaussian[floor(len(null_samples_gaussian)*(1-alpha))];
	
	type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples)
	type_one_error_gaussian=sum(null_samples_gaussian<thresh_boot)/float(num_null_samples)
	
	# plot alternative distribution with threshold
	subplot(2,3,4)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(alt_samples, 20, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples)
	title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error))
	
	# compute range for all null distribution histograms
	hist_range=[min([min(null_samples_boot), min(null_samples_gaussian)]), max([max(null_samples_boot), max(null_samples_gaussian)])]
	
	# plot null distribution with threshold
	subplot(2,3,3)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(null_samples_boot, 20, range=hist_range, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	title('Bootstrapped Null Dist.\n' + 'Type I error is '  + str(type_one_error_boot))
	
	# plot null distribution gaussian
	subplot(2,3,5)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(null_samples_gaussian, 20, range=hist_range, normed=True);
	axvline(thresh_gaussian, 0, 1, linewidth=2, color='red')
	title('Null Dist. Gaussian\nType I error is '  + str(type_one_error_gaussian))
	
	# pull plots a bit apart
	subplots_adjust(hspace=0.5)
	subplots_adjust(wspace=0.5)

Example #6

Show file

def statistics_linear_time_mmd(n, dim, difference):
    from shogun.Features import RealFeatures
    from shogun.Features import MeanShiftDataGenerator
    from shogun.Kernel import GaussianKernel
    from shogun.Statistics import LinearTimeMMD
    from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN
    from shogun.Distance import EuclideanDistance
    from shogun.Mathematics import Statistics, Math

    # init seed for reproducability
    Math.init_random(1)

    # note that the linear time statistic is designed for much larger datasets
    # so increase to get reasonable results

    # streaming data generator for mean shift distributions
    gen_p = MeanShiftDataGenerator(0, dim)
    gen_q = MeanShiftDataGenerator(difference, dim)

    # compute median data distance in order to use for Gaussian kernel width
    # 0.5*median_distance normally (factor two in Gaussian kernel)
    # However, shoguns kernel width is different to usual parametrization
    # Therefore 0.5*2*median_distance^2
    # Use a subset of data for that, only 200 elements. Median is stable

    # Stream examples and merge them in order to compute median on joint sample
    features = gen_p.get_streamed_features(100)
    features = features.create_merged_copy(gen_q.get_streamed_features(100))

    # compute all pairwise distances
    dist = EuclideanDistance(features, features)
    distances = dist.get_distance_matrix()

    # compute median and determine kernel width (using shogun)
    median_distance = Statistics.matrix_median(distances, True)
    sigma = median_distance**2
    #print "median distance for Gaussian kernel:", sigma
    kernel = GaussianKernel(10, sigma)

    # mmd instance using streaming features, blocksize of 10000
    mmd = LinearTimeMMD(kernel, gen_p, gen_q, n, 10000)

    # perform test: compute p-value and test if null-hypothesis is rejected for
    # a test level of 0.05
    statistic = mmd.compute_statistic()
    #print "test statistic:", statistic

    # do the same thing using two different way to approximate null-dstribution
    # bootstrapping and gaussian approximation (ony for really large samples)
    alpha = 0.05

    #print "computing p-value using bootstrapping"
    mmd.set_null_approximation_method(BOOTSTRAP)
    mmd.set_bootstrap_iterations(
        50)  # normally, far more iterations are needed
    p_value_boot = mmd.compute_p_value(statistic)
    #print "p_value_boot:", p_value_boot
    #print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha

    #print "computing p-value using gaussian approximation"
    mmd.set_null_approximation_method(MMD1_GAUSSIAN)
    p_value_gaussian = mmd.compute_p_value(statistic)
    #print "p_value_gaussian:", p_value_gaussian
    #print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha

    # sample from null distribution (these may be plotted or whatsoever)
    # mean should be close to zero, variance stronly depends on data/kernel
    mmd.set_null_approximation_method(BOOTSTRAP)
    mmd.set_bootstrap_iterations(
        10)  # normally, far more iterations are needed
    null_samples = mmd.bootstrap_null()
    #print "null mean:", mean(null_samples)
    #print "null variance:", var(null_samples)

    # compute type I and type II errors for Gaussian approximation
    # number of trials should be larger to compute tight confidence bounds
    mmd.set_null_approximation_method(MMD1_GAUSSIAN)
    num_trials = 5
    alpha = 0.05  # test power
    typeIerrors = [0 for x in range(num_trials)]
    typeIIerrors = [0 for x in range(num_trials)]
    for i in range(num_trials):
        # this effectively means that p=q - rejecting is tpye I error
        mmd.set_simulate_h0(True)
        typeIerrors[i] = mmd.perform_test() > alpha
        mmd.set_simulate_h0(False)

        typeIIerrors[i] = mmd.perform_test() > alpha

    #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors)

    return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors

Example #7

Show file

File: statistics_linear_time_mmd.py Project: theoryno3/shogun

def linear_time_mmd_graphical():

    # parameters, change to get different results
    m = 1000  # set to 10000 for a good test result
    dim = 2

    # setting the difference of the first dimension smaller makes a harder test
    difference = 1

    # number of samples taken from null and alternative distribution
    num_null_samples = 150

    # streaming data generator for mean shift distributions
    gen_p = MeanShiftDataGenerator(0, dim)
    gen_q = MeanShiftDataGenerator(difference, dim)

    # use the median kernel selection
    # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is
    # compute median data distance in order to use for Gaussian kernel width
    # 0.5*median_distance normally (factor two in Gaussian kernel)
    # However, shoguns kernel width is different to usual parametrization
    # Therefore 0.5*2*median_distance^2
    # Use a subset of data for that, only 200 elements. Median is stable
    sigmas = [2**x for x in range(-3, 10)]
    widths = [x * x * 2 for x in sigmas]
    print "kernel widths:", widths
    combined = CombinedKernel()
    for i in range(len(sigmas)):
        combined.append_kernel(GaussianKernel(10, widths[i]))

    # mmd instance using streaming features, blocksize of 10000
    block_size = 1000
    mmd = LinearTimeMMD(combined, gen_p, gen_q, m, block_size)

    # kernel selection instance (this can easily replaced by the other methods for selecting
    # single kernels
    selection = MMDKernelSelectionOpt(mmd)

    # perform kernel selection
    kernel = selection.select_kernel()
    kernel = GaussianKernel.obtain_from_generic(kernel)
    mmd.set_kernel(kernel)
    print "selected kernel width:", kernel.get_width()

    # sample alternative distribution, stream ensures different samples each run
    alt_samples = zeros(num_null_samples)
    for i in range(len(alt_samples)):
        alt_samples[i] = mmd.compute_statistic()

    # sample from null distribution
    # bootstrapping, biased statistic
    mmd.set_null_approximation_method(BOOTSTRAP)
    mmd.set_bootstrap_iterations(num_null_samples)
    null_samples_boot = mmd.bootstrap_null()

    # fit normal distribution to null and sample a normal distribution
    mmd.set_null_approximation_method(MMD1_GAUSSIAN)
    variance = mmd.compute_variance_estimate()
    null_samples_gaussian = normal(0, sqrt(variance), num_null_samples)

    # to plot data, sample a few examples from stream first
    features = gen_p.get_streamed_features(m)
    features = features.create_merged_copy(gen_q.get_streamed_features(m))
    data = features.get_feature_matrix()

    # plot
    figure()

    # plot data of p and q
    subplot(2, 3, 1)
    grid(True)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=4))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=4))  # reduce number of x-ticks
    plot(data[0][0:m], data[1][0:m], 'ro', label='$x$')
    plot(data[0][m + 1:2 * m],
         data[1][m + 1:2 * m],
         'bo',
         label='$x$',
         alpha=0.5)
    title('Data, shift in $x_1$=' + str(difference) + '\nm=' + str(m))
    xlabel('$x_1, y_1$')
    ylabel('$x_2, y_2$')

    # histogram of first data dimension and pdf
    subplot(2, 3, 2)
    grid(True)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True)
    hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True)
    xs = linspace(min(data[0]) - 1, max(data[0]) + 1, 50)
    plot(xs, normpdf(xs, 0, 1), 'r', linewidth=3)
    plot(xs, normpdf(xs, difference, 1), 'b', linewidth=3)
    xlabel('$x_1, y_1$')
    ylabel('$p(x_1), p(y_1)$')
    title('Data PDF in $x_1, y_1$')

    # compute threshold for test level
    alpha = 0.05
    null_samples_boot.sort()
    null_samples_gaussian.sort()
    thresh_boot = null_samples_boot[floor(
        len(null_samples_boot) * (1 - alpha))]
    thresh_gaussian = null_samples_gaussian[floor(
        len(null_samples_gaussian) * (1 - alpha))]

    type_one_error_boot = sum(
        null_samples_boot < thresh_boot) / float(num_null_samples)
    type_one_error_gaussian = sum(
        null_samples_gaussian < thresh_boot) / float(num_null_samples)

    # plot alternative distribution with threshold
    subplot(2, 3, 4)
    grid(True)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    hist(alt_samples, 20, normed=True)
    axvline(thresh_boot, 0, 1, linewidth=2, color='red')
    type_two_error = sum(alt_samples < thresh_boot) / float(num_null_samples)
    title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error))

    # compute range for all null distribution histograms
    hist_range = [
        min([min(null_samples_boot),
             min(null_samples_gaussian)]),
        max([max(null_samples_boot),
             max(null_samples_gaussian)])
    ]

    # plot null distribution with threshold
    subplot(2, 3, 3)
    grid(True)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    hist(null_samples_boot, 20, range=hist_range, normed=True)
    axvline(thresh_boot, 0, 1, linewidth=2, color='red')
    title('Bootstrapped Null Dist.\n' + 'Type I error is ' +
          str(type_one_error_boot))

    # plot null distribution gaussian
    subplot(2, 3, 5)
    grid(True)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    hist(null_samples_gaussian, 20, range=hist_range, normed=True)
    axvline(thresh_gaussian, 0, 1, linewidth=2, color='red')
    title('Null Dist. Gaussian\nType I error is ' +
          str(type_one_error_gaussian))

    # pull plots a bit apart
    subplots_adjust(hspace=0.5)
    subplots_adjust(wspace=0.5)