def statistics_linear_time_mmd(n, dim, difference):
    from modshogun import RealFeatures
    from modshogun import MeanShiftDataGenerator
    from modshogun import GaussianKernel
    from modshogun import LinearTimeMMD
    from modshogun import PERMUTATION, MMD1_GAUSSIAN
    from modshogun import EuclideanDistance
    from modshogun import Statistics, Math

    # init seed for reproducability
    Math.init_random(1)

    # note that the linear time statistic is designed for much larger datasets
    # so increase to get reasonable results

    # streaming data generator for mean shift distributions
    gen_p = MeanShiftDataGenerator(0, dim)
    gen_q = MeanShiftDataGenerator(difference, dim)

    # compute median data distance in order to use for Gaussian kernel width
    # 0.5*median_distance normally (factor two in Gaussian kernel)
    # However, shoguns kernel width is different to usual parametrization
    # Therefore 0.5*2*median_distance^2
    # Use a subset of data for that, only 200 elements. Median is stable

    # Stream examples and merge them in order to compute median on joint sample
    features = gen_p.get_streamed_features(100)
    features = features.create_merged_copy(gen_q.get_streamed_features(100))

    # compute all pairwise distances
    dist = EuclideanDistance(features, features)
    distances = dist.get_distance_matrix()

    # compute median and determine kernel width (using shogun)
    median_distance = Statistics.matrix_median(distances, True)
    sigma = median_distance**2
    #print "median distance for Gaussian kernel:", sigma
    kernel = GaussianKernel(10, sigma)

    # mmd instance using streaming features, blocksize of 10000
    mmd = LinearTimeMMD(kernel, gen_p, gen_q, n, 10000)

    # perform test: compute p-value and test if null-hypothesis is rejected for
    # a test level of 0.05
    statistic = mmd.compute_statistic()
    #print "test statistic:", statistic

    # do the same thing using two different way to approximate null-dstribution
    # sampling null and gaussian approximation (ony for really large samples)
    alpha = 0.05

    #print "computing p-value using sampling null"
    mmd.set_null_approximation_method(PERMUTATION)
    mmd.set_num_null_samples(50)  # normally, far more iterations are needed
    p_value_boot = mmd.compute_p_value(statistic)
    #print "p_value_boot:", p_value_boot
    #print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha

    #print "computing p-value using gaussian approximation"
    mmd.set_null_approximation_method(MMD1_GAUSSIAN)
    p_value_gaussian = mmd.compute_p_value(statistic)
    #print "p_value_gaussian:", p_value_gaussian
    #print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha

    # sample from null distribution (these may be plotted or whatsoever)
    # mean should be close to zero, variance stronly depends on data/kernel
    mmd.set_null_approximation_method(PERMUTATION)
    mmd.set_num_null_samples(10)  # normally, far more iterations are needed
    null_samples = mmd.sample_null()
    #print "null mean:", mean(null_samples)
    #print "null variance:", var(null_samples)

    # compute type I and type II errors for Gaussian approximation
    # number of trials should be larger to compute tight confidence bounds
    mmd.set_null_approximation_method(MMD1_GAUSSIAN)
    num_trials = 5
    alpha = 0.05  # test power
    typeIerrors = [0 for x in range(num_trials)]
    typeIIerrors = [0 for x in range(num_trials)]
    for i in range(num_trials):
        # this effectively means that p=q - rejecting is tpye I error
        mmd.set_simulate_h0(True)
        typeIerrors[i] = mmd.perform_test() > alpha
        mmd.set_simulate_h0(False)

        typeIIerrors[i] = mmd.perform_test() > alpha

    #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors)

    return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors
Esempio n. 2
0
def linear_time_mmd_graphical():


	# parameters, change to get different results
	m=1000 # set to 10000 for a good test result
	dim=2

	# setting the difference of the first dimension smaller makes a harder test
	difference=1

	# number of samples taken from null and alternative distribution
	num_null_samples=150

	# streaming data generator for mean shift distributions
	gen_p=MeanShiftDataGenerator(0, dim)
	gen_q=MeanShiftDataGenerator(difference, dim)

	# use the median kernel selection
	# create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is
	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	sigmas=[2**x for x in range(-3,10)]
	widths=[x*x*2 for x in sigmas]
	print "kernel widths:", widths
	combined=CombinedKernel()
	for i in range(len(sigmas)):
		combined.append_kernel(GaussianKernel(10, widths[i]))

	# mmd instance using streaming features, blocksize of 10000
	block_size=1000
	mmd=LinearTimeMMD(combined, gen_p, gen_q, m, block_size)

	# kernel selection instance (this can easily replaced by the other methods for selecting
	# single kernels
	selection=MMDKernelSelectionOpt(mmd)

	# perform kernel selection
	kernel=selection.select_kernel()
	kernel=GaussianKernel.obtain_from_generic(kernel)
	mmd.set_kernel(kernel);
	print "selected kernel width:", kernel.get_width()

	# sample alternative distribution, stream ensures different samples each run
	alt_samples=zeros(num_null_samples)
	for i in range(len(alt_samples)):
		alt_samples[i]=mmd.compute_statistic()

	# sample from null distribution
	# bootstrapping, biased statistic
	mmd.set_null_approximation_method(PERMUTATION)
	mmd.set_num_null_samples(num_null_samples)
	null_samples_boot=mmd.sample_null()

	# fit normal distribution to null and sample a normal distribution
	mmd.set_null_approximation_method(MMD1_GAUSSIAN)
	variance=mmd.compute_variance_estimate()
	null_samples_gaussian=normal(0,sqrt(variance),num_null_samples)

	# to plot data, sample a few examples from stream first
	features=gen_p.get_streamed_features(m)
	features=features.create_merged_copy(gen_q.get_streamed_features(m))
	data=features.get_feature_matrix()

	# plot
	figure()

	# plot data of p and q
	subplot(2,3,1)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	plot(data[0][0:m], data[1][0:m], 'ro', label='$x$')
	plot(data[0][m+1:2*m], data[1][m+1:2*m], 'bo', label='$x$', alpha=0.5)
	title('Data, shift in $x_1$='+str(difference)+'\nm='+str(m))
	xlabel('$x_1, y_1$')
	ylabel('$x_2, y_2$')

	# histogram of first data dimension and pdf
	subplot(2,3,2)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True)
	hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True)
	xs=linspace(min(data[0])-1,max(data[0])+1, 50)
	plot(xs,normpdf( xs, 0, 1), 'r', linewidth=3)
	plot(xs,normpdf( xs, difference, 1), 'b', linewidth=3)
	xlabel('$x_1, y_1$')
	ylabel('$p(x_1), p(y_1)$')
	title('Data PDF in $x_1, y_1$')

	# compute threshold for test level
	alpha=0.05
	null_samples_boot.sort()
	null_samples_gaussian.sort()
	thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))];
	thresh_gaussian=null_samples_gaussian[floor(len(null_samples_gaussian)*(1-alpha))];

	type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples)
	type_one_error_gaussian=sum(null_samples_gaussian<thresh_boot)/float(num_null_samples)

	# plot alternative distribution with threshold
	subplot(2,3,4)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(alt_samples, 20, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples)
	title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error))

	# compute range for all null distribution histograms
	hist_range=[min([min(null_samples_boot), min(null_samples_gaussian)]), max([max(null_samples_boot), max(null_samples_gaussian)])]

	# plot null distribution with threshold
	subplot(2,3,3)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(null_samples_boot, 20, range=hist_range, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	title('Sampled Null Dist.\n' + 'Type I error is '  + str(type_one_error_boot))

	# plot null distribution gaussian
	subplot(2,3,5)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(null_samples_gaussian, 20, range=hist_range, normed=True);
	axvline(thresh_gaussian, 0, 1, linewidth=2, color='red')
	title('Null Dist. Gaussian\nType I error is '  + str(type_one_error_gaussian))

	# pull plots a bit apart
	subplots_adjust(hspace=0.5)
	subplots_adjust(wspace=0.5)
def statistics_linear_time_mmd (n,dim,difference):
	from modshogun import RealFeatures
	from modshogun import MeanShiftDataGenerator
	from modshogun import GaussianKernel
	from modshogun import LinearTimeMMD
	from modshogun import PERMUTATION, MMD1_GAUSSIAN
	from modshogun import EuclideanDistance
	from modshogun import Statistics, Math

	# init seed for reproducability
	Math.init_random(1)

	# note that the linear time statistic is designed for much larger datasets
	# so increase to get reasonable results

	# streaming data generator for mean shift distributions
	gen_p=MeanShiftDataGenerator(0, dim)
	gen_q=MeanShiftDataGenerator(difference, dim)

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable

	# Stream examples and merge them in order to compute median on joint sample
	features=gen_p.get_streamed_features(100)
	features=features.create_merged_copy(gen_q.get_streamed_features(100))

	# compute all pairwise distances
	dist=EuclideanDistance(features, features)
	distances=dist.get_distance_matrix()

	# compute median and determine kernel width (using shogun)
	median_distance=Statistics.matrix_median(distances, True)
	sigma=median_distance**2
	#print "median distance for Gaussian kernel:", sigma
	kernel=GaussianKernel(10,sigma)

	# mmd instance using streaming features, blocksize of 10000
	mmd=LinearTimeMMD(kernel, gen_p, gen_q, n, 10000)

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05
	statistic=mmd.compute_statistic()
	#print "test statistic:", statistic

	# do the same thing using two different way to approximate null-dstribution
	# sampling null and gaussian approximation (ony for really large samples)
	alpha=0.05

	#print "computing p-value using sampling null"
	mmd.set_null_approximation_method(PERMUTATION)
	mmd.set_num_null_samples(50) # normally, far more iterations are needed
	p_value_boot=mmd.compute_p_value(statistic)
	#print "p_value_boot:", p_value_boot
	#print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha

	#print "computing p-value using gaussian approximation"
	mmd.set_null_approximation_method(MMD1_GAUSSIAN)
	p_value_gaussian=mmd.compute_p_value(statistic)
	#print "p_value_gaussian:", p_value_gaussian
	#print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha

	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	mmd.set_null_approximation_method(PERMUTATION)
	mmd.set_num_null_samples(10) # normally, far more iterations are needed
	null_samples=mmd.sample_null()
	#print "null mean:", mean(null_samples)
	#print "null variance:", var(null_samples)

	# compute type I and type II errors for Gaussian approximation
	# number of trials should be larger to compute tight confidence bounds
	mmd.set_null_approximation_method(MMD1_GAUSSIAN)
	num_trials=5;
	alpha=0.05 # test power
	typeIerrors=[0 for x in range(num_trials)]
	typeIIerrors=[0 for x in range(num_trials)]
	for i in range(num_trials):
		# this effectively means that p=q - rejecting is tpye I error
		mmd.set_simulate_h0(True)
		typeIerrors[i]=mmd.perform_test()>alpha
		mmd.set_simulate_h0(False)

		typeIIerrors[i]=mmd.perform_test()>alpha

	#print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors)

	return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors