def statistics_quadratic_time_mmd ():
	from shogun.Features import RealFeatures
	from shogun.Features import MeanShiftRealDataGenerator
	from shogun.Kernel import GaussianKernel
	from shogun.Statistics import QuadraticTimeMMD
	from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED
	from shogun.Distance import EuclideanDistance
	from shogun.Mathematics import Statistics, Math

	# note that the quadratic time mmd has to store kernel matrices
	# which upper bounds the sample size
	n=500
	dim=2
	difference=0.5

	# streaming data generator for mean shift distributions
	gen_p=MeanShiftRealDataGenerator(0, dim)
	gen_q=MeanShiftRealDataGenerator(difference, dim)
	
	# Stream examples and merge them in order to compute median on joint sample
	# alternative is to call a different constructor of QuadraticTimeMMD
	features=gen_p.get_streamed_features(n)
	features=features.create_merged_copy(gen_q.get_streamed_features(n))
	
	# use data generator class to produce example data
	data=features.get_feature_matrix()
	
	print "dimension means of X", mean(data.T[0:n].T)
	print "dimension means of Y", mean(data.T[n:2*n+1].T)

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	# Use a permutation set to temporarily merge features in merged examples
	subset=Math.randperm_vec(features.get_num_vectors())
	subset=subset[0:200]
	features.add_subset(subset)
	dist=EuclideanDistance(features, features)
	distances=dist.get_distance_matrix()
	features.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma=median_distance**2
	print "median distance for Gaussian kernel:", sigma
	kernel=GaussianKernel(10,sigma)

	mmd=QuadraticTimeMMD(kernel,features, n)

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05 using different methods to approximate
	# null-distribution
	statistic=mmd.compute_statistic()
	alpha=0.05
	
	print "computing p-value using bootstrapping"
	mmd.set_null_approximation_method(BOOTSTRAP)
	# normally, at least 250 iterations should be done, but that takes long
	mmd.set_bootstrap_iterations(10)
	# bootstrapping allows usage of unbiased or biased statistic
	mmd.set_statistic_type(UNBIASED)
	p_value=mmd.compute_p_value(statistic)
	print "p_value:", p_value
	print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
	
	# only can do this if SHOGUN was compiled with LAPACK so check
	if "sample_null_spectrum" in dir(QuadraticTimeMMD):
		print "computing p-value using spectrum method"
		mmd.set_null_approximation_method(MMD2_SPECTRUM)
		# normally, at least 250 iterations should be done, but that takes long
		mmd.set_num_samples_sepctrum(50)
		mmd.set_num_eigenvalues_spectrum(n-10)
		# spectrum method computes p-value for biased statistics only
		mmd.set_statistic_type(BIASED)
		p_value=mmd.compute_p_value(statistic)
		print "p_value:", p_value
		print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
	
	print "computing p-value using gamma method"
	mmd.set_null_approximation_method(MMD2_GAMMA)
	# gamma method computes p-value for biased statistics only
	mmd.set_statistic_type(BIASED)
	p_value=mmd.compute_p_value(statistic)
	print "p_value:", p_value
	print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
	
	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	# bootstrapping, biased statistic
	print "sampling null distribution using bootstrapping"
	mmd.set_null_approximation_method(BOOTSTRAP)
	mmd.set_statistic_type(BIASED)
	mmd.set_bootstrap_iterations(10)
	null_samples=mmd.bootstrap_null()
	print "null mean:", mean(null_samples)
	print "null variance:", var(null_samples)
	
	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	# spectrum, biased statistic
	print "sampling null distribution using spectrum method"
	mmd.set_null_approximation_method(MMD2_SPECTRUM)
	mmd.set_statistic_type(BIASED)
	# 200 samples using 100 eigenvalues
	null_samples=mmd.sample_null_spectrum(50,10)
	print "null mean:", mean(null_samples)
	print "null variance:", var(null_samples)
# number of samples taken from null and alternative distribution
num_null_samples=500

# use data generator class to produce example data
data=DataGenerator.generate_mean_data(m,dim,difference)

# create shogun feature representation
features=RealFeatures(data)

# compute median data distance in order to use for Gaussian kernel width
# 0.5*median_distance normally (factor two in Gaussian kernel)
# However, shoguns kernel width is different to usual parametrization
# Therefore 0.5*2*median_distance^2
# Use a subset of data for that, only 200 elements. Median is stable
# Using all distances here would blow up memory
subset=Math.randperm_vec(features.get_num_vectors())
subset=subset[0:200]
features.add_subset(subset)
dist=EuclideanDistance(features, features)
distances=dist.get_distance_matrix()
features.remove_subset()
median_distance=Statistics.matrix_median(distances, True)
sigma=median_distance**2
print "median distance for Gaussian kernel:", sigma
kernel=GaussianKernel(10,sigma)

# use biased statistic
mmd=LinearTimeMMD(kernel,features, m)

# sample alternative distribution
alt_samples=zeros(num_null_samples)
Ejemplo n.º 3
0
def statistics_linear_time_mmd ():
	from shogun.Features import RealFeatures
	from shogun.Features import DataGenerator
	from shogun.Kernel import GaussianKernel
	from shogun.Statistics import LinearTimeMMD
	from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN
	from shogun.Distance import EuclideanDistance
	from shogun.Mathematics import Statistics, Math

	# note that the linear time statistic is designed for much larger datasets
	n=10000
	dim=2
	difference=0.5

	# use data generator class to produce example data
	# in pratice, this generate data function could be replaced by a method
	# that obtains data from a stream
	data=DataGenerator.generate_mean_data(n,dim,difference)
	
	print "dimension means of X", mean(data.T[0:n].T)
	print "dimension means of Y", mean(data.T[n:2*n+1].T)

	# create shogun feature representation
	features=RealFeatures(data)

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	# Using all distances here would blow up memory
	subset=Math.randperm_vec(features.get_num_vectors())
	subset=subset[0:200]
	features.add_subset(subset)
	dist=EuclideanDistance(features, features)
	distances=dist.get_distance_matrix()
	features.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma=median_distance**2
	print "median distance for Gaussian kernel:", sigma
	kernel=GaussianKernel(10,sigma)

	mmd=LinearTimeMMD(kernel,features, n)

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05
	statistic=mmd.compute_statistic()
	print "test statistic:", statistic
	
	# do the same thing using two different way to approximate null-dstribution
	# bootstrapping and gaussian approximation (ony for really large samples)
	alpha=0.05

	print "computing p-value using bootstrapping"
	mmd.set_null_approximation_method(BOOTSTRAP)
	mmd.set_bootstrap_iterations(50) # normally, far more iterations are needed
	p_value=mmd.compute_p_value(statistic)
	print "p_value:", p_value
	print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
	
	print "computing p-value using gaussian approximation"
	mmd.set_null_approximation_method(MMD1_GAUSSIAN)
	p_value=mmd.compute_p_value(statistic)
	print "p_value:", p_value
	print "p_value <", alpha, ", i.e. test sais p!=q:", p_value<alpha
	
	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	mmd.set_null_approximation_method(BOOTSTRAP)
	mmd.set_bootstrap_iterations(10) # normally, far more iterations are needed
	null_samples=mmd.bootstrap_null()
	print "null mean:", mean(null_samples)
	print "null variance:", var(null_samples)
Ejemplo n.º 4
0
# number of samples taken from null and alternative distribution
num_null_samples=500

# use data generator class to produce example data
data=DataGenerator.generate_sym_mix_gauss(m,difference,angle)

# create shogun feature representation
features_x=RealFeatures(array([data[0]]))
features_y=RealFeatures(array([data[1]]))

# compute median data distance in order to use for Gaussian kernel width
# 0.5*median_distance normally (factor two in Gaussian kernel)
# However, shoguns kernel width is different to usual parametrization
# Therefore 0.5*2*median_distance^2
# Use a subset of data for that, only 200 elements. Median is stable
subset=Math.randperm_vec(features_x.get_num_vectors())
subset=subset[0:200]
features_x.add_subset(subset)
dist=EuclideanDistance(features_x, features_x)
distances=dist.get_distance_matrix()
features_x.remove_subset()
median_distance=Statistics.matrix_median(distances, True)
sigma_x=median_distance**2
features_y.add_subset(subset)
dist=EuclideanDistance(features_y, features_y)
distances=dist.get_distance_matrix()
features_y.remove_subset()
median_distance=Statistics.matrix_median(distances, True)
sigma_y=median_distance**2
print "median distance for Gaussian kernel on x:", sigma_x
print "median distance for Gaussian kernel on y:", sigma_y
def statistics_quadratic_time_mmd():
    from shogun.Features import RealFeatures
    from shogun.Features import DataGenerator
    from shogun.Kernel import GaussianKernel
    from shogun.Statistics import QuadraticTimeMMD
    from shogun.Statistics import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED
    from shogun.Distance import EuclideanDistance
    from shogun.Mathematics import Statistics, Math

    # note that the quadratic time mmd has to store kernel matrices
    # which upper bounds the sample size
    n = 500
    dim = 2
    difference = 0.5

    # use data generator class to produce example data
    data = DataGenerator.generate_mean_data(n, dim, difference)

    print "dimension means of X", mean(data.T[0:n].T)
    print "dimension means of Y", mean(data.T[n:2 * n + 1].T)

    # create shogun feature representation
    features = RealFeatures(data)

    # compute median data distance in order to use for Gaussian kernel width
    # 0.5*median_distance normally (factor two in Gaussian kernel)
    # However, shoguns kernel width is different to usual parametrization
    # Therefore 0.5*2*median_distance^2
    # Use a subset of data for that, only 200 elements. Median is stable
    subset = Math.randperm_vec(features.get_num_vectors())
    subset = subset[0:200]
    features.add_subset(subset)
    dist = EuclideanDistance(features, features)
    distances = dist.get_distance_matrix()
    features.remove_subset()
    median_distance = Statistics.matrix_median(distances, True)
    sigma = median_distance**2
    print "median distance for Gaussian kernel:", sigma
    kernel = GaussianKernel(10, sigma)

    mmd = QuadraticTimeMMD(kernel, features, n)

    # perform test: compute p-value and test if null-hypothesis is rejected for
    # a test level of 0.05 using different methods to approximate
    # null-distribution
    statistic = mmd.compute_statistic()
    alpha = 0.05

    print "computing p-value using bootstrapping"
    mmd.set_null_approximation_method(BOOTSTRAP)
    # normally, at least 250 iterations should be done, but that takes long
    mmd.set_bootstrap_iterations(10)
    # bootstrapping allows usage of unbiased or biased statistic
    mmd.set_statistic_type(UNBIASED)
    p_value = mmd.compute_p_value(statistic)
    print "p_value:", p_value
    print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha

    # only can do this if SHOGUN was compiled with LAPACK so check
    if "sample_null_spectrum" in dir(QuadraticTimeMMD):
        print "computing p-value using spectrum method"
        mmd.set_null_approximation_method(MMD2_SPECTRUM)
        # normally, at least 250 iterations should be done, but that takes long
        mmd.set_num_samples_sepctrum(50)
        mmd.set_num_eigenvalues_spectrum(n - 10)
        # spectrum method computes p-value for biased statistics only
        mmd.set_statistic_type(BIASED)
        p_value = mmd.compute_p_value(statistic)
        print "p_value:", p_value
        print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha

    print "computing p-value using gamma method"
    mmd.set_null_approximation_method(MMD2_GAMMA)
    # gamma method computes p-value for biased statistics only
    mmd.set_statistic_type(BIASED)
    p_value = mmd.compute_p_value(statistic)
    print "p_value:", p_value
    print "p_value <", alpha, ", i.e. test sais p!=q:", p_value < alpha

    # sample from null distribution (these may be plotted or whatsoever)
    # mean should be close to zero, variance stronly depends on data/kernel
    # bootstrapping, biased statistic
    print "sampling null distribution using bootstrapping"
    mmd.set_null_approximation_method(BOOTSTRAP)
    mmd.set_statistic_type(BIASED)
    mmd.set_bootstrap_iterations(10)
    null_samples = mmd.bootstrap_null()
    print "null mean:", mean(null_samples)
    print "null variance:", var(null_samples)

    # sample from null distribution (these may be plotted or whatsoever)
    # mean should be close to zero, variance stronly depends on data/kernel
    # spectrum, biased statistic
    print "sampling null distribution using spectrum method"
    mmd.set_null_approximation_method(MMD2_SPECTRUM)
    mmd.set_statistic_type(BIASED)
    # 200 samples using 100 eigenvalues
    null_samples = mmd.sample_null_spectrum(50, 10)
    print "null mean:", mean(null_samples)
    print "null variance:", var(null_samples)
Ejemplo n.º 6
0
def statistics_hsic ():
	from shogun.Features import RealFeatures
	from shogun.Features import DataGenerator
	from shogun.Kernel import GaussianKernel
	from shogun.Statistics import HSIC
	from shogun.Statistics import BOOTSTRAP, HSIC_GAMMA
	from shogun.Distance import EuclideanDistance
	from shogun.Mathematics import Statistics, Math

	# note that the HSIC has to store kernel matrices
	# which upper bounds the sample size
	n=250
	difference=3
	angle=pi/3

	# use data generator class to produce example data
	data=DataGenerator.generate_sym_mix_gauss(n,difference,angle)
	#plot(data[0], data[1], 'x');show()

	# create shogun feature representation
	features_x=RealFeatures(array([data[0]]))
	features_y=RealFeatures(array([data[1]]))

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	subset=Math.randperm_vec(features_x.get_num_vectors())
	subset=subset[0:200]
	features_x.add_subset(subset)
	dist=EuclideanDistance(features_x, features_x)
	distances=dist.get_distance_matrix()
	features_x.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma_x=median_distance**2
	features_y.add_subset(subset)
	dist=EuclideanDistance(features_y, features_y)
	distances=dist.get_distance_matrix()
	features_y.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma_y=median_distance**2
	print "median distance for Gaussian kernel on x:", sigma_x
	print "median distance for Gaussian kernel on y:", sigma_y
	kernel_x=GaussianKernel(10,sigma_x)
	kernel_y=GaussianKernel(10,sigma_y)

	hsic=HSIC(kernel_x,kernel_y,features_x,features_y)

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05 using different methods to approximate
	# null-distribution
	statistic=hsic.compute_statistic()
	print "HSIC:", statistic
	alpha=0.05

	print "computing p-value using bootstrapping"
	hsic.set_null_approximation_method(BOOTSTRAP)
	# normally, at least 250 iterations should be done, but that takes long
	hsic.set_bootstrap_iterations(100)
	# bootstrapping allows usage of unbiased or biased statistic
	p_value=hsic.compute_p_value(statistic)
	thresh=hsic.compute_threshold(alpha)
	print "p_value:", p_value
	print "threshold for 0.05 alpha:", thresh
	print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value<alpha

	print "computing p-value using gamma method"
	hsic.set_null_approximation_method(HSIC_GAMMA)
	p_value=hsic.compute_p_value(statistic)
	thresh=hsic.compute_threshold(alpha)
	print "p_value:", p_value
	print "threshold for 0.05 alpha:", thresh
	print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value<alpha

	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	# bootstrapping, biased statistic
	print "sampling null distribution using bootstrapping"
	hsic.set_null_approximation_method(BOOTSTRAP)
	hsic.set_bootstrap_iterations(100)
	null_samples=hsic.bootstrap_null()
	print "null mean:", mean(null_samples)
	print "null variance:", var(null_samples)
Ejemplo n.º 7
0
def statistics_hsic():
    from shogun.Features import RealFeatures
    from shogun.Features import DataGenerator
    from shogun.Kernel import GaussianKernel
    from shogun.Statistics import HSIC
    from shogun.Statistics import BOOTSTRAP, HSIC_GAMMA
    from shogun.Distance import EuclideanDistance
    from shogun.Mathematics import Statistics, Math

    # note that the HSIC has to store kernel matrices
    # which upper bounds the sample size
    n = 250
    difference = 3
    angle = pi / 3

    # use data generator class to produce example data
    data = DataGenerator.generate_sym_mix_gauss(n, difference, angle)
    #plot(data[0], data[1], 'x');show()

    # create shogun feature representation
    features_x = RealFeatures(array([data[0]]))
    features_y = RealFeatures(array([data[1]]))

    # compute median data distance in order to use for Gaussian kernel width
    # 0.5*median_distance normally (factor two in Gaussian kernel)
    # However, shoguns kernel width is different to usual parametrization
    # Therefore 0.5*2*median_distance^2
    # Use a subset of data for that, only 200 elements. Median is stable
    subset = Math.randperm_vec(features_x.get_num_vectors())
    subset = subset[0:200]
    features_x.add_subset(subset)
    dist = EuclideanDistance(features_x, features_x)
    distances = dist.get_distance_matrix()
    features_x.remove_subset()
    median_distance = Statistics.matrix_median(distances, True)
    sigma_x = median_distance**2
    features_y.add_subset(subset)
    dist = EuclideanDistance(features_y, features_y)
    distances = dist.get_distance_matrix()
    features_y.remove_subset()
    median_distance = Statistics.matrix_median(distances, True)
    sigma_y = median_distance**2
    print "median distance for Gaussian kernel on x:", sigma_x
    print "median distance for Gaussian kernel on y:", sigma_y
    kernel_x = GaussianKernel(10, sigma_x)
    kernel_y = GaussianKernel(10, sigma_y)

    hsic = HSIC(kernel_x, kernel_y, features_x, features_y)

    # perform test: compute p-value and test if null-hypothesis is rejected for
    # a test level of 0.05 using different methods to approximate
    # null-distribution
    statistic = hsic.compute_statistic()
    print "HSIC:", statistic
    alpha = 0.05

    print "computing p-value using bootstrapping"
    hsic.set_null_approximation_method(BOOTSTRAP)
    # normally, at least 250 iterations should be done, but that takes long
    hsic.set_bootstrap_iterations(100)
    # bootstrapping allows usage of unbiased or biased statistic
    p_value = hsic.compute_p_value(statistic)
    thresh = hsic.compute_threshold(alpha)
    print "p_value:", p_value
    print "threshold for 0.05 alpha:", thresh
    print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value < alpha

    print "computing p-value using gamma method"
    hsic.set_null_approximation_method(HSIC_GAMMA)
    p_value = hsic.compute_p_value(statistic)
    thresh = hsic.compute_threshold(alpha)
    print "p_value:", p_value
    print "threshold for 0.05 alpha:", thresh
    print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value < alpha

    # sample from null distribution (these may be plotted or whatsoever)
    # mean should be close to zero, variance stronly depends on data/kernel
    # bootstrapping, biased statistic
    print "sampling null distribution using bootstrapping"
    hsic.set_null_approximation_method(BOOTSTRAP)
    hsic.set_bootstrap_iterations(100)
    null_samples = hsic.bootstrap_null()
    print "null mean:", mean(null_samples)
    print "null variance:", var(null_samples)