コード例 #1
0
ファイル: statistics_hsic.py プロジェクト: zero1hac/shogun
def statistics_hsic(n, difference, angle):
    from modshogun import RealFeatures
    from modshogun import DataGenerator
    from modshogun import GaussianKernel
    from modshogun import HSIC
    from modshogun import PERMUTATION, HSIC_GAMMA
    from modshogun import EuclideanDistance
    from modshogun import Statistics, Math

    # for reproducable results (the numpy one might not be reproducible across
    # different OS/Python-distributions
    Math.init_random(1)
    np.random.seed(1)

    # note that the HSIC has to store kernel matrices
    # which upper bounds the sample size

    # use data generator class to produce example data
    data = DataGenerator.generate_sym_mix_gauss(n, difference, angle)
    #plot(data[0], data[1], 'x');show()

    # create shogun feature representation
    features_x = RealFeatures(np.array([data[0]]))
    features_y = RealFeatures(np.array([data[1]]))

    # compute median data distance in order to use for Gaussian kernel width
    # 0.5*median_distance normally (factor two in Gaussian kernel)
    # However, shoguns kernel width is different to usual parametrization
    # Therefore 0.5*2*median_distance^2
    # Use a subset of data for that, only 200 elements. Median is stable
    subset = np.random.permutation(features_x.get_num_vectors()).astype(
        np.int32)
    subset = subset[0:200]
    features_x.add_subset(subset)
    dist = EuclideanDistance(features_x, features_x)
    distances = dist.get_distance_matrix()
    features_x.remove_subset()
    median_distance = np.median(distances)
    sigma_x = median_distance**2
    features_y.add_subset(subset)
    dist = EuclideanDistance(features_y, features_y)
    distances = dist.get_distance_matrix()
    features_y.remove_subset()
    median_distance = np.median(distances)
    sigma_y = median_distance**2
    #print "median distance for Gaussian kernel on x:", sigma_x
    #print "median distance for Gaussian kernel on y:", sigma_y
    kernel_x = GaussianKernel(10, sigma_x)
    kernel_y = GaussianKernel(10, sigma_y)

    hsic = HSIC(kernel_x, kernel_y, features_x, features_y)

    # perform test: compute p-value and test if null-hypothesis is rejected for
    # a test level of 0.05 using different methods to approximate
    # null-distribution
    statistic = hsic.compute_statistic()
    #print "HSIC:", statistic
    alpha = 0.05

    #print "computing p-value using sampling null"
    hsic.set_null_approximation_method(PERMUTATION)
    # normally, at least 250 iterations should be done, but that takes long
    hsic.set_num_null_samples(100)
    # sampling null allows usage of unbiased or biased statistic
    p_value_boot = hsic.compute_p_value(statistic)
    thresh_boot = hsic.compute_threshold(alpha)
    #print "p_value:", p_value_boot
    #print "threshold for 0.05 alpha:", thresh_boot
    #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha

    #print "computing p-value using gamma method"
    hsic.set_null_approximation_method(HSIC_GAMMA)
    p_value_gamma = hsic.compute_p_value(statistic)
    thresh_gamma = hsic.compute_threshold(alpha)
    #print "p_value:", p_value_gamma
    #print "threshold for 0.05 alpha:", thresh_gamma
    #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_gamma<alpha

    # sample from null distribution (these may be plotted or whatsoever)
    # mean should be close to zero, variance stronly depends on data/kernel
    # sampling null, biased statistic
    #print "sampling null distribution using sample_null"
    hsic.set_null_approximation_method(PERMUTATION)
    hsic.set_num_null_samples(100)
    null_samples = hsic.sample_null()
    #print "null mean:", np.mean(null_samples)
    #print "null variance:", np.var(null_samples)
    #hist(null_samples, 100); show()

    return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples
コード例 #2
0
def statistics_hsic (n, difference, angle):
	from modshogun import RealFeatures
	from modshogun import DataGenerator
	from modshogun import GaussianKernel
	from modshogun import HSIC
	from modshogun import BOOTSTRAP, HSIC_GAMMA
	from modshogun import EuclideanDistance
	from modshogun import Math, Statistics, IntVector

	# init seed for reproducability
	Math.init_random(1)

	# note that the HSIC has to store kernel matrices
	# which upper bounds the sample size

	# use data generator class to produce example data
	data=DataGenerator.generate_sym_mix_gauss(n,difference,angle)
	#plot(data[0], data[1], 'x');show()

	# create shogun feature representation
	features_x=RealFeatures(array([data[0]]))
	features_y=RealFeatures(array([data[1]]))

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	subset=IntVector.randperm_vec(features_x.get_num_vectors())
	subset=subset[0:200]
	features_x.add_subset(subset)
	dist=EuclideanDistance(features_x, features_x)
	distances=dist.get_distance_matrix()
	features_x.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma_x=median_distance**2
	features_y.add_subset(subset)
	dist=EuclideanDistance(features_y, features_y)
	distances=dist.get_distance_matrix()
	features_y.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma_y=median_distance**2
	#print "median distance for Gaussian kernel on x:", sigma_x
	#print "median distance for Gaussian kernel on y:", sigma_y
	kernel_x=GaussianKernel(10,sigma_x)
	kernel_y=GaussianKernel(10,sigma_y)

	hsic=HSIC(kernel_x,kernel_y,features_x,features_y)

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05 using different methods to approximate
	# null-distribution
	statistic=hsic.compute_statistic()
	#print "HSIC:", statistic
	alpha=0.05

	#print "computing p-value using bootstrapping"
	hsic.set_null_approximation_method(BOOTSTRAP)
	# normally, at least 250 iterations should be done, but that takes long
	hsic.set_bootstrap_iterations(100)
	# bootstrapping allows usage of unbiased or biased statistic
	p_value_boot=hsic.compute_p_value(statistic)
	thresh_boot=hsic.compute_threshold(alpha)
	#print "p_value:", p_value_boot
	#print "threshold for 0.05 alpha:", thresh_boot
	#print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha

	#print "computing p-value using gamma method"
	hsic.set_null_approximation_method(HSIC_GAMMA)
	p_value_gamma=hsic.compute_p_value(statistic)
	thresh_gamma=hsic.compute_threshold(alpha)
	#print "p_value:", p_value_gamma
	#print "threshold for 0.05 alpha:", thresh_gamma
	#print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value_gamma<alpha

	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	# bootstrapping, biased statistic
	#print "sampling null distribution using bootstrapping"
	hsic.set_null_approximation_method(BOOTSTRAP)
	hsic.set_bootstrap_iterations(100)
	null_samples=hsic.bootstrap_null()
	#print "null mean:", mean(null_samples)
	#print "null variance:", var(null_samples)
	#hist(null_samples, 100); show()

	return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples