Example #2
def statistics_mmd_kernel_selection_combined(m, distance, stretch, num_blobs,
                                             angle, selection_method):
    from shogun.Features import RealFeatures
    from shogun.Features import GaussianBlobsDataGenerator
    from shogun.Kernel import GaussianKernel, CombinedKernel
    from shogun.Statistics import LinearTimeMMD
    from shogun.Statistics import MMDKernelSelectionCombMaxL2
    from shogun.Statistics import MMDKernelSelectionCombOpt
    from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN
    from shogun.Distance import EuclideanDistance
    from shogun.Mathematics import Statistics, Math

    # init seed for reproducability

    # note that the linear time statistic is designed for much larger datasets
    # results for this low number will be bad (unstable, type I error wrong)

    # streaming data generator
    gen_p = GaussianBlobsDataGenerator(num_blobs, distance, 1, 0)
    gen_q = GaussianBlobsDataGenerator(num_blobs, distance, stretch, angle)

    # stream some data and plot
    num_plot = 1000
    features = gen_p.get_streamed_features(num_plot)
    features = features.create_merged_copy(
    data = features.get_feature_matrix()

    #plot(data[0][0:num_plot], data[1][0:num_plot], 'r.', label='$x$')
    #title('$X\sim p$')
    #plot(data[0][num_plot+1:2*num_plot], data[1][num_plot+1:2*num_plot], 'b.', label='$x$', alpha=0.5)
    #title('$Y\sim q$')

    # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is
    # different to the standard form, see documentation)
    sigmas = [2**x for x in range(-3, 10)]
    widths = [x * x * 2 for x in sigmas]
    combined = CombinedKernel()
    for i in range(len(sigmas)):
        combined.append_kernel(GaussianKernel(10, widths[i]))

    # mmd instance using streaming features, blocksize of 10000
    block_size = 10000
    mmd = LinearTimeMMD(combined, gen_p, gen_q, m, block_size)

    # kernel selection instance (this can easily replaced by the other methods for selecting
    # combined kernels
    if selection_method == "opt":
        selection = MMDKernelSelectionCombOpt(mmd)
    elif selection_method == "l2":
        selection = MMDKernelSelectionCombMaxL2(mmd)

    # perform kernel selection (kernel is automatically set)
    kernel = selection.select_kernel()
    kernel = CombinedKernel.obtain_from_generic(kernel)
    #print "selected kernel weights:", kernel.get_subkernel_weights()
    #title("Kernel weights")

    # compute tpye I and II error (use many more trials). Type I error is only
    # estimated to check MMD1_GAUSSIAN method for estimating the null
    # distribution. Note that testing has to happen on difference data than
    # kernel selecting, but the linear time mmd does this implicitly

    # number of trials should be larger to compute tight confidence bounds
    num_trials = 5
    alpha = 0.05  # test power
    typeIerrors = [0 for x in range(num_trials)]
    typeIIerrors = [0 for x in range(num_trials)]
    for i in range(num_trials):
        # this effectively means that p=q - rejecting is tpye I error
        typeIerrors[i] = mmd.perform_test() > alpha

        typeIIerrors[i] = mmd.perform_test() > alpha

    #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors)

    return kernel, typeIerrors, typeIIerrors
Example #4
def statistics_linear_time_mmd(n, dim, difference):
    from shogun.Features import RealFeatures
    from shogun.Features import MeanShiftDataGenerator
    from shogun.Kernel import GaussianKernel
    from shogun.Statistics import LinearTimeMMD
    from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN
    from shogun.Distance import EuclideanDistance
    from shogun.Mathematics import Statistics, Math

    # init seed for reproducability

    # note that the linear time statistic is designed for much larger datasets
    # so increase to get reasonable results

    # streaming data generator for mean shift distributions
    gen_p = MeanShiftDataGenerator(0, dim)
    gen_q = MeanShiftDataGenerator(difference, dim)

    # compute median data distance in order to use for Gaussian kernel width
    # 0.5*median_distance normally (factor two in Gaussian kernel)
    # However, shoguns kernel width is different to usual parametrization
    # Therefore 0.5*2*median_distance^2
    # Use a subset of data for that, only 200 elements. Median is stable

    # Stream examples and merge them in order to compute median on joint sample
    features = gen_p.get_streamed_features(100)
    features = features.create_merged_copy(gen_q.get_streamed_features(100))

    # compute all pairwise distances
    dist = EuclideanDistance(features, features)
    distances = dist.get_distance_matrix()

    # compute median and determine kernel width (using shogun)
    median_distance = Statistics.matrix_median(distances, True)
    sigma = median_distance**2
    #print "median distance for Gaussian kernel:", sigma
    kernel = GaussianKernel(10, sigma)

    # mmd instance using streaming features, blocksize of 10000
    mmd = LinearTimeMMD(kernel, gen_p, gen_q, n, 10000)

    # perform test: compute p-value and test if null-hypothesis is rejected for
    # a test level of 0.05
    statistic = mmd.compute_statistic()
    #print "test statistic:", statistic

    # do the same thing using two different way to approximate null-dstribution
    # bootstrapping and gaussian approximation (ony for really large samples)
    alpha = 0.05

    #print "computing p-value using bootstrapping"
        50)  # normally, far more iterations are needed
    p_value_boot = mmd.compute_p_value(statistic)
    #print "p_value_boot:", p_value_boot
    #print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha

    #print "computing p-value using gaussian approximation"
    p_value_gaussian = mmd.compute_p_value(statistic)
    #print "p_value_gaussian:", p_value_gaussian
    #print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha

    # sample from null distribution (these may be plotted or whatsoever)
    # mean should be close to zero, variance stronly depends on data/kernel
        10)  # normally, far more iterations are needed
    null_samples = mmd.bootstrap_null()
    #print "null mean:", mean(null_samples)
    #print "null variance:", var(null_samples)

    # compute type I and type II errors for Gaussian approximation
    # number of trials should be larger to compute tight confidence bounds
    num_trials = 5
    alpha = 0.05  # test power
    typeIerrors = [0 for x in range(num_trials)]
    typeIIerrors = [0 for x in range(num_trials)]
    for i in range(num_trials):
        # this effectively means that p=q - rejecting is tpye I error
        typeIerrors[i] = mmd.perform_test() > alpha

        typeIIerrors[i] = mmd.perform_test() > alpha

    #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors)

    return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors