def statistics_kmm (n,d):
	from modshogun import RealFeatures
	from modshogun import DataGenerator
	from modshogun import GaussianKernel, MSG_DEBUG
	from modshogun import KernelMeanMatching
	from modshogun import Math

	# init seed for reproducability
	Math.init_random(1)
	random.seed(1);

	data = random.randn(d,n)

	# create shogun feature representation
	features=RealFeatures(data)

	# use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
	# which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
	# k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
	kernel=GaussianKernel(10,8)
	kernel.init(features,features)

	kmm = KernelMeanMatching(kernel,array([0,1,2,3,7,8,9],dtype=int32),array([4,5,6],dtype=int32))
	w = kmm.compute_weights()
	#print w
	return w
def evaluation_clustering (features=fea, ground_truth=gnd_raw, ncenters=10):
	from modshogun import ClusteringAccuracy, ClusteringMutualInformation
	from modshogun import MulticlassLabels
	from modshogun import Math

	# reproducable results
	Math.init_random(1)

	centroids = run_clustering(features, ncenters)
	gnd_hat = assign_labels(features, centroids, ncenters)
	gnd = MulticlassLabels(ground_truth)

	AccuracyEval = ClusteringAccuracy()
	AccuracyEval.best_map(gnd_hat, gnd)

	accuracy = AccuracyEval.evaluate(gnd_hat, gnd)
	#print(('Clustering accuracy = %.4f' % accuracy))

	MIEval = ClusteringMutualInformation()
	mutual_info = MIEval.evaluate(gnd_hat, gnd)
	#print(('Clustering mutual information = %.4f' % mutual_info))

	# TODO mutual information does not work with serialization
	#return gnd, gnd_hat, accuracy, MIEval, mutual_info
	return gnd, gnd_hat, accuracy
Example #3
0
def generate_gmm_classification_data(request):
    from modshogun import GMM, Math

    num_classes = int(request.POST['num_classes'])
    gmm = GMM(num_classes)
    total = 40
    rng = 4.0
    num = total/num_classes
    for i in xrange(num_classes):
        gmm.set_nth_mean(np.array([Math.random(-rng, rng) for j in xrange(2)]), i)
        cov_tmp = Math.normal_random(0.2, 0.1)
        cov = np.array([[1.0, cov_tmp], [cov_tmp, 1.0]], dtype=float)
        gmm.set_nth_cov(cov, i)

    data=[]
    labels=[]
    for i in xrange(num_classes):
        coef = np.zeros(num_classes)
        coef[i] = 1.0
        gmm.set_coef(coef)
        data.append(np.array([gmm.sample() for j in xrange(num)]).T)
        labels.append(np.array([i for j in xrange(num)]))

    data = np.hstack(data)
    data = data / (2.0 * rng)
    xmin = np.min(data[0,:])
    ymin = np.min(data[1,:])
    labels = np.hstack(labels)
    toy_data = []
    for i in xrange(num_classes*num):
        toy_data.append( {  'x': data[0, i] - xmin,
                            'y': data[1, i] - ymin,
                            'label': float(labels[i])})
    return HttpResponse(json.dumps(toy_data))
Example #4
0
def statistics_kmm(n, d):
    from modshogun import RealFeatures
    from modshogun import DataGenerator
    from modshogun import GaussianKernel, MSG_DEBUG
    from modshogun import KernelMeanMatching
    from modshogun import Math

    # init seed for reproducability
    Math.init_random(1)
    random.seed(1)

    data = random.randn(d, n)

    # create shogun feature representation
    features = RealFeatures(data)

    # use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization
    # which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard
    # k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2
    kernel = GaussianKernel(10, 8)
    kernel.init(features, features)

    kmm = KernelMeanMatching(kernel, array([0, 1, 2, 3, 7, 8, 9], dtype=int32),
                             array([4, 5, 6], dtype=int32))
    w = kmm.compute_weights()
    #print w
    return w
def evaluation_clustering(features=fea, ground_truth=gnd_raw, ncenters=10):
    from modshogun import ClusteringAccuracy, ClusteringMutualInformation
    from modshogun import MulticlassLabels
    from modshogun import Math

    # reproducable results
    Math.init_random(1)

    centroids = run_clustering(features, ncenters)
    gnd_hat = assign_labels(features, centroids, ncenters)
    gnd = MulticlassLabels(ground_truth)

    AccuracyEval = ClusteringAccuracy()
    AccuracyEval.best_map(gnd_hat, gnd)

    accuracy = AccuracyEval.evaluate(gnd_hat, gnd)
    #print(('Clustering accuracy = %.4f' % accuracy))

    MIEval = ClusteringMutualInformation()
    mutual_info = MIEval.evaluate(gnd_hat, gnd)
    #print(('Clustering mutual information = %.4f' % mutual_info))

    # TODO mutual information does not work with serialization
    #return gnd, gnd_hat, accuracy, MIEval, mutual_info
    return gnd, gnd_hat, accuracy
def regression_gaussian_process_modular (n=100,n_test=100, \
		x_range=6,x_range_test=10,noise_var=0.5,width=1, seed=1):

	from modshogun import RealFeatures, RegressionLabels, GaussianKernel, Math
	try:
		from modshogun import GaussianLikelihood, ZeroMean, \
				ExactInferenceMethod, GaussianProcessRegression
	except ImportError:
		print("Eigen3 needed for Gaussian Processes")
		return

	# reproducable results
	random.seed(seed)
	Math.init_random(17)

	# easy regression data: one dimensional noisy sine wave
	X=random.rand(1,n)*x_range

	X_test=array([[float(i)/n_test*x_range_test for i in range(n_test)]])
	Y_test=sin(X_test)
	Y=sin(X)+random.randn(n)*noise_var

	# shogun representation
	labels=RegressionLabels(Y[0])
	feats_train=RealFeatures(X)
	feats_test=RealFeatures(X_test)

	# GP specification
	shogun_width=width*width*2
	kernel=GaussianKernel(10, shogun_width)
	zmean = ZeroMean()
	lik = GaussianLikelihood()
	lik.set_sigma(noise_var)
	inf = ExactInferenceMethod(kernel, feats_train, zmean, labels, lik)

	# train GP
	gp = GaussianProcessRegression(inf)
	gp.train()

	# some things we can do
	alpha = inf.get_alpha()
	diagonal = inf.get_diagonal_vector()
	cholesky = inf.get_cholesky()

	# get mean and variance vectors
	mean = gp.get_mean_vector(feats_test)
	variance = gp.get_variance_vector(feats_test)

	# plot results
	#plot(X[0],Y[0],'x') # training observations
	#plot(X_test[0],Y_test[0],'-') # ground truth of test
	#plot(X_test[0],mean, '-') # mean predictions of test
	#fill_between(X_test[0],mean-1.96*sqrt(variance),mean+1.96*sqrt(variance),color='grey')  # 95% confidence interval
	#legend(["training", "ground truth", "mean predictions"])

	#show()

	return alpha, diagonal, round(variance,12), round(mean,12), cholesky
def regression_gaussian_process_modular (n=100,n_test=100, \
  x_range=6,x_range_test=10,noise_var=0.5,width=1, seed=1):

    from modshogun import RealFeatures, RegressionLabels, GaussianKernel, Math
    try:
        from modshogun import GaussianLikelihood, ZeroMean, \
          ExactInferenceMethod, GaussianProcessRegression
    except ImportError:
        print("Eigen3 needed for Gaussian Processes")
        return

    # reproducable results
    random.seed(seed)
    Math.init_random(17)

    # easy regression data: one dimensional noisy sine wave
    X = random.rand(1, n) * x_range

    X_test = array([[float(i) / n_test * x_range_test for i in range(n_test)]])
    Y_test = sin(X_test)
    Y = sin(X) + random.randn(n) * noise_var

    # shogun representation
    labels = RegressionLabels(Y[0])
    feats_train = RealFeatures(X)
    feats_test = RealFeatures(X_test)

    # GP specification
    shogun_width = width * width * 2
    kernel = GaussianKernel(10, shogun_width)
    zmean = ZeroMean()
    lik = GaussianLikelihood()
    lik.set_sigma(noise_var)
    inf = ExactInferenceMethod(kernel, feats_train, zmean, labels, lik)

    # train GP
    gp = GaussianProcessRegression(inf)
    gp.train()

    # some things we can do
    alpha = inf.get_alpha()
    diagonal = inf.get_diagonal_vector()
    cholesky = inf.get_cholesky()

    # get mean and variance vectors
    mean = gp.get_mean_vector(feats_test)
    variance = gp.get_variance_vector(feats_test)

    # plot results
    #plot(X[0],Y[0],'x') # training observations
    #plot(X_test[0],Y_test[0],'-') # ground truth of test
    #plot(X_test[0],mean, '-') # mean predictions of test
    #fill_between(X_test[0],mean-1.96*sqrt(variance),mean+1.96*sqrt(variance),color='grey')  # 95% confidence interval
    #legend(["training", "ground truth", "mean predictions"])

    #show()

    return alpha, diagonal, round(variance, 12), round(mean, 12), cholesky
Example #8
0
def modelselection_grid_search_kernel(num_subsets, num_vectors, dim_vectors):
    # init seed for reproducability
    Math.init_random(1)
    random.seed(1)

    # create some (non-sense) data
    matrix = random.rand(dim_vectors, num_vectors)

    # create num_feautres 2-dimensional vectors
    features = RealFeatures()
    features.set_feature_matrix(matrix)

    # create labels, two classes
    labels = BinaryLabels(num_vectors)
    for i in range(num_vectors):
        labels.set_label(i, 1 if i % 2 == 0 else -1)

    # create svm
    classifier = LibSVM()

    # splitting strategy
    splitting_strategy = StratifiedCrossValidationSplitting(
        labels, num_subsets)

    # accuracy evaluation
    evaluation_criterion = ContingencyTableEvaluation(ACCURACY)

    # cross validation class for evaluation in model selection
    cross = CrossValidation(classifier, features, labels, splitting_strategy,
                            evaluation_criterion)
    cross.set_num_runs(1)

    # print all parameter available for modelselection
    # Dont worry if yours is not included, simply write to the mailing list
    #classifier.print_modsel_params()

    # model parameter selection
    param_tree = create_param_tree()
    #param_tree.print_tree()

    grid_search = GridSearchModelSelection(cross, param_tree)

    print_state = False
    best_combination = grid_search.select_model(print_state)
    #print("best parameter(s):")
    #best_combination.print_tree()

    best_combination.apply_to_machine(classifier)

    # larger number of runs to have tighter confidence intervals
    cross.set_num_runs(10)
    cross.set_conf_int_alpha(0.01)
    result = cross.evaluate()
    casted = CrossValidationResult.obtain_from_generic(result)
    #print "result mean:", casted.mean

    return classifier, result, casted.mean
def modelselection_grid_search_kernel (num_subsets, num_vectors, dim_vectors):
	# init seed for reproducability
	Math.init_random(1)
	random.seed(1);

	# create some (non-sense) data
	matrix=random.rand(dim_vectors, num_vectors)

	# create num_feautres 2-dimensional vectors
	features=RealFeatures()
	features.set_feature_matrix(matrix)

	# create labels, two classes
	labels=BinaryLabels(num_vectors)
	for i in range(num_vectors):
		labels.set_label(i, 1 if i%2==0 else -1)

	# create svm
	classifier=LibSVM()

	# splitting strategy
	splitting_strategy=StratifiedCrossValidationSplitting(labels, num_subsets)

	# accuracy evaluation
	evaluation_criterion=ContingencyTableEvaluation(ACCURACY)

	# cross validation class for evaluation in model selection
	cross=CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterion)
	cross.set_num_runs(1)

	# print all parameter available for modelselection
	# Dont worry if yours is not included, simply write to the mailing list
	#classifier.print_modsel_params()

	# model parameter selection
	param_tree=create_param_tree()
	#param_tree.print_tree()

	grid_search=GridSearchModelSelection(cross, param_tree)

	print_state=False
	best_combination=grid_search.select_model(print_state)
	#print("best parameter(s):")
	#best_combination.print_tree()

	best_combination.apply_to_machine(classifier)

	# larger number of runs to have tighter confidence intervals
	cross.set_num_runs(10)
	cross.set_conf_int_alpha(0.01)
	result=cross.evaluate()
	casted=CrossValidationResult.obtain_from_generic(result);
	#print "result mean:", casted.mean

	return classifier,result,casted.mean
Example #10
0
def evaluation_clustering_simple(n_data=100, sqrt_num_blobs=4, distance=5):
    from modshogun import ClusteringAccuracy, ClusteringMutualInformation
    from modshogun import MulticlassLabels, GaussianBlobsDataGenerator
    from modshogun import Math

    # reproducable results
    Math.init_random(1)

    # produce sone Gaussian blobs to cluster
    ncenters = sqrt_num_blobs**2
    stretch = 1
    angle = 1
    gen = GaussianBlobsDataGenerator(sqrt_num_blobs, distance, stretch, angle)
    features = gen.get_streamed_features(n_data)
    X = features.get_feature_matrix()

    # compute approximate "ground truth" labels via taking the closest blob mean
    coords = array(range(0, sqrt_num_blobs * distance, distance))
    idx_0 = [abs(coords - x).argmin() for x in X[0]]
    idx_1 = [abs(coords - x).argmin() for x in X[1]]
    ground_truth = array(
        [idx_0[i] * sqrt_num_blobs + idx_1[i] for i in range(n_data)],
        dtype="float64")

    #for label in unique(ground_truth):
    #	indices=ground_truth==label
    #	plot(X[0][indices], X[1][indices], 'o')
    #show()

    centroids = run_clustering(features, ncenters)
    gnd_hat = assign_labels(features, centroids, ncenters)
    gnd = MulticlassLabels(ground_truth)

    AccuracyEval = ClusteringAccuracy()
    AccuracyEval.best_map(gnd_hat, gnd)

    accuracy = AccuracyEval.evaluate(gnd_hat, gnd)
    # in this case we know that the clustering has to be very good
    #print(('Clustering accuracy = %.4f' % accuracy))
    assert (accuracy > 0.8)

    MIEval = ClusteringMutualInformation()
    mutual_info = MIEval.evaluate(gnd_hat, gnd)
    #print(('Clustering mutual information = %.4f' % mutual_info))

    # TODO add multiclass labels and MI once the serialization works
    #return gnd, accuracy, mutual_info
    return accuracy
def evaluation_clustering_simple (n_data=100, sqrt_num_blobs=4, distance=5):
	from modshogun import ClusteringAccuracy, ClusteringMutualInformation
	from modshogun import MulticlassLabels, GaussianBlobsDataGenerator
	from modshogun import Math

	# reproducable results
	Math.init_random(1)

	# produce sone Gaussian blobs to cluster
	ncenters=sqrt_num_blobs**2
	stretch=1
	angle=1
	gen=GaussianBlobsDataGenerator(sqrt_num_blobs, distance, stretch, angle)
	features=gen.get_streamed_features(n_data)
	X=features.get_feature_matrix()

	# compute approximate "ground truth" labels via taking the closest blob mean
	coords=array(range(0,sqrt_num_blobs*distance,distance))
	idx_0=[abs(coords -x).argmin() for x in X[0]]
	idx_1=[abs(coords -x).argmin() for x in X[1]]
	ground_truth=array([idx_0[i]*sqrt_num_blobs + idx_1[i] for i in range(n_data)], dtype="float64")

	#for label in unique(ground_truth):
	#	indices=ground_truth==label
	#	plot(X[0][indices], X[1][indices], 'o')
	#show()

	centroids = run_clustering(features, ncenters)
	gnd_hat = assign_labels(features, centroids, ncenters)
	gnd = MulticlassLabels(ground_truth)

	AccuracyEval = ClusteringAccuracy()
	AccuracyEval.best_map(gnd_hat, gnd)

	accuracy = AccuracyEval.evaluate(gnd_hat, gnd)
	# in this case we know that the clustering has to be very good
	#print(('Clustering accuracy = %.4f' % accuracy))
	assert(accuracy>0.8)

	MIEval = ClusteringMutualInformation()
	mutual_info = MIEval.evaluate(gnd_hat, gnd)
	#print(('Clustering mutual information = %.4f' % mutual_info))

	# TODO add multiclass labels and MI once the serialization works
	#return gnd, accuracy, mutual_info
	return accuracy
def evaluation_multiclassovrevaluation_modular(train_fname=traindat, label_fname=label_traindat):
	from modshogun import MulticlassOVREvaluation,ROCEvaluation
	from modshogun import MulticlassLibLinear,RealFeatures,ContingencyTableEvaluation,ACCURACY
	from modshogun import MulticlassLabels, Math, CSVFile

	Math.init_random(1)
	ground_truth_labels = MulticlassLabels(CSVFile(label_fname))
	svm = MulticlassLibLinear(1.0,RealFeatures(CSVFile(train_fname)),ground_truth_labels)
	svm.parallel.set_num_threads(1)
	svm.train()
	predicted_labels = svm.apply()

	binary_evaluator = ROCEvaluation()
	evaluator = MulticlassOVREvaluation(binary_evaluator)
	mean_roc = evaluator.evaluate(predicted_labels,ground_truth_labels)
	#print mean_roc

	binary_evaluator = ContingencyTableEvaluation(ACCURACY)
	evaluator = MulticlassOVREvaluation(binary_evaluator)
	mean_accuracy = evaluator.evaluate(predicted_labels,ground_truth_labels)
	#print mean_accuracy
	return mean_roc, mean_accuracy, predicted_labels, svm
Example #13
0
def generate_gmm_classification_data(request):
    from modshogun import GMM, Math

    num_classes = int(request.POST['num_classes'])
    gmm = GMM(num_classes)
    total = 40
    rng = 4.0
    num = total / num_classes
    for i in xrange(num_classes):
        gmm.set_nth_mean(np.array([Math.random(-rng, rng) for j in xrange(2)]),
                         i)
        cov_tmp = Math.normal_random(0.2, 0.1)
        cov = np.array([[1.0, cov_tmp], [cov_tmp, 1.0]], dtype=float)
        gmm.set_nth_cov(cov, i)

    data = []
    labels = []
    for i in xrange(num_classes):
        coef = np.zeros(num_classes)
        coef[i] = 1.0
        gmm.set_coef(coef)
        data.append(np.array([gmm.sample() for j in xrange(num)]).T)
        labels.append(np.array([i for j in xrange(num)]))

    data = np.hstack(data)
    data = data / (2.0 * rng)
    xmin = np.min(data[0, :])
    ymin = np.min(data[1, :])
    labels = np.hstack(labels)
    toy_data = []
    for i in xrange(num_classes * num):
        toy_data.append({
            'x': data[0, i] - xmin,
            'y': data[1, i] - ymin,
            'label': float(labels[i])
        })
    return HttpResponse(json.dumps(toy_data))
def statistics_quadratic_time_mmd (m,dim,difference):
	from modshogun import RealFeatures
	from modshogun import MeanShiftDataGenerator
	from modshogun import GaussianKernel, CustomKernel
	from modshogun import QuadraticTimeMMD
	from modshogun import PERMUTATION, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, BIASED_DEPRECATED
	from modshogun import Statistics, IntVector, RealVector, Math

	# init seed for reproducability
	Math.init_random(1)
	random.seed(17)

	# number of examples kept low in order to make things fast

	# streaming data generator for mean shift distributions
	gen_p=MeanShiftDataGenerator(0, dim);
	#gen_p.parallel.set_num_threads(1)
	gen_q=MeanShiftDataGenerator(difference, dim);

	# stream some data from generator
	feat_p=gen_p.get_streamed_features(m);
	feat_q=gen_q.get_streamed_features(m);

	# set kernel a-priori. usually one would do some kernel selection. See
	# other examples for this.
	width=10;
	kernel=GaussianKernel(10, width);

	# create quadratic time mmd instance. Note that this constructor
	# copies p and q and does not reference them
	mmd=QuadraticTimeMMD(kernel, feat_p, feat_q);

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05
	alpha=0.05;

	# using permutation (slow, not the most reliable way. Consider pre-
	# computing the kernel when using it, see below).
	# Also, in practice, use at least 250 iterations
	mmd.set_null_approximation_method(PERMUTATION);
	mmd.set_num_null_samples(3);
	p_value_null=mmd.perform_test();
	# reject if p-value is smaller than test level
	#print "bootstrap: p!=q: ", p_value_null<alpha

	# using spectrum method. Use at least 250 samples from null.
	# This is consistent but sometimes breaks, always monitor type I error.
	# See tutorial for number of eigenvalues to use .
	mmd.set_statistic_type(BIASED);
	mmd.set_null_approximation_method(MMD2_SPECTRUM);
	mmd.set_num_eigenvalues_spectrum(3);
	mmd.set_num_samples_spectrum(250);
	p_value_spectrum=mmd.perform_test();
	# reject if p-value is smaller than test level
	#print "spectrum: p!=q: ", p_value_spectrum<alpha

	# using gamma method. This is a quick hack, which works most of the time
	# but is NOT guaranteed to. See tutorial for details.
	# Only works with BIASED_DEPRECATED statistic
	mmd.set_statistic_type(BIASED_DEPRECATED);
	mmd.set_null_approximation_method(MMD2_GAMMA);
	p_value_gamma=mmd.perform_test();
	# reject if p-value is smaller than test level
	#print "gamma: p!=q: ", p_value_gamma<alpha

	# compute tpye I and II error (use many more trials in practice).
	# Type I error is not necessary if one uses permutation. We do it here
	# anyway, but note that this is an efficient way of computing it.
	# Also note that testing has to happen on
	# difference data than kernel selection, but the linear time mmd does this
	# implicitly and we used a fixed kernel here.
	mmd.set_statistic_type(BIASED);
	mmd.set_null_approximation_method(PERMUTATION);
	mmd.set_num_null_samples(5);
	num_trials=5;
	type_I_errors=RealVector(num_trials);
	type_II_errors=RealVector(num_trials);
	inds=int32(array([x for x in range(2*m)])) # numpy
	p_and_q=mmd.get_p_and_q();

	# use a precomputed kernel to be faster
	kernel.init(p_and_q, p_and_q);
	precomputed=CustomKernel(kernel);
	mmd.set_kernel(precomputed);
	for i in range(num_trials):
		# this effectively means that p=q - rejecting is tpye I error
		inds=random.permutation(inds) # numpy permutation
		precomputed.add_row_subset(inds);
		precomputed.add_col_subset(inds);
		type_I_errors[i]=mmd.perform_test()>alpha;
		precomputed.remove_row_subset();
		precomputed.remove_col_subset();

		# on normal data, this gives type II error
		type_II_errors[i]=mmd.perform_test()>alpha;

	return type_I_errors.get(),type_I_errors.get(),p_value_null,p_value_spectrum,p_value_gamma,
Example #15
0
def gen_data(ftype, num_samples, show_data=False):
    from modshogun import Math
    from modshogun import FactorType, Factor, TableFactorType, FactorGraph
    from modshogun import FactorGraphObservation, FactorGraphLabels, FactorGraphFeatures
    from modshogun import MAPInference, TREE_MAX_PROD

    Math.init_random(17)

    samples = FactorGraphFeatures(num_samples)
    labels = FactorGraphLabels(num_samples)

    for i in range(num_samples):
        vc = np.array([2, 2, 2], np.int32)
        fg = FactorGraph(vc)

        data1 = np.array([2.0 * Math.random(0.0, 1.0) - 1.0 for i in range(2)])
        vind1 = np.array([0, 1], np.int32)
        fac1 = Factor(ftype[0], vind1, data1)
        fg.add_factor(fac1)

        data2 = np.array([2.0 * Math.random(0.0, 1.0) - 1.0 for i in range(2)])
        vind2 = np.array([1, 2], np.int32)
        fac2 = Factor(ftype[0], vind2, data2)
        fg.add_factor(fac2)

        data3 = np.array([2.0 * Math.random(0.0, 1.0) - 1.0 for i in range(2)])
        vind3 = np.array([0], np.int32)
        fac3 = Factor(ftype[1], vind3, data3)
        fg.add_factor(fac3)

        data4 = np.array([2.0 * Math.random(0.0, 1.0) - 1.0 for i in range(2)])
        vind4 = np.array([1], np.int32)
        fac4 = Factor(ftype[1], vind4, data4)
        fg.add_factor(fac4)

        data5 = np.array([2.0 * Math.random(0.0, 1.0) - 1.0 for i in range(2)])
        vind5 = np.array([2], np.int32)
        fac5 = Factor(ftype[1], vind5, data5)
        fg.add_factor(fac5)

        data6 = np.array([1.0])
        vind6 = np.array([0], np.int32)
        fac6 = Factor(ftype[2], vind6, data6)
        fg.add_factor(fac6)

        data7 = np.array([1.0])
        vind7 = np.array([2], np.int32)
        fac7 = Factor(ftype[2], vind7, data7)
        fg.add_factor(fac7)

        samples.add_sample(fg)
        fg.connect_components()
        fg.compute_energies()

        infer_met = MAPInference(fg, TREE_MAX_PROD)
        infer_met.inference()

        fg_obs = infer_met.get_structured_outputs()
        labels.add_label(fg_obs)

        if show_data:
            state = fg_obs.get_data()
            print(state)

    return samples, labels
def statistics_linear_time_mmd (n,dim,difference):
	from modshogun import RealFeatures
	from modshogun import MeanShiftDataGenerator
	from modshogun import GaussianKernel
	from modshogun import LinearTimeMMD
	from modshogun import PERMUTATION, MMD1_GAUSSIAN
	from modshogun import EuclideanDistance
	from modshogun import Statistics, Math

	# init seed for reproducability
	Math.init_random(1)

	# note that the linear time statistic is designed for much larger datasets
	# so increase to get reasonable results

	# streaming data generator for mean shift distributions
	gen_p=MeanShiftDataGenerator(0, dim)
	gen_q=MeanShiftDataGenerator(difference, dim)

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable

	# Stream examples and merge them in order to compute median on joint sample
	features=gen_p.get_streamed_features(100)
	features=features.create_merged_copy(gen_q.get_streamed_features(100))

	# compute all pairwise distances
	dist=EuclideanDistance(features, features)
	distances=dist.get_distance_matrix()

	# compute median and determine kernel width (using shogun)
	median_distance=Statistics.matrix_median(distances, True)
	sigma=median_distance**2
	#print "median distance for Gaussian kernel:", sigma
	kernel=GaussianKernel(10,sigma)

	# mmd instance using streaming features, blocksize of 10000
	mmd=LinearTimeMMD(kernel, gen_p, gen_q, n, 10000)

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05
	statistic=mmd.compute_statistic()
	#print "test statistic:", statistic

	# do the same thing using two different way to approximate null-dstribution
	# sampling null and gaussian approximation (ony for really large samples)
	alpha=0.05

	#print "computing p-value using sampling null"
	mmd.set_null_approximation_method(PERMUTATION)
	mmd.set_num_null_samples(50) # normally, far more iterations are needed
	p_value_boot=mmd.compute_p_value(statistic)
	#print "p_value_boot:", p_value_boot
	#print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha

	#print "computing p-value using gaussian approximation"
	mmd.set_null_approximation_method(MMD1_GAUSSIAN)
	p_value_gaussian=mmd.compute_p_value(statistic)
	#print "p_value_gaussian:", p_value_gaussian
	#print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha

	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	mmd.set_null_approximation_method(PERMUTATION)
	mmd.set_num_null_samples(10) # normally, far more iterations are needed
	null_samples=mmd.sample_null()
	#print "null mean:", mean(null_samples)
	#print "null variance:", var(null_samples)

	# compute type I and type II errors for Gaussian approximation
	# number of trials should be larger to compute tight confidence bounds
	mmd.set_null_approximation_method(MMD1_GAUSSIAN)
	num_trials=5;
	alpha=0.05 # test power
	typeIerrors=[0 for x in range(num_trials)]
	typeIIerrors=[0 for x in range(num_trials)]
	for i in range(num_trials):
		# this effectively means that p=q - rejecting is tpye I error
		mmd.set_simulate_h0(True)
		typeIerrors[i]=mmd.perform_test()>alpha
		mmd.set_simulate_h0(False)

		typeIIerrors[i]=mmd.perform_test()>alpha

	#print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors)

	return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors
def statistics_quadratic_time_mmd(m, dim, difference):
    from modshogun import RealFeatures
    from modshogun import MeanShiftDataGenerator
    from modshogun import GaussianKernel, CustomKernel
    from modshogun import QuadraticTimeMMD
    from modshogun import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED
    from modshogun import Statistics, IntVector, RealVector, Math

    # init seed for reproducability
    Math.init_random(1)
    random.seed(17)

    # number of examples kept low in order to make things fast

    # streaming data generator for mean shift distributions
    gen_p = MeanShiftDataGenerator(0, dim)
    #gen_p.parallel.set_num_threads(1)
    gen_q = MeanShiftDataGenerator(difference, dim)

    # stream some data from generator
    feat_p = gen_p.get_streamed_features(m)
    feat_q = gen_q.get_streamed_features(m)

    # set kernel a-priori. usually one would do some kernel selection. See
    # other examples for this.
    width = 10
    kernel = GaussianKernel(10, width)

    # create quadratic time mmd instance. Note that this constructor
    # copies p and q and does not reference them
    mmd = QuadraticTimeMMD(kernel, feat_p, feat_q)

    # perform test: compute p-value and test if null-hypothesis is rejected for
    # a test level of 0.05
    alpha = 0.05

    # using bootstrapping (slow, not the most reliable way. Consider pre-
    # computing the kernel when using it, see below).
    # Also, in practice, use at least 250 iterations
    mmd.set_null_approximation_method(BOOTSTRAP)
    mmd.set_bootstrap_iterations(3)
    p_value_boot = mmd.perform_test()
    # reject if p-value is smaller than test level
    #print "bootstrap: p!=q: ", p_value_boot<alpha

    # using spectrum method. Use at least 250 samples from null.
    # This is consistent but sometimes breaks, always monitor type I error.
    # See tutorial for number of eigenvalues to use .
    # Only works with BIASED statistic
    mmd.set_statistic_type(BIASED)
    mmd.set_null_approximation_method(MMD2_SPECTRUM)
    mmd.set_num_eigenvalues_spectrum(3)
    mmd.set_num_samples_sepctrum(250)
    p_value_spectrum = mmd.perform_test()
    # reject if p-value is smaller than test level
    #print "spectrum: p!=q: ", p_value_spectrum<alpha

    # using gamma method. This is a quick hack, which works most of the time
    # but is NOT guaranteed to. See tutorial for details.
    # Only works with BIASED statistic
    mmd.set_statistic_type(BIASED)
    mmd.set_null_approximation_method(MMD2_GAMMA)
    p_value_gamma = mmd.perform_test()
    # reject if p-value is smaller than test level
    #print "gamma: p!=q: ", p_value_gamma<alpha

    # compute tpye I and II error (use many more trials in practice).
    # Type I error is not necessary if one uses bootstrapping. We do it here
    # anyway, but note that this is an efficient way of computing it.
    # Also note that testing has to happen on
    # difference data than kernel selection, but the linear time mmd does this
    # implicitly and we used a fixed kernel here.
    mmd.set_null_approximation_method(BOOTSTRAP)
    mmd.set_bootstrap_iterations(5)
    num_trials = 5
    type_I_errors = RealVector(num_trials)
    type_II_errors = RealVector(num_trials)
    inds = int32(array([x for x in range(2 * m)]))  # numpy
    p_and_q = mmd.get_p_and_q()

    # use a precomputed kernel to be faster
    kernel.init(p_and_q, p_and_q)
    precomputed = CustomKernel(kernel)
    mmd.set_kernel(precomputed)
    for i in range(num_trials):
        # this effectively means that p=q - rejecting is tpye I error
        inds = random.permutation(inds)  # numpy permutation
        precomputed.add_row_subset(inds)
        precomputed.add_col_subset(inds)
        type_I_errors[i] = mmd.perform_test() > alpha
        precomputed.remove_row_subset()
        precomputed.remove_col_subset()

        # on normal data, this gives type II error
        type_II_errors[i] = mmd.perform_test() > alpha

    return type_I_errors.get(), type_I_errors.get(
    ), p_value_boot, p_value_spectrum, p_value_gamma,
Example #18
0
def statistics_hsic(n, difference, angle):
    from modshogun import RealFeatures
    from modshogun import DataGenerator
    from modshogun import GaussianKernel
    from modshogun import HSIC
    from modshogun import PERMUTATION, HSIC_GAMMA
    from modshogun import EuclideanDistance
    from modshogun import Statistics, Math

    # for reproducable results (the numpy one might not be reproducible across
    # different OS/Python-distributions
    Math.init_random(1)
    np.random.seed(1)

    # note that the HSIC has to store kernel matrices
    # which upper bounds the sample size

    # use data generator class to produce example data
    data = DataGenerator.generate_sym_mix_gauss(n, difference, angle)
    #plot(data[0], data[1], 'x');show()

    # create shogun feature representation
    features_x = RealFeatures(np.array([data[0]]))
    features_y = RealFeatures(np.array([data[1]]))

    # compute median data distance in order to use for Gaussian kernel width
    # 0.5*median_distance normally (factor two in Gaussian kernel)
    # However, shoguns kernel width is different to usual parametrization
    # Therefore 0.5*2*median_distance^2
    # Use a subset of data for that, only 200 elements. Median is stable
    subset = np.random.permutation(features_x.get_num_vectors()).astype(
        np.int32)
    subset = subset[0:200]
    features_x.add_subset(subset)
    dist = EuclideanDistance(features_x, features_x)
    distances = dist.get_distance_matrix()
    features_x.remove_subset()
    median_distance = np.median(distances)
    sigma_x = median_distance**2
    features_y.add_subset(subset)
    dist = EuclideanDistance(features_y, features_y)
    distances = dist.get_distance_matrix()
    features_y.remove_subset()
    median_distance = np.median(distances)
    sigma_y = median_distance**2
    #print "median distance for Gaussian kernel on x:", sigma_x
    #print "median distance for Gaussian kernel on y:", sigma_y
    kernel_x = GaussianKernel(10, sigma_x)
    kernel_y = GaussianKernel(10, sigma_y)

    hsic = HSIC(kernel_x, kernel_y, features_x, features_y)

    # perform test: compute p-value and test if null-hypothesis is rejected for
    # a test level of 0.05 using different methods to approximate
    # null-distribution
    statistic = hsic.compute_statistic()
    #print "HSIC:", statistic
    alpha = 0.05

    #print "computing p-value using sampling null"
    hsic.set_null_approximation_method(PERMUTATION)
    # normally, at least 250 iterations should be done, but that takes long
    hsic.set_num_null_samples(100)
    # sampling null allows usage of unbiased or biased statistic
    p_value_boot = hsic.compute_p_value(statistic)
    thresh_boot = hsic.compute_threshold(alpha)
    #print "p_value:", p_value_boot
    #print "threshold for 0.05 alpha:", thresh_boot
    #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha

    #print "computing p-value using gamma method"
    hsic.set_null_approximation_method(HSIC_GAMMA)
    p_value_gamma = hsic.compute_p_value(statistic)
    thresh_gamma = hsic.compute_threshold(alpha)
    #print "p_value:", p_value_gamma
    #print "threshold for 0.05 alpha:", thresh_gamma
    #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_gamma<alpha

    # sample from null distribution (these may be plotted or whatsoever)
    # mean should be close to zero, variance stronly depends on data/kernel
    # sampling null, biased statistic
    #print "sampling null distribution using sample_null"
    hsic.set_null_approximation_method(PERMUTATION)
    hsic.set_num_null_samples(100)
    null_samples = hsic.sample_null()
    #print "null mean:", np.mean(null_samples)
    #print "null variance:", np.var(null_samples)
    #hist(null_samples, 100); show()

    return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples
def statistics_mmd_kernel_selection_single(m,distance,stretch,num_blobs,angle,selection_method):
	from modshogun import RealFeatures
	from modshogun import GaussianBlobsDataGenerator
	from modshogun import GaussianKernel, CombinedKernel
	from modshogun import LinearTimeMMD
	from modshogun import MMDKernelSelectionMedian
	from modshogun import MMDKernelSelectionMax
	from modshogun import MMDKernelSelectionOpt
	from modshogun import PERMUTATION, MMD1_GAUSSIAN
	from modshogun import EuclideanDistance
	from modshogun import Statistics, Math

	# init seed for reproducability
	Math.init_random(1)

	# note that the linear time statistic is designed for much larger datasets
	# results for this low number will be bad (unstable, type I error wrong)
	m=1000
	distance=10
	stretch=5
	num_blobs=3
	angle=pi/4

	# streaming data generator
	gen_p=GaussianBlobsDataGenerator(num_blobs, distance, 1, 0)
	gen_q=GaussianBlobsDataGenerator(num_blobs, distance, stretch, angle)

	# stream some data and plot
	num_plot=1000
	features=gen_p.get_streamed_features(num_plot)
	features=features.create_merged_copy(gen_q.get_streamed_features(num_plot))
	data=features.get_feature_matrix()

	#figure()
	#subplot(2,2,1)
	#grid(True)
	#plot(data[0][0:num_plot], data[1][0:num_plot], 'r.', label='$x$')
	#title('$X\sim p$')
	#subplot(2,2,2)
	#grid(True)
	#plot(data[0][num_plot+1:2*num_plot], data[1][num_plot+1:2*num_plot], 'b.', label='$x$', alpha=0.5)
	#title('$Y\sim q$')


	# create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is
	# different to the standard form, see documentation)
	sigmas=[2**x for x in range(-3,10)]
	widths=[x*x*2 for x in sigmas]
	combined=CombinedKernel()
	for i in range(len(sigmas)):
		combined.append_kernel(GaussianKernel(10, widths[i]))

	# mmd instance using streaming features, blocksize of 10000
	block_size=1000
	mmd=LinearTimeMMD(combined, gen_p, gen_q, m, block_size)

	# kernel selection instance (this can easily replaced by the other methods for selecting
	# single kernels
	if selection_method=="opt":
		selection=MMDKernelSelectionOpt(mmd)
	elif selection_method=="max":
		selection=MMDKernelSelectionMax(mmd)
	elif selection_method=="median":
		selection=MMDKernelSelectionMedian(mmd)

	# print measures (just for information)
	# in case Opt: ratios of MMD and standard deviation
	# in case Max: MMDs for each kernel
	# Does not work for median method
	if selection_method!="median":
		ratios=selection.compute_measures()
		#print "Measures:", ratios

	#subplot(2,2,3)
	#plot(ratios)
	#title('Measures')

	# perform kernel selection
	kernel=selection.select_kernel()
	kernel=GaussianKernel.obtain_from_generic(kernel)
	#print "selected kernel width:", kernel.get_width()

	# compute tpye I and II error (use many more trials). Type I error is only
	# estimated to check MMD1_GAUSSIAN method for estimating the null
	# distribution. Note that testing has to happen on difference data than
	# kernel selecting, but the linear time mmd does this implicitly
	mmd.set_kernel(kernel)
	mmd.set_null_approximation_method(MMD1_GAUSSIAN)

	# number of trials should be larger to compute tight confidence bounds
	num_trials=5;
	alpha=0.05 # test power
	typeIerrors=[0 for x in range(num_trials)]
	typeIIerrors=[0 for x in range(num_trials)]
	for i in range(num_trials):
		# this effectively means that p=q - rejecting is tpye I error
		mmd.set_simulate_h0(True)
		typeIerrors[i]=mmd.perform_test()>alpha
		mmd.set_simulate_h0(False)

		typeIIerrors[i]=mmd.perform_test()>alpha

	#print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors)

	return kernel,typeIerrors,typeIIerrors
def gen_data(ftype, num_samples, show_data = False):
	from modshogun import Math
	from modshogun import FactorType, Factor, TableFactorType, FactorGraph
	from modshogun import FactorGraphObservation, FactorGraphLabels, FactorGraphFeatures
	from modshogun import MAPInference, TREE_MAX_PROD

	Math.init_random(17)

	samples = FactorGraphFeatures(num_samples)
	labels = FactorGraphLabels(num_samples)

	for i in xrange(num_samples):
		vc = np.array([2,2,2], np.int32)
		fg = FactorGraph(vc)

		data1 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in xrange(2)])
		vind1 = np.array([0,1], np.int32)
		fac1 = Factor(ftype[0], vind1, data1)
		fg.add_factor(fac1)

		data2 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in xrange(2)])
		vind2 = np.array([1,2], np.int32)
		fac2 = Factor(ftype[0], vind2, data2)
		fg.add_factor(fac2)

		data3 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in xrange(2)])
		vind3 = np.array([0], np.int32)
		fac3 = Factor(ftype[1], vind3, data3)
		fg.add_factor(fac3)

		data4 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in xrange(2)])
		vind4 = np.array([1], np.int32)
		fac4 = Factor(ftype[1], vind4, data4)
		fg.add_factor(fac4)

		data5 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in xrange(2)])
		vind5 = np.array([2], np.int32)
		fac5 = Factor(ftype[1], vind5, data5)
		fg.add_factor(fac5)

		data6 = np.array([1.0])
		vind6 = np.array([0], np.int32)
		fac6 = Factor(ftype[2], vind6, data6)
		fg.add_factor(fac6)

		data7 = np.array([1.0])
		vind7 = np.array([2], np.int32)
		fac7 = Factor(ftype[2], vind7, data7)
		fg.add_factor(fac7)

		samples.add_sample(fg)
		fg.connect_components()
		fg.compute_energies()

		infer_met = MAPInference(fg, TREE_MAX_PROD)
		infer_met.inference()

		fg_obs = infer_met.get_structured_outputs()
		labels.add_label(fg_obs)

		if show_data:
			state = fg_obs.get_data()
			print state

	return samples, labels
def statistics_linear_time_mmd(n, dim, difference):
    from modshogun import RealFeatures
    from modshogun import MeanShiftDataGenerator
    from modshogun import GaussianKernel
    from modshogun import LinearTimeMMD
    from modshogun import PERMUTATION, MMD1_GAUSSIAN
    from modshogun import EuclideanDistance
    from modshogun import Statistics, Math

    # init seed for reproducability
    Math.init_random(1)

    # note that the linear time statistic is designed for much larger datasets
    # so increase to get reasonable results

    # streaming data generator for mean shift distributions
    gen_p = MeanShiftDataGenerator(0, dim)
    gen_q = MeanShiftDataGenerator(difference, dim)

    # compute median data distance in order to use for Gaussian kernel width
    # 0.5*median_distance normally (factor two in Gaussian kernel)
    # However, shoguns kernel width is different to usual parametrization
    # Therefore 0.5*2*median_distance^2
    # Use a subset of data for that, only 200 elements. Median is stable

    # Stream examples and merge them in order to compute median on joint sample
    features = gen_p.get_streamed_features(100)
    features = features.create_merged_copy(gen_q.get_streamed_features(100))

    # compute all pairwise distances
    dist = EuclideanDistance(features, features)
    distances = dist.get_distance_matrix()

    # compute median and determine kernel width (using shogun)
    median_distance = Statistics.matrix_median(distances, True)
    sigma = median_distance**2
    #print "median distance for Gaussian kernel:", sigma
    kernel = GaussianKernel(10, sigma)

    # mmd instance using streaming features, blocksize of 10000
    mmd = LinearTimeMMD(kernel, gen_p, gen_q, n, 10000)

    # perform test: compute p-value and test if null-hypothesis is rejected for
    # a test level of 0.05
    statistic = mmd.compute_statistic()
    #print "test statistic:", statistic

    # do the same thing using two different way to approximate null-dstribution
    # sampling null and gaussian approximation (ony for really large samples)
    alpha = 0.05

    #print "computing p-value using sampling null"
    mmd.set_null_approximation_method(PERMUTATION)
    mmd.set_num_null_samples(50)  # normally, far more iterations are needed
    p_value_boot = mmd.compute_p_value(statistic)
    #print "p_value_boot:", p_value_boot
    #print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha

    #print "computing p-value using gaussian approximation"
    mmd.set_null_approximation_method(MMD1_GAUSSIAN)
    p_value_gaussian = mmd.compute_p_value(statistic)
    #print "p_value_gaussian:", p_value_gaussian
    #print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha

    # sample from null distribution (these may be plotted or whatsoever)
    # mean should be close to zero, variance stronly depends on data/kernel
    mmd.set_null_approximation_method(PERMUTATION)
    mmd.set_num_null_samples(10)  # normally, far more iterations are needed
    null_samples = mmd.sample_null()
    #print "null mean:", mean(null_samples)
    #print "null variance:", var(null_samples)

    # compute type I and type II errors for Gaussian approximation
    # number of trials should be larger to compute tight confidence bounds
    mmd.set_null_approximation_method(MMD1_GAUSSIAN)
    num_trials = 5
    alpha = 0.05  # test power
    typeIerrors = [0 for x in range(num_trials)]
    typeIIerrors = [0 for x in range(num_trials)]
    for i in range(num_trials):
        # this effectively means that p=q - rejecting is tpye I error
        mmd.set_simulate_h0(True)
        typeIerrors[i] = mmd.perform_test() > alpha
        mmd.set_simulate_h0(False)

        typeIIerrors[i] = mmd.perform_test() > alpha

    #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors)

    return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors
Example #22
0
def evaluation_cross_validation_multiclass_storage (traindat=traindat, label_traindat=label_traindat):
    from modshogun import CrossValidation, CrossValidationResult
    from modshogun import CrossValidationPrintOutput
    from modshogun import CrossValidationMKLStorage, CrossValidationMulticlassStorage
    from modshogun import MulticlassAccuracy, F1Measure
    from modshogun import StratifiedCrossValidationSplitting
    from modshogun import MulticlassLabels
    from modshogun import RealFeatures, CombinedFeatures
    from modshogun import GaussianKernel, CombinedKernel
    from modshogun import MKLMulticlass
    from modshogun import Statistics, MSG_DEBUG, Math

    Math.init_random(1)

    # training data, combined features all on same data
    features=RealFeatures(traindat)
    comb_features=CombinedFeatures()
    comb_features.append_feature_obj(features)
    comb_features.append_feature_obj(features)
    comb_features.append_feature_obj(features)
    labels=MulticlassLabels(label_traindat)

    # kernel, different Gaussians combined
    kernel=CombinedKernel()
    kernel.append_kernel(GaussianKernel(10, 0.1))
    kernel.append_kernel(GaussianKernel(10, 1))
    kernel.append_kernel(GaussianKernel(10, 2))

    # create mkl using libsvm, due to a mem-bug, interleaved is not possible
    svm=MKLMulticlass(1.0,kernel,labels);
    svm.set_kernel(kernel);

    # splitting strategy for 5 fold cross-validation (for classification its better
    # to use "StratifiedCrossValidation", but the standard
    # "StratifiedCrossValidationSplitting" is also available
    splitting_strategy=StratifiedCrossValidationSplitting(labels, 3)

    # evaluation method
    evaluation_criterium=MulticlassAccuracy()

    # cross-validation instance
    cross_validation=CrossValidation(svm, comb_features, labels,
        splitting_strategy, evaluation_criterium)
    cross_validation.set_autolock(False)

    # append cross vlaidation output classes
    #cross_validation.add_cross_validation_output(CrossValidationPrintOutput())
    #mkl_storage=CrossValidationMKLStorage()
    #cross_validation.add_cross_validation_output(mkl_storage)
    multiclass_storage=CrossValidationMulticlassStorage()
    multiclass_storage.append_binary_evaluation(F1Measure())
    cross_validation.add_cross_validation_output(multiclass_storage)
    cross_validation.set_num_runs(3)

    # perform cross-validation
    result=cross_validation.evaluate()

    roc_0_0_0 = multiclass_storage.get_fold_ROC(0,0,0)
    #print roc_0_0_0
    auc_0_0_0 = multiclass_storage.get_fold_evaluation_result(0,0,0,0)
    #print auc_0_0_0
    return roc_0_0_0, auc_0_0_0
Example #23
0
def statistics_hsic (n, difference, angle):
	from modshogun import RealFeatures
	from modshogun import DataGenerator
	from modshogun import GaussianKernel
	from modshogun import HSIC
	from modshogun import BOOTSTRAP, HSIC_GAMMA
	from modshogun import EuclideanDistance
	from modshogun import Math, Statistics, IntVector

	# init seed for reproducability
	Math.init_random(1)

	# note that the HSIC has to store kernel matrices
	# which upper bounds the sample size

	# use data generator class to produce example data
	data=DataGenerator.generate_sym_mix_gauss(n,difference,angle)
	#plot(data[0], data[1], 'x');show()

	# create shogun feature representation
	features_x=RealFeatures(array([data[0]]))
	features_y=RealFeatures(array([data[1]]))

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	subset=IntVector.randperm_vec(features_x.get_num_vectors())
	subset=subset[0:200]
	features_x.add_subset(subset)
	dist=EuclideanDistance(features_x, features_x)
	distances=dist.get_distance_matrix()
	features_x.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma_x=median_distance**2
	features_y.add_subset(subset)
	dist=EuclideanDistance(features_y, features_y)
	distances=dist.get_distance_matrix()
	features_y.remove_subset()
	median_distance=Statistics.matrix_median(distances, True)
	sigma_y=median_distance**2
	#print "median distance for Gaussian kernel on x:", sigma_x
	#print "median distance for Gaussian kernel on y:", sigma_y
	kernel_x=GaussianKernel(10,sigma_x)
	kernel_y=GaussianKernel(10,sigma_y)

	hsic=HSIC(kernel_x,kernel_y,features_x,features_y)

	# perform test: compute p-value and test if null-hypothesis is rejected for
	# a test level of 0.05 using different methods to approximate
	# null-distribution
	statistic=hsic.compute_statistic()
	#print "HSIC:", statistic
	alpha=0.05

	#print "computing p-value using bootstrapping"
	hsic.set_null_approximation_method(BOOTSTRAP)
	# normally, at least 250 iterations should be done, but that takes long
	hsic.set_bootstrap_iterations(100)
	# bootstrapping allows usage of unbiased or biased statistic
	p_value_boot=hsic.compute_p_value(statistic)
	thresh_boot=hsic.compute_threshold(alpha)
	#print "p_value:", p_value_boot
	#print "threshold for 0.05 alpha:", thresh_boot
	#print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha

	#print "computing p-value using gamma method"
	hsic.set_null_approximation_method(HSIC_GAMMA)
	p_value_gamma=hsic.compute_p_value(statistic)
	thresh_gamma=hsic.compute_threshold(alpha)
	#print "p_value:", p_value_gamma
	#print "threshold for 0.05 alpha:", thresh_gamma
	#print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value_gamma<alpha

	# sample from null distribution (these may be plotted or whatsoever)
	# mean should be close to zero, variance stronly depends on data/kernel
	# bootstrapping, biased statistic
	#print "sampling null distribution using bootstrapping"
	hsic.set_null_approximation_method(BOOTSTRAP)
	hsic.set_bootstrap_iterations(100)
	null_samples=hsic.bootstrap_null()
	#print "null mean:", mean(null_samples)
	#print "null variance:", var(null_samples)
	#hist(null_samples, 100); show()

	return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples