def statistics_kmm (n,d): from modshogun import RealFeatures from modshogun import DataGenerator from modshogun import GaussianKernel, MSG_DEBUG from modshogun import KernelMeanMatching from modshogun import Math # init seed for reproducability Math.init_random(1) random.seed(1); data = random.randn(d,n) # create shogun feature representation features=RealFeatures(data) # use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization # which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard # k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2 kernel=GaussianKernel(10,8) kernel.init(features,features) kmm = KernelMeanMatching(kernel,array([0,1,2,3,7,8,9],dtype=int32),array([4,5,6],dtype=int32)) w = kmm.compute_weights() #print w return w
def evaluation_clustering (features=fea, ground_truth=gnd_raw, ncenters=10): from modshogun import ClusteringAccuracy, ClusteringMutualInformation from modshogun import MulticlassLabels from modshogun import Math # reproducable results Math.init_random(1) centroids = run_clustering(features, ncenters) gnd_hat = assign_labels(features, centroids, ncenters) gnd = MulticlassLabels(ground_truth) AccuracyEval = ClusteringAccuracy() AccuracyEval.best_map(gnd_hat, gnd) accuracy = AccuracyEval.evaluate(gnd_hat, gnd) #print(('Clustering accuracy = %.4f' % accuracy)) MIEval = ClusteringMutualInformation() mutual_info = MIEval.evaluate(gnd_hat, gnd) #print(('Clustering mutual information = %.4f' % mutual_info)) # TODO mutual information does not work with serialization #return gnd, gnd_hat, accuracy, MIEval, mutual_info return gnd, gnd_hat, accuracy
def statistics_kmm(n, d): from modshogun import RealFeatures from modshogun import DataGenerator from modshogun import GaussianKernel, MSG_DEBUG from modshogun import KernelMeanMatching from modshogun import Math # init seed for reproducability Math.init_random(1) random.seed(1) data = random.randn(d, n) # create shogun feature representation features = RealFeatures(data) # use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization # which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard # k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2 kernel = GaussianKernel(10, 8) kernel.init(features, features) kmm = KernelMeanMatching(kernel, array([0, 1, 2, 3, 7, 8, 9], dtype=int32), array([4, 5, 6], dtype=int32)) w = kmm.compute_weights() #print w return w
def evaluation_clustering(features=fea, ground_truth=gnd_raw, ncenters=10): from modshogun import ClusteringAccuracy, ClusteringMutualInformation from modshogun import MulticlassLabels from modshogun import Math # reproducable results Math.init_random(1) centroids = run_clustering(features, ncenters) gnd_hat = assign_labels(features, centroids, ncenters) gnd = MulticlassLabels(ground_truth) AccuracyEval = ClusteringAccuracy() AccuracyEval.best_map(gnd_hat, gnd) accuracy = AccuracyEval.evaluate(gnd_hat, gnd) #print(('Clustering accuracy = %.4f' % accuracy)) MIEval = ClusteringMutualInformation() mutual_info = MIEval.evaluate(gnd_hat, gnd) #print(('Clustering mutual information = %.4f' % mutual_info)) # TODO mutual information does not work with serialization #return gnd, gnd_hat, accuracy, MIEval, mutual_info return gnd, gnd_hat, accuracy
def regression_gaussian_process_modular (n=100,n_test=100, \ x_range=6,x_range_test=10,noise_var=0.5,width=1, seed=1): from modshogun import RealFeatures, RegressionLabels, GaussianKernel, Math try: from modshogun import GaussianLikelihood, ZeroMean, \ ExactInferenceMethod, GaussianProcessRegression except ImportError: print("Eigen3 needed for Gaussian Processes") return # reproducable results random.seed(seed) Math.init_random(17) # easy regression data: one dimensional noisy sine wave X = random.rand(1, n) * x_range X_test = array([[float(i) / n_test * x_range_test for i in range(n_test)]]) Y_test = sin(X_test) Y = sin(X) + random.randn(n) * noise_var # shogun representation labels = RegressionLabels(Y[0]) feats_train = RealFeatures(X) feats_test = RealFeatures(X_test) # GP specification shogun_width = width * width * 2 kernel = GaussianKernel(10, shogun_width) zmean = ZeroMean() lik = GaussianLikelihood() lik.set_sigma(noise_var) inf = ExactInferenceMethod(kernel, feats_train, zmean, labels, lik) # train GP gp = GaussianProcessRegression(inf) gp.train() # some things we can do alpha = inf.get_alpha() diagonal = inf.get_diagonal_vector() cholesky = inf.get_cholesky() # get mean and variance vectors mean = gp.get_mean_vector(feats_test) variance = gp.get_variance_vector(feats_test) # plot results #plot(X[0],Y[0],'x') # training observations #plot(X_test[0],Y_test[0],'-') # ground truth of test #plot(X_test[0],mean, '-') # mean predictions of test #fill_between(X_test[0],mean-1.96*sqrt(variance),mean+1.96*sqrt(variance),color='grey') # 95% confidence interval #legend(["training", "ground truth", "mean predictions"]) #show() return alpha, diagonal, round(variance, 12), round(mean, 12), cholesky
def regression_gaussian_process_modular (n=100,n_test=100, \ x_range=6,x_range_test=10,noise_var=0.5,width=1, seed=1): from modshogun import RealFeatures, RegressionLabels, GaussianKernel, Math try: from modshogun import GaussianLikelihood, ZeroMean, \ ExactInferenceMethod, GaussianProcessRegression except ImportError: print("Eigen3 needed for Gaussian Processes") return # reproducable results random.seed(seed) Math.init_random(17) # easy regression data: one dimensional noisy sine wave X=random.rand(1,n)*x_range X_test=array([[float(i)/n_test*x_range_test for i in range(n_test)]]) Y_test=sin(X_test) Y=sin(X)+random.randn(n)*noise_var # shogun representation labels=RegressionLabels(Y[0]) feats_train=RealFeatures(X) feats_test=RealFeatures(X_test) # GP specification shogun_width=width*width*2 kernel=GaussianKernel(10, shogun_width) zmean = ZeroMean() lik = GaussianLikelihood() lik.set_sigma(noise_var) inf = ExactInferenceMethod(kernel, feats_train, zmean, labels, lik) # train GP gp = GaussianProcessRegression(inf) gp.train() # some things we can do alpha = inf.get_alpha() diagonal = inf.get_diagonal_vector() cholesky = inf.get_cholesky() # get mean and variance vectors mean = gp.get_mean_vector(feats_test) variance = gp.get_variance_vector(feats_test) # plot results #plot(X[0],Y[0],'x') # training observations #plot(X_test[0],Y_test[0],'-') # ground truth of test #plot(X_test[0],mean, '-') # mean predictions of test #fill_between(X_test[0],mean-1.96*sqrt(variance),mean+1.96*sqrt(variance),color='grey') # 95% confidence interval #legend(["training", "ground truth", "mean predictions"]) #show() return alpha, diagonal, round(variance,12), round(mean,12), cholesky
def modelselection_grid_search_kernel(num_subsets, num_vectors, dim_vectors): # init seed for reproducability Math.init_random(1) random.seed(1) # create some (non-sense) data matrix = random.rand(dim_vectors, num_vectors) # create num_feautres 2-dimensional vectors features = RealFeatures() features.set_feature_matrix(matrix) # create labels, two classes labels = BinaryLabels(num_vectors) for i in range(num_vectors): labels.set_label(i, 1 if i % 2 == 0 else -1) # create svm classifier = LibSVM() # splitting strategy splitting_strategy = StratifiedCrossValidationSplitting( labels, num_subsets) # accuracy evaluation evaluation_criterion = ContingencyTableEvaluation(ACCURACY) # cross validation class for evaluation in model selection cross = CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterion) cross.set_num_runs(1) # print all parameter available for modelselection # Dont worry if yours is not included, simply write to the mailing list #classifier.print_modsel_params() # model parameter selection param_tree = create_param_tree() #param_tree.print_tree() grid_search = GridSearchModelSelection(cross, param_tree) print_state = False best_combination = grid_search.select_model(print_state) #print("best parameter(s):") #best_combination.print_tree() best_combination.apply_to_machine(classifier) # larger number of runs to have tighter confidence intervals cross.set_num_runs(10) cross.set_conf_int_alpha(0.01) result = cross.evaluate() casted = CrossValidationResult.obtain_from_generic(result) #print "result mean:", casted.mean return classifier, result, casted.mean
def modelselection_grid_search_kernel (num_subsets, num_vectors, dim_vectors): # init seed for reproducability Math.init_random(1) random.seed(1); # create some (non-sense) data matrix=random.rand(dim_vectors, num_vectors) # create num_feautres 2-dimensional vectors features=RealFeatures() features.set_feature_matrix(matrix) # create labels, two classes labels=BinaryLabels(num_vectors) for i in range(num_vectors): labels.set_label(i, 1 if i%2==0 else -1) # create svm classifier=LibSVM() # splitting strategy splitting_strategy=StratifiedCrossValidationSplitting(labels, num_subsets) # accuracy evaluation evaluation_criterion=ContingencyTableEvaluation(ACCURACY) # cross validation class for evaluation in model selection cross=CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterion) cross.set_num_runs(1) # print all parameter available for modelselection # Dont worry if yours is not included, simply write to the mailing list #classifier.print_modsel_params() # model parameter selection param_tree=create_param_tree() #param_tree.print_tree() grid_search=GridSearchModelSelection(cross, param_tree) print_state=False best_combination=grid_search.select_model(print_state) #print("best parameter(s):") #best_combination.print_tree() best_combination.apply_to_machine(classifier) # larger number of runs to have tighter confidence intervals cross.set_num_runs(10) cross.set_conf_int_alpha(0.01) result=cross.evaluate() casted=CrossValidationResult.obtain_from_generic(result); #print "result mean:", casted.mean return classifier,result,casted.mean
def evaluation_clustering_simple(n_data=100, sqrt_num_blobs=4, distance=5): from modshogun import ClusteringAccuracy, ClusteringMutualInformation from modshogun import MulticlassLabels, GaussianBlobsDataGenerator from modshogun import Math # reproducable results Math.init_random(1) # produce sone Gaussian blobs to cluster ncenters = sqrt_num_blobs**2 stretch = 1 angle = 1 gen = GaussianBlobsDataGenerator(sqrt_num_blobs, distance, stretch, angle) features = gen.get_streamed_features(n_data) X = features.get_feature_matrix() # compute approximate "ground truth" labels via taking the closest blob mean coords = array(range(0, sqrt_num_blobs * distance, distance)) idx_0 = [abs(coords - x).argmin() for x in X[0]] idx_1 = [abs(coords - x).argmin() for x in X[1]] ground_truth = array( [idx_0[i] * sqrt_num_blobs + idx_1[i] for i in range(n_data)], dtype="float64") #for label in unique(ground_truth): # indices=ground_truth==label # plot(X[0][indices], X[1][indices], 'o') #show() centroids = run_clustering(features, ncenters) gnd_hat = assign_labels(features, centroids, ncenters) gnd = MulticlassLabels(ground_truth) AccuracyEval = ClusteringAccuracy() AccuracyEval.best_map(gnd_hat, gnd) accuracy = AccuracyEval.evaluate(gnd_hat, gnd) # in this case we know that the clustering has to be very good #print(('Clustering accuracy = %.4f' % accuracy)) assert (accuracy > 0.8) MIEval = ClusteringMutualInformation() mutual_info = MIEval.evaluate(gnd_hat, gnd) #print(('Clustering mutual information = %.4f' % mutual_info)) # TODO add multiclass labels and MI once the serialization works #return gnd, accuracy, mutual_info return accuracy
def evaluation_clustering_simple (n_data=100, sqrt_num_blobs=4, distance=5): from modshogun import ClusteringAccuracy, ClusteringMutualInformation from modshogun import MulticlassLabels, GaussianBlobsDataGenerator from modshogun import Math # reproducable results Math.init_random(1) # produce sone Gaussian blobs to cluster ncenters=sqrt_num_blobs**2 stretch=1 angle=1 gen=GaussianBlobsDataGenerator(sqrt_num_blobs, distance, stretch, angle) features=gen.get_streamed_features(n_data) X=features.get_feature_matrix() # compute approximate "ground truth" labels via taking the closest blob mean coords=array(range(0,sqrt_num_blobs*distance,distance)) idx_0=[abs(coords -x).argmin() for x in X[0]] idx_1=[abs(coords -x).argmin() for x in X[1]] ground_truth=array([idx_0[i]*sqrt_num_blobs + idx_1[i] for i in range(n_data)], dtype="float64") #for label in unique(ground_truth): # indices=ground_truth==label # plot(X[0][indices], X[1][indices], 'o') #show() centroids = run_clustering(features, ncenters) gnd_hat = assign_labels(features, centroids, ncenters) gnd = MulticlassLabels(ground_truth) AccuracyEval = ClusteringAccuracy() AccuracyEval.best_map(gnd_hat, gnd) accuracy = AccuracyEval.evaluate(gnd_hat, gnd) # in this case we know that the clustering has to be very good #print(('Clustering accuracy = %.4f' % accuracy)) assert(accuracy>0.8) MIEval = ClusteringMutualInformation() mutual_info = MIEval.evaluate(gnd_hat, gnd) #print(('Clustering mutual information = %.4f' % mutual_info)) # TODO add multiclass labels and MI once the serialization works #return gnd, accuracy, mutual_info return accuracy
def evaluation_multiclassovrevaluation_modular(train_fname=traindat, label_fname=label_traindat): from modshogun import MulticlassOVREvaluation,ROCEvaluation from modshogun import MulticlassLibLinear,RealFeatures,ContingencyTableEvaluation,ACCURACY from modshogun import MulticlassLabels, Math, CSVFile Math.init_random(1) ground_truth_labels = MulticlassLabels(CSVFile(label_fname)) svm = MulticlassLibLinear(1.0,RealFeatures(CSVFile(train_fname)),ground_truth_labels) svm.parallel.set_num_threads(1) svm.train() predicted_labels = svm.apply() binary_evaluator = ROCEvaluation() evaluator = MulticlassOVREvaluation(binary_evaluator) mean_roc = evaluator.evaluate(predicted_labels,ground_truth_labels) #print mean_roc binary_evaluator = ContingencyTableEvaluation(ACCURACY) evaluator = MulticlassOVREvaluation(binary_evaluator) mean_accuracy = evaluator.evaluate(predicted_labels,ground_truth_labels) #print mean_accuracy return mean_roc, mean_accuracy, predicted_labels, svm
def statistics_quadratic_time_mmd (m,dim,difference): from modshogun import RealFeatures from modshogun import MeanShiftDataGenerator from modshogun import GaussianKernel, CustomKernel from modshogun import QuadraticTimeMMD from modshogun import PERMUTATION, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, BIASED_DEPRECATED from modshogun import Statistics, IntVector, RealVector, Math # init seed for reproducability Math.init_random(1) random.seed(17) # number of examples kept low in order to make things fast # streaming data generator for mean shift distributions gen_p=MeanShiftDataGenerator(0, dim); #gen_p.parallel.set_num_threads(1) gen_q=MeanShiftDataGenerator(difference, dim); # stream some data from generator feat_p=gen_p.get_streamed_features(m); feat_q=gen_q.get_streamed_features(m); # set kernel a-priori. usually one would do some kernel selection. See # other examples for this. width=10; kernel=GaussianKernel(10, width); # create quadratic time mmd instance. Note that this constructor # copies p and q and does not reference them mmd=QuadraticTimeMMD(kernel, feat_p, feat_q); # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 alpha=0.05; # using permutation (slow, not the most reliable way. Consider pre- # computing the kernel when using it, see below). # Also, in practice, use at least 250 iterations mmd.set_null_approximation_method(PERMUTATION); mmd.set_num_null_samples(3); p_value_null=mmd.perform_test(); # reject if p-value is smaller than test level #print "bootstrap: p!=q: ", p_value_null<alpha # using spectrum method. Use at least 250 samples from null. # This is consistent but sometimes breaks, always monitor type I error. # See tutorial for number of eigenvalues to use . mmd.set_statistic_type(BIASED); mmd.set_null_approximation_method(MMD2_SPECTRUM); mmd.set_num_eigenvalues_spectrum(3); mmd.set_num_samples_spectrum(250); p_value_spectrum=mmd.perform_test(); # reject if p-value is smaller than test level #print "spectrum: p!=q: ", p_value_spectrum<alpha # using gamma method. This is a quick hack, which works most of the time # but is NOT guaranteed to. See tutorial for details. # Only works with BIASED_DEPRECATED statistic mmd.set_statistic_type(BIASED_DEPRECATED); mmd.set_null_approximation_method(MMD2_GAMMA); p_value_gamma=mmd.perform_test(); # reject if p-value is smaller than test level #print "gamma: p!=q: ", p_value_gamma<alpha # compute tpye I and II error (use many more trials in practice). # Type I error is not necessary if one uses permutation. We do it here # anyway, but note that this is an efficient way of computing it. # Also note that testing has to happen on # difference data than kernel selection, but the linear time mmd does this # implicitly and we used a fixed kernel here. mmd.set_statistic_type(BIASED); mmd.set_null_approximation_method(PERMUTATION); mmd.set_num_null_samples(5); num_trials=5; type_I_errors=RealVector(num_trials); type_II_errors=RealVector(num_trials); inds=int32(array([x for x in range(2*m)])) # numpy p_and_q=mmd.get_p_and_q(); # use a precomputed kernel to be faster kernel.init(p_and_q, p_and_q); precomputed=CustomKernel(kernel); mmd.set_kernel(precomputed); for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error inds=random.permutation(inds) # numpy permutation precomputed.add_row_subset(inds); precomputed.add_col_subset(inds); type_I_errors[i]=mmd.perform_test()>alpha; precomputed.remove_row_subset(); precomputed.remove_col_subset(); # on normal data, this gives type II error type_II_errors[i]=mmd.perform_test()>alpha; return type_I_errors.get(),type_I_errors.get(),p_value_null,p_value_spectrum,p_value_gamma,
def statistics_hsic(n, difference, angle): from modshogun import RealFeatures from modshogun import DataGenerator from modshogun import GaussianKernel from modshogun import HSIC from modshogun import PERMUTATION, HSIC_GAMMA from modshogun import EuclideanDistance from modshogun import Statistics, Math # for reproducable results (the numpy one might not be reproducible across # different OS/Python-distributions Math.init_random(1) np.random.seed(1) # note that the HSIC has to store kernel matrices # which upper bounds the sample size # use data generator class to produce example data data = DataGenerator.generate_sym_mix_gauss(n, difference, angle) #plot(data[0], data[1], 'x');show() # create shogun feature representation features_x = RealFeatures(np.array([data[0]])) features_y = RealFeatures(np.array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset = np.random.permutation(features_x.get_num_vectors()).astype( np.int32) subset = subset[0:200] features_x.add_subset(subset) dist = EuclideanDistance(features_x, features_x) distances = dist.get_distance_matrix() features_x.remove_subset() median_distance = np.median(distances) sigma_x = median_distance**2 features_y.add_subset(subset) dist = EuclideanDistance(features_y, features_y) distances = dist.get_distance_matrix() features_y.remove_subset() median_distance = np.median(distances) sigma_y = median_distance**2 #print "median distance for Gaussian kernel on x:", sigma_x #print "median distance for Gaussian kernel on y:", sigma_y kernel_x = GaussianKernel(10, sigma_x) kernel_y = GaussianKernel(10, sigma_y) hsic = HSIC(kernel_x, kernel_y, features_x, features_y) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic = hsic.compute_statistic() #print "HSIC:", statistic alpha = 0.05 #print "computing p-value using sampling null" hsic.set_null_approximation_method(PERMUTATION) # normally, at least 250 iterations should be done, but that takes long hsic.set_num_null_samples(100) # sampling null allows usage of unbiased or biased statistic p_value_boot = hsic.compute_p_value(statistic) thresh_boot = hsic.compute_threshold(alpha) #print "p_value:", p_value_boot #print "threshold for 0.05 alpha:", thresh_boot #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha #print "computing p-value using gamma method" hsic.set_null_approximation_method(HSIC_GAMMA) p_value_gamma = hsic.compute_p_value(statistic) thresh_gamma = hsic.compute_threshold(alpha) #print "p_value:", p_value_gamma #print "threshold for 0.05 alpha:", thresh_gamma #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_gamma<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # sampling null, biased statistic #print "sampling null distribution using sample_null" hsic.set_null_approximation_method(PERMUTATION) hsic.set_num_null_samples(100) null_samples = hsic.sample_null() #print "null mean:", np.mean(null_samples) #print "null variance:", np.var(null_samples) #hist(null_samples, 100); show() return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples
def statistics_quadratic_time_mmd(m, dim, difference): from modshogun import RealFeatures from modshogun import MeanShiftDataGenerator from modshogun import GaussianKernel, CustomKernel from modshogun import QuadraticTimeMMD from modshogun import BOOTSTRAP, MMD2_SPECTRUM, MMD2_GAMMA, BIASED, UNBIASED from modshogun import Statistics, IntVector, RealVector, Math # init seed for reproducability Math.init_random(1) random.seed(17) # number of examples kept low in order to make things fast # streaming data generator for mean shift distributions gen_p = MeanShiftDataGenerator(0, dim) #gen_p.parallel.set_num_threads(1) gen_q = MeanShiftDataGenerator(difference, dim) # stream some data from generator feat_p = gen_p.get_streamed_features(m) feat_q = gen_q.get_streamed_features(m) # set kernel a-priori. usually one would do some kernel selection. See # other examples for this. width = 10 kernel = GaussianKernel(10, width) # create quadratic time mmd instance. Note that this constructor # copies p and q and does not reference them mmd = QuadraticTimeMMD(kernel, feat_p, feat_q) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 alpha = 0.05 # using bootstrapping (slow, not the most reliable way. Consider pre- # computing the kernel when using it, see below). # Also, in practice, use at least 250 iterations mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(3) p_value_boot = mmd.perform_test() # reject if p-value is smaller than test level #print "bootstrap: p!=q: ", p_value_boot<alpha # using spectrum method. Use at least 250 samples from null. # This is consistent but sometimes breaks, always monitor type I error. # See tutorial for number of eigenvalues to use . # Only works with BIASED statistic mmd.set_statistic_type(BIASED) mmd.set_null_approximation_method(MMD2_SPECTRUM) mmd.set_num_eigenvalues_spectrum(3) mmd.set_num_samples_sepctrum(250) p_value_spectrum = mmd.perform_test() # reject if p-value is smaller than test level #print "spectrum: p!=q: ", p_value_spectrum<alpha # using gamma method. This is a quick hack, which works most of the time # but is NOT guaranteed to. See tutorial for details. # Only works with BIASED statistic mmd.set_statistic_type(BIASED) mmd.set_null_approximation_method(MMD2_GAMMA) p_value_gamma = mmd.perform_test() # reject if p-value is smaller than test level #print "gamma: p!=q: ", p_value_gamma<alpha # compute tpye I and II error (use many more trials in practice). # Type I error is not necessary if one uses bootstrapping. We do it here # anyway, but note that this is an efficient way of computing it. # Also note that testing has to happen on # difference data than kernel selection, but the linear time mmd does this # implicitly and we used a fixed kernel here. mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(5) num_trials = 5 type_I_errors = RealVector(num_trials) type_II_errors = RealVector(num_trials) inds = int32(array([x for x in range(2 * m)])) # numpy p_and_q = mmd.get_p_and_q() # use a precomputed kernel to be faster kernel.init(p_and_q, p_and_q) precomputed = CustomKernel(kernel) mmd.set_kernel(precomputed) for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error inds = random.permutation(inds) # numpy permutation precomputed.add_row_subset(inds) precomputed.add_col_subset(inds) type_I_errors[i] = mmd.perform_test() > alpha precomputed.remove_row_subset() precomputed.remove_col_subset() # on normal data, this gives type II error type_II_errors[i] = mmd.perform_test() > alpha return type_I_errors.get(), type_I_errors.get( ), p_value_boot, p_value_spectrum, p_value_gamma,
def statistics_mmd_kernel_selection_single(m,distance,stretch,num_blobs,angle,selection_method): from modshogun import RealFeatures from modshogun import GaussianBlobsDataGenerator from modshogun import GaussianKernel, CombinedKernel from modshogun import LinearTimeMMD from modshogun import MMDKernelSelectionMedian from modshogun import MMDKernelSelectionMax from modshogun import MMDKernelSelectionOpt from modshogun import PERMUTATION, MMD1_GAUSSIAN from modshogun import EuclideanDistance from modshogun import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # results for this low number will be bad (unstable, type I error wrong) m=1000 distance=10 stretch=5 num_blobs=3 angle=pi/4 # streaming data generator gen_p=GaussianBlobsDataGenerator(num_blobs, distance, 1, 0) gen_q=GaussianBlobsDataGenerator(num_blobs, distance, stretch, angle) # stream some data and plot num_plot=1000 features=gen_p.get_streamed_features(num_plot) features=features.create_merged_copy(gen_q.get_streamed_features(num_plot)) data=features.get_feature_matrix() #figure() #subplot(2,2,1) #grid(True) #plot(data[0][0:num_plot], data[1][0:num_plot], 'r.', label='$x$') #title('$X\sim p$') #subplot(2,2,2) #grid(True) #plot(data[0][num_plot+1:2*num_plot], data[1][num_plot+1:2*num_plot], 'b.', label='$x$', alpha=0.5) #title('$Y\sim q$') # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # different to the standard form, see documentation) sigmas=[2**x for x in range(-3,10)] widths=[x*x*2 for x in sigmas] combined=CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # mmd instance using streaming features, blocksize of 10000 block_size=1000 mmd=LinearTimeMMD(combined, gen_p, gen_q, m, block_size) # kernel selection instance (this can easily replaced by the other methods for selecting # single kernels if selection_method=="opt": selection=MMDKernelSelectionOpt(mmd) elif selection_method=="max": selection=MMDKernelSelectionMax(mmd) elif selection_method=="median": selection=MMDKernelSelectionMedian(mmd) # print measures (just for information) # in case Opt: ratios of MMD and standard deviation # in case Max: MMDs for each kernel # Does not work for median method if selection_method!="median": ratios=selection.compute_measures() #print "Measures:", ratios #subplot(2,2,3) #plot(ratios) #title('Measures') # perform kernel selection kernel=selection.select_kernel() kernel=GaussianKernel.obtain_from_generic(kernel) #print "selected kernel width:", kernel.get_width() # compute tpye I and II error (use many more trials). Type I error is only # estimated to check MMD1_GAUSSIAN method for estimating the null # distribution. Note that testing has to happen on difference data than # kernel selecting, but the linear time mmd does this implicitly mmd.set_kernel(kernel) mmd.set_null_approximation_method(MMD1_GAUSSIAN) # number of trials should be larger to compute tight confidence bounds num_trials=5; alpha=0.05 # test power typeIerrors=[0 for x in range(num_trials)] typeIIerrors=[0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i]=mmd.perform_test()>alpha mmd.set_simulate_h0(False) typeIIerrors[i]=mmd.perform_test()>alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return kernel,typeIerrors,typeIIerrors
def gen_data(ftype, num_samples, show_data = False): from modshogun import Math from modshogun import FactorType, Factor, TableFactorType, FactorGraph from modshogun import FactorGraphObservation, FactorGraphLabels, FactorGraphFeatures from modshogun import MAPInference, TREE_MAX_PROD Math.init_random(17) samples = FactorGraphFeatures(num_samples) labels = FactorGraphLabels(num_samples) for i in xrange(num_samples): vc = np.array([2,2,2], np.int32) fg = FactorGraph(vc) data1 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in xrange(2)]) vind1 = np.array([0,1], np.int32) fac1 = Factor(ftype[0], vind1, data1) fg.add_factor(fac1) data2 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in xrange(2)]) vind2 = np.array([1,2], np.int32) fac2 = Factor(ftype[0], vind2, data2) fg.add_factor(fac2) data3 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in xrange(2)]) vind3 = np.array([0], np.int32) fac3 = Factor(ftype[1], vind3, data3) fg.add_factor(fac3) data4 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in xrange(2)]) vind4 = np.array([1], np.int32) fac4 = Factor(ftype[1], vind4, data4) fg.add_factor(fac4) data5 = np.array([2.0*Math.random(0.0,1.0)-1.0 for i in xrange(2)]) vind5 = np.array([2], np.int32) fac5 = Factor(ftype[1], vind5, data5) fg.add_factor(fac5) data6 = np.array([1.0]) vind6 = np.array([0], np.int32) fac6 = Factor(ftype[2], vind6, data6) fg.add_factor(fac6) data7 = np.array([1.0]) vind7 = np.array([2], np.int32) fac7 = Factor(ftype[2], vind7, data7) fg.add_factor(fac7) samples.add_sample(fg) fg.connect_components() fg.compute_energies() infer_met = MAPInference(fg, TREE_MAX_PROD) infer_met.inference() fg_obs = infer_met.get_structured_outputs() labels.add_label(fg_obs) if show_data: state = fg_obs.get_data() print state return samples, labels
def gen_data(ftype, num_samples, show_data=False): from modshogun import Math from modshogun import FactorType, Factor, TableFactorType, FactorGraph from modshogun import FactorGraphObservation, FactorGraphLabels, FactorGraphFeatures from modshogun import MAPInference, TREE_MAX_PROD Math.init_random(17) samples = FactorGraphFeatures(num_samples) labels = FactorGraphLabels(num_samples) for i in range(num_samples): vc = np.array([2, 2, 2], np.int32) fg = FactorGraph(vc) data1 = np.array([2.0 * Math.random(0.0, 1.0) - 1.0 for i in range(2)]) vind1 = np.array([0, 1], np.int32) fac1 = Factor(ftype[0], vind1, data1) fg.add_factor(fac1) data2 = np.array([2.0 * Math.random(0.0, 1.0) - 1.0 for i in range(2)]) vind2 = np.array([1, 2], np.int32) fac2 = Factor(ftype[0], vind2, data2) fg.add_factor(fac2) data3 = np.array([2.0 * Math.random(0.0, 1.0) - 1.0 for i in range(2)]) vind3 = np.array([0], np.int32) fac3 = Factor(ftype[1], vind3, data3) fg.add_factor(fac3) data4 = np.array([2.0 * Math.random(0.0, 1.0) - 1.0 for i in range(2)]) vind4 = np.array([1], np.int32) fac4 = Factor(ftype[1], vind4, data4) fg.add_factor(fac4) data5 = np.array([2.0 * Math.random(0.0, 1.0) - 1.0 for i in range(2)]) vind5 = np.array([2], np.int32) fac5 = Factor(ftype[1], vind5, data5) fg.add_factor(fac5) data6 = np.array([1.0]) vind6 = np.array([0], np.int32) fac6 = Factor(ftype[2], vind6, data6) fg.add_factor(fac6) data7 = np.array([1.0]) vind7 = np.array([2], np.int32) fac7 = Factor(ftype[2], vind7, data7) fg.add_factor(fac7) samples.add_sample(fg) fg.connect_components() fg.compute_energies() infer_met = MAPInference(fg, TREE_MAX_PROD) infer_met.inference() fg_obs = infer_met.get_structured_outputs() labels.add_label(fg_obs) if show_data: state = fg_obs.get_data() print(state) return samples, labels
def statistics_hsic (n, difference, angle): from modshogun import RealFeatures from modshogun import DataGenerator from modshogun import GaussianKernel from modshogun import HSIC from modshogun import BOOTSTRAP, HSIC_GAMMA from modshogun import EuclideanDistance from modshogun import Math, Statistics, IntVector # init seed for reproducability Math.init_random(1) # note that the HSIC has to store kernel matrices # which upper bounds the sample size # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(n,difference,angle) #plot(data[0], data[1], 'x');show() # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=IntVector.randperm_vec(features_x.get_num_vectors()) subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_y=median_distance**2 #print "median distance for Gaussian kernel on x:", sigma_x #print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic=hsic.compute_statistic() #print "HSIC:", statistic alpha=0.05 #print "computing p-value using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long hsic.set_bootstrap_iterations(100) # bootstrapping allows usage of unbiased or biased statistic p_value_boot=hsic.compute_p_value(statistic) thresh_boot=hsic.compute_threshold(alpha) #print "p_value:", p_value_boot #print "threshold for 0.05 alpha:", thresh_boot #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha #print "computing p-value using gamma method" hsic.set_null_approximation_method(HSIC_GAMMA) p_value_gamma=hsic.compute_p_value(statistic) thresh_gamma=hsic.compute_threshold(alpha) #print "p_value:", p_value_gamma #print "threshold for 0.05 alpha:", thresh_gamma #print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value_gamma<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic #print "sampling null distribution using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) hsic.set_bootstrap_iterations(100) null_samples=hsic.bootstrap_null() #print "null mean:", mean(null_samples) #print "null variance:", var(null_samples) #hist(null_samples, 100); show() return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples
def statistics_linear_time_mmd (n,dim,difference): from modshogun import RealFeatures from modshogun import MeanShiftDataGenerator from modshogun import GaussianKernel from modshogun import LinearTimeMMD from modshogun import PERMUTATION, MMD1_GAUSSIAN from modshogun import EuclideanDistance from modshogun import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # so increase to get reasonable results # streaming data generator for mean shift distributions gen_p=MeanShiftDataGenerator(0, dim) gen_q=MeanShiftDataGenerator(difference, dim) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Stream examples and merge them in order to compute median on joint sample features=gen_p.get_streamed_features(100) features=features.create_merged_copy(gen_q.get_streamed_features(100)) # compute all pairwise distances dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() # compute median and determine kernel width (using shogun) median_distance=Statistics.matrix_median(distances, True) sigma=median_distance**2 #print "median distance for Gaussian kernel:", sigma kernel=GaussianKernel(10,sigma) # mmd instance using streaming features, blocksize of 10000 mmd=LinearTimeMMD(kernel, gen_p, gen_q, n, 10000) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic=mmd.compute_statistic() #print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # sampling null and gaussian approximation (ony for really large samples) alpha=0.05 #print "computing p-value using sampling null" mmd.set_null_approximation_method(PERMUTATION) mmd.set_num_null_samples(50) # normally, far more iterations are needed p_value_boot=mmd.compute_p_value(statistic) #print "p_value_boot:", p_value_boot #print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha #print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value_gaussian=mmd.compute_p_value(statistic) #print "p_value_gaussian:", p_value_gaussian #print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(PERMUTATION) mmd.set_num_null_samples(10) # normally, far more iterations are needed null_samples=mmd.sample_null() #print "null mean:", mean(null_samples) #print "null variance:", var(null_samples) # compute type I and type II errors for Gaussian approximation # number of trials should be larger to compute tight confidence bounds mmd.set_null_approximation_method(MMD1_GAUSSIAN) num_trials=5; alpha=0.05 # test power typeIerrors=[0 for x in range(num_trials)] typeIIerrors=[0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i]=mmd.perform_test()>alpha mmd.set_simulate_h0(False) typeIIerrors[i]=mmd.perform_test()>alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors
def evaluation_cross_validation_multiclass_storage (traindat=traindat, label_traindat=label_traindat): from modshogun import CrossValidation, CrossValidationResult from modshogun import CrossValidationPrintOutput from modshogun import CrossValidationMKLStorage, CrossValidationMulticlassStorage from modshogun import MulticlassAccuracy, F1Measure from modshogun import StratifiedCrossValidationSplitting from modshogun import MulticlassLabels from modshogun import RealFeatures, CombinedFeatures from modshogun import GaussianKernel, CombinedKernel from modshogun import MKLMulticlass from modshogun import Statistics, MSG_DEBUG, Math Math.init_random(1) # training data, combined features all on same data features=RealFeatures(traindat) comb_features=CombinedFeatures() comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) labels=MulticlassLabels(label_traindat) # kernel, different Gaussians combined kernel=CombinedKernel() kernel.append_kernel(GaussianKernel(10, 0.1)) kernel.append_kernel(GaussianKernel(10, 1)) kernel.append_kernel(GaussianKernel(10, 2)) # create mkl using libsvm, due to a mem-bug, interleaved is not possible svm=MKLMulticlass(1.0,kernel,labels); svm.set_kernel(kernel); # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy=StratifiedCrossValidationSplitting(labels, 3) # evaluation method evaluation_criterium=MulticlassAccuracy() # cross-validation instance cross_validation=CrossValidation(svm, comb_features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # append cross vlaidation output classes #cross_validation.add_cross_validation_output(CrossValidationPrintOutput()) #mkl_storage=CrossValidationMKLStorage() #cross_validation.add_cross_validation_output(mkl_storage) multiclass_storage=CrossValidationMulticlassStorage() multiclass_storage.append_binary_evaluation(F1Measure()) cross_validation.add_cross_validation_output(multiclass_storage) cross_validation.set_num_runs(3) # perform cross-validation result=cross_validation.evaluate() roc_0_0_0 = multiclass_storage.get_fold_ROC(0,0,0) #print roc_0_0_0 auc_0_0_0 = multiclass_storage.get_fold_evaluation_result(0,0,0,0) #print auc_0_0_0 return roc_0_0_0, auc_0_0_0
def statistics_linear_time_mmd(n, dim, difference): from modshogun import RealFeatures from modshogun import MeanShiftDataGenerator from modshogun import GaussianKernel from modshogun import LinearTimeMMD from modshogun import PERMUTATION, MMD1_GAUSSIAN from modshogun import EuclideanDistance from modshogun import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # so increase to get reasonable results # streaming data generator for mean shift distributions gen_p = MeanShiftDataGenerator(0, dim) gen_q = MeanShiftDataGenerator(difference, dim) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Stream examples and merge them in order to compute median on joint sample features = gen_p.get_streamed_features(100) features = features.create_merged_copy(gen_q.get_streamed_features(100)) # compute all pairwise distances dist = EuclideanDistance(features, features) distances = dist.get_distance_matrix() # compute median and determine kernel width (using shogun) median_distance = Statistics.matrix_median(distances, True) sigma = median_distance**2 #print "median distance for Gaussian kernel:", sigma kernel = GaussianKernel(10, sigma) # mmd instance using streaming features, blocksize of 10000 mmd = LinearTimeMMD(kernel, gen_p, gen_q, n, 10000) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic = mmd.compute_statistic() #print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # sampling null and gaussian approximation (ony for really large samples) alpha = 0.05 #print "computing p-value using sampling null" mmd.set_null_approximation_method(PERMUTATION) mmd.set_num_null_samples(50) # normally, far more iterations are needed p_value_boot = mmd.compute_p_value(statistic) #print "p_value_boot:", p_value_boot #print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha #print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value_gaussian = mmd.compute_p_value(statistic) #print "p_value_gaussian:", p_value_gaussian #print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(PERMUTATION) mmd.set_num_null_samples(10) # normally, far more iterations are needed null_samples = mmd.sample_null() #print "null mean:", mean(null_samples) #print "null variance:", var(null_samples) # compute type I and type II errors for Gaussian approximation # number of trials should be larger to compute tight confidence bounds mmd.set_null_approximation_method(MMD1_GAUSSIAN) num_trials = 5 alpha = 0.05 # test power typeIerrors = [0 for x in range(num_trials)] typeIIerrors = [0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i] = mmd.perform_test() > alpha mmd.set_simulate_h0(False) typeIIerrors[i] = mmd.perform_test() > alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors