def multiclass_c45classifiertree_modular(train=traindat, test=testdat, labels=label_traindat, ft=feattypes): try: from modshogun import RealFeatures, MulticlassLabels, CSVFile, C45ClassifierTree from numpy import random, int32 except ImportError: print("Could not import Shogun and/or numpy modules") return # wrap features and labels into Shogun objects feats_train = RealFeatures(CSVFile(train)) feats_test = RealFeatures(CSVFile(test)) train_labels = MulticlassLabels(CSVFile(labels)) # divide train dataset into training and validation subsets in the ratio 2/3 to 1/3 subset = int32(random.permutation(feats_train.get_num_vectors())) vsubset = subset[1:subset.size / 3] trsubset = subset[1 + subset.size / 3:subset.size] # C4.5 Tree formation using training subset train_labels.add_subset(trsubset) feats_train.add_subset(trsubset) c = C45ClassifierTree() c.set_labels(train_labels) c.set_feature_types(ft) c.train(feats_train) train_labels.remove_subset() feats_train.remove_subset() # prune tree using validation subset train_labels.add_subset(vsubset) feats_train.add_subset(vsubset) c.prune_tree(feats_train, train_labels) train_labels.remove_subset() feats_train.remove_subset() # Classify test data output = c.apply_multiclass(feats_test).get_labels() output_certainty = c.get_certainty_vector() return c, output, output_certainty
def multiclass_c45classifiertree_modular(train=traindat,test=testdat,labels=label_traindat,ft=feattypes): try: from modshogun import RealFeatures, MulticlassLabels, CSVFile, C45ClassifierTree from numpy import random, int32 except ImportError: print("Could not import Shogun and/or numpy modules") return # wrap features and labels into Shogun objects feats_train=RealFeatures(CSVFile(train)) feats_test=RealFeatures(CSVFile(test)) train_labels=MulticlassLabels(CSVFile(labels)) # divide train dataset into training and validation subsets in the ratio 2/3 to 1/3 subset=int32(random.permutation(feats_train.get_num_vectors())) vsubset=subset[1:subset.size/3] trsubset=subset[1+subset.size/3:subset.size] # C4.5 Tree formation using training subset train_labels.add_subset(trsubset) feats_train.add_subset(trsubset) c=C45ClassifierTree() c.set_labels(train_labels) c.set_feature_types(ft) c.train(feats_train) train_labels.remove_subset() feats_train.remove_subset() # prune tree using validation subset train_labels.add_subset(vsubset) feats_train.add_subset(vsubset) c.prune_tree(feats_train,train_labels) train_labels.remove_subset() feats_train.remove_subset() # Classify test data output=c.apply_multiclass(feats_test).get_labels() output_certainty=c.get_certainty_vector() return c,output,output_certainty
def stochasticgbmachine_modular(train=traindat, train_labels=label_traindat, ft=feat_types): try: from modshogun import RealFeatures, RegressionLabels, CSVFile, CARTree, StochasticGBMachine, SquaredLoss except ImportError: print("Could not import Shogun modules") return # wrap features and labels into Shogun objects feats = RealFeatures(CSVFile(train)) labels = RegressionLabels(CSVFile(train_labels)) # divide into training (90%) and test dataset (10%) p = np.random.permutation(labels.get_num_labels()) num = labels.get_num_labels() * 0.9 cart = CARTree() cart.set_feature_types(ft) cart.set_max_depth(1) loss = SquaredLoss() s = StochasticGBMachine(cart, loss, 500, 0.01, 0.6) # train feats.add_subset(np.int32(p[0:num])) labels.add_subset(np.int32(p[0:num])) s.set_labels(labels) s.train(feats) feats.remove_subset() labels.remove_subset() # apply feats.add_subset(np.int32(p[num:len(p)])) labels.add_subset(np.int32(p[num:len(p)])) output = s.apply_regression(feats) feats.remove_subset() labels.remove_subset() return s, output
def stochasticgbmachine_modular(train=traindat,train_labels=label_traindat,ft=feat_types): try: from modshogun import RealFeatures, RegressionLabels, CSVFile, CARTree, StochasticGBMachine, SquaredLoss except ImportError: print("Could not import Shogun modules") return # wrap features and labels into Shogun objects feats=RealFeatures(CSVFile(train)) labels=RegressionLabels(CSVFile(train_labels)) # divide into training (90%) and test dataset (10%) p=np.random.permutation(labels.get_num_labels()) num=labels.get_num_labels()*0.9 cart=CARTree() cart.set_feature_types(ft) cart.set_max_depth(1) loss=SquaredLoss() s=StochasticGBMachine(cart,loss,500,0.01,0.6) # train feats.add_subset(np.int32(p[0:num])) labels.add_subset(np.int32(p[0:num])) s.set_labels(labels) s.train(feats) feats.remove_subset() labels.remove_subset() # apply feats.add_subset(np.int32(p[num:len(p)])) labels.add_subset(np.int32(p[num:len(p)])) output=s.apply_regression(feats) feats.remove_subset() labels.remove_subset() return s,output
def hsic_graphical(): # parameters, change to get different results m = 250 difference = 3 # setting the angle lower makes a harder test angle = pi / 30 # number of samples taken from null and alternative distribution num_null_samples = 500 # use data generator class to produce example data data = DataGenerator.generate_sym_mix_gauss(m, difference, angle) # create shogun feature representation features_x = RealFeatures(array([data[0]])) features_y = RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset = int32(array([x for x in range(features_x.get_num_vectors()) ])) # numpy subset = random.permutation(subset) # numpy permutation subset = subset[0:200] features_x.add_subset(subset) dist = EuclideanDistance(features_x, features_x) distances = dist.get_distance_matrix() features_x.remove_subset() median_distance = np.median(distances) sigma_x = median_distance**2 features_y.add_subset(subset) dist = EuclideanDistance(features_y, features_y) distances = dist.get_distance_matrix() features_y.remove_subset() median_distance = np.median(distances) sigma_y = median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x = GaussianKernel(10, sigma_x) kernel_y = GaussianKernel(10, sigma_y) # create hsic instance. Note that this is a convienience constructor which copies # feature data. features_x and features_y are not these used in hsic. # This is only for user-friendlyness. Usually, its ok to do this. # Below, the alternative distribution is sampled, which means # that new feature objects have to be created in each iteration (slow) # However, normally, the alternative distribution is not sampled hsic = HSIC(kernel_x, kernel_y, features_x, features_y) # sample alternative distribution alt_samples = zeros(num_null_samples) for i in range(len(alt_samples)): data = DataGenerator.generate_sym_mix_gauss(m, difference, angle) features_x.set_feature_matrix(array([data[0]])) features_y.set_feature_matrix(array([data[1]])) # re-create hsic instance everytime since feature objects are copied due to # useage of convienience constructor hsic = HSIC(kernel_x, kernel_y, features_x, features_y) alt_samples[i] = hsic.compute_statistic() # sample from null distribution # permutation, biased statistic hsic.set_null_approximation_method(PERMUTATION) hsic.set_num_null_samples(num_null_samples) null_samples_boot = hsic.sample_null() # fit gamma distribution, biased statistic hsic.set_null_approximation_method(HSIC_GAMMA) gamma_params = hsic.fit_null_gamma() # sample gamma with parameters null_samples_gamma = array([ gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples) ]) # plot figure() # plot data x and y subplot(2, 2, 1) gca().xaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks grid(True) plot(data[0], data[1], 'o') title('Data, rotation=$\pi$/' + str(1 / angle * pi) + '\nm=' + str(m)) xlabel('$x$') ylabel('$y$') # compute threshold for test level alpha = 0.05 null_samples_boot.sort() null_samples_gamma.sort() thresh_boot = null_samples_boot[floor( len(null_samples_boot) * (1 - alpha))] thresh_gamma = null_samples_gamma[floor( len(null_samples_gamma) * (1 - alpha))] type_one_error_boot = sum( null_samples_boot < thresh_boot) / float(num_null_samples) type_one_error_gamma = sum( null_samples_gamma < thresh_boot) / float(num_null_samples) # plot alternative distribution with threshold subplot(2, 2, 2) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks grid(True) hist(alt_samples, 20, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error = sum(alt_samples < thresh_boot) / float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range = [ min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)]) ] # plot null distribution with threshold subplot(2, 2, 3) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks grid(True) hist(null_samples_boot, 20, range=hist_range, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Sampled Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gamma subplot(2, 2, 4) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks grid(True) hist(null_samples_gamma, 20, range=hist_range, normed=True) axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) grid(True) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def statistics_hsic(n, difference, angle): from modshogun import RealFeatures from modshogun import DataGenerator from modshogun import GaussianKernel from modshogun import HSIC from modshogun import PERMUTATION, HSIC_GAMMA from modshogun import EuclideanDistance from modshogun import Statistics, Math # for reproducable results (the numpy one might not be reproducible across # different OS/Python-distributions Math.init_random(1) np.random.seed(1) # note that the HSIC has to store kernel matrices # which upper bounds the sample size # use data generator class to produce example data data = DataGenerator.generate_sym_mix_gauss(n, difference, angle) #plot(data[0], data[1], 'x');show() # create shogun feature representation features_x = RealFeatures(np.array([data[0]])) features_y = RealFeatures(np.array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset = np.random.permutation(features_x.get_num_vectors()).astype( np.int32) subset = subset[0:200] features_x.add_subset(subset) dist = EuclideanDistance(features_x, features_x) distances = dist.get_distance_matrix() features_x.remove_subset() median_distance = np.median(distances) sigma_x = median_distance**2 features_y.add_subset(subset) dist = EuclideanDistance(features_y, features_y) distances = dist.get_distance_matrix() features_y.remove_subset() median_distance = np.median(distances) sigma_y = median_distance**2 #print "median distance for Gaussian kernel on x:", sigma_x #print "median distance for Gaussian kernel on y:", sigma_y kernel_x = GaussianKernel(10, sigma_x) kernel_y = GaussianKernel(10, sigma_y) hsic = HSIC(kernel_x, kernel_y, features_x, features_y) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic = hsic.compute_statistic() #print "HSIC:", statistic alpha = 0.05 #print "computing p-value using sampling null" hsic.set_null_approximation_method(PERMUTATION) # normally, at least 250 iterations should be done, but that takes long hsic.set_num_null_samples(100) # sampling null allows usage of unbiased or biased statistic p_value_boot = hsic.compute_p_value(statistic) thresh_boot = hsic.compute_threshold(alpha) #print "p_value:", p_value_boot #print "threshold for 0.05 alpha:", thresh_boot #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha #print "computing p-value using gamma method" hsic.set_null_approximation_method(HSIC_GAMMA) p_value_gamma = hsic.compute_p_value(statistic) thresh_gamma = hsic.compute_threshold(alpha) #print "p_value:", p_value_gamma #print "threshold for 0.05 alpha:", thresh_gamma #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_gamma<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # sampling null, biased statistic #print "sampling null distribution using sample_null" hsic.set_null_approximation_method(PERMUTATION) hsic.set_num_null_samples(100) null_samples = hsic.sample_null() #print "null mean:", np.mean(null_samples) #print "null variance:", np.var(null_samples) #hist(null_samples, 100); show() return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples
def hsic_graphical(): # parameters, change to get different results m=250 difference=3 # setting the angle lower makes a harder test angle=pi/30 # number of samples taken from null and alternative distribution num_null_samples=500 # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=int32(array([x for x in range(features_x.get_num_vectors())])) # numpy subset=random.permutation(subset) # numpy permutation subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=np.median(distances) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=np.median(distances) sigma_y=median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) # create hsic instance. Note that this is a convienience constructor which copies # feature data. features_x and features_y are not these used in hsic. # This is only for user-friendlyness. Usually, its ok to do this. # Below, the alternative distribution is sampled, which means # that new feature objects have to be created in each iteration (slow) # However, normally, the alternative distribution is not sampled hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # sample alternative distribution alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) features_x.set_feature_matrix(array([data[0]])) features_y.set_feature_matrix(array([data[1]])) # re-create hsic instance everytime since feature objects are copied due to # useage of convienience constructor hsic=HSIC(kernel_x,kernel_y,features_x,features_y) alt_samples[i]=hsic.compute_statistic() # sample from null distribution # permutation, biased statistic hsic.set_null_approximation_method(PERMUTATION) hsic.set_num_null_samples(num_null_samples) null_samples_boot=hsic.sample_null() # fit gamma distribution, biased statistic hsic.set_null_approximation_method(HSIC_GAMMA) gamma_params=hsic.fit_null_gamma() # sample gamma with parameters null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)]) # plot figure() # plot data x and y subplot(2,2,1) gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks grid(True) plot(data[0], data[1], 'o') title('Data, rotation=$\pi$/'+str(1/angle*pi)+'\nm='+str(m)) xlabel('$x$') ylabel('$y$') # compute threshold for test level alpha=0.05 null_samples_boot.sort() null_samples_gamma.sort() thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))]; thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))]; type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples) type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples) # plot alternative distribution with threshold subplot(2,2,2) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(alt_samples, 20, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range=[min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)])] # plot null distribution with threshold subplot(2,2,3) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(null_samples_boot, 20, range=hist_range, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Sampled Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gamma subplot(2,2,4) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(null_samples_gamma, 20, range=hist_range, normed=True); axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) grid(True) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def statistics_hsic (n, difference, angle): from modshogun import RealFeatures from modshogun import DataGenerator from modshogun import GaussianKernel from modshogun import HSIC from modshogun import BOOTSTRAP, HSIC_GAMMA from modshogun import EuclideanDistance from modshogun import Math, Statistics, IntVector # init seed for reproducability Math.init_random(1) # note that the HSIC has to store kernel matrices # which upper bounds the sample size # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(n,difference,angle) #plot(data[0], data[1], 'x');show() # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=IntVector.randperm_vec(features_x.get_num_vectors()) subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_y=median_distance**2 #print "median distance for Gaussian kernel on x:", sigma_x #print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic=hsic.compute_statistic() #print "HSIC:", statistic alpha=0.05 #print "computing p-value using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long hsic.set_bootstrap_iterations(100) # bootstrapping allows usage of unbiased or biased statistic p_value_boot=hsic.compute_p_value(statistic) thresh_boot=hsic.compute_threshold(alpha) #print "p_value:", p_value_boot #print "threshold for 0.05 alpha:", thresh_boot #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha #print "computing p-value using gamma method" hsic.set_null_approximation_method(HSIC_GAMMA) p_value_gamma=hsic.compute_p_value(statistic) thresh_gamma=hsic.compute_threshold(alpha) #print "p_value:", p_value_gamma #print "threshold for 0.05 alpha:", thresh_gamma #print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value_gamma<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic #print "sampling null distribution using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) hsic.set_bootstrap_iterations(100) null_samples=hsic.bootstrap_null() #print "null mean:", mean(null_samples) #print "null variance:", var(null_samples) #hist(null_samples, 100); show() return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples