def feature_function(): from modshogun import RealFeatures from modshogun import CSVFile import numpy as np #3x3 random matrix feat_arr = np.random.rand(3, 3) #initialize RealFeatures from numpy array features = RealFeatures(feat_arr) #get matrix value function print features.get_feature_matrix(features) #get selected column of matrix print features.get_feature_vector(1) #get number of columns print features.get_num_features() #get number of rows print features.get_num_vectors() feats_from_csv = RealFeatures(CSVFile("csv/feature.csv")) print "csv is ", feats_from_csv.get_feature_matrix()
def transfer_multitask_l12_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup try: from modshogun import MultitaskL12LogisticRegression except ImportError: print("MultitaskL12LogisticRegression not available") exit(0) features = RealFeatures(hstack((traindat,traindat))) labels = BinaryLabels(hstack((label_train,label_train))) n_vectors = features.get_num_vectors() task_one = Task(0,n_vectors//2) task_two = Task(n_vectors//2,n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlr = MultitaskL12LogisticRegression(0.1,0.1,features,labels,task_group) mtlr.set_tolerance(1e-2) # use 1e-2 tolerance mtlr.set_max_iter(10) mtlr.train() mtlr.set_current_task(0) out = mtlr.apply_regression().get_labels() return out
def transfer_multitask_leastsquares_regression(fm_train=traindat, fm_test=testdat, label_train=label_traindat): from modshogun import RegressionLabels, RealFeatures, Task, TaskGroup try: from modshogun import MultitaskLeastSquaresRegression except ImportError: print("MultitaskLeastSquaresRegression not available") exit(0) features = RealFeatures(traindat) labels = RegressionLabels(label_train) n_vectors = features.get_num_vectors() task_one = Task(0, n_vectors // 2) task_two = Task(n_vectors // 2, n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlsr = MultitaskLeastSquaresRegression(0.1, features, labels, task_group) mtlsr.set_regularization(1) # use regularization ratio mtlsr.set_tolerance(1e-2) # use 1e-2 tolerance mtlsr.train() mtlsr.set_current_task(0) out = mtlsr.apply_regression().get_labels() return out
def transfer_multitask_leastsquares_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from modshogun import RegressionLabels, RealFeatures, Task, TaskGroup try: from modshogun import MultitaskLeastSquaresRegression except ImportError: print("MultitaskLeastSquaresRegression not available") exit(0) features = RealFeatures(traindat) labels = RegressionLabels(label_train) n_vectors = features.get_num_vectors() task_one = Task(0,n_vectors//2) task_two = Task(n_vectors//2,n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlsr = MultitaskLeastSquaresRegression(0.1,features,labels,task_group) mtlsr.set_regularization(1) # use regularization ratio mtlsr.set_tolerance(1e-2) # use 1e-2 tolerance mtlsr.train() mtlsr.set_current_task(0) out = mtlsr.apply_regression().get_labels() return out
def transfer_multitask_l12_logistic_regression(fm_train=traindat, fm_test=testdat, label_train=label_traindat): from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup try: from modshogun import MultitaskL12LogisticRegression except ImportError: print("MultitaskL12LogisticRegression not available") exit(0) features = RealFeatures(hstack((traindat, traindat))) labels = BinaryLabels(hstack((label_train, label_train))) n_vectors = features.get_num_vectors() task_one = Task(0, n_vectors // 2) task_two = Task(n_vectors // 2, n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlr = MultitaskL12LogisticRegression(0.1, 0.1, features, labels, task_group) mtlr.set_tolerance(1e-2) # use 1e-2 tolerance mtlr.set_max_iter(10) mtlr.train() mtlr.set_current_task(0) out = mtlr.apply_regression().get_labels() return out
def transfer_multitask_clustered_logistic_regression(fm_train=traindat, fm_test=testdat, label_train=label_traindat ): from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup, MultitaskClusteredLogisticRegression, MSG_DEBUG features = RealFeatures(hstack((traindat, sin(traindat), cos(traindat)))) labels = BinaryLabels(hstack((label_train, label_train, label_train))) n_vectors = features.get_num_vectors() task_one = Task(0, n_vectors // 3) task_two = Task(n_vectors // 3, 2 * n_vectors // 3) task_three = Task(2 * n_vectors // 3, n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) task_group.append_task(task_three) mtlr = MultitaskClusteredLogisticRegression(1.0, 100.0, features, labels, task_group, 2) #mtlr.io.set_loglevel(MSG_DEBUG) mtlr.set_tolerance(1e-3) # use 1e-2 tolerance mtlr.set_max_iter(100) mtlr.train() mtlr.set_current_task(0) #print mtlr.get_w() out = mtlr.apply_regression().get_labels() return out
def transfer_multitask_clustered_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup, MSG_DEBUG try: from modshogun import MultitaskClusteredLogisticRegression except ImportError: print("MultitaskClusteredLogisticRegression not available") exit() features = RealFeatures(hstack((traindat,sin(traindat),cos(traindat)))) labels = BinaryLabels(hstack((label_train,label_train,label_train))) n_vectors = features.get_num_vectors() task_one = Task(0,n_vectors//3) task_two = Task(n_vectors//3,2*n_vectors//3) task_three = Task(2*n_vectors//3,n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) task_group.append_task(task_three) mtlr = MultitaskClusteredLogisticRegression(1.0,100.0,features,labels,task_group,2) #mtlr.io.set_loglevel(MSG_DEBUG) mtlr.set_tolerance(1e-3) # use 1e-2 tolerance mtlr.set_max_iter(100) mtlr.train() mtlr.set_current_task(0) #print mtlr.get_w() out = mtlr.apply_regression().get_labels() return out
def load_data(num_train_samples=7291, m_data_dict=data_dict): from modshogun import RealFeatures, MulticlassLabels import numpy train_vec = m_data_dict['yTr'][0][:num_train_samples].astype(numpy.float64) train_labels = MulticlassLabels(train_vec) test_vec = m_data_dict['yTe'][0].astype(numpy.float64) test_labels = MulticlassLabels(test_vec) print "#train_labels = " + str(train_labels.get_num_labels()) print "#test_labels = " + str(test_labels.get_num_labels()) train_mat = m_data_dict['xTr'][:, :num_train_samples].astype(numpy.float64) train_features = RealFeatures(train_mat) test_mat = m_data_dict['xTe'].astype(numpy.float64) test_features = RealFeatures(test_mat) print "#train_vectors = " + str(train_features.get_num_vectors()) print "#test_vectors = " + str(test_features.get_num_vectors()) print "data dimension = " + str(test_features.get_num_features()) return train_features, train_labels, test_features, test_labels
def load_data(num_train_samples=7291, m_data_dict=data_dict): from modshogun import RealFeatures, MulticlassLabels import numpy train_vec = m_data_dict['yTr'][0][:num_train_samples].astype(numpy.float64) train_labels = MulticlassLabels(train_vec) test_vec = m_data_dict['yTe'][0].astype(numpy.float64) test_labels = MulticlassLabels(test_vec) print "#train_labels = " + str(train_labels.get_num_labels()) print "#test_labels = " + str(test_labels.get_num_labels()) train_mat = m_data_dict['xTr'][:,:num_train_samples].astype(numpy.float64) train_features = RealFeatures(train_mat) test_mat = m_data_dict['xTe'].astype(numpy.float64) test_features = RealFeatures(test_mat) print "#train_vectors = " + str(train_features.get_num_vectors()) print "#test_vectors = " + str(test_features.get_num_vectors()) print "data dimension = " + str(test_features.get_num_features()) return train_features, train_labels, test_features, test_labels
def multiclass_c45classifiertree_modular(train=traindat, test=testdat, labels=label_traindat, ft=feattypes): try: from modshogun import RealFeatures, MulticlassLabels, CSVFile, C45ClassifierTree from numpy import random, int32 except ImportError: print("Could not import Shogun and/or numpy modules") return # wrap features and labels into Shogun objects feats_train = RealFeatures(CSVFile(train)) feats_test = RealFeatures(CSVFile(test)) train_labels = MulticlassLabels(CSVFile(labels)) # divide train dataset into training and validation subsets in the ratio 2/3 to 1/3 subset = int32(random.permutation(feats_train.get_num_vectors())) vsubset = subset[1:subset.size / 3] trsubset = subset[1 + subset.size / 3:subset.size] # C4.5 Tree formation using training subset train_labels.add_subset(trsubset) feats_train.add_subset(trsubset) c = C45ClassifierTree() c.set_labels(train_labels) c.set_feature_types(ft) c.train(feats_train) train_labels.remove_subset() feats_train.remove_subset() # prune tree using validation subset train_labels.add_subset(vsubset) feats_train.add_subset(vsubset) c.prune_tree(feats_train, train_labels) train_labels.remove_subset() feats_train.remove_subset() # Classify test data output = c.apply_multiclass(feats_test).get_labels() output_certainty = c.get_certainty_vector() return c, output, output_certainty
def multiclass_c45classifiertree_modular(train=traindat,test=testdat,labels=label_traindat,ft=feattypes): try: from modshogun import RealFeatures, MulticlassLabels, CSVFile, C45ClassifierTree from numpy import random, int32 except ImportError: print("Could not import Shogun and/or numpy modules") return # wrap features and labels into Shogun objects feats_train=RealFeatures(CSVFile(train)) feats_test=RealFeatures(CSVFile(test)) train_labels=MulticlassLabels(CSVFile(labels)) # divide train dataset into training and validation subsets in the ratio 2/3 to 1/3 subset=int32(random.permutation(feats_train.get_num_vectors())) vsubset=subset[1:subset.size/3] trsubset=subset[1+subset.size/3:subset.size] # C4.5 Tree formation using training subset train_labels.add_subset(trsubset) feats_train.add_subset(trsubset) c=C45ClassifierTree() c.set_labels(train_labels) c.set_feature_types(ft) c.train(feats_train) train_labels.remove_subset() feats_train.remove_subset() # prune tree using validation subset train_labels.add_subset(vsubset) feats_train.add_subset(vsubset) c.prune_tree(feats_train,train_labels) train_labels.remove_subset() feats_train.remove_subset() # Classify test data output=c.apply_multiclass(feats_test).get_labels() output_certainty=c.get_certainty_vector() return c,output,output_certainty
def transfer_multitask_group_regression(fm_train=traindat,fm_test=testdat,label_train=label_traindat): from modshogun import RegressionLabels, RealFeatures, Task, TaskGroup, MultitaskLSRegression features = RealFeatures(traindat) labels = RegressionLabels(label_train) n_vectors = features.get_num_vectors() task_one = Task(0,n_vectors/2) task_two = Task(n_vectors/2,n_vectors) task_group = TaskGroup() task_group.add_task(task_one) task_group.add_task(task_two) mtlsr = MultitaskLSRegression(0.1,features,labels,task_group) mtlsr.train() mtlsr.set_current_task(0) out = mtlsr.apply_regression().get_labels() return out
def transfer_multitask_group_regression(fm_train=traindat, fm_test=testdat, label_train=label_traindat): from modshogun import RegressionLabels, RealFeatures, Task, TaskGroup, MultitaskLSRegression features = RealFeatures(traindat) labels = RegressionLabels(label_train) n_vectors = features.get_num_vectors() task_one = Task(0, n_vectors / 2) task_two = Task(n_vectors / 2, n_vectors) task_group = TaskGroup() task_group.add_task(task_one) task_group.add_task(task_two) mtlsr = MultitaskLSRegression(0.1, features, labels, task_group) mtlsr.train() mtlsr.set_current_task(0) out = mtlsr.apply_regression().get_labels() return out
def transfer_multitask_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from modshogun import BinaryLabels, RealFeatures, Task, TaskGroup, MultitaskLogisticRegression features = RealFeatures(hstack((traindat,traindat))) labels = BinaryLabels(hstack((label_train,label_train))) n_vectors = features.get_num_vectors() task_one = Task(0,n_vectors/2) task_two = Task(n_vectors/2,n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlr = MultitaskLogisticRegression(0.1,features,labels,task_group) mtlr.set_regularization(1) # use regularization ratio mtlr.set_tolerance(1e-2) # use 1e-2 tolerance mtlr.train() mtlr.set_current_task(0) out = mtlr.apply().get_labels() return out
def metric_lmnn_statistics( k=3, fname_features='../../data/fm_train_multiclass_digits.dat.gz', fname_labels='../../data/label_train_multiclass_digits.dat'): try: from modshogun import LMNN, CSVFile, RealFeatures, MulticlassLabels, MSG_DEBUG import matplotlib.pyplot as pyplot except ImportError: print 'Error importing modshogun or other required modules. Please, verify their installation.' return features = RealFeatures(load_compressed_features(fname_features).T) labels = MulticlassLabels(CSVFile(fname_labels)) # print 'number of examples = %d' % features.get_num_vectors() # print 'number of features = %d' % features.get_num_features() assert (features.get_num_vectors() == labels.get_num_labels()) # train LMNN lmnn = LMNN(features, labels, k) lmnn.set_correction(100) # lmnn.io.set_loglevel(MSG_DEBUG) print 'Training LMNN, this will take about two minutes...' lmnn.train() print 'Training done!' # plot objective obtained during training statistics = lmnn.get_statistics() pyplot.plot(statistics.obj.get()) pyplot.grid(True) pyplot.xlabel('Iterations') pyplot.ylabel('LMNN objective') pyplot.title( 'LMNN objective during training for the multiclass digits data set') pyplot.show()
def metric_lmnn_statistics( k=3, fname_features="../../data/fm_train_multiclass_digits.dat.gz", fname_labels="../../data/label_train_multiclass_digits.dat", ): try: from modshogun import LMNN, CSVFile, RealFeatures, MulticlassLabels, MSG_DEBUG import matplotlib.pyplot as pyplot except ImportError: print "Error importing modshogun or other required modules. Please, verify their installation." return features = RealFeatures(load_compressed_features(fname_features).T) labels = MulticlassLabels(CSVFile(fname_labels)) # print 'number of examples = %d' % features.get_num_vectors() # print 'number of features = %d' % features.get_num_features() assert features.get_num_vectors() == labels.get_num_labels() # train LMNN lmnn = LMNN(features, labels, k) lmnn.set_correction(100) # lmnn.io.set_loglevel(MSG_DEBUG) print "Training LMNN, this will take about two minutes..." lmnn.train() print "Training done!" # plot objective obtained during training statistics = lmnn.get_statistics() pyplot.plot(statistics.obj.get()) pyplot.grid(True) pyplot.xlabel("Iterations") pyplot.ylabel("LMNN objective") pyplot.title("LMNN objective during training for the multiclass digits data set") pyplot.show()
def train(self, images, labels): """ Train eigenfaces """ print "Train...", #copy labels self._labels = labels; #transform the numpe vector to shogun structure features = RealFeatures(images) #PCA self.pca = PCA() #set dimension self.pca.set_target_dim(self._num_components); #compute PCA self.pca.init(features) for sampleIdx in range(features.get_num_vectors()): v = features.get_feature_vector(sampleIdx); p = self.pca.apply_to_feature_vector(v); self._projections.insert(sampleIdx, p); print "ok!"
def train(self, images, labels): """ Train eigenfaces """ print "Train..." #copy labels self._labels = labels #transform the numpe vector to shogun structure features = RealFeatures(images) #PCA self.pca = PCA() #set dimension self.pca.set_target_dim(self._num_components) #compute PCA self.pca.init(features) for sampleIdx in range(features.get_num_vectors()): v = features.get_feature_vector(sampleIdx) p = self.pca.apply_to_feature_vector(v) self._projections.insert(sampleIdx, p) print "Train ok!"
def statistics_hsic(n, difference, angle): from modshogun import RealFeatures from modshogun import DataGenerator from modshogun import GaussianKernel from modshogun import HSIC from modshogun import PERMUTATION, HSIC_GAMMA from modshogun import EuclideanDistance from modshogun import Statistics, Math # for reproducable results (the numpy one might not be reproducible across # different OS/Python-distributions Math.init_random(1) np.random.seed(1) # note that the HSIC has to store kernel matrices # which upper bounds the sample size # use data generator class to produce example data data = DataGenerator.generate_sym_mix_gauss(n, difference, angle) #plot(data[0], data[1], 'x');show() # create shogun feature representation features_x = RealFeatures(np.array([data[0]])) features_y = RealFeatures(np.array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset = np.random.permutation(features_x.get_num_vectors()).astype( np.int32) subset = subset[0:200] features_x.add_subset(subset) dist = EuclideanDistance(features_x, features_x) distances = dist.get_distance_matrix() features_x.remove_subset() median_distance = np.median(distances) sigma_x = median_distance**2 features_y.add_subset(subset) dist = EuclideanDistance(features_y, features_y) distances = dist.get_distance_matrix() features_y.remove_subset() median_distance = np.median(distances) sigma_y = median_distance**2 #print "median distance for Gaussian kernel on x:", sigma_x #print "median distance for Gaussian kernel on y:", sigma_y kernel_x = GaussianKernel(10, sigma_x) kernel_y = GaussianKernel(10, sigma_y) hsic = HSIC(kernel_x, kernel_y, features_x, features_y) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic = hsic.compute_statistic() #print "HSIC:", statistic alpha = 0.05 #print "computing p-value using sampling null" hsic.set_null_approximation_method(PERMUTATION) # normally, at least 250 iterations should be done, but that takes long hsic.set_num_null_samples(100) # sampling null allows usage of unbiased or biased statistic p_value_boot = hsic.compute_p_value(statistic) thresh_boot = hsic.compute_threshold(alpha) #print "p_value:", p_value_boot #print "threshold for 0.05 alpha:", thresh_boot #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha #print "computing p-value using gamma method" hsic.set_null_approximation_method(HSIC_GAMMA) p_value_gamma = hsic.compute_p_value(statistic) thresh_gamma = hsic.compute_threshold(alpha) #print "p_value:", p_value_gamma #print "threshold for 0.05 alpha:", thresh_gamma #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_gamma<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # sampling null, biased statistic #print "sampling null distribution using sample_null" hsic.set_null_approximation_method(PERMUTATION) hsic.set_num_null_samples(100) null_samples = hsic.sample_null() #print "null mean:", np.mean(null_samples) #print "null variance:", np.var(null_samples) #hist(null_samples, 100); show() return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples
def hsic_graphical(): # parameters, change to get different results m=250 difference=3 # setting the angle lower makes a harder test angle=pi/30 # number of samples taken from null and alternative distribution num_null_samples=500 # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=int32(array([x for x in range(features_x.get_num_vectors())])) # numpy subset=random.permutation(subset) # numpy permutation subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=np.median(distances) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=np.median(distances) sigma_y=median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) # create hsic instance. Note that this is a convienience constructor which copies # feature data. features_x and features_y are not these used in hsic. # This is only for user-friendlyness. Usually, its ok to do this. # Below, the alternative distribution is sampled, which means # that new feature objects have to be created in each iteration (slow) # However, normally, the alternative distribution is not sampled hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # sample alternative distribution alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) features_x.set_feature_matrix(array([data[0]])) features_y.set_feature_matrix(array([data[1]])) # re-create hsic instance everytime since feature objects are copied due to # useage of convienience constructor hsic=HSIC(kernel_x,kernel_y,features_x,features_y) alt_samples[i]=hsic.compute_statistic() # sample from null distribution # permutation, biased statistic hsic.set_null_approximation_method(PERMUTATION) hsic.set_num_null_samples(num_null_samples) null_samples_boot=hsic.sample_null() # fit gamma distribution, biased statistic hsic.set_null_approximation_method(HSIC_GAMMA) gamma_params=hsic.fit_null_gamma() # sample gamma with parameters null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)]) # plot figure() # plot data x and y subplot(2,2,1) gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks grid(True) plot(data[0], data[1], 'o') title('Data, rotation=$\pi$/'+str(1/angle*pi)+'\nm='+str(m)) xlabel('$x$') ylabel('$y$') # compute threshold for test level alpha=0.05 null_samples_boot.sort() null_samples_gamma.sort() thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))]; thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))]; type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples) type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples) # plot alternative distribution with threshold subplot(2,2,2) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(alt_samples, 20, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range=[min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)])] # plot null distribution with threshold subplot(2,2,3) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(null_samples_boot, 20, range=hist_range, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Sampled Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gamma subplot(2,2,4) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(null_samples_gamma, 20, range=hist_range, normed=True); axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) grid(True) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
acc = evaluator.evaluate(predicted_labels, test_labels) err = 1-acc return err features_file = '../data/fm_ape_gut.txt' labels_file = '../data/label_ape_gut.txt' features = RealFeatures(CSVFile(features_file)) labels = MulticlassLabels(CSVFile(labels_file)) # reduce the number of features to use so that the training is faster but still # the results of feature selection are significant fm = features.get_feature_matrix() features = RealFeatures(fm[:500, :]) assert(features.get_num_vectors() == labels.get_num_labels()) print('Number of examples = %d, number of features = %d.' % (features.get_num_vectors(), features.get_num_features())) visualize_tdsne(features, labels) lmnn = diagonal_lmnn(features, labels, max_iter=1200) diagonal_transform = lmnn.get_linear_transform() diagonal = numpy.diag(diagonal_transform) print('%d out of %d elements are non-zero' % (numpy.sum(diagonal != 0), diagonal.shape[0])) statistics = lmnn.get_statistics() pyplot.plot(statistics.obj.get()) pyplot.show()
#!/usr/bin/python from modshogun import CSVFile, RealFeatures, RescaleFeatures from scipy.linalg import solve_triangular, cholesky, sqrtm, inv import matplotlib.pyplot as pyplot import numpy # load wine features features = RealFeatures(CSVFile('../data/fm_wine.dat')) print('%d vectors with %d features.' % (features.get_num_vectors(), features.get_num_features())) print('original features mean = ' + str(numpy.mean(features, axis=1))) # rescale the features to [0,1] feature_rescaling = RescaleFeatures() feature_rescaling.init(features) features.add_preprocessor(feature_rescaling) features.apply_preprocessor() print('mean after rescaling = ' + str(numpy.mean(features, axis=1))) # remove mean from data data = features.get_feature_matrix() data = data.T data -= numpy.mean(data, axis=0) print numpy.mean(data, axis=0) fig, axarr = pyplot.subplots(1, 2) axarr[0].matshow(numpy.cov(data.T))
def plot_neighborhood_graph(x, nn, axis): for i in xrange(x.shape[0]): xs = [x[i, 0], x[nn[1, i], 0]] ys = [x[i, 1], x[nn[1, i], 1]] axis.plot(xs, ys, COLS[int(y[i])]) figure, axarr = pyplot.subplots(3, 1) x, y = sandwich_data() features = RealFeatures(x.T) labels = MulticlassLabels(y) print('%d vectors with %d features' % (features.get_num_vectors(), features.get_num_features())) assert (features.get_num_vectors() == labels.get_num_labels()) distance = EuclideanDistance(features, features) k = 2 knn = KNN(k, distance, labels) plot_data(x, y, axarr[0]) plot_neighborhood_graph(x, knn.nearest_neighbors(), axarr[0]) axarr[0].set_aspect('equal') axarr[0].set_xlim(-6, 4) axarr[0].set_ylim(-3, 2) lmnn = LMNN(features, labels, k) lmnn.set_maxiter(10000) lmnn.train()
def statistics_hsic (n, difference, angle): from modshogun import RealFeatures from modshogun import DataGenerator from modshogun import GaussianKernel from modshogun import HSIC from modshogun import BOOTSTRAP, HSIC_GAMMA from modshogun import EuclideanDistance from modshogun import Math, Statistics, IntVector # init seed for reproducability Math.init_random(1) # note that the HSIC has to store kernel matrices # which upper bounds the sample size # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(n,difference,angle) #plot(data[0], data[1], 'x');show() # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=IntVector.randperm_vec(features_x.get_num_vectors()) subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=Statistics.matrix_median(distances, True) sigma_y=median_distance**2 #print "median distance for Gaussian kernel on x:", sigma_x #print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic=hsic.compute_statistic() #print "HSIC:", statistic alpha=0.05 #print "computing p-value using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long hsic.set_bootstrap_iterations(100) # bootstrapping allows usage of unbiased or biased statistic p_value_boot=hsic.compute_p_value(statistic) thresh_boot=hsic.compute_threshold(alpha) #print "p_value:", p_value_boot #print "threshold for 0.05 alpha:", thresh_boot #print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value_boot<alpha #print "computing p-value using gamma method" hsic.set_null_approximation_method(HSIC_GAMMA) p_value_gamma=hsic.compute_p_value(statistic) thresh_gamma=hsic.compute_threshold(alpha) #print "p_value:", p_value_gamma #print "threshold for 0.05 alpha:", thresh_gamma #print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value_gamma<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic #print "sampling null distribution using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) hsic.set_bootstrap_iterations(100) null_samples=hsic.bootstrap_null() #print "null mean:", mean(null_samples) #print "null variance:", var(null_samples) #hist(null_samples, 100); show() return p_value_boot, thresh_boot, p_value_gamma, thresh_gamma, statistic, null_samples
xi = x[y==val] axis.scatter(xi[:,0], xi[:,1], s=50, facecolors='none', edgecolors=COLS[idx]) def plot_neighborhood_graph(x, nn, axis): for i in xrange(x.shape[0]): xs = [x[i,0], x[nn[1,i], 0]] ys = [x[i,1], x[nn[1,i], 1]] axis.plot(xs, ys, COLS[int(y[i])]) figure, axarr = pyplot.subplots(3, 1) x, y = sandwich_data() features = RealFeatures(x.T) labels = MulticlassLabels(y) print('%d vectors with %d features' % (features.get_num_vectors(), features.get_num_features())) assert(features.get_num_vectors() == labels.get_num_labels()) distance = EuclideanDistance(features, features) k = 2 knn = KNN(k, distance, labels) plot_data(x, y, axarr[0]) plot_neighborhood_graph(x, knn.nearest_neighbors(), axarr[0]) axarr[0].set_aspect('equal') axarr[0].set_xlim(-6, 4) axarr[0].set_ylim(-3, 2) lmnn = LMNN(features, labels, k) lmnn.set_maxiter(10000) lmnn.train()
#!/usr/bin/env python2.7 # # This software is distributed under BSD 3-clause license (see LICENSE file). # # Copyright (C) 2014 Thoralf Klein # from modshogun import RealFeatures, BinaryLabels, LibLinear from numpy import random, mean X_train = RealFeatures(random.randn(30, 100)) Y_train = BinaryLabels(random.randn(X_train.get_num_vectors())) svm = LibLinear(1.0, X_train, Y_train) svm.train() Y_pred = svm.apply_binary(X_train) Y_train.get_labels() == Y_pred.get_labels() print "accuracy:", mean(Y_train.get_labels() == Y_pred.get_labels())
def hsic_graphical(): # parameters, change to get different results m = 250 difference = 3 # setting the angle lower makes a harder test angle = pi / 30 # number of samples taken from null and alternative distribution num_null_samples = 500 # use data generator class to produce example data data = DataGenerator.generate_sym_mix_gauss(m, difference, angle) # create shogun feature representation features_x = RealFeatures(array([data[0]])) features_y = RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset = int32(array([x for x in range(features_x.get_num_vectors()) ])) # numpy subset = random.permutation(subset) # numpy permutation subset = subset[0:200] features_x.add_subset(subset) dist = EuclideanDistance(features_x, features_x) distances = dist.get_distance_matrix() features_x.remove_subset() median_distance = np.median(distances) sigma_x = median_distance**2 features_y.add_subset(subset) dist = EuclideanDistance(features_y, features_y) distances = dist.get_distance_matrix() features_y.remove_subset() median_distance = np.median(distances) sigma_y = median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x = GaussianKernel(10, sigma_x) kernel_y = GaussianKernel(10, sigma_y) # create hsic instance. Note that this is a convienience constructor which copies # feature data. features_x and features_y are not these used in hsic. # This is only for user-friendlyness. Usually, its ok to do this. # Below, the alternative distribution is sampled, which means # that new feature objects have to be created in each iteration (slow) # However, normally, the alternative distribution is not sampled hsic = HSIC(kernel_x, kernel_y, features_x, features_y) # sample alternative distribution alt_samples = zeros(num_null_samples) for i in range(len(alt_samples)): data = DataGenerator.generate_sym_mix_gauss(m, difference, angle) features_x.set_feature_matrix(array([data[0]])) features_y.set_feature_matrix(array([data[1]])) # re-create hsic instance everytime since feature objects are copied due to # useage of convienience constructor hsic = HSIC(kernel_x, kernel_y, features_x, features_y) alt_samples[i] = hsic.compute_statistic() # sample from null distribution # permutation, biased statistic hsic.set_null_approximation_method(PERMUTATION) hsic.set_num_null_samples(num_null_samples) null_samples_boot = hsic.sample_null() # fit gamma distribution, biased statistic hsic.set_null_approximation_method(HSIC_GAMMA) gamma_params = hsic.fit_null_gamma() # sample gamma with parameters null_samples_gamma = array([ gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples) ]) # plot figure() # plot data x and y subplot(2, 2, 1) gca().xaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks grid(True) plot(data[0], data[1], 'o') title('Data, rotation=$\pi$/' + str(1 / angle * pi) + '\nm=' + str(m)) xlabel('$x$') ylabel('$y$') # compute threshold for test level alpha = 0.05 null_samples_boot.sort() null_samples_gamma.sort() thresh_boot = null_samples_boot[floor( len(null_samples_boot) * (1 - alpha))] thresh_gamma = null_samples_gamma[floor( len(null_samples_gamma) * (1 - alpha))] type_one_error_boot = sum( null_samples_boot < thresh_boot) / float(num_null_samples) type_one_error_gamma = sum( null_samples_gamma < thresh_boot) / float(num_null_samples) # plot alternative distribution with threshold subplot(2, 2, 2) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks grid(True) hist(alt_samples, 20, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error = sum(alt_samples < thresh_boot) / float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range = [ min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)]) ] # plot null distribution with threshold subplot(2, 2, 3) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks grid(True) hist(null_samples_boot, 20, range=hist_range, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Sampled Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gamma subplot(2, 2, 4) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks grid(True) hist(null_samples_gamma, 20, range=hist_range, normed=True) axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) grid(True) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
#!/usr/bin/python from modshogun import CSVFile, RealFeatures, RescaleFeatures from scipy.linalg import solve_triangular, cholesky, sqrtm, inv import matplotlib.pyplot as pyplot import numpy # load wine features features = RealFeatures(CSVFile('../data/fm_wine.dat')) print('%d vectors with %d features.' % (features.get_num_vectors(), features.get_num_features())) print('original features mean = ' + str(numpy.mean(features, axis=1))) # rescale the features to [0,1] feature_rescaling = RescaleFeatures() feature_rescaling.init(features) features.add_preprocessor(feature_rescaling) features.apply_preprocessor() print('mean after rescaling = ' + str(numpy.mean(features, axis=1))) # remove mean from data data = features.get_feature_matrix() data = data.T data-= numpy.mean(data, axis=0) print numpy.mean(data, axis=0) fig, axarr = pyplot.subplots(1,2) axarr[0].matshow(numpy.cov(data.T)) #### whiten data