def metric_lmnn_statistics(k=3, fname_features='../../data/fm_train_multiclass_digits.dat.gz', fname_labels='../../data/label_train_multiclass_digits.dat'): try: from shogun import LMNN, CSVFile, RealFeatures, MulticlassLabels, MSG_DEBUG import matplotlib.pyplot as pyplot except ImportError: print 'Error importing shogun or other required modules. Please, verify their installation.' return features = RealFeatures(load_compressed_features(fname_features).T) labels = MulticlassLabels(CSVFile(fname_labels)) # print 'number of examples = %d' % features.get_num_vectors() # print 'number of features = %d' % features.get_num_features() assert(features.get_num_vectors() == labels.get_num_labels()) # train LMNN lmnn = LMNN(features, labels, k) lmnn.set_correction(100) # lmnn.io.set_loglevel(MSG_DEBUG) print 'Training LMNN, this will take about two minutes...' lmnn.train() print 'Training done!' # plot objective obtained during training statistics = lmnn.get_statistics() pyplot.plot(statistics.obj.get()) pyplot.grid(True) pyplot.xlabel('Iterations') pyplot.ylabel('LMNN objective') pyplot.title('LMNN objective during training for the multiclass digits data set') pyplot.show()
def transfer_multitask_logistic_regression(fm_train=traindat, fm_test=testdat, label_train=label_traindat): from shogun import BinaryLabels, RealFeatures, Task, TaskGroup try: from shogun import MultitaskLogisticRegression except ImportError: print("MultitaskLogisticRegression not available") exit() features = RealFeatures(hstack((traindat, traindat))) labels = BinaryLabels(hstack((label_train, label_train))) n_vectors = features.get_num_vectors() task_one = Task(0, n_vectors // 2) task_two = Task(n_vectors // 2, n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlr = MultitaskLogisticRegression(0.1, features, labels, task_group) mtlr.set_regularization(1) # use regularization ratio mtlr.set_tolerance(1e-2) # use 1e-2 tolerance mtlr.train() mtlr.set_current_task(0) out = mtlr.apply().get_labels() return out
def transfer_multitask_leastsquares_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from shogun import RegressionLabels, RealFeatures, Task, TaskGroup try: from shogun import MultitaskLeastSquaresRegression except ImportError: print("MultitaskLeastSquaresRegression not available") exit(0) features = RealFeatures(traindat) labels = RegressionLabels(label_train) n_vectors = features.get_num_vectors() task_one = Task(0,n_vectors//2) task_two = Task(n_vectors//2,n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlsr = MultitaskLeastSquaresRegression(0.1,features,labels,task_group) mtlsr.set_regularization(1) # use regularization ratio mtlsr.set_tolerance(1e-2) # use 1e-2 tolerance mtlsr.train() mtlsr.set_current_task(0) out = mtlsr.apply_regression().get_labels() return out
def transfer_multitask_clustered_logistic_regression(fm_train=traindat, fm_test=testdat, label_train=label_traindat ): from shogun import BinaryLabels, RealFeatures, Task, TaskGroup, MSG_DEBUG try: from shogun import MultitaskClusteredLogisticRegression except ImportError: print("MultitaskClusteredLogisticRegression not available") exit() features = RealFeatures(hstack((traindat, sin(traindat), cos(traindat)))) labels = BinaryLabels(hstack((label_train, label_train, label_train))) n_vectors = features.get_num_vectors() task_one = Task(0, n_vectors // 3) task_two = Task(n_vectors // 3, 2 * n_vectors // 3) task_three = Task(2 * n_vectors // 3, n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) task_group.append_task(task_three) mtlr = MultitaskClusteredLogisticRegression(1.0, 100.0, features, labels, task_group, 2) #mtlr.io.set_loglevel(MSG_DEBUG) mtlr.set_tolerance(1e-3) # use 1e-2 tolerance mtlr.set_max_iter(100) mtlr.train() mtlr.set_current_task(0) #print mtlr.get_w() out = mtlr.apply_regression().get_labels() return out
def transfer_multitask_trace_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from shogun import BinaryLabels, RealFeatures, Task, TaskGroup try: from shogun import MultitaskTraceLogisticRegression except ImportError: print("MultitaskTraceLogisticRegression not available") exit(0) features = RealFeatures(hstack((traindat,traindat))) labels = BinaryLabels(hstack((label_train,label_train))) n_vectors = features.get_num_vectors() task_one = Task(0,n_vectors//2) task_two = Task(n_vectors//2,n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlr = MultitaskTraceLogisticRegression(0.1,features,labels,task_group) mtlr.set_tolerance(1e-2) # use 1e-2 tolerance mtlr.set_max_iter(10) mtlr.train() mtlr.set_current_task(0) out = mtlr.apply_regression().get_labels() return out
def transfer_multitask_clustered_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from shogun import BinaryLabels, RealFeatures, Task, TaskGroup, MSG_DEBUG try: from shogun import MultitaskClusteredLogisticRegression except ImportError: print("MultitaskClusteredLogisticRegression not available") exit() features = RealFeatures(hstack((traindat,sin(traindat),cos(traindat)))) labels = BinaryLabels(hstack((label_train,label_train,label_train))) n_vectors = features.get_num_vectors() task_one = Task(0,n_vectors//3) task_two = Task(n_vectors//3,2*n_vectors//3) task_three = Task(2*n_vectors//3,n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) task_group.append_task(task_three) mtlr = MultitaskClusteredLogisticRegression(1.0,100.0,features,labels,task_group,2) #mtlr.io.set_loglevel(MSG_DEBUG) mtlr.set_tolerance(1e-3) # use 1e-2 tolerance mtlr.set_max_iter(100) mtlr.train() mtlr.set_current_task(0) #print mtlr.get_w() out = mtlr.apply_regression().get_labels() return out
def multiclass_c45classifiertree(train=traindat, test=testdat, labels=label_traindat, ft=feattypes): try: from shogun import RealFeatures, MulticlassLabels, CSVFile, C45ClassifierTree from numpy import random, int32 except ImportError: print("Could not import Shogun and/or numpy modules") return # wrap features and labels into Shogun objects feats_train = RealFeatures(CSVFile(train)) feats_test = RealFeatures(CSVFile(test)) train_labels = MulticlassLabels(CSVFile(labels)) # divide train dataset into training and validation subsets in the ratio 2/3 to 1/3 subset = int32(random.permutation(feats_train.get_num_vectors())) vsubset = subset[1:int(subset.size / 3)] trsubset = subset[1 + int(subset.size / 3):subset.size] # C4.5 Tree formation using training subset train_labels.add_subset(trsubset) feats_train.add_subset(trsubset) c = C45ClassifierTree() c.set_labels(train_labels) c.set_feature_types(ft) c.train(feats_train) train_labels.remove_subset() feats_train.remove_subset() # prune tree using validation subset train_labels.add_subset(vsubset) feats_train.add_subset(vsubset) c.prune_tree(feats_train, train_labels) train_labels.remove_subset() feats_train.remove_subset() # Classify test data output = c.apply_multiclass(feats_test).get_labels() output_certainty = c.get_certainty_vector() return c, output, output_certainty
def metric_lmnn_statistics( k=3, fname_features='../../data/fm_train_multiclass_digits.dat.gz', fname_labels='../../data/label_train_multiclass_digits.dat'): try: from shogun import LMNN, CSVFile, RealFeatures, MulticlassLabels, MSG_DEBUG import matplotlib.pyplot as pyplot except ImportError: print 'Error importing shogun or other required modules. Please, verify their installation.' return features = RealFeatures(load_compressed_features(fname_features).T) labels = MulticlassLabels(CSVFile(fname_labels)) # print 'number of examples = %d' % features.get_num_vectors() # print 'number of features = %d' % features.get_num_features() assert (features.get_num_vectors() == labels.get_num_labels()) # train LMNN lmnn = LMNN(features, labels, k) lmnn.set_correction(100) # lmnn.io.set_loglevel(MSG_DEBUG) print 'Training LMNN, this will take about two minutes...' lmnn.train() print 'Training done!' # plot objective obtained during training statistics = lmnn.get_statistics() pyplot.plot(statistics.obj.get()) pyplot.grid(True) pyplot.xlabel('Iterations') pyplot.ylabel('LMNN objective') pyplot.title( 'LMNN objective during training for the multiclass digits data set') pyplot.show()
def train(self, images, labels): """ Train eigenfaces """ print "Train...", #copy labels self._labels = labels #transform the numpe vector to shogun structure features = RealFeatures(images) #PCA self.pca = PCA() #set dimension self.pca.set_target_dim(self._num_components) #compute PCA self.pca.init(features) for sampleIdx in range(features.get_num_vectors()): v = features.get_feature_vector(sampleIdx) p = self.pca.apply_to_feature_vector(v) self._projections.insert(sampleIdx, p) print "ok!"
def hsic_graphical(): # parameters, change to get different results m=250 difference=3 # setting the angle lower makes a harder test angle=pi/30 # number of samples taken from null and alternative distribution num_null_samples=500 # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=int32(array([x for x in range(features_x.get_num_vectors())])) # numpy subset=random.permutation(subset) # numpy permutation subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=np.median(distances) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=np.median(distances) sigma_y=median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) # create hsic instance. Note that this is a convienience constructor which copies # feature data. features_x and features_y are not these used in hsic. # This is only for user-friendlyness. Usually, its ok to do this. # Below, the alternative distribution is sampled, which means # that new feature objects have to be created in each iteration (slow) # However, normally, the alternative distribution is not sampled hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # sample alternative distribution alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) features_x.set_feature_matrix(array([data[0]])) features_y.set_feature_matrix(array([data[1]])) # re-create hsic instance everytime since feature objects are copied due to # useage of convienience constructor hsic=HSIC(kernel_x,kernel_y,features_x,features_y) alt_samples[i]=hsic.compute_statistic() # sample from null distribution # permutation, biased statistic hsic.set_null_approximation_method(PERMUTATION) hsic.set_num_null_samples(num_null_samples) null_samples_boot=hsic.sample_null() # fit gamma distribution, biased statistic hsic.set_null_approximation_method(HSIC_GAMMA) gamma_params=hsic.fit_null_gamma() # sample gamma with parameters null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)]) # plot figure() # plot data x and y subplot(2,2,1) gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks grid(True) plot(data[0], data[1], 'o') title('Data, rotation=$\pi$/'+str(1/angle*pi)+'\nm='+str(m)) xlabel('$x$') ylabel('$y$') # compute threshold for test level alpha=0.05 null_samples_boot.sort() null_samples_gamma.sort() thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))]; thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))]; type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples) type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples) # plot alternative distribution with threshold subplot(2,2,2) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(alt_samples, 20, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range=[min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)])] # plot null distribution with threshold subplot(2,2,3) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(null_samples_boot, 20, range=hist_range, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Sampled Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gamma subplot(2,2,4) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(null_samples_gamma, 20, range=hist_range, normed=True); axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) grid(True) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)