def evaluation_cross_validation_mkl_weight_storage(traindat=traindat, label_traindat=label_traindat): from modshogun import CrossValidation, CrossValidationResult from modshogun import CrossValidationPrintOutput from modshogun import CrossValidationMKLStorage from modshogun import ContingencyTableEvaluation, ACCURACY from modshogun import StratifiedCrossValidationSplitting from modshogun import BinaryLabels from modshogun import RealFeatures, CombinedFeatures from modshogun import GaussianKernel, CombinedKernel from modshogun import LibSVM, MKLClassification from modshogun import Statistics # training data, combined features all on same data features=RealFeatures(traindat) comb_features=CombinedFeatures() comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) labels=BinaryLabels(label_traindat) # kernel, different Gaussians combined kernel=CombinedKernel() kernel.append_kernel(GaussianKernel(10, 0.1)) kernel.append_kernel(GaussianKernel(10, 1)) kernel.append_kernel(GaussianKernel(10, 2)) # create mkl using libsvm, due to a mem-bug, interleaved is not possible svm=MKLClassification(LibSVM()); svm.set_interleaved_optimization_enabled(False); svm.set_kernel(kernel); # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy=StratifiedCrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium=ContingencyTableEvaluation(ACCURACY) # cross-validation instance cross_validation=CrossValidation(svm, comb_features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # append cross vlaidation output classes #cross_validation.add_cross_validation_output(CrossValidationPrintOutput()) mkl_storage=CrossValidationMKLStorage() cross_validation.add_cross_validation_output(mkl_storage) cross_validation.set_num_runs(3) # perform cross-validation result=cross_validation.evaluate() # print mkl weights weights=mkl_storage.get_mkl_weights()
def runShogunSVRWDKernel(train_xt, train_lt, test_xt, svm_c=1, svr_param=0.1): """ serialize svr with string kernels """ ################################################## # set up svr feats_train = construct_features(train_xt) feats_test = construct_features(test_xt) max_len = len(train_xt[0]) kernel_wdk = WeightedDegreePositionStringKernel(SIZE, 5) shifts_vector = np.ones(max_len, dtype=np.int32) * NUMSHIFTS kernel_wdk.set_shifts(shifts_vector) ######## # set up spectrum use_sign = False kernel_spec_1 = WeightedCommWordStringKernel(SIZE, use_sign) #kernel_spec_2 = WeightedCommWordStringKernel(SIZE, use_sign) ######## # combined kernel kernel = CombinedKernel() kernel.append_kernel(kernel_wdk) kernel.append_kernel(kernel_spec_1) #kernel.append_kernel(kernel_spec_2) # init kernel labels = RegressionLabels(train_lt) # two svr models: epsilon and nu svr_epsilon = LibSVR(svm_c, svr_param, kernel, labels, LIBSVR_EPSILON_SVR) print "Ready to train!" svr_epsilon.train(feats_train) #svr_nu=LibSVR(svm_c, svr_param, kernel, labels, LIBSVR_NU_SVR) #svr_nu.train(feats_train) # predictions print "Making predictions!" kernel.init(feats_train, feats_test) out1_epsilon = svr_epsilon.apply().get_labels() out2_epsilon = svr_epsilon.apply(feats_test).get_labels() #out1_nu=svr_epsilon.apply().get_labels() #out2_nu=svr_epsilon.apply(feats_test).get_labels() ################################################## # serialize to file fEpsilon = open(FNEPSILON, 'w+') #fNu = open(FNNU, 'w+') svr_epsilon.save(fEpsilon) #svr_nu.save(fNu) fEpsilon.close() #fNu.close() ################################################## #return out1_epsilon,out2_epsilon,out1_nu,out2_nu ,kernel return out1_epsilon, out2_epsilon, kernel
def mkl_regression_modular(n=100,n_test=100, \ x_range=6,x_range_test=10,noise_var=0.5,width=1, seed=1): from modshogun import RegressionLabels, RealFeatures from modshogun import GaussianKernel, PolyKernel, CombinedKernel from modshogun import MKLRegression, SVRLight # reproducible results random.seed(seed) # easy regression data: one dimensional noisy sine wave n = 15 n_test = 100 x_range_test = 10 noise_var = 0.5 X = random.rand(1, n) * x_range X_test = array([[float(i) / n_test * x_range_test for i in range(n_test)]]) Y_test = sin(X_test) Y = sin(X) + random.randn(n) * noise_var # shogun representation labels = RegressionLabels(Y[0]) feats_train = RealFeatures(X) feats_test = RealFeatures(X_test) # combined kernel kernel = CombinedKernel() kernel.append_kernel(GaussianKernel(10, 2)) kernel.append_kernel(GaussianKernel(10, 3)) kernel.append_kernel(PolyKernel(10, 2)) kernel.init(feats_train, feats_train) # constraint generator and MKLRegression svr_constraints = SVRLight() svr_mkl = MKLRegression(svr_constraints) svr_mkl.set_kernel(kernel) svr_mkl.set_labels(labels) svr_mkl.train() # predictions kernel.init(feats_train, feats_test) out = svr_mkl.apply().get_labels() return out, svr_mkl, kernel
def runShogunSVRWDKernel(train_xt, train_lt, test_xt, svm_c=1, svr_param=0.1): """ serialize svr with string kernels """ ################################################## # set up svr feats_train = construct_features(train_xt) feats_test = construct_features(test_xt) max_len = len(train_xt[0]) kernel_wdk = WeightedDegreePositionStringKernel(SIZE, 5) shifts_vector = np.ones(max_len, dtype=np.int32)*NUMSHIFTS kernel_wdk.set_shifts(shifts_vector) ######## # set up spectrum use_sign = False kernel_spec_1 = WeightedCommWordStringKernel(SIZE, use_sign) #kernel_spec_2 = WeightedCommWordStringKernel(SIZE, use_sign) ######## # combined kernel kernel = CombinedKernel() kernel.append_kernel(kernel_wdk) kernel.append_kernel(kernel_spec_1) #kernel.append_kernel(kernel_spec_2) # init kernel labels = RegressionLabels(train_lt) # two svr models: epsilon and nu svr_epsilon=LibSVR(svm_c, svr_param, kernel, labels, LIBSVR_EPSILON_SVR) print "Ready to train!" svr_epsilon.train(feats_train) #svr_nu=LibSVR(svm_c, svr_param, kernel, labels, LIBSVR_NU_SVR) #svr_nu.train(feats_train) # predictions print "Making predictions!" kernel.init(feats_train, feats_test) out1_epsilon=svr_epsilon.apply().get_labels() out2_epsilon=svr_epsilon.apply(feats_test).get_labels() #out1_nu=svr_epsilon.apply().get_labels() #out2_nu=svr_epsilon.apply(feats_test).get_labels() ################################################## # serialize to file fEpsilon = open(FNEPSILON, 'w+') #fNu = open(FNNU, 'w+') svr_epsilon.save(fEpsilon) #svr_nu.save(fNu) fEpsilon.close() #fNu.close() ################################################## #return out1_epsilon,out2_epsilon,out1_nu,out2_nu ,kernel return out1_epsilon,out2_epsilon,kernel
def runShogunSVMDNACombinedSpectrumKernel(train_xt, train_lt, test_xt): """ run svm with combined spectrum kernel """ ################################################## # set up svm kernel=CombinedKernel() feats_train=CombinedFeatures() feats_test=CombinedFeatures() for K in KList: # Iterate through the K's and make a spectrum kernel for each charfeat_train = StringCharFeatures(train_xt, DNA) current_feats_train = StringWordFeatures(DNA) current_feats_train.obtain_from_char(charfeat_train, K-1, K, GAP, False) preproc=SortWordString() preproc.init(current_feats_train) current_feats_train.add_preprocessor(preproc) current_feats_train.apply_preprocessor() feats_train.append_feature_obj(current_feats_train) charfeat_test = StringCharFeatures(test_xt, DNA) current_feats_test=StringWordFeatures(DNA) current_feats_test.obtain_from_char(charfeat_test, K-1, K, GAP, False) current_feats_test.add_preprocessor(preproc) current_feats_test.apply_preprocessor() feats_test.append_feature_obj(current_feats_test) current_kernel=CommWordStringKernel(10, False) kernel.append_kernel(current_kernel) kernel.io.set_loglevel(MSG_DEBUG) # init kernel labels = BinaryLabels(train_lt) # run svm model print "Ready to train!" kernel.init(feats_train, feats_train) svm=LibSVM(SVMC, kernel, labels) svm.io.set_loglevel(MSG_DEBUG) svm.train() # predictions print "Making predictions!" out1DecisionValues = svm.apply(feats_train) out1=out1DecisionValues.get_labels() kernel.init(feats_train, feats_test) out2DecisionValues = svm.apply(feats_test) out2=out2DecisionValues.get_labels() return out1,out2,out1DecisionValues,out2DecisionValues
def mkl_regression_modular(n=100,n_test=100, \ x_range=6,x_range_test=10,noise_var=0.5,width=1, seed=1): from modshogun import RegressionLabels, RealFeatures from modshogun import GaussianKernel, PolyKernel, CombinedKernel from modshogun import MKLRegression, SVRLight # reproducible results random.seed(seed) # easy regression data: one dimensional noisy sine wave n=15 n_test=100 x_range_test=10 noise_var=0.5; X=random.rand(1,n)*x_range X_test=array([[float(i)/n_test*x_range_test for i in range(n_test)]]) Y_test=sin(X_test) Y=sin(X)+random.randn(n)*noise_var # shogun representation labels=RegressionLabels(Y[0]) feats_train=RealFeatures(X) feats_test=RealFeatures(X_test) # combined kernel kernel = CombinedKernel() kernel.append_kernel(GaussianKernel(10,2)) kernel.append_kernel(GaussianKernel(10,3)) kernel.append_kernel(PolyKernel(10,2)) kernel.init(feats_train, feats_train) # constraint generator and MKLRegression svr_constraints=SVRLight() svr_mkl=MKLRegression(svr_constraints) svr_mkl.set_kernel(kernel) svr_mkl.set_labels(labels) svr_mkl.train() # predictions kernel.init(feats_train, feats_test) out=svr_mkl.apply().get_labels() return out, svr_mkl, kernel
def evaluation_cross_validation_mkl_weight_storage( traindat=traindat, label_traindat=label_traindat): from modshogun import CrossValidation, CrossValidationResult from modshogun import CrossValidationPrintOutput from modshogun import CrossValidationMKLStorage from modshogun import ContingencyTableEvaluation, ACCURACY from modshogun import StratifiedCrossValidationSplitting from modshogun import BinaryLabels from modshogun import RealFeatures, CombinedFeatures from modshogun import GaussianKernel, CombinedKernel from modshogun import LibSVM, MKLClassification # training data, combined features all on same data features = RealFeatures(traindat) comb_features = CombinedFeatures() comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) labels = BinaryLabels(label_traindat) # kernel, different Gaussians combined kernel = CombinedKernel() kernel.append_kernel(GaussianKernel(10, 0.1)) kernel.append_kernel(GaussianKernel(10, 1)) kernel.append_kernel(GaussianKernel(10, 2)) # create mkl using libsvm, due to a mem-bug, interleaved is not possible svm = MKLClassification(LibSVM()) svm.set_interleaved_optimization_enabled(False) svm.set_kernel(kernel) # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy = StratifiedCrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium = ContingencyTableEvaluation(ACCURACY) # cross-validation instance cross_validation = CrossValidation(svm, comb_features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # append cross vlaidation output classes #cross_validation.add_cross_validation_output(CrossValidationPrintOutput()) mkl_storage = CrossValidationMKLStorage() cross_validation.add_cross_validation_output(mkl_storage) cross_validation.set_num_runs(3) # perform cross-validation result = cross_validation.evaluate() # print mkl weights weights = mkl_storage.get_mkl_weights()
def make_combined_kernel(feats_train, raw_train, use_sign=True, minseq=3, maxseq=8): from modshogun import CombinedKernel from modshogun import CommUlongStringKernel # init the combined kernel kernel=CombinedKernel() # initialize the subkernels count = 0 for seqlen in range(minseq, maxseq+1): subkernel=CommUlongStringKernel(raw_train[count], raw_train[count], use_sign) kernel.append_kernel(subkernel) count += 1 kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() return kernel
def predict_new_data(graph_file, cons_file, tri_file, other_feature_file): print 'reading extracted features' graph_feature = read_feature_data(graph_file) graph_feature = get_normalized_given_max_min(graph_feature, 'models/grtaph_max_size') cons_feature = read_feature_data(cons_file) cons_feature = get_normalized_given_max_min(cons_feature, 'models/cons_max_size') CC_feature = read_feature_data(tri_file) CC_feature = get_normalized_given_max_min(CC_feature, 'models/tri_max_size') ATOS_feature = read_feature_data(other_feature_file) ATOS_feature = get_normalized_given_max_min(ATOS_feature, 'models/alu_max_size') width, C, epsilon, num_threads, mkl_epsilon, mkl_norm = 0.5, 1.2, 1e-5, 1, 0.001, 3.5 kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() #pdb.set_trace() subkfeats_train = RealFeatures() subkfeats_test = RealFeatures(np.transpose(np.array(graph_feature))) subkernel = GaussianKernel(10, width) feats_test.append_feature_obj(subkfeats_test) fstream = SerializableAsciiFile("models/graph.dat", "r") status = subkfeats_train.load_serializable(fstream) feats_train.append_feature_obj(subkfeats_train) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures() subkfeats_test = RealFeatures(np.transpose(np.array(cons_feature))) subkernel = GaussianKernel(10, width) feats_test.append_feature_obj(subkfeats_test) fstream = SerializableAsciiFile("models/cons.dat", "r") status = subkfeats_train.load_serializable(fstream) feats_train.append_feature_obj(subkfeats_train) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures() subkfeats_test = RealFeatures(np.transpose(np.array(CC_feature))) subkernel = GaussianKernel(10, width) feats_test.append_feature_obj(subkfeats_test) fstream = SerializableAsciiFile("models/tri.dat", "r") status = subkfeats_train.load_serializable(fstream) feats_train.append_feature_obj(subkfeats_train) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures() subkfeats_test = RealFeatures(np.transpose(np.array(ATOS_feature))) subkernel = GaussianKernel(10, width) feats_test.append_feature_obj(subkfeats_test) fstream = SerializableAsciiFile("models/alu.dat", "r") status = subkfeats_train.load_serializable(fstream) feats_train.append_feature_obj(subkfeats_train) kernel.append_kernel(subkernel) model_file = "models/mkl.dat" if not os.path.exists(model_file): print 'downloading model file' url_add = 'http://rth.dk/resources/mirnasponge/data/mkl.dat' urllib.urlretrieve(url_add, model_file) print 'loading trained model' fstream = SerializableAsciiFile("models/mkl.dat", "r") new_mkl = MKLClassification() status = new_mkl.load_serializable(fstream) print 'model predicting' kernel.init(feats_train, feats_test) new_mkl.set_kernel(kernel) y_out = new_mkl.apply().get_labels() return y_out
def serialization_string_kernels_modular(n_data, num_shifts, size): """ serialize svm with string kernels """ ################################################## # set up toy data and svm train_xt, train_lt = generate_random_data(n_data) test_xt, test_lt = generate_random_data(n_data) feats_train = construct_features(train_xt) feats_test = construct_features(test_xt) max_len = len(train_xt[0]) kernel_wdk = WeightedDegreePositionStringKernel(size, 5) shifts_vector = numpy.ones(max_len, dtype=numpy.int32) * num_shifts kernel_wdk.set_shifts(shifts_vector) ######## # set up spectrum use_sign = False kernel_spec_1 = WeightedCommWordStringKernel(size, use_sign) kernel_spec_2 = WeightedCommWordStringKernel(size, use_sign) ######## # combined kernel kernel = CombinedKernel() kernel.append_kernel(kernel_wdk) kernel.append_kernel(kernel_spec_1) kernel.append_kernel(kernel_spec_2) # init kernel labels = BinaryLabels(train_lt) svm = SVMLight(1.0, kernel, labels) #svm.io.set_loglevel(MSG_DEBUG) svm.train(feats_train) ################################################## # serialize to file fn = "serialized_svm.bz2" #print("serializing SVM to file", fn) save(fn, svm) ################################################## # unserialize and sanity check #print("unserializing SVM") svm2 = load(fn) #print("comparing predictions") out = svm.apply(feats_test).get_labels() out2 = svm2.apply(feats_test).get_labels() # assert outputs are close for i in range(len(out)): assert abs(out[i] - out2[i] < 0.000001) #print("all checks passed.") return out, out2
def mkl_multiclass_modular (fm_train_real, fm_test_real, label_train_multiclass, width, C, epsilon, num_threads, mkl_epsilon, mkl_norm): from modshogun import CombinedFeatures, RealFeatures, MulticlassLabels from modshogun import CombinedKernel, GaussianKernel, LinearKernel,PolyKernel from modshogun import MKLMulticlass kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = GaussianKernel(10, width) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = LinearKernel() feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = PolyKernel(10,2) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels = MulticlassLabels(label_train_multiclass) mkl = MKLMulticlass(C, kernel, labels) mkl.set_epsilon(epsilon); mkl.parallel.set_num_threads(num_threads) mkl.set_mkl_epsilon(mkl_epsilon) mkl.set_mkl_norm(mkl_norm) mkl.train() kernel.init(feats_train, feats_test) out = mkl.apply().get_labels() return out
def quadratic_time_mmd_graphical(): # parameters, change to get different results m=100 dim=2 # setting the difference of the first dimension smaller makes a harder test difference=0.5 # number of samples taken from null and alternative distribution num_null_samples=500 # streaming data generator for mean shift distributions gen_p=MeanShiftDataGenerator(0, dim) gen_q=MeanShiftDataGenerator(difference, dim) # Stream examples and merge them in order to compute MMD on joint sample # alternative is to call a different constructor of QuadraticTimeMMD features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) # use the median kernel selection # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable sigmas=[2**x for x in range(-3,10)] widths=[x*x*2 for x in sigmas] print "kernel widths:", widths combined=CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # create MMD instance, use biased statistic mmd=QuadraticTimeMMD(combined,features, m) mmd.set_statistic_type(BIASED) # kernel selection instance (this can easily replaced by the other methods for selecting # single kernels selection=MMDKernelSelectionMax(mmd) # perform kernel selection kernel=selection.select_kernel() kernel=GaussianKernel.obtain_from_generic(kernel) mmd.set_kernel(kernel); print "selected kernel width:", kernel.get_width() # sample alternative distribution (new data each trial) alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): # Stream examples and merge them in order to replace in MMD features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) mmd.set_p_and_q(features) alt_samples[i]=mmd.compute_statistic() # sample from null distribution # bootstrapping, biased statistic mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_statistic_type(BIASED) mmd.set_bootstrap_iterations(num_null_samples) null_samples_boot=mmd.bootstrap_null() # sample from null distribution # spectrum, biased statistic if "sample_null_spectrum" in dir(QuadraticTimeMMD): mmd.set_null_approximation_method(MMD2_SPECTRUM) mmd.set_statistic_type(BIASED) null_samples_spectrum=mmd.sample_null_spectrum(num_null_samples, m-10) # fit gamma distribution, biased statistic mmd.set_null_approximation_method(MMD2_GAMMA) mmd.set_statistic_type(BIASED) gamma_params=mmd.fit_null_gamma() # sample gamma with parameters null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)]) # to plot data, sample a few examples from stream first features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) data=features.get_feature_matrix() # plot figure() title('Quadratic Time MMD') # plot data of p and q subplot(2,3,1) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks plot(data[0][0:m], data[1][0:m], 'ro', label='$x$') plot(data[0][m+1:2*m], data[1][m+1:2*m], 'bo', label='$x$', alpha=0.5) title('Data, shift in $x_1$='+str(difference)+'\nm='+str(m)) xlabel('$x_1, y_1$') ylabel('$x_2, y_2$') # histogram of first data dimension and pdf subplot(2,3,2) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3 )) # reduce number of x-ticks hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True) hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True) xs=linspace(min(data[0])-1,max(data[0])+1, 50) plot(xs,normpdf( xs, 0, 1), 'r', linewidth=3) plot(xs,normpdf( xs, difference, 1), 'b', linewidth=3) xlabel('$x_1, y_1$') ylabel('$p(x_1), p(y_1)$') title('Data PDF in $x_1, y_1$') # compute threshold for test level alpha=0.05 null_samples_boot.sort() null_samples_spectrum.sort() null_samples_gamma.sort() thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))]; thresh_spectrum=null_samples_spectrum[floor(len(null_samples_spectrum)*(1-alpha))]; thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))]; type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples) type_one_error_spectrum=sum(null_samples_spectrum<thresh_boot)/float(num_null_samples) type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples) # plot alternative distribution with threshold subplot(2,3,4) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(alt_samples, 20, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range=[min([min(null_samples_boot), min(null_samples_spectrum), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_spectrum), max(null_samples_gamma)])] # plot null distribution with threshold subplot(2,3,3) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3 )) # reduce number of x-ticks hist(null_samples_boot, 20, range=hist_range, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Bootstrapped Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) grid(True) # plot null distribution spectrum subplot(2,3,5) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(null_samples_spectrum, 20, range=hist_range, normed=True); axvline(thresh_spectrum, 0, 1, linewidth=2, color='red') title('Null Dist. Spectrum\nType I error is ' + str(type_one_error_spectrum)) # plot null distribution gamma subplot(2,3,6) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(null_samples_gamma, 20, range=hist_range, normed=True); axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def quadratic_time_mmd_graphical(): # parameters, change to get different results m = 100 dim = 2 # setting the difference of the first dimension smaller makes a harder test difference = 0.5 # number of samples taken from null and alternative distribution num_null_samples = 500 # streaming data generator for mean shift distributions gen_p = MeanShiftDataGenerator(0, dim) gen_q = MeanShiftDataGenerator(difference, dim) # Stream examples and merge them in order to compute MMD on joint sample # alternative is to call a different constructor of QuadraticTimeMMD features = gen_p.get_streamed_features(m) features = features.create_merged_copy(gen_q.get_streamed_features(m)) # use the median kernel selection # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable sigmas = [2**x for x in range(-3, 10)] widths = [x * x * 2 for x in sigmas] print "kernel widths:", widths combined = CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # create MMD instance, use biased statistic mmd = QuadraticTimeMMD(combined, features, m) mmd.set_statistic_type(BIASED) # kernel selection instance (this can easily replaced by the other methods for selecting # single kernels selection = MMDKernelSelectionMax(mmd) # perform kernel selection kernel = selection.select_kernel() kernel = GaussianKernel.obtain_from_generic(kernel) mmd.set_kernel(kernel) print "selected kernel width:", kernel.get_width() # sample alternative distribution (new data each trial) alt_samples = zeros(num_null_samples) for i in range(len(alt_samples)): # Stream examples and merge them in order to replace in MMD features = gen_p.get_streamed_features(m) features = features.create_merged_copy(gen_q.get_streamed_features(m)) mmd.set_p_and_q(features) alt_samples[i] = mmd.compute_statistic() # sample from null distribution # bootstrapping, biased statistic mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_statistic_type(BIASED) mmd.set_bootstrap_iterations(num_null_samples) null_samples_boot = mmd.bootstrap_null() # sample from null distribution # spectrum, biased statistic if "sample_null_spectrum" in dir(QuadraticTimeMMD): mmd.set_null_approximation_method(MMD2_SPECTRUM) mmd.set_statistic_type(BIASED) null_samples_spectrum = mmd.sample_null_spectrum( num_null_samples, m - 10) # fit gamma distribution, biased statistic mmd.set_null_approximation_method(MMD2_GAMMA) mmd.set_statistic_type(BIASED) gamma_params = mmd.fit_null_gamma() # sample gamma with parameters null_samples_gamma = array([ gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples) ]) # to plot data, sample a few examples from stream first features = gen_p.get_streamed_features(m) features = features.create_merged_copy(gen_q.get_streamed_features(m)) data = features.get_feature_matrix() # plot figure() title('Quadratic Time MMD') # plot data of p and q subplot(2, 3, 1) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks plot(data[0][0:m], data[1][0:m], 'ro', label='$x$') plot(data[0][m + 1:2 * m], data[1][m + 1:2 * m], 'bo', label='$x$', alpha=0.5) title('Data, shift in $x_1$=' + str(difference) + '\nm=' + str(m)) xlabel('$x_1, y_1$') ylabel('$x_2, y_2$') # histogram of first data dimension and pdf subplot(2, 3, 2) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True) hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True) xs = linspace(min(data[0]) - 1, max(data[0]) + 1, 50) plot(xs, normpdf(xs, 0, 1), 'r', linewidth=3) plot(xs, normpdf(xs, difference, 1), 'b', linewidth=3) xlabel('$x_1, y_1$') ylabel('$p(x_1), p(y_1)$') title('Data PDF in $x_1, y_1$') # compute threshold for test level alpha = 0.05 null_samples_boot.sort() null_samples_spectrum.sort() null_samples_gamma.sort() thresh_boot = null_samples_boot[floor( len(null_samples_boot) * (1 - alpha))] thresh_spectrum = null_samples_spectrum[floor( len(null_samples_spectrum) * (1 - alpha))] thresh_gamma = null_samples_gamma[floor( len(null_samples_gamma) * (1 - alpha))] type_one_error_boot = sum( null_samples_boot < thresh_boot) / float(num_null_samples) type_one_error_spectrum = sum( null_samples_spectrum < thresh_boot) / float(num_null_samples) type_one_error_gamma = sum( null_samples_gamma < thresh_boot) / float(num_null_samples) # plot alternative distribution with threshold subplot(2, 3, 4) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(alt_samples, 20, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error = sum(alt_samples < thresh_boot) / float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range = [ min([ min(null_samples_boot), min(null_samples_spectrum), min(null_samples_gamma) ]), max([ max(null_samples_boot), max(null_samples_spectrum), max(null_samples_gamma) ]) ] # plot null distribution with threshold subplot(2, 3, 3) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(null_samples_boot, 20, range=hist_range, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Bootstrapped Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) grid(True) # plot null distribution spectrum subplot(2, 3, 5) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(null_samples_spectrum, 20, range=hist_range, normed=True) axvline(thresh_spectrum, 0, 1, linewidth=2, color='red') title('Null Dist. Spectrum\nType I error is ' + str(type_one_error_spectrum)) # plot null distribution gamma subplot(2, 3, 6) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(null_samples_gamma, 20, range=hist_range, normed=True) axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def runShogunSVMMultipleKernels(train_xt, train_lt, test_xt): """ Run SVM with Multiple Kernels """ ################################################## # Take all examples idxs = np.random.randint(1,14000,14000); train_xt = np.array(train_xt)[idxs]; train_lt = np.array(train_lt)[idxs]; # Initialize kernel and features kernel=CombinedKernel() feats_train=CombinedFeatures() feats_test=CombinedFeatures() labels = BinaryLabels(train_lt) ##################### Multiple Spectrum Kernels ######################### for i in range(K1,K2,-1): # append training data to combined feature object charfeat_train = StringCharFeatures(list(train_xt), DNA) feats_train_k1 = StringWordFeatures(DNA) feats_train_k1.obtain_from_char(charfeat_train, i-1, i, GAP, False) preproc=SortWordString() preproc.init(feats_train_k1) feats_train_k1.add_preprocessor(preproc) feats_train_k1.apply_preprocessor() # append testing data to combined feature object charfeat_test = StringCharFeatures(test_xt, DNA) feats_test_k1=StringWordFeatures(DNA) feats_test_k1.obtain_from_char(charfeat_test, i-1, i, GAP, False) feats_test_k1.add_preprocessor(preproc) feats_test_k1.apply_preprocessor() # append features feats_train.append_feature_obj(charfeat_train); feats_test.append_feature_obj(charfeat_test); # append spectrum kernel kernel1=CommWordStringKernel(10,i); kernel1.io.set_loglevel(MSG_DEBUG); kernel.append_kernel(kernel1); ''' Uncomment this for Multiple Weighted degree kernels and comment the multiple spectrum kernel block above instead ##################### Multiple Weighted Degree Kernel ######################### for i in range(K1,K2,-1): # append training data to combined feature object charfeat_train = StringCharFeatures(list(train_xt), DNA) # append testing data to combined feature object charfeat_test = StringCharFeatures(test_xt, DNA) # append features feats_train.append_feature_obj(charfeat_train); feats_test.append_feature_obj(charfeat_test); # setup weighted degree kernel kernel1=WeightedDegreePositionStringKernel(10,i); kernel1.io.set_loglevel(MSG_DEBUG); kernel1.set_shifts(SHIFT*np.ones(len(train_xt[0]), dtype=np.int32)) kernel1.set_position_weights(np.ones(len(train_xt[0]), dtype=np.float64)); kernel.append_kernel(kernel1); ''' ##################### Training ######################### print "Starting MKL training.." mkl = MKLClassification(); mkl.set_mkl_norm(3) #1,2,3 mkl.set_C(SVMC, SVMC) mkl.set_kernel(kernel) mkl.set_labels(labels) mkl.train(feats_train) print "Making predictions!" out1 = mkl.apply(feats_train).get_labels(); out2 = mkl.apply(feats_test).get_labels(); return out1,out2,train_lt
def predict_new_data(graph_file, cons_file, tri_file, other_feature_file): print "reading extracted features" graph_feature = read_feature_data(graph_file) graph_feature = get_normalized_given_max_min(graph_feature, "models/grtaph_max_size") cons_feature = read_feature_data(cons_file) cons_feature = get_normalized_given_max_min(cons_feature, "models/cons_max_size") CC_feature = read_feature_data(tri_file) CC_feature = get_normalized_given_max_min(CC_feature, "models/tri_max_size") ATOS_feature = read_feature_data(other_feature_file) ATOS_feature = get_normalized_given_max_min(ATOS_feature, "models/alu_max_size") width, C, epsilon, num_threads, mkl_epsilon, mkl_norm = 0.5, 1.2, 1e-5, 1, 0.001, 3.5 kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() # pdb.set_trace() subkfeats_train = RealFeatures() subkfeats_test = RealFeatures(np.transpose(np.array(graph_feature))) subkernel = GaussianKernel(10, width) feats_test.append_feature_obj(subkfeats_test) fstream = SerializableAsciiFile("models/graph.dat", "r") status = subkfeats_train.load_serializable(fstream) feats_train.append_feature_obj(subkfeats_train) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures() subkfeats_test = RealFeatures(np.transpose(np.array(cons_feature))) subkernel = GaussianKernel(10, width) feats_test.append_feature_obj(subkfeats_test) fstream = SerializableAsciiFile("models/cons.dat", "r") status = subkfeats_train.load_serializable(fstream) feats_train.append_feature_obj(subkfeats_train) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures() subkfeats_test = RealFeatures(np.transpose(np.array(CC_feature))) subkernel = GaussianKernel(10, width) feats_test.append_feature_obj(subkfeats_test) fstream = SerializableAsciiFile("models/tri.dat", "r") status = subkfeats_train.load_serializable(fstream) feats_train.append_feature_obj(subkfeats_train) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures() subkfeats_test = RealFeatures(np.transpose(np.array(ATOS_feature))) subkernel = GaussianKernel(10, width) feats_test.append_feature_obj(subkfeats_test) fstream = SerializableAsciiFile("models/alu.dat", "r") status = subkfeats_train.load_serializable(fstream) feats_train.append_feature_obj(subkfeats_train) kernel.append_kernel(subkernel) model_file = "models/mkl.dat" if not os.path.exists(model_file): print "downloading model file" url_add = "http://rth.dk/resources/mirnasponge/data/mkl.dat" urllib.urlretrieve(url_add, model_file) print "loading trained model" fstream = SerializableAsciiFile("models/mkl.dat", "r") new_mkl = MKLClassification() status = new_mkl.load_serializable(fstream) print "model predicting" kernel.init(feats_train, feats_test) new_mkl.set_kernel(kernel) y_out = new_mkl.apply().get_labels() return y_out
def mkl_binclass_modular (fm_train_real=traindat,fm_test_real=testdat,fm_label_twoclass = label_traindat): ################################## # set up and train # create some poly train/test matrix tfeats = RealFeatures(fm_train_real) tkernel = PolyKernel(10,3) tkernel.init(tfeats, tfeats) K_train = tkernel.get_kernel_matrix() pfeats = RealFeatures(fm_test_real) tkernel.init(tfeats, pfeats) K_test = tkernel.get_kernel_matrix() # create combined train features feats_train = CombinedFeatures() feats_train.append_feature_obj(RealFeatures(fm_train_real)) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(CustomKernel(K_train)) kernel.append_kernel(PolyKernel(10,2)) kernel.init(feats_train, feats_train) # train mkl labels = BinaryLabels(fm_label_twoclass) mkl = MKLClassification() # which norm to use for MKL mkl.set_mkl_norm(1) #2,3 # set cost (neg, pos) mkl.set_C(1, 1) # set kernel and labels mkl.set_kernel(kernel) mkl.set_labels(labels) # train mkl.train() #w=kernel.get_subkernel_weights() #kernel.set_subkernel_weights(w) ################################## # test # create combined test features feats_pred = CombinedFeatures() feats_pred.append_feature_obj(RealFeatures(fm_test_real)) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(CustomKernel(K_test)) kernel.append_kernel(PolyKernel(10, 2)) kernel.init(feats_train, feats_pred) # and classify mkl.set_kernel(kernel) mkl.apply() return mkl.apply(),kernel
def kernel_combined_custom_poly_modular (train_fname = traindat,test_fname = testdat,train_label_fname=label_traindat): from modshogun import CombinedFeatures, RealFeatures, BinaryLabels from modshogun import CombinedKernel, PolyKernel, CustomKernel from modshogun import LibSVM, CSVFile kernel = CombinedKernel() feats_train = CombinedFeatures() tfeats = RealFeatures(CSVFile(train_fname)) tkernel = PolyKernel(10,3) tkernel.init(tfeats, tfeats) K = tkernel.get_kernel_matrix() kernel.append_kernel(CustomKernel(K)) subkfeats_train = RealFeatures(CSVFile(train_fname)) feats_train.append_feature_obj(subkfeats_train) subkernel = PolyKernel(10,2) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels = BinaryLabels(CSVFile(train_label_fname)) svm = LibSVM(1.0, kernel, labels) svm.train() kernel = CombinedKernel() feats_pred = CombinedFeatures() pfeats = RealFeatures(CSVFile(test_fname)) tkernel = PolyKernel(10,3) tkernel.init(tfeats, pfeats) K = tkernel.get_kernel_matrix() kernel.append_kernel(CustomKernel(K)) subkfeats_test = RealFeatures(CSVFile(test_fname)) feats_pred.append_feature_obj(subkfeats_test) subkernel = PolyKernel(10, 2) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_pred) svm.set_kernel(kernel) svm.apply() km_train=kernel.get_kernel_matrix() return km_train,kernel
def evaluation_cross_validation_multiclass_storage (traindat=traindat, label_traindat=label_traindat): from modshogun import CrossValidation, CrossValidationResult from modshogun import CrossValidationPrintOutput from modshogun import CrossValidationMKLStorage, CrossValidationMulticlassStorage from modshogun import MulticlassAccuracy, F1Measure from modshogun import StratifiedCrossValidationSplitting from modshogun import MulticlassLabels from modshogun import RealFeatures, CombinedFeatures from modshogun import GaussianKernel, CombinedKernel from modshogun import MKLMulticlass from modshogun import Statistics, MSG_DEBUG, Math Math.init_random(1) # training data, combined features all on same data features=RealFeatures(traindat) comb_features=CombinedFeatures() comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) labels=MulticlassLabels(label_traindat) # kernel, different Gaussians combined kernel=CombinedKernel() kernel.append_kernel(GaussianKernel(10, 0.1)) kernel.append_kernel(GaussianKernel(10, 1)) kernel.append_kernel(GaussianKernel(10, 2)) # create mkl using libsvm, due to a mem-bug, interleaved is not possible svm=MKLMulticlass(1.0,kernel,labels); svm.set_kernel(kernel); # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy=StratifiedCrossValidationSplitting(labels, 3) # evaluation method evaluation_criterium=MulticlassAccuracy() # cross-validation instance cross_validation=CrossValidation(svm, comb_features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # append cross vlaidation output classes #cross_validation.add_cross_validation_output(CrossValidationPrintOutput()) #mkl_storage=CrossValidationMKLStorage() #cross_validation.add_cross_validation_output(mkl_storage) multiclass_storage=CrossValidationMulticlassStorage() multiclass_storage.append_binary_evaluation(F1Measure()) cross_validation.add_cross_validation_output(multiclass_storage) cross_validation.set_num_runs(3) # perform cross-validation result=cross_validation.evaluate() roc_0_0_0 = multiclass_storage.get_fold_ROC(0,0,0) #print roc_0_0_0 auc_0_0_0 = multiclass_storage.get_fold_evaluation_result(0,0,0,0) #print auc_0_0_0 return roc_0_0_0, auc_0_0_0
def kernel_combined_modular (fm_train_real=traindat,fm_test_real=testdat,fm_train_dna=traindna,fm_test_dna=testdna ): from modshogun import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel from modshogun import RealFeatures, StringCharFeatures, CombinedFeatures, DNA kernel=CombinedKernel() feats_train=CombinedFeatures() feats_test=CombinedFeatures() subkfeats_train=RealFeatures(fm_train_real) subkfeats_test=RealFeatures(fm_test_real) subkernel=GaussianKernel(10, 1.1) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=StringCharFeatures(fm_train_dna, DNA) subkfeats_test=StringCharFeatures(fm_test_dna, DNA) degree=3 subkernel=FixedDegreeStringKernel(10, degree) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=StringCharFeatures(fm_train_dna, DNA) subkfeats_test=StringCharFeatures(fm_test_dna, DNA) subkernel=LocalAlignmentStringKernel(10) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def mkl(train_features, train_labels, test_features, test_labels, width=5, C=1.2, epsilon=1e-2, mkl_epsilon=0.001, mkl_norm=2): from modshogun import CombinedKernel, CombinedFeatures from modshogun import GaussianKernel, LinearKernel, PolyKernel from modshogun import MKLMulticlass, MulticlassAccuracy kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() feats_train.append_feature_obj(train_features) feats_test.append_feature_obj(test_features) subkernel = GaussianKernel(10,width) kernel.append_kernel(subkernel) feats_train.append_feature_obj(train_features) feats_test.append_feature_obj(test_features) subkernel = LinearKernel() kernel.append_kernel(subkernel) feats_train.append_feature_obj(train_features) feats_test.append_feature_obj(test_features) subkernel = PolyKernel(10,2) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) mkl = MKLMulticlass(C, kernel, train_labels) mkl.set_epsilon(epsilon); mkl.set_mkl_epsilon(mkl_epsilon) mkl.set_mkl_norm(mkl_norm) mkl.train() train_output = mkl.apply() kernel.init(feats_train, feats_test) test_output = mkl.apply() evaluator = MulticlassAccuracy() print 'MKL training error is %.4f' % ((1-evaluator.evaluate(train_output, train_labels))*100) print 'MKL test error is %.4f' % ((1-evaluator.evaluate(test_output, test_labels))*100)
def kernel_combined_custom_poly_modular(train_fname=traindat, test_fname=testdat, train_label_fname=label_traindat): from modshogun import CombinedFeatures, RealFeatures, BinaryLabels from modshogun import CombinedKernel, PolyKernel, CustomKernel from modshogun import LibSVM, CSVFile kernel = CombinedKernel() feats_train = CombinedFeatures() tfeats = RealFeatures(CSVFile(train_fname)) tkernel = PolyKernel(10, 3) tkernel.init(tfeats, tfeats) K = tkernel.get_kernel_matrix() kernel.append_kernel(CustomKernel(K)) subkfeats_train = RealFeatures(CSVFile(train_fname)) feats_train.append_feature_obj(subkfeats_train) subkernel = PolyKernel(10, 2) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels = BinaryLabels(CSVFile(train_label_fname)) svm = LibSVM(1.0, kernel, labels) svm.train() kernel = CombinedKernel() feats_pred = CombinedFeatures() pfeats = RealFeatures(CSVFile(test_fname)) tkernel = PolyKernel(10, 3) tkernel.init(tfeats, pfeats) K = tkernel.get_kernel_matrix() kernel.append_kernel(CustomKernel(K)) subkfeats_test = RealFeatures(CSVFile(test_fname)) feats_pred.append_feature_obj(subkfeats_test) subkernel = PolyKernel(10, 2) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_pred) svm.set_kernel(kernel) svm.apply() km_train = kernel.get_kernel_matrix() return km_train, kernel
def statistics_mmd_kernel_selection_single(m,distance,stretch,num_blobs,angle,selection_method): from modshogun import RealFeatures from modshogun import GaussianBlobsDataGenerator from modshogun import GaussianKernel, CombinedKernel from modshogun import LinearTimeMMD from modshogun import MMDKernelSelectionMedian from modshogun import MMDKernelSelectionMax from modshogun import MMDKernelSelectionOpt from modshogun import PERMUTATION, MMD1_GAUSSIAN from modshogun import EuclideanDistance from modshogun import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # results for this low number will be bad (unstable, type I error wrong) m=1000 distance=10 stretch=5 num_blobs=3 angle=pi/4 # streaming data generator gen_p=GaussianBlobsDataGenerator(num_blobs, distance, 1, 0) gen_q=GaussianBlobsDataGenerator(num_blobs, distance, stretch, angle) # stream some data and plot num_plot=1000 features=gen_p.get_streamed_features(num_plot) features=features.create_merged_copy(gen_q.get_streamed_features(num_plot)) data=features.get_feature_matrix() #figure() #subplot(2,2,1) #grid(True) #plot(data[0][0:num_plot], data[1][0:num_plot], 'r.', label='$x$') #title('$X\sim p$') #subplot(2,2,2) #grid(True) #plot(data[0][num_plot+1:2*num_plot], data[1][num_plot+1:2*num_plot], 'b.', label='$x$', alpha=0.5) #title('$Y\sim q$') # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # different to the standard form, see documentation) sigmas=[2**x for x in range(-3,10)] widths=[x*x*2 for x in sigmas] combined=CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # mmd instance using streaming features, blocksize of 10000 block_size=1000 mmd=LinearTimeMMD(combined, gen_p, gen_q, m, block_size) # kernel selection instance (this can easily replaced by the other methods for selecting # single kernels if selection_method=="opt": selection=MMDKernelSelectionOpt(mmd) elif selection_method=="max": selection=MMDKernelSelectionMax(mmd) elif selection_method=="median": selection=MMDKernelSelectionMedian(mmd) # print measures (just for information) # in case Opt: ratios of MMD and standard deviation # in case Max: MMDs for each kernel # Does not work for median method if selection_method!="median": ratios=selection.compute_measures() #print "Measures:", ratios #subplot(2,2,3) #plot(ratios) #title('Measures') # perform kernel selection kernel=selection.select_kernel() kernel=GaussianKernel.obtain_from_generic(kernel) #print "selected kernel width:", kernel.get_width() # compute tpye I and II error (use many more trials). Type I error is only # estimated to check MMD1_GAUSSIAN method for estimating the null # distribution. Note that testing has to happen on difference data than # kernel selecting, but the linear time mmd does this implicitly mmd.set_kernel(kernel) mmd.set_null_approximation_method(MMD1_GAUSSIAN) # number of trials should be larger to compute tight confidence bounds num_trials=5; alpha=0.05 # test power typeIerrors=[0 for x in range(num_trials)] typeIIerrors=[0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i]=mmd.perform_test()>alpha mmd.set_simulate_h0(False) typeIIerrors[i]=mmd.perform_test()>alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return kernel,typeIerrors,typeIIerrors
def linear_time_mmd_graphical(): # parameters, change to get different results m=1000 # set to 10000 for a good test result dim=2 # setting the difference of the first dimension smaller makes a harder test difference=1 # number of samples taken from null and alternative distribution num_null_samples=150 # streaming data generator for mean shift distributions gen_p=MeanShiftDataGenerator(0, dim) gen_q=MeanShiftDataGenerator(difference, dim) # use the median kernel selection # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable sigmas=[2**x for x in range(-3,10)] widths=[x*x*2 for x in sigmas] print "kernel widths:", widths combined=CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # mmd instance using streaming features, blocksize of 10000 block_size=1000 mmd=LinearTimeMMD(combined, gen_p, gen_q, m, block_size) # kernel selection instance (this can easily replaced by the other methods for selecting # single kernels selection=MMDKernelSelectionOpt(mmd) # perform kernel selection kernel=selection.select_kernel() kernel=GaussianKernel.obtain_from_generic(kernel) mmd.set_kernel(kernel); print "selected kernel width:", kernel.get_width() # sample alternative distribution, stream ensures different samples each run alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): alt_samples[i]=mmd.compute_statistic() # sample from null distribution # bootstrapping, biased statistic mmd.set_null_approximation_method(PERMUTATION) mmd.set_num_null_samples(num_null_samples) null_samples_boot=mmd.sample_null() # fit normal distribution to null and sample a normal distribution mmd.set_null_approximation_method(MMD1_GAUSSIAN) variance=mmd.compute_variance_estimate() null_samples_gaussian=normal(0,sqrt(variance),num_null_samples) # to plot data, sample a few examples from stream first features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) data=features.get_feature_matrix() # plot figure() # plot data of p and q subplot(2,3,1) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks plot(data[0][0:m], data[1][0:m], 'ro', label='$x$') plot(data[0][m+1:2*m], data[1][m+1:2*m], 'bo', label='$x$', alpha=0.5) title('Data, shift in $x_1$='+str(difference)+'\nm='+str(m)) xlabel('$x_1, y_1$') ylabel('$x_2, y_2$') # histogram of first data dimension and pdf subplot(2,3,2) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True) hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True) xs=linspace(min(data[0])-1,max(data[0])+1, 50) plot(xs,normpdf( xs, 0, 1), 'r', linewidth=3) plot(xs,normpdf( xs, difference, 1), 'b', linewidth=3) xlabel('$x_1, y_1$') ylabel('$p(x_1), p(y_1)$') title('Data PDF in $x_1, y_1$') # compute threshold for test level alpha=0.05 null_samples_boot.sort() null_samples_gaussian.sort() thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))]; thresh_gaussian=null_samples_gaussian[floor(len(null_samples_gaussian)*(1-alpha))]; type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples) type_one_error_gaussian=sum(null_samples_gaussian<thresh_boot)/float(num_null_samples) # plot alternative distribution with threshold subplot(2,3,4) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(alt_samples, 20, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range=[min([min(null_samples_boot), min(null_samples_gaussian)]), max([max(null_samples_boot), max(null_samples_gaussian)])] # plot null distribution with threshold subplot(2,3,3) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(null_samples_boot, 20, range=hist_range, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Sampled Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gaussian subplot(2,3,5) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(null_samples_gaussian, 20, range=hist_range, normed=True); axvline(thresh_gaussian, 0, 1, linewidth=2, color='red') title('Null Dist. Gaussian\nType I error is ' + str(type_one_error_gaussian)) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def runShogunSVMMultipleKernels(train_xt, train_lt, test_xt): """ Run SVM with Multiple Kernels """ ################################################## # Take all examples idxs = np.random.randint(1, 14000, 14000) train_xt = np.array(train_xt)[idxs] train_lt = np.array(train_lt)[idxs] # Initialize kernel and features kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() labels = BinaryLabels(train_lt) ##################### Multiple Spectrum Kernels ######################### for i in range(K1, K2, -1): # append training data to combined feature object charfeat_train = StringCharFeatures(list(train_xt), DNA) feats_train_k1 = StringWordFeatures(DNA) feats_train_k1.obtain_from_char(charfeat_train, i - 1, i, GAP, False) preproc = SortWordString() preproc.init(feats_train_k1) feats_train_k1.add_preprocessor(preproc) feats_train_k1.apply_preprocessor() # append testing data to combined feature object charfeat_test = StringCharFeatures(test_xt, DNA) feats_test_k1 = StringWordFeatures(DNA) feats_test_k1.obtain_from_char(charfeat_test, i - 1, i, GAP, False) feats_test_k1.add_preprocessor(preproc) feats_test_k1.apply_preprocessor() # append features feats_train.append_feature_obj(charfeat_train) feats_test.append_feature_obj(charfeat_test) # append spectrum kernel kernel1 = CommWordStringKernel(10, i) kernel1.io.set_loglevel(MSG_DEBUG) kernel.append_kernel(kernel1) ''' Uncomment this for Multiple Weighted degree kernels and comment the multiple spectrum kernel block above instead ##################### Multiple Weighted Degree Kernel ######################### for i in range(K1,K2,-1): # append training data to combined feature object charfeat_train = StringCharFeatures(list(train_xt), DNA) # append testing data to combined feature object charfeat_test = StringCharFeatures(test_xt, DNA) # append features feats_train.append_feature_obj(charfeat_train); feats_test.append_feature_obj(charfeat_test); # setup weighted degree kernel kernel1=WeightedDegreePositionStringKernel(10,i); kernel1.io.set_loglevel(MSG_DEBUG); kernel1.set_shifts(SHIFT*np.ones(len(train_xt[0]), dtype=np.int32)) kernel1.set_position_weights(np.ones(len(train_xt[0]), dtype=np.float64)); kernel.append_kernel(kernel1); ''' ##################### Training ######################### print "Starting MKL training.." mkl = MKLClassification() mkl.set_mkl_norm(3) #1,2,3 mkl.set_C(SVMC, SVMC) mkl.set_kernel(kernel) mkl.set_labels(labels) mkl.train(feats_train) print "Making predictions!" out1 = mkl.apply(feats_train).get_labels() out2 = mkl.apply(feats_test).get_labels() return out1, out2, train_lt
def mkl(train_features, train_labels, test_features, test_labels, width=5, C=1.2, epsilon=1e-2, mkl_epsilon=0.001, mkl_norm=2): from modshogun import CombinedKernel, CombinedFeatures from modshogun import GaussianKernel, LinearKernel, PolyKernel from modshogun import MKLMulticlass, MulticlassAccuracy kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() feats_train.append_feature_obj(train_features) feats_test.append_feature_obj(test_features) subkernel = GaussianKernel(10, width) kernel.append_kernel(subkernel) feats_train.append_feature_obj(train_features) feats_test.append_feature_obj(test_features) subkernel = LinearKernel() kernel.append_kernel(subkernel) feats_train.append_feature_obj(train_features) feats_test.append_feature_obj(test_features) subkernel = PolyKernel(10, 2) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) mkl = MKLMulticlass(C, kernel, train_labels) mkl.set_epsilon(epsilon) mkl.set_mkl_epsilon(mkl_epsilon) mkl.set_mkl_norm(mkl_norm) mkl.train() train_output = mkl.apply() kernel.init(feats_train, feats_test) test_output = mkl.apply() evaluator = MulticlassAccuracy() print 'MKL training error is %.4f' % ( (1 - evaluator.evaluate(train_output, train_labels)) * 100) print 'MKL test error is %.4f' % ( (1 - evaluator.evaluate(test_output, test_labels)) * 100)
def serialization_string_kernels_modular(n_data, num_shifts, size): """ serialize svm with string kernels """ ################################################## # set up toy data and svm train_xt, train_lt = generate_random_data(n_data) test_xt, test_lt = generate_random_data(n_data) feats_train = construct_features(train_xt) feats_test = construct_features(test_xt) max_len = len(train_xt[0]) kernel_wdk = WeightedDegreePositionStringKernel(size, 5) shifts_vector = numpy.ones(max_len, dtype=numpy.int32)*num_shifts kernel_wdk.set_shifts(shifts_vector) ######## # set up spectrum use_sign = False kernel_spec_1 = WeightedCommWordStringKernel(size, use_sign) kernel_spec_2 = WeightedCommWordStringKernel(size, use_sign) ######## # combined kernel kernel = CombinedKernel() kernel.append_kernel(kernel_wdk) kernel.append_kernel(kernel_spec_1) kernel.append_kernel(kernel_spec_2) # init kernel labels = BinaryLabels(train_lt); svm = SVMLight(1.0, kernel, labels) #svm.io.set_loglevel(MSG_DEBUG) svm.train(feats_train) ################################################## # serialize to file fn = "serialized_svm.bz2" #print("serializing SVM to file", fn) save(fn, svm) ################################################## # unserialize and sanity check #print("unserializing SVM") svm2 = load(fn) #print("comparing predictions") out = svm.apply(feats_test).get_labels() out2 = svm2.apply(feats_test).get_labels() # assert outputs are close for i in range(len(out)): assert abs(out[i] - out2[i] < 0.000001) #print("all checks passed.") return out,out2