def mlprocess(task_filename, data_filename, pred_filename, verbose=True): """Demo of creating machine learning process.""" task_type, fidx, lidx, train_idx, test_idx = parse_task(task_filename) outputs = init_output(task_type) all_data = parse_data(data_filename) train_ex, train_lab, test_ex, test_lab = split_data(all_data, fidx, lidx, train_idx, test_idx) label_train = outputs.str2label(train_lab) if verbose: print 'Number of features: %d' % train_ex.shape[0] print '%d training examples, %d test examples' % (len(train_lab), len(test_lab)) feats_train = RealFeatures(train_ex) feats_test = RealFeatures(test_ex) width=1.0 kernel=GaussianKernel(feats_train, feats_train, width) labels=Labels(label_train) svm = init_svm(task_type, kernel, labels) svm.train() kernel.init(feats_train, feats_test) preds = svm.classify().get_labels() pred_label = outputs.label2str(preds) pf = open(pred_filename, 'w') for pred in pred_label: pf.write(pred+'\n') pf.close()
def statistics_kmm (n,d): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel, MSG_DEBUG from shogun.Statistics import KernelMeanMatching from shogun.Mathematics import Math # init seed for reproducability Math.init_random(1) random.seed(1); data = random.randn(d,n) # create shogun feature representation features=RealFeatures(data) # use a kernel width of sigma=2, which is 8 in SHOGUN's parametrization # which is k(x,y)=exp(-||x-y||^2 / tau), in constrast to the standard # k(x,y)=exp(-||x-y||^2 / (2*sigma^2)), so tau=2*sigma^2 kernel=GaussianKernel(10,8) kernel.init(features,features) kmm = KernelMeanMatching(kernel,array([0,1,2,3,7,8,9],dtype=int32),array([4,5,6],dtype=int32)) w = kmm.compute_weights() #print w return w
def classify(classifier, features, labels, C=5, kernel_name=None, kernel_args=None): from shogun.Features import RealFeatures sigma = 10000 kernel = GaussianKernel(features, features, sigma) # TODO # kernel = LinearKernel(features, features) # kernel = PolyKernel(features, features, 50, 2) # kernel = kernels[kernel_name](features, features, *kernel_args) svm = classifier(C, kernel, labels) svm.train(features) x_size = 640 y_size = 400 size = 100 x1 = np.linspace(0, x_size, size) y1 = np.linspace(0, y_size, size) x, y = np.meshgrid(x1, y1) test = RealFeatures(np.array((np.ravel(x), np.ravel(y)))) kernel.init(features, test) out = svm.apply(test).get_values() if not len(out): out = svm.apply(test).get_labels() z = out.reshape((size, size)) z = np.transpose(z) return x, y, z
def regression_svrlight_modular(fm_train=traindat,fm_test=testdat,label_train=label_traindat, \ width=1.2,C=1,epsilon=1e-5,tube_epsilon=1e-2,num_threads=3): from shogun.Features import Labels, RealFeatures from shogun.Kernel import GaussianKernel try: from shogun.Regression import SVRLight except ImportError: print('No support for SVRLight available.') return feats_train=RealFeatures(fm_train) feats_test=RealFeatures(fm_test) kernel=GaussianKernel(feats_train, feats_train, width) labels=Labels(label_train) svr=SVRLight(C, epsilon, kernel, labels) svr.set_tube_epsilon(tube_epsilon) svr.parallel.set_num_threads(num_threads) svr.train() kernel.init(feats_train, feats_test) out = svr.apply().get_labels() return out, kernel
def kernel_io_modular (fm_train_real=traindat,fm_test_real=testdat,width=1.9): from shogun.Features import RealFeatures from shogun.Kernel import GaussianKernel from shogun.Library import AsciiFile, BinaryFile feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) kernel=GaussianKernel(feats_train, feats_train, width) km_train=kernel.get_kernel_matrix() f=AsciiFile("gaussian_train.ascii","w") kernel.save(f) del f kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() f=AsciiFile("gaussian_test.ascii","w") kernel.save(f) del f #clean up import os os.unlink("gaussian_test.ascii") os.unlink("gaussian_train.ascii") return km_train, km_test, kernel
def createKernel(self, feats_train): """Call the corresponding constructor for the kernel""" if self.kparam['name'] == 'gauss': kernel = GaussianKernel(feats_train, feats_train, self.kparam['width']) elif self.kparam['name'] == 'linear': kernel = LinearKernel(feats_train, feats_train, self.kparam['scale']) elif self.kparam['name'] == 'poly': kernel = PolyKernel(feats_train, feats_train, self.kparam['degree'], self.kparam['inhomogene'], self.kparam['normal']) elif self.kparam['name'] == 'wd': kernel = WeightedDegreePositionStringKernel(feats_train, feats_train, self.kparam['degree']) kernel.set_shifts(self.kparam['shift'] * numpy.ones(self.kparam['seqlength'], dtype=numpy.int32)) elif self.kparam['name'] == 'spec': kernel = CommWordStringKernel(feats_train, feats_train) elif self.kparam['name'] == 'localalign': kernel = LocalAlignmentStringKernel(feats_train, feats_train) elif self.kparam['name'] == 'localimprove': kernel = LocalityImprovedStringKernel(feats_train, feats_train, self.kparam['length'], \ self.kparam['indeg'], self.kparam['outdeg']) else: print 'Unknown kernel %s' % self.kparam['name'] raise ValueError self.kernel = kernel return kernel
def kernel_gaussian_modular (fm_train_real=traindat,fm_test_real=testdat, width=1.3): from shogun.Features import RealFeatures from shogun.Kernel import GaussianKernel feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) kernel=GaussianKernel(feats_train, feats_train, width) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def create_param_tree(): root=ModelSelectionParameters() c1=ModelSelectionParameters("C1") root.append_child(c1) c1.build_values(-1.0, 1.0, R_EXP) c2=ModelSelectionParameters("C2") root.append_child(c2) c2.build_values(-1.0, 1.0, R_EXP) gaussian_kernel=GaussianKernel() # print all parameter available for modelselection # Dont worry if yours is not included, simply write to the mailing list gaussian_kernel.print_modsel_params() param_gaussian_kernel=ModelSelectionParameters("kernel", gaussian_kernel) gaussian_kernel_width=ModelSelectionParameters("width") gaussian_kernel_width.build_values(-1.0, 1.0, R_EXP, 1.0, 2.0) param_gaussian_kernel.append_child(gaussian_kernel_width) root.append_child(param_gaussian_kernel) power_kernel=PowerKernel() # print all parameter available for modelselection # Dont worry if yours is not included, simply write to the mailing list power_kernel.print_modsel_params() param_power_kernel=ModelSelectionParameters("kernel", power_kernel) root.append_child(param_power_kernel) param_power_kernel_degree=ModelSelectionParameters("degree") param_power_kernel_degree.build_values(1.0, 2.0, R_LINEAR) param_power_kernel.append_child(param_power_kernel_degree) metric=MinkowskiMetric(10) # print all parameter available for modelselection # Dont worry if yours is not included, simply write to the mailing list metric.print_modsel_params() param_power_kernel_metric1=ModelSelectionParameters("distance", metric) param_power_kernel.append_child(param_power_kernel_metric1) param_power_kernel_metric1_k=ModelSelectionParameters("k") param_power_kernel_metric1_k.build_values(1.0, 2.0, R_LINEAR) param_power_kernel_metric1.append_child(param_power_kernel_metric1_k) return root
def gaussian (): print 'Gaussian' from shogun.Features import RealFeatures from shogun.Kernel import GaussianKernel feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) width=1.9 kernel=GaussianKernel(feats_train, feats_train, width) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix()
def classifier_libsvm_minimal_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_twoclass=label_traindat,width=2.1,C=1): from shogun.Features import RealFeatures, BinaryLabels from shogun.Classifier import LibSVM from shogun.Kernel import GaussianKernel feats_train=RealFeatures(fm_train_real); feats_test=RealFeatures(fm_test_real); kernel=GaussianKernel(feats_train, feats_train, width); labels=BinaryLabels(label_train_twoclass); svm=LibSVM(C, kernel, labels); svm.train(); kernel.init(feats_train, feats_test); out=svm.apply().get_labels(); testerr=mean(sign(out)!=label_train_twoclass)
def create_param_tree(): from shogun.ModelSelection import ModelSelectionParameters, R_EXP, R_LINEAR from shogun.ModelSelection import ParameterCombination from shogun.Kernel import GaussianKernel, PolyKernel root=ModelSelectionParameters() tau=ModelSelectionParameters("tau") root.append_child(tau) # also R_LINEAR/R_LOG is available as type min=-1 max=1 type=R_EXP step=1.5 base=2 tau.build_values(min, max, type, step, base) # gaussian kernel with width gaussian_kernel=GaussianKernel() # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list gaussian_kernel.print_modsel_params() param_gaussian_kernel=ModelSelectionParameters("kernel", gaussian_kernel) gaussian_kernel_width=ModelSelectionParameters("width"); gaussian_kernel_width.build_values(5.0, 8.0, R_EXP, 1.0, 2.0) param_gaussian_kernel.append_child(gaussian_kernel_width) root.append_child(param_gaussian_kernel) # polynomial kernel with degree poly_kernel=PolyKernel() # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list poly_kernel.print_modsel_params() param_poly_kernel=ModelSelectionParameters("kernel", poly_kernel) root.append_child(param_poly_kernel) # note that integers are used here param_poly_kernel_degree=ModelSelectionParameters("degree") param_poly_kernel_degree.build_values(1, 2, R_LINEAR) param_poly_kernel.append_child(param_poly_kernel_degree) return root
def mkl_binclass_modular (train_data, testdata, train_labels, test_labels, d1, d2): # create some Gaussian train/test matrix tfeats = RealFeatures(train_data) tkernel = GaussianKernel(128, d1) tkernel.init(tfeats, tfeats) K_train = tkernel.get_kernel_matrix() pfeats = RealFeatures(test_data) tkernel.init(tfeats, pfeats) K_test = tkernel.get_kernel_matrix() # create combined train features feats_train = CombinedFeatures() feats_train.append_feature_obj(RealFeatures(train_data)) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(CustomKernel(K_train)) kernel.append_kernel(GaussianKernel(128, d2)) kernel.init(feats_train, feats_train) # train mkl labels = Labels(train_labels) mkl = MKLClassification() # not to use svmlight mkl.set_interleaved_optimization_enabled(0) # which norm to use for MKL mkl.set_mkl_norm(2) # set cost (neg, pos) mkl.set_C(1, 1) # set kernel and labels mkl.set_kernel(kernel) mkl.set_labels(labels) # train mkl.train() # test # create combined test features feats_pred = CombinedFeatures() feats_pred.append_feature_obj(RealFeatures(test_data)) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(CustomKernel(K_test)) kernel.append_kernel(GaussianKernel(128, d2)) kernel.init(feats_train, feats_pred) # and classify mkl.set_kernel(kernel) output = mkl.apply().get_labels() output = [1.0 if i>0 else -1.0 for i in output] accu = len(where(output == test_labels)[0]) / float(len(output)) return accu
def classifier_multiclassmachine_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,width=2.1,C=1,epsilon=1e-5): from shogun.Features import RealFeatures, Labels from shogun.Kernel import GaussianKernel from shogun.Classifier import LibSVM, KernelMulticlassMachine, ONE_VS_REST_STRATEGY feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) kernel=GaussianKernel(feats_train, feats_train, width) labels=Labels(label_train_multiclass) classifier = LibSVM(C, kernel, labels) classifier.set_epsilon(epsilon) mc_classifier = KernelMulticlassMachine(ONE_VS_REST_STRATEGY,kernel,classifier,labels) mc_classifier.train() kernel.init(feats_train, feats_test) out = mc_classifier.apply().get_labels() return out
def classifier_libsvmoneclass_modular (fm_train_real=traindat,fm_test_real=testdat,width=2.1,C=1,epsilon=1e-5): from shogun.Features import RealFeatures, Labels from shogun.Kernel import GaussianKernel from shogun.Classifier import LibSVMOneClass feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) kernel=GaussianKernel(feats_train, feats_train, width) svm=LibSVMOneClass(C, kernel) svm.set_epsilon(epsilon) svm.train() kernel.init(feats_train, feats_test) svm.apply().get_labels() predictions = svm.apply() return predictions, svm, predictions.get_labels()
def classifier_multiclasslibsvm_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,width=2.1,C=1,epsilon=1e-5): from shogun.Features import RealFeatures, Labels from shogun.Kernel import GaussianKernel from shogun.Classifier import MulticlassLibSVM feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) kernel=GaussianKernel(feats_train, feats_train, width) labels=Labels(label_train_multiclass) svm=MulticlassLibSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() kernel.init(feats_train, feats_test) out = svm.apply().get_labels() predictions = svm.apply() return predictions, svm, predictions.get_labels()
def classifier_gmnpsvm_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,width=2.1,C=1,epsilon=1e-5): from shogun.Features import RealFeatures, MulticlassLabels from shogun.Kernel import GaussianKernel from shogun.Classifier import GMNPSVM feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) kernel=GaussianKernel(feats_train, feats_train, width) labels=MulticlassLabels(label_train_multiclass) svm=GMNPSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train(feats_train) kernel.init(feats_train, feats_test) out=svm.apply(feats_test).get_labels() return out,kernel
def regression_kernel_ridge_modular (fm_train=traindat,fm_test=testdat,label_train=label_traindat,width=0.8,tau=1e-6): from shogun.Features import Labels, RealFeatures from shogun.Kernel import GaussianKernel from shogun.Regression import KernelRidgeRegression feats_train=RealFeatures(fm_train) feats_test=RealFeatures(fm_test) kernel=GaussianKernel(feats_train, feats_train, width) labels=Labels(label_train) krr=KernelRidgeRegression(tau, kernel, labels) krr.train(feats_train) kernel.init(feats_train, feats_test) out = krr.apply().get_labels() return out,kernel,krr
def regression_kernel_ridge_modular (n=100,n_test=100, \ x_range=6,x_range_test=10,noise_var=0.5,width=1, tau=1e-6, seed=1): from shogun.Features import RegressionLabels, RealFeatures from shogun.Kernel import GaussianKernel from shogun.Regression import KernelRidgeRegression # reproducable results random.seed(seed) # easy regression data: one dimensional noisy sine wave n=15 n_test=100 x_range_test=10 noise_var=0.5; X=random.rand(1,n)*x_range X_test=array([[float(i)/n_test*x_range_test for i in range(n_test)]]) Y_test=sin(X_test) Y=sin(X)+random.randn(n)*noise_var # shogun representation labels=RegressionLabels(Y[0]) feats_train=RealFeatures(X) feats_test=RealFeatures(X_test) kernel=GaussianKernel(feats_train, feats_train, width) krr=KernelRidgeRegression(tau, kernel, labels) krr.train(feats_train) kernel.init(feats_train, feats_test) out = krr.apply().get_labels() # plot results #plot(X[0],Y[0],'x') # training observations #plot(X_test[0],Y_test[0],'-') # ground truth of test #plot(X_test[0],out, '-') # mean predictions of test #legend(["training", "ground truth", "mean predictions"]) #show() return out,kernel,krr
def regression_libsvr_modular (svm_c=1, svr_param=0.1, n=100,n_test=100, \ x_range=6,x_range_test=10,noise_var=0.5,width=1, seed=1): from shogun.Features import RegressionLabels, RealFeatures from shogun.Kernel import GaussianKernel from shogun.Regression import LibSVR, LIBSVR_NU_SVR, LIBSVR_EPSILON_SVR # reproducable results random.seed(seed) # easy regression data: one dimensional noisy sine wave n=15 n_test=100 x_range_test=10 noise_var=0.5; X=random.rand(1,n)*x_range X_test=array([[float(i)/n_test*x_range_test for i in range(n_test)]]) Y_test=sin(X_test) Y=sin(X)+random.randn(n)*noise_var # shogun representation labels=RegressionLabels(Y[0]) feats_train=RealFeatures(X) feats_test=RealFeatures(X_test) kernel=GaussianKernel(feats_train, feats_train, width) # two svr models: epsilon and nu svr_epsilon=LibSVR(svm_c, svr_param, kernel, labels, LIBSVR_EPSILON_SVR) svr_epsilon.train() svr_nu=LibSVR(svm_c, svr_param, kernel, labels, LIBSVR_NU_SVR) svr_nu.train() # predictions kernel.init(feats_train, feats_test) out1_epsilon=svr_epsilon.apply().get_labels() out2_epsilon=svr_epsilon.apply(feats_test).get_labels() out1_nu=svr_epsilon.apply().get_labels() out2_nu=svr_epsilon.apply(feats_test).get_labels() return out1_epsilon,out2_epsilon,out1_nu,out2_nu ,kernel
def krr (): print 'KRR' from shogun.Features import Labels, RealFeatures from shogun.Kernel import GaussianKernel from shogun.Regression import KRR feats_train=RealFeatures(fm_train) feats_test=RealFeatures(fm_test) width=0.8 kernel=GaussianKernel(feats_train, feats_train, width) tau=1e-6 labels=Labels(label_train) krr=KRR(tau, kernel, labels) krr.train(feats_train) kernel.init(feats_train, feats_test) out = krr.classify().get_labels() return out
def classifier_multiclassmachine_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,width=2.1,C=1,epsilon=1e-5): from shogun.Features import RealFeatures, MulticlassLabels from shogun.Kernel import GaussianKernel from shogun.Classifier import LibSVM, KernelMulticlassMachine, MulticlassOneVsRestStrategy feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) kernel=GaussianKernel(feats_train, feats_train, width) labels=MulticlassLabels(label_train_multiclass) classifier = LibSVM() classifier.set_epsilon(epsilon) #print labels.get_labels() mc_classifier = KernelMulticlassMachine(MulticlassOneVsRestStrategy(),kernel,classifier,labels) mc_classifier.train() kernel.init(feats_train, feats_test) out = mc_classifier.apply().get_labels() return out
def classifier_libsvm_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_twoclass=label_traindat,width=2.1,C=1,epsilon=1e-5): from shogun.Features import RealFeatures, Labels from shogun.Kernel import GaussianKernel from shogun.Classifier import LibSVM feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) kernel=GaussianKernel(feats_train, feats_train, width) labels=Labels(label_train_twoclass) svm=LibSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() kernel.init(feats_train, feats_test) labels = svm.apply().get_labels() supportvectors = sv_idx=svm.get_support_vectors() alphas=svm.get_alphas() predictions = svm.apply() return predictions, svm, predictions.get_labels()
def libsvm_oneclass (): print 'LibSVMOneClass' from shogun.Features import RealFeatures, Labels from shogun.Kernel import GaussianKernel from shogun.Classifier import LibSVMOneClass feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) width=2.1 kernel=GaussianKernel(feats_train, feats_train, width) C=1 epsilon=1e-5 svm=LibSVMOneClass(C, kernel) svm.set_epsilon(epsilon) svm.train() kernel.init(feats_train, feats_test) svm.classify().get_labels()
def mpdsvm (): print 'MPDSVM' from shogun.Features import RealFeatures, Labels from shogun.Kernel import GaussianKernel from shogun.Classifier import MPDSVM feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) width=2.1 kernel=GaussianKernel(feats_train, feats_train, width) C=1 epsilon=1e-5 labels=Labels(label_train_twoclass) svm=MPDSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() kernel.init(feats_train, feats_test) svm.classify().get_labels()
def regression_libsvr_modular (fm_train=traindat,fm_test=testdat,label_train=label_traindat,\ width=2.1,C=1,epsilon=1e-5,tube_epsilon=1e-2): from shogun.Features import Labels, RealFeatures from shogun.Kernel import GaussianKernel from shogun.Regression import LibSVR feats_train=RealFeatures(fm_train) feats_test=RealFeatures(fm_test) kernel=GaussianKernel(feats_train, feats_train, width) labels=Labels(label_train) svr=LibSVR(C, tube_epsilon, kernel, labels) svr.set_epsilon(epsilon) svr.train() kernel.init(feats_train, feats_test) out1=svr.apply().get_labels() out2=svr.apply(feats_test).get_labels() return out1,out2,kernel
def libsvr (): print 'LibSVR' from shogun.Features import Labels, RealFeatures from shogun.Kernel import GaussianKernel from shogun.Regression import LibSVR feats_train=RealFeatures(fm_train) feats_test=RealFeatures(fm_test) width=2.1 kernel=GaussianKernel(feats_train, feats_train, width) C=1 epsilon=1e-5 tube_epsilon=1e-2 labels=Labels(label_train) svr=LibSVR(C, epsilon, kernel, labels) svr.set_tube_epsilon(tube_epsilon) svr.train() kernel.init(feats_train, feats_test) out1=svr.classify().get_labels() out2=svr.classify(feats_test).get_labels()
def libsvm (): print 'LibSVM' from shogun.Features import RealFeatures, Labels from shogun.Kernel import GaussianKernel from shogun.Classifier import LibSVM feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) width=2.1 kernel=GaussianKernel(feats_train, feats_train, width) C=1 epsilon=1e-5 labels=Labels(label_train_twoclass) svm=LibSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() kernel.init(feats_train, feats_test) svm.classify().get_labels() sv_idx=svm.get_support_vectors() alphas=svm.get_alphas()
def __init__(self, traininput,testinput,traintarget,width=0.5,C=1,epsilon=0.1,tube_epsilon=0.1): train = matrix(traininput, dtype=float64) test = matrix(testinput, dtype=float64) label_train = array(traintarget, dtype=float64) self.feats_train=RealFeatures(train) feats_test=RealFeatures(test) trainstart = time.time() self.kernel=GaussianKernel(self.feats_train, self.feats_train, width) # self.kernel = PolyKernel(self.feats_train, self.feats_train, 2, False) labels=Labels(label_train) self.svr=LibSVR(C, epsilon, self.kernel, labels) self.svr.set_tube_epsilon(tube_epsilon) self.svr.train() trainend = time.time() print 'SVR train time' print trainend-trainstart
def linear_time_mmd_graphical(): # parameters, change to get different results m = 1000 # set to 10000 for a good test result dim = 2 # setting the difference of the first dimension smaller makes a harder test difference = 1 # number of samples taken from null and alternative distribution num_null_samples = 150 # streaming data generator for mean shift distributions gen_p = MeanShiftDataGenerator(0, dim) gen_q = MeanShiftDataGenerator(difference, dim) # use the median kernel selection # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable sigmas = [2**x for x in range(-3, 10)] widths = [x * x * 2 for x in sigmas] print "kernel widths:", widths combined = CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # mmd instance using streaming features, blocksize of 10000 block_size = 1000 mmd = LinearTimeMMD(combined, gen_p, gen_q, m, block_size) # kernel selection instance (this can easily replaced by the other methods for selecting # single kernels selection = MMDKernelSelectionOpt(mmd) # perform kernel selection kernel = selection.select_kernel() kernel = GaussianKernel.obtain_from_generic(kernel) mmd.set_kernel(kernel) print "selected kernel width:", kernel.get_width() # sample alternative distribution, stream ensures different samples each run alt_samples = zeros(num_null_samples) for i in range(len(alt_samples)): alt_samples[i] = mmd.compute_statistic() # sample from null distribution # bootstrapping, biased statistic mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(num_null_samples) null_samples_boot = mmd.bootstrap_null() # fit normal distribution to null and sample a normal distribution mmd.set_null_approximation_method(MMD1_GAUSSIAN) variance = mmd.compute_variance_estimate() null_samples_gaussian = normal(0, sqrt(variance), num_null_samples) # to plot data, sample a few examples from stream first features = gen_p.get_streamed_features(m) features = features.create_merged_copy(gen_q.get_streamed_features(m)) data = features.get_feature_matrix() # plot figure() # plot data of p and q subplot(2, 3, 1) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks plot(data[0][0:m], data[1][0:m], 'ro', label='$x$') plot(data[0][m + 1:2 * m], data[1][m + 1:2 * m], 'bo', label='$x$', alpha=0.5) title('Data, shift in $x_1$=' + str(difference) + '\nm=' + str(m)) xlabel('$x_1, y_1$') ylabel('$x_2, y_2$') # histogram of first data dimension and pdf subplot(2, 3, 2) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True) hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True) xs = linspace(min(data[0]) - 1, max(data[0]) + 1, 50) plot(xs, normpdf(xs, 0, 1), 'r', linewidth=3) plot(xs, normpdf(xs, difference, 1), 'b', linewidth=3) xlabel('$x_1, y_1$') ylabel('$p(x_1), p(y_1)$') title('Data PDF in $x_1, y_1$') # compute threshold for test level alpha = 0.05 null_samples_boot.sort() null_samples_gaussian.sort() thresh_boot = null_samples_boot[floor( len(null_samples_boot) * (1 - alpha))] thresh_gaussian = null_samples_gaussian[floor( len(null_samples_gaussian) * (1 - alpha))] type_one_error_boot = sum( null_samples_boot < thresh_boot) / float(num_null_samples) type_one_error_gaussian = sum( null_samples_gaussian < thresh_boot) / float(num_null_samples) # plot alternative distribution with threshold subplot(2, 3, 4) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(alt_samples, 20, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error = sum(alt_samples < thresh_boot) / float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range = [ min([min(null_samples_boot), min(null_samples_gaussian)]), max([max(null_samples_boot), max(null_samples_gaussian)]) ] # plot null distribution with threshold subplot(2, 3, 3) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(null_samples_boot, 20, range=hist_range, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Bootstrapped Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gaussian subplot(2, 3, 5) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(null_samples_gaussian, 20, range=hist_range, normed=True) axvline(thresh_gaussian, 0, 1, linewidth=2, color='red') title('Null Dist. Gaussian\nType I error is ' + str(type_one_error_gaussian)) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def create_kernel(kname, kparam, feats_train): """Call the corresponding constructor for the kernel""" if kname == 'gauss': kernel = GaussianKernel(feats_train, feats_train, kparam['width']) elif kname == 'linear': kernel = LinearKernel(feats_train, feats_train) kernel.set_normalizer(AvgDiagKernelNormalizer(kparam['scale'])) elif kname == 'poly': kernel = PolyKernel(feats_train, feats_train, kparam['degree'], kparam['inhomogene'], kparam['normal']) elif kname == 'wd': kernel = WeightedDegreePositionStringKernel(feats_train, feats_train, kparam['degree']) kernel.set_normalizer( AvgDiagKernelNormalizer(float(kparam['seqlength']))) kernel.set_shifts(kparam['shift'] * numpy.ones(kparam['seqlength'], dtype=numpy.int32)) #kernel=WeightedDegreeStringKernel(feats_train, feats_train, kparam['degree']) elif kname == 'spec': kernel = CommUlongStringKernel(feats_train, feats_train) elif kname == 'cumspec': kernel = WeightedCommWordStringKernel(feats_train, feats_train) kernel.set_weights(numpy.ones(kparam['degree'])) elif kname == 'spec2': kernel = CombinedKernel() k0 = CommWordStringKernel(feats_train['f0'], feats_train['f0']) k0.io.disable_progress() kernel.append_kernel(k0) k1 = CommWordStringKernel(feats_train['f1'], feats_train['f1']) k1.io.disable_progress() kernel.append_kernel(k1) elif kname == 'cumspec2': kernel = CombinedKernel() k0 = WeightedCommWordStringKernel(feats_train['f0'], feats_train['f0']) k0.set_weights(numpy.ones(kparam['degree'])) k0.io.disable_progress() kernel.append_kernel(k0) k1 = WeightedCommWordStringKernel(feats_train['f1'], feats_train['f1']) k1.set_weights(numpy.ones(kparam['degree'])) k1.io.disable_progress() kernel.append_kernel(k1) elif kname == 'localalign': kernel = LocalAlignmentStringKernel(feats_train, feats_train) elif kname == 'localimprove': kernel = LocalityImprovedStringKernel(feats_train, feats_train, kparam['length'],\ kparam['indeg'], kparam['outdeg']) else: print 'Unknown kernel %s' % kname kernel.set_cache_size(32) return kernel
train_feats = train_data[:, :-1] test_feats = test_data[:, :-1] train_label = train_data[:, -1] test_label = test_data[:, -1] features_train = RealFeatures(train_feats.T) features_test = RealFeatures(test_feats.T) labels_train = BinaryLabels(train_label.T) labels_test = BinaryLabels(test_label.T) epsilon = 0.001 C = 100000 combined_kernel = CombinedKernel() #gauss_kernel_1 = GaussianKernel(features_train, features_train, 15) gauss_kernel_1 = GaussianKernel(features_train, features_train, 2) gauss_kernel_2 = GaussianKernel(features_train, features_train, 0.5) combined_kernel.append_kernel(gauss_kernel_1) combined_kernel.append_kernel(gauss_kernel_2) combined_kernel.init(features_train, features_train) libsvm = LibSVM() svm = MKLClassification(libsvm) svm.set_interleaved_optimization_enabled(False) svm.set_kernel(combined_kernel) svm.set_labels(labels_train) svm.train() beta = combined_kernel.get_subkernel_weights()
def modelselection_grid_search_libsvr_modular(fm_train=traindat,fm_test=testdat,label_train=label_traindat,\ width=2.1,C=1,epsilon=1e-5,tube_epsilon=1e-2): from shogun.Evaluation import CrossValidation, CrossValidationResult from shogun.Evaluation import MeanSquaredError from shogun.Evaluation import CrossValidationSplitting from shogun.Features import RegressionLabels from shogun.Features import RealFeatures from shogun.Kernel import GaussianKernel from shogun.Regression import LibSVR from shogun.ModelSelection import GridSearchModelSelection from shogun.ModelSelection import ModelSelectionParameters, R_EXP from shogun.ModelSelection import ParameterCombination # training data features_train = RealFeatures(traindat) labels = RegressionLabels(label_traindat) # kernel kernel = GaussianKernel(features_train, features_train, width) # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #kernel.print_modsel_params() labels = RegressionLabels(label_train) # predictor predictor = LibSVR(C, tube_epsilon, kernel, labels) predictor.set_epsilon(epsilon) # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy = CrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium = MeanSquaredError() # cross-validation instance cross_validation = CrossValidation(predictor, features_train, labels, splitting_strategy, evaluation_criterium) # (optional) repeat x-val 10 times cross_validation.set_num_runs(10) # (optional) request 95% confidence intervals for results (not actually needed # for this toy example) cross_validation.set_conf_int_alpha(0.05) # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #predictor.print_modsel_params() # build parameter tree to select C1 and C2 param_tree_root = ModelSelectionParameters() c1 = ModelSelectionParameters("C1") param_tree_root.append_child(c1) c1.build_values(-2.0, 2.0, R_EXP) c2 = ModelSelectionParameters("C2") param_tree_root.append_child(c2) c2.build_values(-2.0, 2.0, R_EXP) # model selection instance model_selection = GridSearchModelSelection(param_tree_root, cross_validation) # perform model selection with selected methods #print "performing model selection of" #print "parameter tree" #param_tree_root.print_tree() #print "starting model selection" # print the current parameter combination, if no parameter nothing is printed print_state = False # lock data before since model selection will not change the kernel matrix # (use with care) This avoids that the kernel matrix is recomputed in every # iteration of the model search predictor.data_lock(labels, features_train) best_parameters = model_selection.select_model(print_state) # print best parameters #print "best parameters:" #best_parameters.print_tree() # apply them and print result best_parameters.apply_to_machine(predictor) result = cross_validation.evaluate()
def modelselection_parameter_tree_modular(dummy): from shogun.ModelSelection import ParameterCombination from shogun.ModelSelection import ModelSelectionParameters, R_EXP, R_LINEAR from shogun.Kernel import PowerKernel from shogun.Kernel import GaussianKernel from shogun.Kernel import DistantSegmentsKernel from shogun.Distance import MinkowskiMetric root = ModelSelectionParameters() combinations = root.get_combinations() combinations.get_num_elements() c = ModelSelectionParameters('C') root.append_child(c) c.build_values(1, 11, R_EXP) power_kernel = PowerKernel() # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #power_kernel.print_modsel_params() param_power_kernel = ModelSelectionParameters('kernel', power_kernel) root.append_child(param_power_kernel) param_power_kernel_degree = ModelSelectionParameters('degree') param_power_kernel_degree.build_values(1, 1, R_EXP) param_power_kernel.append_child(param_power_kernel_degree) metric1 = MinkowskiMetric(10) # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #metric1.print_modsel_params() param_power_kernel_metric1 = ModelSelectionParameters('distance', metric1) param_power_kernel.append_child(param_power_kernel_metric1) param_power_kernel_metric1_k = ModelSelectionParameters('k') param_power_kernel_metric1_k.build_values(1, 12, R_LINEAR) param_power_kernel_metric1.append_child(param_power_kernel_metric1_k) gaussian_kernel = GaussianKernel() # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #gaussian_kernel.print_modsel_params() param_gaussian_kernel = ModelSelectionParameters('kernel', gaussian_kernel) root.append_child(param_gaussian_kernel) param_gaussian_kernel_width = ModelSelectionParameters('width') param_gaussian_kernel_width.build_values(1, 2, R_EXP) param_gaussian_kernel.append_child(param_gaussian_kernel_width) ds_kernel = DistantSegmentsKernel() # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #ds_kernel.print_modsel_params() param_ds_kernel = ModelSelectionParameters('kernel', ds_kernel) root.append_child(param_ds_kernel) param_ds_kernel_delta = ModelSelectionParameters('delta') param_ds_kernel_delta.build_values(1, 2, R_EXP) param_ds_kernel.append_child(param_ds_kernel_delta) param_ds_kernel_theta = ModelSelectionParameters('theta') param_ds_kernel_theta.build_values(1, 2, R_EXP) param_ds_kernel.append_child(param_ds_kernel_theta) # root.print_tree() combinations = root.get_combinations() # for i in range(combinations.get_num_elements()): # combinations.get_element(i).print_tree() return
def quadratic_time_mmd_graphical(): # parameters, change to get different results m=100 dim=2 # setting the difference of the first dimension smaller makes a harder test difference=0.5 # number of samples taken from null and alternative distribution num_null_samples=500 # streaming data generator for mean shift distributions gen_p=MeanShiftDataGenerator(0, dim) gen_q=MeanShiftDataGenerator(difference, dim) # Stream examples and merge them in order to compute MMD on joint sample # alternative is to call a different constructor of QuadraticTimeMMD features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) # use the median kernel selection # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable sigmas=[2**x for x in range(-3,10)] widths=[x*x*2 for x in sigmas] print "kernel widths:", widths combined=CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # create MMD instance, use biased statistic mmd=QuadraticTimeMMD(combined,features, m) mmd.set_statistic_type(BIASED) # kernel selection instance (this can easily replaced by the other methods for selecting # single kernels selection=MMDKernelSelectionMax(mmd) # perform kernel selection kernel=selection.select_kernel() kernel=GaussianKernel.obtain_from_generic(kernel) mmd.set_kernel(kernel); print "selected kernel width:", kernel.get_width() # sample alternative distribution (new data each trial) alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): # Stream examples and merge them in order to replace in MMD features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) mmd.set_p_and_q(features) alt_samples[i]=mmd.compute_statistic() # sample from null distribution # bootstrapping, biased statistic mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_statistic_type(BIASED) mmd.set_bootstrap_iterations(num_null_samples) null_samples_boot=mmd.bootstrap_null() # sample from null distribution # spectrum, biased statistic if "sample_null_spectrum" in dir(QuadraticTimeMMD): mmd.set_null_approximation_method(MMD2_SPECTRUM) mmd.set_statistic_type(BIASED) null_samples_spectrum=mmd.sample_null_spectrum(num_null_samples, m-10) # fit gamma distribution, biased statistic mmd.set_null_approximation_method(MMD2_GAMMA) mmd.set_statistic_type(BIASED) gamma_params=mmd.fit_null_gamma() # sample gamma with parameters null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)]) # to plot data, sample a few examples from stream first features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) data=features.get_feature_matrix() # plot figure() title('Quadratic Time MMD') # plot data of p and q subplot(2,3,1) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks plot(data[0][0:m], data[1][0:m], 'ro', label='$x$') plot(data[0][m+1:2*m], data[1][m+1:2*m], 'bo', label='$x$', alpha=0.5) title('Data, shift in $x_1$='+str(difference)+'\nm='+str(m)) xlabel('$x_1, y_1$') ylabel('$x_2, y_2$') # histogram of first data dimension and pdf subplot(2,3,2) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3 )) # reduce number of x-ticks hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True) hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True) xs=linspace(min(data[0])-1,max(data[0])+1, 50) plot(xs,normpdf( xs, 0, 1), 'r', linewidth=3) plot(xs,normpdf( xs, difference, 1), 'b', linewidth=3) xlabel('$x_1, y_1$') ylabel('$p(x_1), p(y_1)$') title('Data PDF in $x_1, y_1$') # compute threshold for test level alpha=0.05 null_samples_boot.sort() null_samples_spectrum.sort() null_samples_gamma.sort() thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))]; thresh_spectrum=null_samples_spectrum[floor(len(null_samples_spectrum)*(1-alpha))]; thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))]; type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples) type_one_error_spectrum=sum(null_samples_spectrum<thresh_boot)/float(num_null_samples) type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples) # plot alternative distribution with threshold subplot(2,3,4) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(alt_samples, 20, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range=[min([min(null_samples_boot), min(null_samples_spectrum), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_spectrum), max(null_samples_gamma)])] # plot null distribution with threshold subplot(2,3,3) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3 )) # reduce number of x-ticks hist(null_samples_boot, 20, range=hist_range, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Bootstrapped Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) grid(True) # plot null distribution spectrum subplot(2,3,5) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(null_samples_spectrum, 20, range=hist_range, normed=True); axvline(thresh_spectrum, 0, 1, linewidth=2, color='red') title('Null Dist. Spectrum\nType I error is ' + str(type_one_error_spectrum)) # plot null distribution gamma subplot(2,3,6) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(null_samples_gamma, 20, range=hist_range, normed=True); axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def serialization_svmlight_modular(num, dist, width, C): from shogun.IO import MSG_DEBUG from shogun.Features import RealFeatures, BinaryLabels, DNA, Alphabet from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel from shogun.Classifier import SVMLight from numpy import concatenate, ones from numpy.random import randn, seed import sys import types import random import bz2 try: import cPickle as pickle except ImportError: import pickle as pickle import inspect def save(filename, myobj): """ save object to file using pickle @param filename: name of destination file @type filename: str @param myobj: object to save (has to be pickleable) @type myobj: obj """ try: f = bz2.BZ2File(filename, 'wb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be written\n') sys.stderr.write(details) return pickle.dump(myobj, f, protocol=2) f.close() def load(filename): """ Load from filename using pickle @param filename: name of file to load from @type filename: str """ try: f = bz2.BZ2File(filename, 'rb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be read\n') sys.stderr.write(details) return myobj = pickle.load(f) f.close() return myobj ################################################## seed(17) traindata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1) testdata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1); trainlab=concatenate((-ones(num), ones(num))); testlab=concatenate((-ones(num), ones(num))); feats_train=RealFeatures(traindata_real); feats_test=RealFeatures(testdata_real); kernel=GaussianKernel(feats_train, feats_train, width); #kernel.io.set_loglevel(MSG_DEBUG) labels=BinaryLabels(trainlab); svm=SVMLight(C, kernel, labels) svm.train() #svm.io.set_loglevel(MSG_DEBUG) ################################################## #print("labels:") #print(pickle.dumps(labels)) # #print("features") #print(pickle.dumps(feats_train)) # #print("kernel") #print(pickle.dumps(kernel)) # #print("svm") #print(pickle.dumps(svm)) # #print("#################################") fn = "serialized_svm.bz2" #print("serializing SVM to file", fn) save(fn, svm) #print("#################################") #print("unserializing SVM") svm2 = load(fn) #print("#################################") #print("comparing training") svm2.train() #print("objective before serialization:", svm.get_objective()) #print("objective after serialization:", svm2.get_objective()) return svm, svm.get_objective(), svm2, svm2.get_objective()
def statistics_linear_time_mmd(n, dim, difference): from shogun.Features import RealFeatures from shogun.Features import MeanShiftDataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # so increase to get reasonable results # streaming data generator for mean shift distributions gen_p = MeanShiftDataGenerator(0, dim) gen_q = MeanShiftDataGenerator(difference, dim) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Stream examples and merge them in order to compute median on joint sample features = gen_p.get_streamed_features(100) features = features.create_merged_copy(gen_q.get_streamed_features(100)) # compute all pairwise distances dist = EuclideanDistance(features, features) distances = dist.get_distance_matrix() # compute median and determine kernel width (using shogun) median_distance = Statistics.matrix_median(distances, True) sigma = median_distance**2 #print "median distance for Gaussian kernel:", sigma kernel = GaussianKernel(10, sigma) # mmd instance using streaming features, blocksize of 10000 mmd = LinearTimeMMD(kernel, gen_p, gen_q, n, 10000) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 statistic = mmd.compute_statistic() #print "test statistic:", statistic # do the same thing using two different way to approximate null-dstribution # bootstrapping and gaussian approximation (ony for really large samples) alpha = 0.05 #print "computing p-value using bootstrapping" mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations( 50) # normally, far more iterations are needed p_value_boot = mmd.compute_p_value(statistic) #print "p_value_boot:", p_value_boot #print "p_value_boot <", alpha, ", i.e. test sais p!=q:", p_value_boot<alpha #print "computing p-value using gaussian approximation" mmd.set_null_approximation_method(MMD1_GAUSSIAN) p_value_gaussian = mmd.compute_p_value(statistic) #print "p_value_gaussian:", p_value_gaussian #print "p_value_gaussian <", alpha, ", i.e. test sais p!=q:", p_value_gaussian<alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations( 10) # normally, far more iterations are needed null_samples = mmd.bootstrap_null() #print "null mean:", mean(null_samples) #print "null variance:", var(null_samples) # compute type I and type II errors for Gaussian approximation # number of trials should be larger to compute tight confidence bounds mmd.set_null_approximation_method(MMD1_GAUSSIAN) num_trials = 5 alpha = 0.05 # test power typeIerrors = [0 for x in range(num_trials)] typeIIerrors = [0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i] = mmd.perform_test() > alpha mmd.set_simulate_h0(False) typeIIerrors[i] = mmd.perform_test() > alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return statistic, p_value_boot, p_value_gaussian, null_samples, typeIerrors, typeIIerrors
def training_run(options): """Conduct a training run and return a trained SVM kernel""" settings = MotifFinderSettings(kirmes_ini.MOTIF_LENGTH, options.window_width, options.replace) positives = MotifFinder(finder_settings=settings) positives.setFastaFile(options.positives) positives.setMotifs(options.pgff) pmotifs, ppositions = positives.getResults() negatives = MotifFinder(finder_settings=settings) negatives.setFastaFile(options.negatives) negatives.setMotifs(options.ngff) nmotifs, npositions = negatives.getResults() wds_kparams = kirmes_ini.WDS_KERNEL_PARAMETERS wds_svm = EasySVM.EasySVM(wds_kparams) num_positives = len(pmotifs.values()[0]) num_negatives = len(nmotifs.values()[0]) #Creating Kernel Objects kernel = CombinedKernel() features = CombinedFeatures() kernel_array = [] motifs = pmotifs.keys() motifs.sort() #Adding Kmer Kernels for motif in motifs: all_examples = pmotifs[motif] + nmotifs[motif] motif_features = wds_svm.createFeatures(all_examples) wds_kernel = WeightedDegreePositionStringKernel(motif_features, motif_features, \ wds_kparams['degree']) wds_kernel.set_shifts(wds_kparams['shift'] * ones(wds_kparams['seqlength'], dtype=int32)) features.append_feature_obj(motif_features) kernel_array.append(wds_kernel) kernel.append_kernel(wds_kernel) rbf_svm = EasySVM.EasySVM(kirmes_ini.RBF_KERNEL_PARAMETERS) positions = array(ppositions + npositions, dtype=float64).T position_features = rbf_svm.createFeatures(positions) features.append_feature_obj(position_features) motif_labels = append(ones(num_positives), -ones(num_negatives)) complete_labels = Labels(motif_labels) rbf_kernel = GaussianKernel(position_features, position_features, \ kirmes_ini.RBF_KERNEL_PARAMETERS['width']) kernel_array.append(rbf_kernel) kernel.append_kernel(rbf_kernel) #Kernel init kernel.init(features, features) kernel.set_cache_size(kirmes_ini.K_CACHE_SIZE) svm = LibSVM(kirmes_ini.K_COMBINED_C, kernel, complete_labels) svm.parallel.set_num_threads(kirmes_ini.K_NUM_THREADS) #Training svm.train() if not os.path.exists(options.output_path): os.mkdir(options.output_path) html = {} if options.contrib: html["contrib"] = contrib(svm, kernel, motif_labels, kernel_array, motifs) if options.logos: html["poims"] = poims(svm, kernel, kernel_array, motifs, options.output_path) if options.query: html["query"] = evaluate(options, svm, kernel, features, motifs) htmlize(html, options.output_html)
import expenv import numpy import helper from numpy import array, float64 import sys # create dense matrices A,B,C A = array([[1, 2, 3], [4, 0, 0], [0, 0, 0], [0, 5, 0], [0, 0, 6], [9, 9, 9]], dtype=float64) B = array([1, 1, 1, -1, -1, -1], dtype=float64) # ... of type Real, LongInt and Byte feats_train = RealFeatures(A.transpose()) kernel = GaussianKernel(feats_train, feats_train, 1.0) kernel.io.set_loglevel(MSG_DEBUG) lab = Labels(B) svm = SVMLight(1, kernel, lab) svm.train() helper.save("/tmp/awesome_svm", svm) svm = helper.load("/tmp/awesome_svm") svm.train() #sys.exit(0) run = expenv.Run.get(1010)
lmds = MultidimensionalScaling() lmds.set_landmark(True) lmds.set_landmark_number(20) converters.append((lmds,"Landmark MDS with %d landmarks" % lmds.get_landmark_number())) from shogun.Converter import Isomap cisomap = Isomap() cisomap.set_k(9) converters.append((cisomap,"Isomap with k=%d" % cisomap.get_k())) from shogun.Converter import DiffusionMaps from shogun.Kernel import GaussianKernel dm = DiffusionMaps() dm.set_t(1) kernel = GaussianKernel(100,10.0) dm.set_kernel(kernel) converters.append((dm,"Diffusion Maps with t=%d, sigma=%f" % (dm.get_t(),kernel.get_width()))) from shogun.Converter import HessianLocallyLinearEmbedding hlle = HessianLocallyLinearEmbedding() hlle.set_k(6) converters.append((hlle,"Hessian LLE with k=%d" % (hlle.get_k()))) from shogun.Converter import LocalTangentSpaceAlignment ltsa = LocalTangentSpaceAlignment() ltsa.set_k(6) converters.append((ltsa,"LTSA with k=%d" % (ltsa.get_k()))) from shogun.Converter import LaplacianEigenmaps le = LaplacianEigenmaps()
import numpy as np import os import csv from shogun.Features import RealFeatures from shogun.Classifier import KernelPCA from shogun.Distance import EuclideanDistance from shogun.Kernel import GaussianKernel # Load the data. f = open(os.path.dirname(__file__) + '../data/circle_data.txt') data = np.fromfile(f, dtype=np.float64, sep=' ') data = data.reshape(-1, 2) f.close() # Perform Kernel PCA. feat = RealFeatures(data.T) kernel = GaussianKernel(feat, feat, 2.0) preprocessor = KernelPCA(kernel) preprocessor.set_target_dim(2) preprocessor.init(feat) transformedData = preprocessor.apply_to_feature_matrix(feat) # Show transformed data. print np.matrix(transformedData.T)
from shogun.Kernel import GaussianKernel from shogun.Classifier import LibSVM, LDA from shogun.Evaluation import PRCEvaluation import util util.set_title('PRC example') util.DISTANCE = 0.5 subplots_adjust(hspace=0.3) pos = util.get_realdata(True) neg = util.get_realdata(False) features = util.get_realfeatures(pos, neg) labels = util.get_labels() # classifiers gk = GaussianKernel(features, features, 1.0) svm = LibSVM(1000.0, gk, labels) svm.train() lda = LDA(1, features, labels) lda.train() ## plot points subplot(211) plot(pos[0, :], pos[1, :], "r.") plot(neg[0, :], neg[1, :], "b.") grid(True) title('Data', size=10) # plot PRC for SVM subplot(223) PRC_evaluation = PRCEvaluation()
# #Test Train Swap # train_feats = test_data[:, :-1] # test_feats = train_data[:, :-1] # train_label = test_data[:, -1] # test_label = train_data[:, -1] features_train = RealFeatures(train_feats.T) features_test = RealFeatures(test_feats.T) labels_train = BinaryLabels(train_label.T) labels_test = BinaryLabels(test_label.T) epsilon = 0.001 # C = 1.0 C = 100000 # Gaussian gauss_kernel = GaussianKernel(features_train, features_train, 1) svm = LibSVM(C, gauss_kernel, labels_train) # #Linear # linear_kernel = LinearKernel(features_train, features_train) # svm = LibSVM(C, linear_kernel, labels_train) # svm.set_epsilon(epsilon) svm.train() labels_predict = svm.apply(features_test) train_pred = svm.apply(features_train) alphas = svm.get_alphas() b = svm.get_bias()
def statistics_hsic(): from shogun.Features import RealFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel from shogun.Statistics import HSIC from shogun.Statistics import BOOTSTRAP, HSIC_GAMMA from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # note that the HSIC has to store kernel matrices # which upper bounds the sample size n = 250 difference = 3 angle = pi / 3 # use data generator class to produce example data data = DataGenerator.generate_sym_mix_gauss(n, difference, angle) #plot(data[0], data[1], 'x');show() # create shogun feature representation features_x = RealFeatures(array([data[0]])) features_y = RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset = Math.randperm_vec(features_x.get_num_vectors()) subset = subset[0:200] features_x.add_subset(subset) dist = EuclideanDistance(features_x, features_x) distances = dist.get_distance_matrix() features_x.remove_subset() median_distance = Statistics.matrix_median(distances, True) sigma_x = median_distance**2 features_y.add_subset(subset) dist = EuclideanDistance(features_y, features_y) distances = dist.get_distance_matrix() features_y.remove_subset() median_distance = Statistics.matrix_median(distances, True) sigma_y = median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x = GaussianKernel(10, sigma_x) kernel_y = GaussianKernel(10, sigma_y) hsic = HSIC(kernel_x, kernel_y, features_x, features_y) # perform test: compute p-value and test if null-hypothesis is rejected for # a test level of 0.05 using different methods to approximate # null-distribution statistic = hsic.compute_statistic() print "HSIC:", statistic alpha = 0.05 print "computing p-value using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) # normally, at least 250 iterations should be done, but that takes long hsic.set_bootstrap_iterations(100) # bootstrapping allows usage of unbiased or biased statistic p_value = hsic.compute_p_value(statistic) thresh = hsic.compute_threshold(alpha) print "p_value:", p_value print "threshold for 0.05 alpha:", thresh print "p_value <", alpha, ", i.e. test sais p and q are dependend:", p_value < alpha print "computing p-value using gamma method" hsic.set_null_approximation_method(HSIC_GAMMA) p_value = hsic.compute_p_value(statistic) thresh = hsic.compute_threshold(alpha) print "p_value:", p_value print "threshold for 0.05 alpha:", thresh print "p_value <", alpha, ", i.e. test sais p and q are dependend::", p_value < alpha # sample from null distribution (these may be plotted or whatsoever) # mean should be close to zero, variance stronly depends on data/kernel # bootstrapping, biased statistic print "sampling null distribution using bootstrapping" hsic.set_null_approximation_method(BOOTSTRAP) hsic.set_bootstrap_iterations(100) null_samples = hsic.bootstrap_null() print "null mean:", mean(null_samples) print "null variance:", var(null_samples)
train_feats = train_data[1, :, :-1] test_feats = test_data[1, :, :-1] train_label = train_data[1, :, -1] test_label = test_data[1, :, -1] features_train = RealFeatures(train_feats.T) features_test = RealFeatures(test_feats.T) labels_train = BinaryLabels(train_label.T) labels_test = BinaryLabels(test_label.T) epsilon = 0.001 C = 1.0 combined_kernel = CombinedKernel() #gauss_kernel_1 = GaussianKernel(features_train, features_train, 15) gauss_kernel_1 = GaussianKernel(features_train, features_train, 1.0) gauss_kernel_2 = GaussianKernel(features_train, features_train, 2.0) combined_kernel.append_kernel(gauss_kernel_1) combined_kernel.append_kernel(gauss_kernel_2) combined_kernel.init(features_train, features_train) #svm = LibSVM(C, gauss_kernel, labels_train) #svm = LibSVM(C, combined_kernel, labels_train) #svm.set_epsilon(epsilon) #binary_svm_solver = SVRLight()
def statistics_mmd_kernel_selection_combined(m, distance, stretch, num_blobs, angle, selection_method): from shogun.Features import RealFeatures from shogun.Features import GaussianBlobsDataGenerator from shogun.Kernel import GaussianKernel, CombinedKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import MMDKernelSelectionCombMaxL2 from shogun.Statistics import MMDKernelSelectionCombOpt from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # results for this low number will be bad (unstable, type I error wrong) # streaming data generator gen_p = GaussianBlobsDataGenerator(num_blobs, distance, 1, 0) gen_q = GaussianBlobsDataGenerator(num_blobs, distance, stretch, angle) # stream some data and plot num_plot = 1000 features = gen_p.get_streamed_features(num_plot) features = features.create_merged_copy( gen_q.get_streamed_features(num_plot)) data = features.get_feature_matrix() #figure() #subplot(2,2,1) #grid(True) #plot(data[0][0:num_plot], data[1][0:num_plot], 'r.', label='$x$') #title('$X\sim p$') #subplot(2,2,2) #grid(True) #plot(data[0][num_plot+1:2*num_plot], data[1][num_plot+1:2*num_plot], 'b.', label='$x$', alpha=0.5) #title('$Y\sim q$') # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # different to the standard form, see documentation) sigmas = [2**x for x in range(-3, 10)] widths = [x * x * 2 for x in sigmas] combined = CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # mmd instance using streaming features, blocksize of 10000 block_size = 10000 mmd = LinearTimeMMD(combined, gen_p, gen_q, m, block_size) # kernel selection instance (this can easily replaced by the other methods for selecting # combined kernels if selection_method == "opt": selection = MMDKernelSelectionCombOpt(mmd) elif selection_method == "l2": selection = MMDKernelSelectionCombMaxL2(mmd) # perform kernel selection (kernel is automatically set) kernel = selection.select_kernel() kernel = CombinedKernel.obtain_from_generic(kernel) #print "selected kernel weights:", kernel.get_subkernel_weights() #subplot(2,2,3) #plot(kernel.get_subkernel_weights()) #title("Kernel weights") # compute tpye I and II error (use many more trials). Type I error is only # estimated to check MMD1_GAUSSIAN method for estimating the null # distribution. Note that testing has to happen on difference data than # kernel selecting, but the linear time mmd does this implicitly mmd.set_null_approximation_method(MMD1_GAUSSIAN) # number of trials should be larger to compute tight confidence bounds num_trials = 5 alpha = 0.05 # test power typeIerrors = [0 for x in range(num_trials)] typeIIerrors = [0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i] = mmd.perform_test() > alpha mmd.set_simulate_h0(False) typeIIerrors[i] = mmd.perform_test() > alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return kernel, typeIerrors, typeIIerrors
def create_kernel(kname,kparam,feats_train): """Call the corresponding constructor for the kernel""" if kname == 'gauss': kernel = GaussianKernel(feats_train, feats_train, kparam['width']) elif kname == 'linear': kernel = LinearKernel(feats_train, feats_train) kernel.set_normalizer(AvgDiagKernelNormalizer(kparam['scale'])) elif kname == 'poly': kernel = PolyKernel(feats_train, feats_train, kparam['degree'], kparam['inhomogene'], kparam['normal']) elif kname == 'wd': kernel=WeightedDegreePositionStringKernel(feats_train, feats_train, kparam['degree']) kernel.set_normalizer(AvgDiagKernelNormalizer(float(kparam['seqlength']))) kernel.set_shifts(kparam['shift']*numpy.ones(kparam['seqlength'],dtype=numpy.int32)) #kernel=WeightedDegreeStringKernel(feats_train, feats_train, kparam['degree']) elif kname == 'spec': kernel = CommUlongStringKernel(feats_train, feats_train) elif kname == 'cumspec': kernel = WeightedCommWordStringKernel(feats_train, feats_train) kernel.set_weights(numpy.ones(kparam['degree'])) elif kname == 'spec2': kernel = CombinedKernel() k0 = CommWordStringKernel(feats_train['f0'], feats_train['f0']) k0.io.disable_progress() kernel.append_kernel(k0) k1 = CommWordStringKernel(feats_train['f1'], feats_train['f1']) k1.io.disable_progress() kernel.append_kernel(k1) elif kname == 'cumspec2': kernel = CombinedKernel() k0 = WeightedCommWordStringKernel(feats_train['f0'], feats_train['f0']) k0.set_weights(numpy.ones(kparam['degree'])) k0.io.disable_progress() kernel.append_kernel(k0) k1 = WeightedCommWordStringKernel(feats_train['f1'], feats_train['f1']) k1.set_weights(numpy.ones(kparam['degree'])) k1.io.disable_progress() kernel.append_kernel(k1) elif kname == 'localalign': kernel = LocalAlignmentStringKernel(feats_train, feats_train) elif kname == 'localimprove': kernel = LocalityImprovedStringKernel(feats_train, feats_train, kparam['length'],\ kparam['indeg'], kparam['outdeg']) else: print 'Unknown kernel %s' % kname kernel.set_cache_size(32) return kernel
# Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable # Stream examples and merge them in order to compute median on joint sample features=gen_p.get_streamed_features(100) features=features.create_merged_copy(gen_q.get_streamed_features(100)) # compute all pairwise distances dist=EuclideanDistance(features, features) distances=dist.get_distance_matrix() # compute median and determine kernel width (using shogun) median_distance=Statistics.matrix_median(distances, True) sigma=median_distance**2 print "median distance for Gaussian kernel:", sigma kernel=GaussianKernel(10,sigma) # mmd instance using streaming features, blocksize of 10000 mmd=LinearTimeMMD(kernel, gen_p, gen_q, m, 10000) # sample alternative distribution, stream ensures different samples each run alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): alt_samples[i]=mmd.compute_statistic() # sample from null distribution # bootstrapping, biased statistic mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(num_null_samples) null_samples_boot=mmd.bootstrap_null()
def regression_gaussian_process_modelselection (n=100,n_test=100, \ x_range=6,x_range_test=10,noise_var=0.5,width=1, seed=1): from shogun.Features import RealFeatures, RegressionLabels from shogun.Kernel import GaussianKernel from shogun.ModelSelection import GradientModelSelection, ModelSelectionParameters, R_LINEAR from shogun.Regression import GaussianLikelihood, ZeroMean, \ ExactInferenceMethod, GaussianProcessRegression, GradientCriterion, \ GradientEvaluation # Reproducable results random.seed(seed) # Easy regression data: one dimensional noisy sine wave X_train=random.rand(1,n)*x_range X_test=array([[float(i)/n_test*x_range_test for i in range(n_test)]]) Y_test=sin(X_test) Y_train=sin(X_train)+random.randn(n)*noise_var # shogun representation labels=RegressionLabels(Y_train[0]) feats_train=RealFeatures(X_train) feats_test=RealFeatures(X_test) # GP specification width=1 shogun_width=width*width*2 kernel=GaussianKernel(10,shogun_width) kernel.init(feats_train,feats_train) zmean = ZeroMean() likelihood = GaussianLikelihood() inf = ExactInferenceMethod(kernel, feats_train, zmean, labels, likelihood) gp = GaussianProcessRegression(inf, feats_train, labels) # Paramter tree for model selection root = ModelSelectionParameters() c1 = ModelSelectionParameters("inference_method", inf) root.append_child(c1) c2 = ModelSelectionParameters("scale") c1.append_child(c2) c2.build_values(0.01, 4.0, R_LINEAR) c3 = ModelSelectionParameters("likelihood_model", likelihood) c1.append_child(c3) c4 = ModelSelectionParameters("sigma") c3.append_child(c4) c4.build_values(0.001, 4.0, R_LINEAR) c5 = ModelSelectionParameters("kernel", kernel) c1.append_child(c5) c6 = ModelSelectionParameters("width") c5.append_child(c6) c6.build_values(0.001, 4.0, R_LINEAR) # Criterion for Gradient Search crit = GradientCriterion() # Evaluate our inference method for its derivatives grad = GradientEvaluation(gp, feats_train, labels, crit) grad.set_function(inf) gp.print_modsel_params() root.print_tree() # Handles all of the above structures in memory grad_search = GradientModelSelection(root, grad) # Set autolocking to false to get rid of warnings grad.set_autolock(False) # Search for best parameters best_combination = grad_search.select_model(True) # Outputs all result and information best_combination.print_tree() best_combination.apply_to_machine(gp) result = grad.evaluate() result.print_result() #inference gp.set_return_type(GaussianProcessRegression.GP_RETURN_COV) covariance = gp.apply_regression(feats_test) covariance = covariance.get_labels() gp.set_return_type(GaussianProcessRegression.GP_RETURN_MEANS) mean = gp.apply_regression(feats_test) mean = mean.get_labels() # some things we can do alpha = inf.get_alpha() diagonal = inf.get_diagonal_vector() cholesky = inf.get_cholesky() # plot results plot(X_train[0],Y_train[0],'x') # training observations plot(X_test[0],Y_test[0],'-') # ground truth of test plot(X_test[0],mean, '-') # mean predictions of test legend(["training", "ground truth", "mean predictions"]) show() return gp, alpha, labels, diagonal, covariance, mean, cholesky
def serialization_complex_example(num=5, dist=1, dim=10, C=2.0, width=10): import os from numpy import concatenate, zeros, ones from numpy.random import randn, seed from shogun.Features import RealFeatures, Labels from shogun.Classifier import GMNPSVM from shogun.Kernel import GaussianKernel from shogun.IO import SerializableHdf5File,SerializableAsciiFile, \ SerializableJsonFile,SerializableXmlFile,MSG_DEBUG from shogun.Preprocessor import NormOne, LogPlusOne seed(17) data=concatenate((randn(dim, num), randn(dim, num) + dist, randn(dim, num) + 2*dist, randn(dim, num) + 3*dist), axis=1) lab=concatenate((zeros(num), ones(num), 2*ones(num), 3*ones(num))) feats=RealFeatures(data) #feats.io.set_loglevel(MSG_DEBUG) kernel=GaussianKernel(feats, feats, width) labels=Labels(lab) svm = GMNPSVM(C, kernel, labels) feats.add_preprocessor(NormOne()) feats.add_preprocessor(LogPlusOne()) feats.set_preprocessed(1) svm.train(feats) #svm.print_serializable() fstream = SerializableHdf5File("blaah.h5", "w") status = svm.save_serializable(fstream) check_status(status) fstream = SerializableAsciiFile("blaah.asc", "w") status = svm.save_serializable(fstream) check_status(status) fstream = SerializableJsonFile("blaah.json", "w") status = svm.save_serializable(fstream) check_status(status) fstream = SerializableXmlFile("blaah.xml", "w") status = svm.save_serializable(fstream) check_status(status) fstream = SerializableHdf5File("blaah.h5", "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status) new_svm.train() fstream = SerializableAsciiFile("blaah.asc", "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status) new_svm.train() #fstream = SerializableJsonFile("blaah.json", "r") #new_svm=GMNPSVM() #status = new_svm.load_serializable(fstream) #check_status(status) #new_svm.train() fstream = SerializableXmlFile("blaah.xml", "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status) new_svm.train() os.unlink("blaah.h5") os.unlink("blaah.asc") os.unlink("blaah.json") os.unlink("blaah.xml") return svm,new_svm