def regression_svrlight (fm_train=traindat,fm_test=testdat,label_train=label_traindat, \ width=1.2,C=1,epsilon=1e-5,tube_epsilon=1e-2,num_threads=3): from shogun import RegressionLabels, RealFeatures from shogun import GaussianKernel try: from shogun import SVRLight except ImportError: print('No support for SVRLight available.') return feats_train=RealFeatures(fm_train) feats_test=RealFeatures(fm_test) kernel=GaussianKernel(feats_train, feats_train, width) labels=RegressionLabels(label_train) svr=SVRLight(C, epsilon, kernel, labels) svr.set_tube_epsilon(tube_epsilon) svr.parallel.set_num_threads(num_threads) svr.train() kernel.init(feats_train, feats_test) out = svr.apply().get_labels() return out, kernel
def kernel_gaussian (train_fname=traindat,test_fname=testdat, width=1.3): from shogun import RealFeatures, GaussianKernel, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) kernel=GaussianKernel(feats_train, feats_train, width) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def preprocessor_kernelpca_modular(data, threshold, width): from shogun import RealFeatures from shogun import KernelPCA from shogun import GaussianKernel features = RealFeatures(data) kernel = GaussianKernel(features, features, width) preprocessor = KernelPCA(kernel) preprocessor.init(features) preprocessor.set_target_dim(2) #X=preprocessor.get_transformation_matrix() X2 = preprocessor.apply_to_feature_matrix(features) lx0 = len(X2) modified_d1 = zeros((lx0, number_of_points_for_circle1)) modified_d2 = zeros((lx0, number_of_points_for_circle2)) modified_d1 = [X2[i][0:number_of_points_for_circle1] for i in range(lx0)] modified_d2 = [ X2[i][number_of_points_for_circle1:(number_of_points_for_circle1 + number_of_points_for_circle2)] for i in range(lx0) ] p.plot(modified_d1[0][:], modified_d1[1][:], 'o', modified_d2[0][:], modified_d2[1][:], 'x') p.title('final data') p.show() return features
def classifier_gpbtsvm(train_fname=traindat, test_fname=testdat, label_fname=label_traindat, width=2.1, C=1, epsilon=1e-5): from shogun import RealFeatures, BinaryLabels from shogun import GaussianKernel from shogun import CSVFile try: from shogun import GPBTSVM except ImportError: print("GPBTSVM not available") exit(0) feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = BinaryLabels(CSVFile(label_fname)) kernel = GaussianKernel(feats_train, feats_train, width) svm = GPBTSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels()
def kernel_combined (fm_train_real=traindat,fm_test_real=testdat,fm_train_dna=traindna,fm_test_dna=testdna ): from shogun import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel from shogun import RealFeatures, StringCharFeatures, CombinedFeatures, DNA kernel=CombinedKernel() feats_train=CombinedFeatures() feats_test=CombinedFeatures() subkfeats_train=RealFeatures(fm_train_real) subkfeats_test=RealFeatures(fm_test_real) subkernel=GaussianKernel(10, 1.1) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=StringCharFeatures(fm_train_dna, DNA) subkfeats_test=StringCharFeatures(fm_test_dna, DNA) degree=3 subkernel=FixedDegreeStringKernel(10, degree) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=StringCharFeatures(fm_train_dna, DNA) subkfeats_test=StringCharFeatures(fm_test_dna, DNA) subkernel=LocalAlignmentStringKernel(10) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def mkl_multiclass_1(fm_train_real, fm_test_real, label_train_multiclass, C): kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() for i in range(-10, 11): subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = GaussianKernel(pow(2, i + 1)) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels = MulticlassLabels(label_train_multiclass) mkl = MKLMulticlass(C, kernel, labels) mkl.set_epsilon(1e-2) mkl.parallel.set_num_threads(num_threads) mkl.set_mkl_epsilon(mkl_epsilon) mkl.set_mkl_norm(1) mkl.train() kernel.init(feats_train, feats_test) out = mkl.apply().get_labels() return out
def classifier_gmnpsvm(fm_train_real, fm_test_real, label_train_multiclass, C): feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) kernel = GaussianKernel(feats_train, feats_train, width) import time start = time.time() tmp = kernel.get_kernel_matrix() end = time.time() labels = MulticlassLabels(label_train_multiclass) svm = GMNPSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train(feats_train) out = svm.apply(feats_test).get_labels() return out
def _svm_new(self, kernel_width, c, epsilon): if self.x == None or self.y == None: raise Exception("No training data loaded.") x = RealFeatures(self.x) y = MulticlassLabels(self.y) self.svm = GMNPSVM(c, GaussianKernel(x, x, kernel_width), y) self.svm.set_epsilon(epsilon)
def create_kernel(kname, features, kparam=None): if kname == 'gauss': kernel = GaussianKernel(features, features, kparam) elif kname == 'linear': kernel = LinearKernel(features, features) elif kname == 'poly': kernel = PolyKernel(features, features, kparam, True, False) return kernel
def kernel_auc (train_fname=traindat,label_fname=label_traindat,width=1.7): from shogun import GaussianKernel, AUCKernel, RealFeatures from shogun import BinaryLabels, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) subkernel=GaussianKernel(feats_train, feats_train, width) kernel=AUCKernel(0, subkernel) kernel.setup_auc_maximization(BinaryLabels(CSVFile(label_fname))) km_train=kernel.get_kernel_matrix() return kernel
def classifier_multiclassmachine (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,width=2.1,C=1,epsilon=1e-5): from shogun import RealFeatures, MulticlassLabels from shogun import GaussianKernel from shogun import LibSVM, KernelMulticlassMachine, MulticlassOneVsRestStrategy feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) kernel=GaussianKernel(feats_train, feats_train, width) labels=MulticlassLabels(label_train_multiclass) classifier = LibSVM() classifier.set_epsilon(epsilon) #print labels.get_labels() mc_classifier = KernelMulticlassMachine(MulticlassOneVsRestStrategy(),kernel,classifier,labels) mc_classifier.train() kernel.init(feats_train, feats_test) out = mc_classifier.apply().get_labels() return out
def kernel_io(train_fname=traindat, test_fname=testdat, width=1.9): from shogun import RealFeatures, GaussianKernel, CSVFile from tempfile import NamedTemporaryFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) kernel = GaussianKernel(feats_train, feats_train, width) km_train = kernel.get_kernel_matrix() tmp_train_csv = NamedTemporaryFile(suffix='train.csv') f = CSVFile(tmp_train_csv.name, "w") kernel.save(f) del f kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() tmp_test_csv = NamedTemporaryFile(suffix='test.csv') f = CSVFile(tmp_test_csv.name, "w") kernel.save(f) del f return km_train, km_test, kernel
def create_param_tree(): root = ModelSelectionParameters() c1 = ModelSelectionParameters("C1") root.append_child(c1) c1.build_values(-1.0, 1.0, R_EXP) c2 = ModelSelectionParameters("C2") root.append_child(c2) c2.build_values(-1.0, 1.0, R_EXP) gaussian_kernel = GaussianKernel() # print all parameter available for modelselection # Dont worry if yours is not included, simply write to the mailing list #gaussian_kernel.print_modsel_params() param_gaussian_kernel = ModelSelectionParameters("kernel", gaussian_kernel) gaussian_kernel_width = ModelSelectionParameters("log_width") gaussian_kernel_width.build_values(-math.log(2.0), 0.0, R_EXP, 1.0, 2.0) param_gaussian_kernel.append_child(gaussian_kernel_width) root.append_child(param_gaussian_kernel) power_kernel = PowerKernel() # print all parameter available for modelselection # Dont worry if yours is not included, simply write to the mailing list #power_kernel.print_modsel_params() param_power_kernel = ModelSelectionParameters("kernel", power_kernel) root.append_child(param_power_kernel) param_power_kernel_degree = ModelSelectionParameters("degree") param_power_kernel_degree.build_values(1.0, 2.0, R_LINEAR) param_power_kernel.append_child(param_power_kernel_degree) metric = MinkowskiMetric(10) # print all parameter available for modelselection # Dont worry if yours is not included, simply write to the mailing list #metric.print_modsel_params() param_power_kernel_metric1 = ModelSelectionParameters("distance", metric) param_power_kernel.append_child(param_power_kernel_metric1) param_power_kernel_metric1_k = ModelSelectionParameters("k") param_power_kernel_metric1_k.build_values(1.0, 2.0, R_LINEAR) param_power_kernel_metric1.append_child(param_power_kernel_metric1_k) return root
def classifier_libsvmoneclass (train_fname=traindat,test_fname=testdat,width=2.1,C=1,epsilon=1e-5): from shogun import RealFeatures, GaussianKernel, LibSVMOneClass, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) kernel=GaussianKernel(feats_train, feats_train, width) svm=LibSVMOneClass(C, kernel) svm.set_epsilon(epsilon) svm.train() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels()
def mkl_multiclass(fm_train_real, fm_test_real, label_train_multiclass, width, C, epsilon, num_threads, mkl_epsilon, mkl_norm): from shogun import CombinedFeatures, RealFeatures, MulticlassLabels from shogun import CombinedKernel, GaussianKernel, LinearKernel, PolyKernel from shogun import MKLMulticlass kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = GaussianKernel(10, width) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = LinearKernel() feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = PolyKernel(10, 2) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels = MulticlassLabels(label_train_multiclass) mkl = MKLMulticlass(C, kernel, labels) mkl.set_epsilon(epsilon) mkl.parallel.set_num_threads(num_threads) mkl.set_mkl_epsilon(mkl_epsilon) mkl.set_mkl_norm(mkl_norm) mkl.train() kernel.init(feats_train, feats_test) out = mkl.apply().get_labels() return out
def create_param_tree(): from shogun import ModelSelectionParameters, R_EXP, R_LINEAR from shogun import ParameterCombination from shogun import GaussianKernel, PolyKernel import math root = ModelSelectionParameters() tau = ModelSelectionParameters("tau") root.append_child(tau) # also R_LINEAR/R_LOG is available as type min = -1 max = 1 type = R_EXP step = 1.5 base = 2 tau.build_values(min, max, type, step, base) # gaussian kernel with width gaussian_kernel = GaussianKernel() # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #gaussian_kernel.print_modsel_params() param_gaussian_kernel = ModelSelectionParameters("kernel", gaussian_kernel) gaussian_kernel_width = ModelSelectionParameters("log_width") gaussian_kernel_width.build_values(2.0 * math.log(2.0), 2.5 * math.log(2.0), R_LINEAR, 1.0) param_gaussian_kernel.append_child(gaussian_kernel_width) root.append_child(param_gaussian_kernel) # polynomial kernel with degree poly_kernel = PolyKernel() # print all parameter available for modelselection # Dont worry if yours is not included but, write to the mailing list #poly_kernel.print_modsel_params() param_poly_kernel = ModelSelectionParameters("kernel", poly_kernel) root.append_child(param_poly_kernel) # note that integers are used here param_poly_kernel_degree = ModelSelectionParameters("degree") param_poly_kernel_degree.build_values(1, 2, R_LINEAR) param_poly_kernel.append_child(param_poly_kernel_degree) return root
def converter_diffusionmaps(data_fname, t): try: from shogun import RealFeatures, DiffusionMaps, GaussianKernel, CSVFile features = RealFeatures(CSVFile(data_fname)) converter = DiffusionMaps() converter.set_target_dim(1) converter.set_kernel(GaussianKernel(10, 10.0)) converter.set_t(t) converter.apply(features) return features except ImportError: print('No Eigen3 available')
def preprocessor_kernelpca (data, threshold, width): from shogun import RealFeatures from shogun import KernelPCA from shogun import GaussianKernel features = RealFeatures(data) kernel = GaussianKernel(features,features,width) preprocessor = KernelPCA(kernel) preprocessor.init(features) preprocessor.set_target_dim(2) preprocessor.apply_to_feature_matrix(features) return features
def BuildModel(self, data, labels, options): if "kernel" in options: k = str(options.pop("kernel")) else: Log.Fatal("Required parameter 'kernel' not specified!") raise Exception("missing parameter") if "c" in options: self.C = float(options.pop("c")) if "gamma" in options: self.gamma = float(options.pop("gamma")) if k == "gaussian": self.kernel = GaussianKernel(data, data, 1) elif k == "polynomial": if "degree" in options: d = int(options.pop("degree")) else: d = 1 self.kernel = PolyKernel(data, data, d, True) elif k == "linear": self.kernel = LinearKernel(data, data) elif k == "hyptan": self.kernel = SigmoidKernel(data, data, 2, 1.0, 1.0) else: self.kernel = GaussianKernel(data, data, 1) if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") # Create and train the classifier. svm = LibSvm(self.C, self.kernel, labels) svm.train() return svm
def kernel_io (train_fname=traindat,test_fname=testdat,width=1.9): from shogun import RealFeatures, GaussianKernel, CSVFile from tempfile import NamedTemporaryFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) kernel=GaussianKernel(feats_train, feats_train, width) km_train=kernel.get_kernel_matrix() tmp_train_csv = NamedTemporaryFile(suffix='train.csv') f=CSVFile(tmp_train_csv.name, "w") kernel.save(f) del f kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() tmp_test_csv = NamedTemporaryFile(suffix='test.csv') f=CSVFile(tmp_test_csv.name,"w") kernel.save(f) del f return km_train, km_test, kernel
def kernel_gaussian(train_fname=traindat, test_fname=testdat, width=1.3): from shogun import RealFeatures, GaussianKernel, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) kernel = GaussianKernel(feats_train, feats_train, width) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def classifier_larank(num_vec, num_class, distance, C=0.9, num_threads=1, num_iter=5, seed=1): from shogun import RealFeatures, MulticlassLabels from shogun import GaussianKernel from shogun import LaRank from shogun import Math_init_random # reproducible results Math_init_random(seed) random.seed(seed) # generate some training data where each class pair is linearly separable label_train = array([mod(x, num_class) for x in range(num_vec)], dtype="float64") label_test = array([mod(x, num_class) for x in range(num_vec)], dtype="float64") fm_train = array(random.randn(num_class, num_vec)) fm_test = array(random.randn(num_class, num_vec)) for i in range(len(label_train)): fm_train[int(label_train[i]), i] += distance fm_test[int(label_test[i]), i] += distance feats_train = RealFeatures(fm_train) feats_test = RealFeatures(fm_test) width = 2.1 kernel = GaussianKernel(feats_train, feats_train, width) epsilon = 1e-5 labels = MulticlassLabels(label_train) svm = LaRank(C, kernel, labels) #svm.set_tau(1e-3) svm.set_batch_mode(False) #svm.io.enable_progress() svm.set_epsilon(epsilon) svm.train() out = svm.apply(feats_test).get_labels() predictions = svm.apply() return predictions, svm, predictions.get_labels()
def kernel_sparse_gaussian(fm_train_real=traindat, fm_test_real=testdat, width=1.1): from shogun import SparseRealFeatures from shogun import GaussianKernel feats_train = SparseRealFeatures(fm_train_real) feats_test = SparseRealFeatures(fm_test_real) kernel = GaussianKernel(feats_train, feats_train, width) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def classifier_gmnpsvm(train_fname=traindat, test_fname=testdat, label_fname=label_traindat, width=2.1, C=1, epsilon=1e-5): from shogun import RealFeatures, MulticlassLabels from shogun import GaussianKernel, GMNPSVM, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = MulticlassLabels(CSVFile(label_fname)) kernel = GaussianKernel(feats_train, feats_train, width) svm = GMNPSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train(feats_train) out = svm.apply(feats_test).get_labels() return out, kernel
def classifier_mpdsvm(train_fname=traindat, test_fname=testdat, label_fname=label_traindat, C=1, epsilon=1e-5): from shogun import RealFeatures, BinaryLabels from shogun import GaussianKernel from shogun import MPDSVM, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = BinaryLabels(CSVFile(label_fname)) width = 2.1 kernel = GaussianKernel(feats_train, feats_train, width) svm = MPDSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels()
def evaluation_cross_validation_regression(train_fname=traindat, label_fname=label_traindat, width=0.8, tau=1e-6): from shogun import CrossValidation, CrossValidationResult from shogun import MeanSquaredError, CrossValidationSplitting from shogun import RegressionLabels, RealFeatures from shogun import GaussianKernel, KernelRidgeRegression, CSVFile # training data features = RealFeatures(CSVFile(train_fname)) labels = RegressionLabels(CSVFile(label_fname)) # kernel and predictor kernel = GaussianKernel() predictor = KernelRidgeRegression(tau, kernel, labels) # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but here, the std x-val is used splitting_strategy = CrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium = MeanSquaredError() # cross-validation instance cross_validation = CrossValidation(predictor, features, labels, splitting_strategy, evaluation_criterium) # (optional) repeat x-val 10 times cross_validation.set_num_runs(10) # (optional) tell machine to precompute kernel matrix. speeds up. may not work predictor.data_lock(labels, features) # perform cross-validation and print(results) result = cross_validation.evaluate()
def RunSVRShogun(): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) # Use the last row of the training set as the responses. X, y = SplitTrainData(self.dataset) # Get all the parameters. self.C = 1.0 self.epsilon = 1.0 self.width = 0.1 if "c" in options: self.C = float(options.pop("c")) if "epsilon" in options: self.epsilon = float(options.pop("epsilon")) if "gamma" in options: self.width = np.true_divide(1, float(options.pop("gamma"))) if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") data = RealFeatures(X.T) labels_train = RegressionLabels(y) self.kernel = GaussianKernel(data, data, self.width) try: with totalTimer: # Perform SVR. model = LibSVR(self.C, self.epsilon, self.kernel, labels_train) model.train() except Exception as e: return -1 return totalTimer.ElapsedTime()
def metric(self): distance = "Euclidean" if "distance" in self.method_param: distance = str(self.method_param["distance"]) kernel = "Gaussian" if "kernel" in self.method_param: kernel = str(self.method_param["kernel"]) cache_size = 10 if "cache-size" in self.method_param: cache_size = int(self.method_param["cache-size"]) degree = 2 if "degree" in self.method_param: degree = int(self.method_param["degree"]) gamma = 2.0 if "gamma" in self.method_param: gamma = float(self.method_param["gamma"]) coef0 = 1.0 if "coef0" in self.method_param: coef0 = float(self.method_param["coef0"]) order = 2.0 if "order" in self.method_param: order = float(self.method_param["order"]) width = 2.0 if "width" in self.method_param: width = float(self.method_param["order"]) sigma = 1.5 if "sigma" in self.method_param: sigma = float(self.method_param["sigma"]) const = 2.0 if "constant" in self.method_param: const = float(self.method_param["constant"]) #Choosing a Distance Function required by some Kernels if distance == "Euclidean": distanceMethod = EuclideanDistance() elif distance == "Chi-Square": distanceMethod = ChiSquareDistance() elif distance == "Tanimoto": distanceMethod = TanimotoDistance() elif distance == "Minkowski": distanceMethod = MinkowskiMetric() elif distance == "Manhattan": distanceMethod = ManhattanMetric() elif distance == "Jensen": distanceMethod = JensenMetric() elif distance == "Canberra": distanceMethod = CanberraMetric() else: raise ValueError( "distance function not supported by the benchmarks") totalTimer = Timer() with totalTimer: #Choosing a Kernel for the Gaussian Process Classification if kernel == "Gaussian": kernelMethod = GaussianKernel(width) elif kernel == "Polynomial": kernelMethod = PolyKernel(cache_size, degree) elif kernel == "Sigmoid": kernelMethod = SigmoidKernel(cache_size, gamma, coef0) elif kernel == "Bessel": kernelMethod = BesselKernel(cache_size, order, width, degree, distanceMethod) elif kernel == "Power": kernelMethod = PowerKernel(cache_size, degree, distanceMethod) elif kernel == "Log": kernelMethod = LogKernel(cache_size, degree, distanceMethod) elif kernel == "Cauchy": kernelMethod = CauchyKernel(cache_size, sigma, distanceMethod) elif kernel == "Constant": kernelMethod = ConstKernel(const) elif kernel == "Diagonal": kernelMethod = DiagKernel(cache_size, const) else: raise ValueError("kernel not supported by the benchmarks") mean_function = ConstMean() likelihood = SoftMaxLikelihood() inference_method = MultiLaplaceInferenceMethod( kernelMethod, self.train_features, mean_function, self.train_labels, likelihood) #Create the model model = SGPC(inference_method) #Train model model.train() if len(self.data) >= 2: predictions = model.apply_multiclass( self.test_features).get_labels() metric = {} metric["runtime"] = totalTimer.ElapsedTime() if len(self.data) >= 2: predictions = label_decoder(predictions, self.label_map) if len(self.data) >= 3: confusionMatrix = Metrics.ConfusionMatrix(self.data[2], predictions) metric['ACC'] = Metrics.AverageAccuracy(confusionMatrix) metric['MCC'] = Metrics.MCCMultiClass(confusionMatrix) metric['Precision'] = Metrics.AvgPrecision(confusionMatrix) metric['Recall'] = Metrics.AvgRecall(confusionMatrix) metric['MSE'] = Metrics.SimpleMeanSquaredError( self.data[2], predictions) return metric
from shogun import GaussianKernel from shogun import LibSVM, LDA from shogun import ROCEvaluation import util util.set_title('ROC example') util.DISTANCE = 0.5 subplots_adjust(hspace=0.3) pos = util.get_realdata(True) neg = util.get_realdata(False) features = util.get_realfeatures(pos, neg) labels = util.get_labels() # classifiers gk = GaussianKernel(features, features, 1.0) svm = LibSVM(1000.0, gk, labels) svm.train() lda = LDA(1, features, labels) lda.train() ## plot points subplot(211) plot(pos[0, :], pos[1, :], "r.") plot(neg[0, :], neg[1, :], "b.") grid(True) title('Data', size=10) # plot ROC for SVM subplot(223) ROC_evaluation = ROCEvaluation()
def quadratic_time_mmd_graphical(): # parameters, change to get different results m = 100 dim = 2 # setting the difference of the first dimension smaller makes a harder test difference = 0.5 # number of samples taken from null and alternative distribution num_null_samples = 500 # streaming data generator for mean shift distributions gen_p = MeanShiftDataGenerator(0, dim) gen_q = MeanShiftDataGenerator(difference, dim) # Stream examples and merge them in order to compute MMD on joint sample # alternative is to call a different constructor of QuadraticTimeMMD features = gen_p.get_streamed_features(m) features = features.create_merged_copy(gen_q.get_streamed_features(m)) # use the median kernel selection # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable sigmas = [2**x for x in range(-3, 10)] widths = [x * x * 2 for x in sigmas] print "kernel widths:", widths combined = CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # create MMD instance, use biased statistic mmd = QuadraticTimeMMD(combined, features, m) mmd.set_statistic_type(BIASED) # kernel selection instance (this can easily replaced by the other methods for selecting # single kernels selection = MMDKernelSelectionMax(mmd) # perform kernel selection kernel = selection.select_kernel() kernel = GaussianKernel.obtain_from_generic(kernel) mmd.set_kernel(kernel) print "selected kernel width:", kernel.get_width() # sample alternative distribution (new data each trial) alt_samples = zeros(num_null_samples) for i in range(len(alt_samples)): # Stream examples and merge them in order to replace in MMD features = gen_p.get_streamed_features(m) features = features.create_merged_copy(gen_q.get_streamed_features(m)) mmd.set_p_and_q(features) alt_samples[i] = mmd.compute_statistic() # sample from null distribution # bootstrapping, biased statistic mmd.set_null_approximation_method(PERMUTATION) mmd.set_statistic_type(BIASED) mmd.set_num_null_samples(num_null_samples) null_samples_boot = mmd.sample_null() # sample from null distribution # spectrum, biased statistic if "sample_null_spectrum" in dir(QuadraticTimeMMD): mmd.set_null_approximation_method(MMD2_SPECTRUM) mmd.set_statistic_type(BIASED) null_samples_spectrum = mmd.sample_null_spectrum( num_null_samples, m - 10) # fit gamma distribution, biased statistic mmd.set_null_approximation_method(MMD2_GAMMA) mmd.set_statistic_type(BIASED) gamma_params = mmd.fit_null_gamma() # sample gamma with parameters null_samples_gamma = array([ gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples) ]) # to plot data, sample a few examples from stream first features = gen_p.get_streamed_features(m) features = features.create_merged_copy(gen_q.get_streamed_features(m)) data = features.get_feature_matrix() # plot figure() title('Quadratic Time MMD') # plot data of p and q subplot(2, 3, 1) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks plot(data[0][0:m], data[1][0:m], 'ro', label='$x$') plot(data[0][m + 1:2 * m], data[1][m + 1:2 * m], 'bo', label='$x$', alpha=0.5) title('Data, shift in $x_1$=' + str(difference) + '\nm=' + str(m)) xlabel('$x_1, y_1$') ylabel('$x_2, y_2$') # histogram of first data dimension and pdf subplot(2, 3, 2) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True) hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True) xs = linspace(min(data[0]) - 1, max(data[0]) + 1, 50) plot(xs, normpdf(xs, 0, 1), 'r', linewidth=3) plot(xs, normpdf(xs, difference, 1), 'b', linewidth=3) xlabel('$x_1, y_1$') ylabel('$p(x_1), p(y_1)$') title('Data PDF in $x_1, y_1$') # compute threshold for test level alpha = 0.05 null_samples_boot.sort() null_samples_spectrum.sort() null_samples_gamma.sort() thresh_boot = null_samples_boot[floor( len(null_samples_boot) * (1 - alpha))] thresh_spectrum = null_samples_spectrum[floor( len(null_samples_spectrum) * (1 - alpha))] thresh_gamma = null_samples_gamma[floor( len(null_samples_gamma) * (1 - alpha))] type_one_error_boot = sum( null_samples_boot < thresh_boot) / float(num_null_samples) type_one_error_spectrum = sum( null_samples_spectrum < thresh_boot) / float(num_null_samples) type_one_error_gamma = sum( null_samples_gamma < thresh_boot) / float(num_null_samples) # plot alternative distribution with threshold subplot(2, 3, 4) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(alt_samples, 20, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error = sum(alt_samples < thresh_boot) / float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range = [ min([ min(null_samples_boot), min(null_samples_spectrum), min(null_samples_gamma) ]), max([ max(null_samples_boot), max(null_samples_spectrum), max(null_samples_gamma) ]) ] # plot null distribution with threshold subplot(2, 3, 3) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(null_samples_boot, 20, range=hist_range, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Sampled Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) grid(True) # plot null distribution spectrum subplot(2, 3, 5) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(null_samples_spectrum, 20, range=hist_range, normed=True) axvline(thresh_spectrum, 0, 1, linewidth=2, color='red') title('Null Dist. Spectrum\nType I error is ' + str(type_one_error_spectrum)) # plot null distribution gamma subplot(2, 3, 6) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(null_samples_gamma, 20, range=hist_range, normed=True) axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def evaluation_cross_validation_multiclass_storage( traindat=traindat, label_traindat=label_traindat): from shogun import CrossValidation, CrossValidationResult from shogun import ParameterObserverCV from shogun import MulticlassAccuracy, F1Measure from shogun import StratifiedCrossValidationSplitting from shogun import MulticlassLabels from shogun import RealFeatures, CombinedFeatures from shogun import GaussianKernel, CombinedKernel from shogun import MKLMulticlass from shogun import Statistics, MSG_DEBUG, Math from shogun import ROCEvaluation Math.init_random(1) # training data, combined features all on same data features = RealFeatures(traindat) comb_features = CombinedFeatures() comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) labels = MulticlassLabels(label_traindat) # kernel, different Gaussians combined kernel = CombinedKernel() kernel.append_kernel(GaussianKernel(10, 0.1)) kernel.append_kernel(GaussianKernel(10, 1)) kernel.append_kernel(GaussianKernel(10, 2)) # create mkl using libsvm, due to a mem-bug, interleaved is not possible svm = MKLMulticlass(1.0, kernel, labels) svm.set_kernel(kernel) # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy = StratifiedCrossValidationSplitting(labels, 3) # evaluation method evaluation_criterium = MulticlassAccuracy() # cross-validation instance cross_validation = CrossValidation(svm, comb_features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # append cross validation parameter observer multiclass_storage = ParameterObserverCV() cross_validation.subscribe_to_parameters(multiclass_storage) cross_validation.set_num_runs(3) # perform cross-validation result = cross_validation.evaluate() # get first observation and first fold obs = multiclass_storage.get_observations()[0] fold = obs.get_folds_results()[0] # get fold ROC for first class eval_ROC = ROCEvaluation() pred_lab_binary = MulticlassLabels.obtain_from_generic( fold.get_test_result()).get_binary_for_class(0) true_lab_binary = MulticlassLabels.obtain_from_generic( fold.get_test_true_result()).get_binary_for_class(0) eval_ROC.evaluate(pred_lab_binary, true_lab_binary) print eval_ROC.get_ROC() # get fold evaluation result acc_measure = F1Measure() print acc_measure.evaluate(pred_lab_binary, true_lab_binary)
def linear_time_mmd_graphical(): # parameters, change to get different results m=1000 # set to 10000 for a good test result dim=2 # setting the difference of the first dimension smaller makes a harder test difference=1 # number of samples taken from null and alternative distribution num_null_samples=150 # streaming data generator for mean shift distributions gen_p=MeanShiftDataGenerator(0, dim) gen_q=MeanShiftDataGenerator(difference, dim) # use the median kernel selection # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable sigmas=[2**x for x in range(-3,10)] widths=[x*x*2 for x in sigmas] print "kernel widths:", widths combined=CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # mmd instance using streaming features, blocksize of 10000 block_size=1000 mmd=LinearTimeMMD(combined, gen_p, gen_q, m, block_size) # kernel selection instance (this can easily replaced by the other methods for selecting # single kernels selection=MMDKernelSelectionOpt(mmd) # perform kernel selection kernel=selection.select_kernel() kernel=GaussianKernel.obtain_from_generic(kernel) mmd.set_kernel(kernel); print "selected kernel width:", kernel.get_width() # sample alternative distribution, stream ensures different samples each run alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): alt_samples[i]=mmd.compute_statistic() # sample from null distribution # bootstrapping, biased statistic mmd.set_null_approximation_method(PERMUTATION) mmd.set_num_null_samples(num_null_samples) null_samples_boot=mmd.sample_null() # fit normal distribution to null and sample a normal distribution mmd.set_null_approximation_method(MMD1_GAUSSIAN) variance=mmd.compute_variance_estimate() null_samples_gaussian=normal(0,sqrt(variance),num_null_samples) # to plot data, sample a few examples from stream first features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) data=features.get_feature_matrix() # plot figure() # plot data of p and q subplot(2,3,1) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks plot(data[0][0:m], data[1][0:m], 'ro', label='$x$') plot(data[0][m+1:2*m], data[1][m+1:2*m], 'bo', label='$x$', alpha=0.5) title('Data, shift in $x_1$='+str(difference)+'\nm='+str(m)) xlabel('$x_1, y_1$') ylabel('$x_2, y_2$') # histogram of first data dimension and pdf subplot(2,3,2) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True) hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True) xs=linspace(min(data[0])-1,max(data[0])+1, 50) plot(xs,normpdf( xs, 0, 1), 'r', linewidth=3) plot(xs,normpdf( xs, difference, 1), 'b', linewidth=3) xlabel('$x_1, y_1$') ylabel('$p(x_1), p(y_1)$') title('Data PDF in $x_1, y_1$') # compute threshold for test level alpha=0.05 null_samples_boot.sort() null_samples_gaussian.sort() thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))]; thresh_gaussian=null_samples_gaussian[floor(len(null_samples_gaussian)*(1-alpha))]; type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples) type_one_error_gaussian=sum(null_samples_gaussian<thresh_boot)/float(num_null_samples) # plot alternative distribution with threshold subplot(2,3,4) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(alt_samples, 20, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range=[min([min(null_samples_boot), min(null_samples_gaussian)]), max([max(null_samples_boot), max(null_samples_gaussian)])] # plot null distribution with threshold subplot(2,3,3) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(null_samples_boot, 20, range=hist_range, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Sampled Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gaussian subplot(2,3,5) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(null_samples_gaussian, 20, range=hist_range, normed=True); axvline(thresh_gaussian, 0, 1, linewidth=2, color='red') title('Null Dist. Gaussian\nType I error is ' + str(type_one_error_gaussian)) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def create_kernel(kname,kparam,feats_train): """Call the corresponding constructor for the kernel""" if kname == 'gauss': kernel = GaussianKernel(feats_train, feats_train, kparam['width']) elif kname == 'linear': kernel = LinearKernel(feats_train, feats_train) kernel.set_normalizer(AvgDiagKernelNormalizer(kparam['scale'])) elif kname == 'poly': kernel = PolyKernel(feats_train, feats_train, kparam['degree'], kparam['inhomogene'], kparam['normal']) elif kname == 'wd': kernel=WeightedDegreePositionStringKernel(feats_train, feats_train, kparam['degree']) kernel.set_normalizer(AvgDiagKernelNormalizer(float(kparam['seqlength']))) kernel.set_shifts(kparam['shift']*numpy.ones(kparam['seqlength'],dtype=numpy.int32)) #kernel=WeightedDegreeStringKernel(feats_train, feats_train, kparam['degree']) elif kname == 'spec': kernel = CommUlongStringKernel(feats_train, feats_train) elif kname == 'cumspec': kernel = WeightedCommWordStringKernel(feats_train, feats_train) kernel.set_weights(numpy.ones(kparam['degree'])) elif kname == 'spec2': kernel = CombinedKernel() k0 = CommWordStringKernel(feats_train['f0'], feats_train['f0']) k0.io.disable_progress() kernel.append_kernel(k0) k1 = CommWordStringKernel(feats_train['f1'], feats_train['f1']) k1.io.disable_progress() kernel.append_kernel(k1) elif kname == 'cumspec2': kernel = CombinedKernel() k0 = WeightedCommWordStringKernel(feats_train['f0'], feats_train['f0']) k0.set_weights(numpy.ones(kparam['degree'])) k0.io.disable_progress() kernel.append_kernel(k0) k1 = WeightedCommWordStringKernel(feats_train['f1'], feats_train['f1']) k1.set_weights(numpy.ones(kparam['degree'])) k1.io.disable_progress() kernel.append_kernel(k1) elif kname == 'localalign': kernel = LocalAlignmentStringKernel(feats_train, feats_train) elif kname == 'localimprove': kernel = LocalityImprovedStringKernel(feats_train, feats_train, kparam['length'],\ kparam['indeg'], kparam['outdeg']) else: print 'Unknown kernel %s' % kname kernel.set_cache_size(32) return kernel
def quadratic_time_mmd_graphical(): # parameters, change to get different results m=100 dim=2 # setting the difference of the first dimension smaller makes a harder test difference=0.5 # number of samples taken from null and alternative distribution num_null_samples=500 # streaming data generator for mean shift distributions gen_p=MeanShiftDataGenerator(0, dim) gen_q=MeanShiftDataGenerator(difference, dim) # Stream examples and merge them in order to compute MMD on joint sample # alternative is to call a different constructor of QuadraticTimeMMD features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) # use the median kernel selection # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable sigmas=[2**x for x in range(-3,10)] widths=[x*x*2 for x in sigmas] print "kernel widths:", widths combined=CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # create MMD instance, use biased statistic mmd=QuadraticTimeMMD(combined,features, m) mmd.set_statistic_type(BIASED) # kernel selection instance (this can easily replaced by the other methods for selecting # single kernels selection=MMDKernelSelectionMax(mmd) # perform kernel selection kernel=selection.select_kernel() kernel=GaussianKernel.obtain_from_generic(kernel) mmd.set_kernel(kernel); print "selected kernel width:", kernel.get_width() # sample alternative distribution (new data each trial) alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): # Stream examples and merge them in order to replace in MMD features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) mmd.set_p_and_q(features) alt_samples[i]=mmd.compute_statistic() # sample from null distribution # bootstrapping, biased statistic mmd.set_null_approximation_method(PERMUTATION) mmd.set_statistic_type(BIASED) mmd.set_num_null_samples(num_null_samples) null_samples_boot=mmd.sample_null() # sample from null distribution # spectrum, biased statistic if "sample_null_spectrum" in dir(QuadraticTimeMMD): mmd.set_null_approximation_method(MMD2_SPECTRUM) mmd.set_statistic_type(BIASED) null_samples_spectrum=mmd.sample_null_spectrum(num_null_samples, m-10) # fit gamma distribution, biased statistic mmd.set_null_approximation_method(MMD2_GAMMA) mmd.set_statistic_type(BIASED) gamma_params=mmd.fit_null_gamma() # sample gamma with parameters null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)]) # to plot data, sample a few examples from stream first features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) data=features.get_feature_matrix() # plot figure() title('Quadratic Time MMD') # plot data of p and q subplot(2,3,1) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks plot(data[0][0:m], data[1][0:m], 'ro', label='$x$') plot(data[0][m+1:2*m], data[1][m+1:2*m], 'bo', label='$x$', alpha=0.5) title('Data, shift in $x_1$='+str(difference)+'\nm='+str(m)) xlabel('$x_1, y_1$') ylabel('$x_2, y_2$') # histogram of first data dimension and pdf subplot(2,3,2) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3 )) # reduce number of x-ticks hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True) hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True) xs=linspace(min(data[0])-1,max(data[0])+1, 50) plot(xs,normpdf( xs, 0, 1), 'r', linewidth=3) plot(xs,normpdf( xs, difference, 1), 'b', linewidth=3) xlabel('$x_1, y_1$') ylabel('$p(x_1), p(y_1)$') title('Data PDF in $x_1, y_1$') # compute threshold for test level alpha=0.05 null_samples_boot.sort() null_samples_spectrum.sort() null_samples_gamma.sort() thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))]; thresh_spectrum=null_samples_spectrum[floor(len(null_samples_spectrum)*(1-alpha))]; thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))]; type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples) type_one_error_spectrum=sum(null_samples_spectrum<thresh_boot)/float(num_null_samples) type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples) # plot alternative distribution with threshold subplot(2,3,4) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(alt_samples, 20, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range=[min([min(null_samples_boot), min(null_samples_spectrum), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_spectrum), max(null_samples_gamma)])] # plot null distribution with threshold subplot(2,3,3) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3 )) # reduce number of x-ticks hist(null_samples_boot, 20, range=hist_range, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Sampled Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) grid(True) # plot null distribution spectrum subplot(2,3,5) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(null_samples_spectrum, 20, range=hist_range, normed=True); axvline(thresh_spectrum, 0, 1, linewidth=2, color='red') title('Null Dist. Spectrum\nType I error is ' + str(type_one_error_spectrum)) # plot null distribution gamma subplot(2,3,6) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(null_samples_gamma, 20, range=hist_range, normed=True); axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def RunKPCAShogun(): totalTimer = Timer() try: # Load input dataset. Log.Info("Loading dataset", self.verbose) data = np.genfromtxt(self.dataset, delimiter=',') dataFeat = RealFeatures(data.T) with totalTimer: # Get the new dimensionality, if it is necessary. if "new_dimensionality" in options: d = int(options.pop("new_dimensionality")) if (d > data.shape[1]): Log.Fatal("New dimensionality (" + str(d) + ") cannot be greater " + "than existing dimensionality (" + str(data.shape[1]) + ")!") return -1 else: d = data.shape[1] # Get the kernel type and make sure it is valid. if "kernel" in options: kernel = str(options.pop("kernel")) else: Log.Fatal( "Choose kernel type, valid choices are 'linear'," + " 'hyptan', 'polynomial' and 'gaussian'.") return -1 if "degree" in options: degree = int(options.pop("degree")) if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") if kernel == "polynomial": kernel = PolyKernel(dataFeat, dataFeat, degree, True) elif kernel == "gaussian": kernel = GaussianKernel(dataFeat, dataFeat, 2.0) elif kernel == "linear": kernel = LinearKernel(dataFeat, dataFeat) elif kernel == "hyptan": kernel = SigmoidKernel(dataFeat, dataFeat, 2, 1.0, 1.0) else: Log.Fatal( "Invalid kernel type (" + kernel.group(1) + "); valid " + "choices are 'linear', 'hyptan', 'polynomial' and 'gaussian'." ) return -1 # Perform Kernel Principal Components Analysis. model = KernelPCA(kernel) model.set_target_dim(d) model.init(dataFeat) model.apply_to_feature_matrix(dataFeat) except Exception as e: return -1 return totalTimer.ElapsedTime()