def regression_svrlight (fm_train=traindat,fm_test=testdat,label_train=label_traindat, \
				    width=1.2,C=1,epsilon=1e-5,tube_epsilon=1e-2,num_threads=3):


	from shogun import RegressionLabels, RealFeatures
	from shogun import GaussianKernel
	try:
		from shogun import SVRLight
	except ImportError:
		print('No support for SVRLight available.')
		return

	feats_train=RealFeatures(fm_train)
	feats_test=RealFeatures(fm_test)

	kernel=GaussianKernel(feats_train, feats_train, width)

	labels=RegressionLabels(label_train)

	svr=SVRLight(C, epsilon, kernel, labels)
	svr.set_tube_epsilon(tube_epsilon)
	svr.parallel.set_num_threads(num_threads)
	svr.train()

	kernel.init(feats_train, feats_test)
	out = svr.apply().get_labels()

	return out, kernel
def kernel_gaussian (train_fname=traindat,test_fname=testdat, width=1.3):
	from shogun import RealFeatures, GaussianKernel, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	kernel=GaussianKernel(feats_train, feats_train, width)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
def preprocessor_kernelpca_modular(data, threshold, width):

    from shogun import RealFeatures
    from shogun import KernelPCA
    from shogun import GaussianKernel
    features = RealFeatures(data)
    kernel = GaussianKernel(features, features, width)
    preprocessor = KernelPCA(kernel)
    preprocessor.init(features)
    preprocessor.set_target_dim(2)
    #X=preprocessor.get_transformation_matrix()
    X2 = preprocessor.apply_to_feature_matrix(features)
    lx0 = len(X2)
    modified_d1 = zeros((lx0, number_of_points_for_circle1))
    modified_d2 = zeros((lx0, number_of_points_for_circle2))
    modified_d1 = [X2[i][0:number_of_points_for_circle1] for i in range(lx0)]
    modified_d2 = [
        X2[i][number_of_points_for_circle1:(number_of_points_for_circle1 +
                                            number_of_points_for_circle2)]
        for i in range(lx0)
    ]
    p.plot(modified_d1[0][:], modified_d1[1][:], 'o', modified_d2[0][:],
           modified_d2[1][:], 'x')
    p.title('final data')
    p.show()
    return features
Exemple #4
0
def classifier_gpbtsvm(train_fname=traindat,
                       test_fname=testdat,
                       label_fname=label_traindat,
                       width=2.1,
                       C=1,
                       epsilon=1e-5):
    from shogun import RealFeatures, BinaryLabels
    from shogun import GaussianKernel
    from shogun import CSVFile
    try:
        from shogun import GPBTSVM
    except ImportError:
        print("GPBTSVM not available")
        exit(0)

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))
    labels = BinaryLabels(CSVFile(label_fname))
    kernel = GaussianKernel(feats_train, feats_train, width)

    svm = GPBTSVM(C, kernel, labels)
    svm.set_epsilon(epsilon)
    svm.train()

    predictions = svm.apply(feats_test)
    return predictions, svm, predictions.get_labels()
Exemple #5
0
def kernel_combined (fm_train_real=traindat,fm_test_real=testdat,fm_train_dna=traindna,fm_test_dna=testdna ):
	from shogun import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel
	from shogun import RealFeatures, StringCharFeatures, CombinedFeatures, DNA

	kernel=CombinedKernel()
	feats_train=CombinedFeatures()
	feats_test=CombinedFeatures()

	subkfeats_train=RealFeatures(fm_train_real)
	subkfeats_test=RealFeatures(fm_test_real)
	subkernel=GaussianKernel(10, 1.1)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
	subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
	degree=3
	subkernel=FixedDegreeStringKernel(10, degree)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
	subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
	subkernel=LocalAlignmentStringKernel(10)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	kernel.init(feats_train, feats_train)
	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
Exemple #6
0
def mkl_multiclass_1(fm_train_real, fm_test_real, label_train_multiclass, C):
    kernel = CombinedKernel()
    feats_train = CombinedFeatures()
    feats_test = CombinedFeatures()

    for i in range(-10, 11):
        subkfeats_train = RealFeatures(fm_train_real)
        subkfeats_test = RealFeatures(fm_test_real)
        subkernel = GaussianKernel(pow(2, i + 1))
        feats_train.append_feature_obj(subkfeats_train)
        feats_test.append_feature_obj(subkfeats_test)
        kernel.append_kernel(subkernel)

    kernel.init(feats_train, feats_train)

    labels = MulticlassLabels(label_train_multiclass)

    mkl = MKLMulticlass(C, kernel, labels)

    mkl.set_epsilon(1e-2)
    mkl.parallel.set_num_threads(num_threads)
    mkl.set_mkl_epsilon(mkl_epsilon)
    mkl.set_mkl_norm(1)

    mkl.train()

    kernel.init(feats_train, feats_test)

    out = mkl.apply().get_labels()
    return out
Exemple #7
0
def classifier_gmnpsvm(fm_train_real, fm_test_real, label_train_multiclass, C):
    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)
    kernel = GaussianKernel(feats_train, feats_train, width)
    import time
    start = time.time()
    tmp = kernel.get_kernel_matrix()
    end = time.time()

    labels = MulticlassLabels(label_train_multiclass)

    svm = GMNPSVM(C, kernel, labels)
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.train(feats_train)

    out = svm.apply(feats_test).get_labels()
    return out
Exemple #8
0
    def _svm_new(self, kernel_width, c, epsilon):
        if self.x == None or self.y == None:
            raise Exception("No training data loaded.")

        x = RealFeatures(self.x)
        y = MulticlassLabels(self.y)

        self.svm = GMNPSVM(c, GaussianKernel(x, x, kernel_width), y)
        self.svm.set_epsilon(epsilon)
Exemple #9
0
def create_kernel(kname, features, kparam=None):

    if kname == 'gauss':
        kernel = GaussianKernel(features, features, kparam)
    elif kname == 'linear':
        kernel = LinearKernel(features, features)
    elif kname == 'poly':
        kernel = PolyKernel(features, features, kparam, True, False)

    return kernel
Exemple #10
0
def kernel_auc (train_fname=traindat,label_fname=label_traindat,width=1.7):
	from shogun import GaussianKernel, AUCKernel, RealFeatures
	from shogun import BinaryLabels, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	subkernel=GaussianKernel(feats_train, feats_train, width)

	kernel=AUCKernel(0, subkernel)
	kernel.setup_auc_maximization(BinaryLabels(CSVFile(label_fname)))
	km_train=kernel.get_kernel_matrix()
	return kernel
Exemple #11
0
def classifier_multiclassmachine (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,width=2.1,C=1,epsilon=1e-5):
	from shogun import RealFeatures, MulticlassLabels
	from shogun import GaussianKernel
	from shogun import LibSVM, KernelMulticlassMachine, MulticlassOneVsRestStrategy

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)
	kernel=GaussianKernel(feats_train, feats_train, width)

	labels=MulticlassLabels(label_train_multiclass)

	classifier = LibSVM()
	classifier.set_epsilon(epsilon)
	#print labels.get_labels()
	mc_classifier = KernelMulticlassMachine(MulticlassOneVsRestStrategy(),kernel,classifier,labels)
	mc_classifier.train()

	kernel.init(feats_train, feats_test)
	out = mc_classifier.apply().get_labels()
	return out
Exemple #12
0
def kernel_io(train_fname=traindat, test_fname=testdat, width=1.9):
    from shogun import RealFeatures, GaussianKernel, CSVFile
    from tempfile import NamedTemporaryFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))

    kernel = GaussianKernel(feats_train, feats_train, width)
    km_train = kernel.get_kernel_matrix()
    tmp_train_csv = NamedTemporaryFile(suffix='train.csv')
    f = CSVFile(tmp_train_csv.name, "w")
    kernel.save(f)
    del f

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    tmp_test_csv = NamedTemporaryFile(suffix='test.csv')
    f = CSVFile(tmp_test_csv.name, "w")
    kernel.save(f)
    del f

    return km_train, km_test, kernel
Exemple #13
0
def create_param_tree():
    root = ModelSelectionParameters()

    c1 = ModelSelectionParameters("C1")
    root.append_child(c1)
    c1.build_values(-1.0, 1.0, R_EXP)

    c2 = ModelSelectionParameters("C2")
    root.append_child(c2)
    c2.build_values(-1.0, 1.0, R_EXP)

    gaussian_kernel = GaussianKernel()

    # print all parameter available for modelselection
    # Dont worry if yours is not included, simply write to the mailing list
    #gaussian_kernel.print_modsel_params()

    param_gaussian_kernel = ModelSelectionParameters("kernel", gaussian_kernel)
    gaussian_kernel_width = ModelSelectionParameters("log_width")
    gaussian_kernel_width.build_values(-math.log(2.0), 0.0, R_EXP, 1.0, 2.0)
    param_gaussian_kernel.append_child(gaussian_kernel_width)
    root.append_child(param_gaussian_kernel)

    power_kernel = PowerKernel()

    # print all parameter available for modelselection
    # Dont worry if yours is not included, simply write to the mailing list
    #power_kernel.print_modsel_params()

    param_power_kernel = ModelSelectionParameters("kernel", power_kernel)
    root.append_child(param_power_kernel)

    param_power_kernel_degree = ModelSelectionParameters("degree")
    param_power_kernel_degree.build_values(1.0, 2.0, R_LINEAR)
    param_power_kernel.append_child(param_power_kernel_degree)

    metric = MinkowskiMetric(10)

    # print all parameter available for modelselection
    # Dont worry if yours is not included, simply write to the mailing list
    #metric.print_modsel_params()

    param_power_kernel_metric1 = ModelSelectionParameters("distance", metric)

    param_power_kernel.append_child(param_power_kernel_metric1)

    param_power_kernel_metric1_k = ModelSelectionParameters("k")
    param_power_kernel_metric1_k.build_values(1.0, 2.0, R_LINEAR)
    param_power_kernel_metric1.append_child(param_power_kernel_metric1_k)

    return root
def classifier_libsvmoneclass (train_fname=traindat,test_fname=testdat,width=2.1,C=1,epsilon=1e-5):
	from shogun import RealFeatures, GaussianKernel, LibSVMOneClass, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	kernel=GaussianKernel(feats_train, feats_train, width)

	svm=LibSVMOneClass(C, kernel)
	svm.set_epsilon(epsilon)
	svm.train()

	predictions = svm.apply(feats_test)
	return predictions, svm, predictions.get_labels()
Exemple #15
0
def mkl_multiclass(fm_train_real, fm_test_real, label_train_multiclass, width,
                   C, epsilon, num_threads, mkl_epsilon, mkl_norm):

    from shogun import CombinedFeatures, RealFeatures, MulticlassLabels
    from shogun import CombinedKernel, GaussianKernel, LinearKernel, PolyKernel
    from shogun import MKLMulticlass

    kernel = CombinedKernel()
    feats_train = CombinedFeatures()
    feats_test = CombinedFeatures()

    subkfeats_train = RealFeatures(fm_train_real)
    subkfeats_test = RealFeatures(fm_test_real)
    subkernel = GaussianKernel(10, width)
    feats_train.append_feature_obj(subkfeats_train)
    feats_test.append_feature_obj(subkfeats_test)
    kernel.append_kernel(subkernel)

    subkfeats_train = RealFeatures(fm_train_real)
    subkfeats_test = RealFeatures(fm_test_real)
    subkernel = LinearKernel()
    feats_train.append_feature_obj(subkfeats_train)
    feats_test.append_feature_obj(subkfeats_test)
    kernel.append_kernel(subkernel)

    subkfeats_train = RealFeatures(fm_train_real)
    subkfeats_test = RealFeatures(fm_test_real)
    subkernel = PolyKernel(10, 2)
    feats_train.append_feature_obj(subkfeats_train)
    feats_test.append_feature_obj(subkfeats_test)
    kernel.append_kernel(subkernel)

    kernel.init(feats_train, feats_train)

    labels = MulticlassLabels(label_train_multiclass)

    mkl = MKLMulticlass(C, kernel, labels)

    mkl.set_epsilon(epsilon)
    mkl.parallel.set_num_threads(num_threads)
    mkl.set_mkl_epsilon(mkl_epsilon)
    mkl.set_mkl_norm(mkl_norm)

    mkl.train()

    kernel.init(feats_train, feats_test)

    out = mkl.apply().get_labels()
    return out
Exemple #16
0
def create_param_tree():
    from shogun import ModelSelectionParameters, R_EXP, R_LINEAR
    from shogun import ParameterCombination
    from shogun import GaussianKernel, PolyKernel
    import math
    root = ModelSelectionParameters()

    tau = ModelSelectionParameters("tau")
    root.append_child(tau)

    # also R_LINEAR/R_LOG is available as type
    min = -1
    max = 1
    type = R_EXP
    step = 1.5
    base = 2
    tau.build_values(min, max, type, step, base)

    # gaussian kernel with width
    gaussian_kernel = GaussianKernel()

    # print all parameter available for modelselection
    # Dont worry if yours is not included but, write to the mailing list
    #gaussian_kernel.print_modsel_params()

    param_gaussian_kernel = ModelSelectionParameters("kernel", gaussian_kernel)
    gaussian_kernel_width = ModelSelectionParameters("log_width")
    gaussian_kernel_width.build_values(2.0 * math.log(2.0),
                                       2.5 * math.log(2.0), R_LINEAR, 1.0)
    param_gaussian_kernel.append_child(gaussian_kernel_width)
    root.append_child(param_gaussian_kernel)

    # polynomial kernel with degree
    poly_kernel = PolyKernel()

    # print all parameter available for modelselection
    # Dont worry if yours is not included but, write to the mailing list
    #poly_kernel.print_modsel_params()

    param_poly_kernel = ModelSelectionParameters("kernel", poly_kernel)

    root.append_child(param_poly_kernel)

    # note that integers are used here
    param_poly_kernel_degree = ModelSelectionParameters("degree")
    param_poly_kernel_degree.build_values(1, 2, R_LINEAR)
    param_poly_kernel.append_child(param_poly_kernel_degree)

    return root
def converter_diffusionmaps(data_fname, t):
    try:
        from shogun import RealFeatures, DiffusionMaps, GaussianKernel, CSVFile

        features = RealFeatures(CSVFile(data_fname))

        converter = DiffusionMaps()
        converter.set_target_dim(1)
        converter.set_kernel(GaussianKernel(10, 10.0))
        converter.set_t(t)
        converter.apply(features)

        return features
    except ImportError:
        print('No Eigen3 available')
def preprocessor_kernelpca (data, threshold, width):
	from shogun import RealFeatures
	from shogun import KernelPCA
	from shogun import GaussianKernel

	features = RealFeatures(data)

	kernel = GaussianKernel(features,features,width)

	preprocessor = KernelPCA(kernel)
	preprocessor.init(features)
	preprocessor.set_target_dim(2)
	preprocessor.apply_to_feature_matrix(features)

	return features
Exemple #19
0
    def BuildModel(self, data, labels, options):
        if "kernel" in options:
            k = str(options.pop("kernel"))
        else:
            Log.Fatal("Required parameter 'kernel' not specified!")
            raise Exception("missing parameter")

        if "c" in options:
            self.C = float(options.pop("c"))
        if "gamma" in options:
            self.gamma = float(options.pop("gamma"))

        if k == "gaussian":
            self.kernel = GaussianKernel(data, data, 1)
        elif k == "polynomial":
            if "degree" in options:
                d = int(options.pop("degree"))
            else:
                d = 1

            self.kernel = PolyKernel(data, data, d, True)
        elif k == "linear":
            self.kernel = LinearKernel(data, data)
        elif k == "hyptan":
            self.kernel = SigmoidKernel(data, data, 2, 1.0, 1.0)
        else:
            self.kernel = GaussianKernel(data, data, 1)

        if len(options) > 0:
            Log.Fatal("Unknown parameters: " + str(options))
            raise Exception("unknown parameters")

        # Create and train the classifier.
        svm = LibSvm(self.C, self.kernel, labels)
        svm.train()
        return svm
Exemple #20
0
def kernel_io (train_fname=traindat,test_fname=testdat,width=1.9):
	from shogun import RealFeatures, GaussianKernel, CSVFile
	from tempfile import NamedTemporaryFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	kernel=GaussianKernel(feats_train, feats_train, width)
	km_train=kernel.get_kernel_matrix()
	tmp_train_csv = NamedTemporaryFile(suffix='train.csv')
	f=CSVFile(tmp_train_csv.name, "w")
	kernel.save(f)
	del f

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	tmp_test_csv = NamedTemporaryFile(suffix='test.csv')
	f=CSVFile(tmp_test_csv.name,"w")
	kernel.save(f)
	del f

	return km_train, km_test, kernel
Exemple #21
0
def kernel_gaussian(train_fname=traindat, test_fname=testdat, width=1.3):
    from shogun import RealFeatures, GaussianKernel, CSVFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))

    kernel = GaussianKernel(feats_train, feats_train, width)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
Exemple #22
0
def classifier_larank(num_vec,
                      num_class,
                      distance,
                      C=0.9,
                      num_threads=1,
                      num_iter=5,
                      seed=1):
    from shogun import RealFeatures, MulticlassLabels
    from shogun import GaussianKernel
    from shogun import LaRank
    from shogun import Math_init_random

    # reproducible results
    Math_init_random(seed)
    random.seed(seed)

    # generate some training data where each class pair is linearly separable
    label_train = array([mod(x, num_class) for x in range(num_vec)],
                        dtype="float64")
    label_test = array([mod(x, num_class) for x in range(num_vec)],
                       dtype="float64")
    fm_train = array(random.randn(num_class, num_vec))
    fm_test = array(random.randn(num_class, num_vec))
    for i in range(len(label_train)):
        fm_train[int(label_train[i]), i] += distance
        fm_test[int(label_test[i]), i] += distance

    feats_train = RealFeatures(fm_train)
    feats_test = RealFeatures(fm_test)

    width = 2.1
    kernel = GaussianKernel(feats_train, feats_train, width)

    epsilon = 1e-5
    labels = MulticlassLabels(label_train)

    svm = LaRank(C, kernel, labels)
    #svm.set_tau(1e-3)
    svm.set_batch_mode(False)
    #svm.io.enable_progress()
    svm.set_epsilon(epsilon)
    svm.train()
    out = svm.apply(feats_test).get_labels()
    predictions = svm.apply()
    return predictions, svm, predictions.get_labels()
def kernel_sparse_gaussian(fm_train_real=traindat,
                           fm_test_real=testdat,
                           width=1.1):
    from shogun import SparseRealFeatures
    from shogun import GaussianKernel

    feats_train = SparseRealFeatures(fm_train_real)
    feats_test = SparseRealFeatures(fm_test_real)

    kernel = GaussianKernel(feats_train, feats_train, width)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
Exemple #24
0
def classifier_gmnpsvm(train_fname=traindat,
                       test_fname=testdat,
                       label_fname=label_traindat,
                       width=2.1,
                       C=1,
                       epsilon=1e-5):
    from shogun import RealFeatures, MulticlassLabels
    from shogun import GaussianKernel, GMNPSVM, CSVFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))
    labels = MulticlassLabels(CSVFile(label_fname))

    kernel = GaussianKernel(feats_train, feats_train, width)

    svm = GMNPSVM(C, kernel, labels)
    svm.set_epsilon(epsilon)
    svm.train(feats_train)

    out = svm.apply(feats_test).get_labels()
    return out, kernel
Exemple #25
0
def classifier_mpdsvm(train_fname=traindat,
                      test_fname=testdat,
                      label_fname=label_traindat,
                      C=1,
                      epsilon=1e-5):

    from shogun import RealFeatures, BinaryLabels
    from shogun import GaussianKernel
    from shogun import MPDSVM, CSVFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))
    labels = BinaryLabels(CSVFile(label_fname))
    width = 2.1
    kernel = GaussianKernel(feats_train, feats_train, width)

    svm = MPDSVM(C, kernel, labels)
    svm.set_epsilon(epsilon)
    svm.train()

    predictions = svm.apply(feats_test)
    return predictions, svm, predictions.get_labels()
def evaluation_cross_validation_regression(train_fname=traindat,
                                           label_fname=label_traindat,
                                           width=0.8,
                                           tau=1e-6):
    from shogun import CrossValidation, CrossValidationResult
    from shogun import MeanSquaredError, CrossValidationSplitting
    from shogun import RegressionLabels, RealFeatures
    from shogun import GaussianKernel, KernelRidgeRegression, CSVFile

    # training data
    features = RealFeatures(CSVFile(train_fname))
    labels = RegressionLabels(CSVFile(label_fname))

    # kernel and predictor
    kernel = GaussianKernel()
    predictor = KernelRidgeRegression(tau, kernel, labels)

    # splitting strategy for 5 fold cross-validation (for classification its better
    # to use "StratifiedCrossValidation", but here, the std x-val is used
    splitting_strategy = CrossValidationSplitting(labels, 5)

    # evaluation method
    evaluation_criterium = MeanSquaredError()

    # cross-validation instance
    cross_validation = CrossValidation(predictor, features, labels,
                                       splitting_strategy,
                                       evaluation_criterium)

    # (optional) repeat x-val 10 times
    cross_validation.set_num_runs(10)

    # (optional) tell machine to precompute kernel matrix. speeds up. may not work
    predictor.data_lock(labels, features)

    # perform cross-validation and print(results)
    result = cross_validation.evaluate()
Exemple #27
0
        def RunSVRShogun():
            totalTimer = Timer()
            # Load input dataset.
            Log.Info("Loading dataset", self.verbose)
            # Use the last row of the training set as the responses.
            X, y = SplitTrainData(self.dataset)

            # Get all the parameters.
            self.C = 1.0
            self.epsilon = 1.0
            self.width = 0.1
            if "c" in options:
                self.C = float(options.pop("c"))
            if "epsilon" in options:
                self.epsilon = float(options.pop("epsilon"))
            if "gamma" in options:
                self.width = np.true_divide(1, float(options.pop("gamma")))

            if len(options) > 0:
                Log.Fatal("Unknown parameters: " + str(options))
                raise Exception("unknown parameters")

            data = RealFeatures(X.T)
            labels_train = RegressionLabels(y)
            self.kernel = GaussianKernel(data, data, self.width)

            try:
                with totalTimer:
                    # Perform SVR.
                    model = LibSVR(self.C, self.epsilon, self.kernel,
                                   labels_train)
                    model.train()
            except Exception as e:
                return -1

            return totalTimer.ElapsedTime()
Exemple #28
0
    def metric(self):

        distance = "Euclidean"
        if "distance" in self.method_param:
            distance = str(self.method_param["distance"])

        kernel = "Gaussian"
        if "kernel" in self.method_param:
            kernel = str(self.method_param["kernel"])

        cache_size = 10
        if "cache-size" in self.method_param:
            cache_size = int(self.method_param["cache-size"])

        degree = 2
        if "degree" in self.method_param:
            degree = int(self.method_param["degree"])

        gamma = 2.0
        if "gamma" in self.method_param:
            gamma = float(self.method_param["gamma"])

        coef0 = 1.0
        if "coef0" in self.method_param:
            coef0 = float(self.method_param["coef0"])

        order = 2.0
        if "order" in self.method_param:
            order = float(self.method_param["order"])

        width = 2.0
        if "width" in self.method_param:
            width = float(self.method_param["order"])

        sigma = 1.5
        if "sigma" in self.method_param:
            sigma = float(self.method_param["sigma"])

        const = 2.0
        if "constant" in self.method_param:
            const = float(self.method_param["constant"])

        #Choosing a Distance Function required by some Kernels
        if distance == "Euclidean":
            distanceMethod = EuclideanDistance()

        elif distance == "Chi-Square":
            distanceMethod = ChiSquareDistance()

        elif distance == "Tanimoto":
            distanceMethod = TanimotoDistance()

        elif distance == "Minkowski":
            distanceMethod = MinkowskiMetric()

        elif distance == "Manhattan":
            distanceMethod = ManhattanMetric()

        elif distance == "Jensen":
            distanceMethod = JensenMetric()

        elif distance == "Canberra":
            distanceMethod = CanberraMetric()

        else:
            raise ValueError(
                "distance function not supported by the benchmarks")

        totalTimer = Timer()
        with totalTimer:
            #Choosing a Kernel for the Gaussian Process Classification
            if kernel == "Gaussian":
                kernelMethod = GaussianKernel(width)

            elif kernel == "Polynomial":
                kernelMethod = PolyKernel(cache_size, degree)

            elif kernel == "Sigmoid":
                kernelMethod = SigmoidKernel(cache_size, gamma, coef0)

            elif kernel == "Bessel":
                kernelMethod = BesselKernel(cache_size, order, width, degree,
                                            distanceMethod)

            elif kernel == "Power":
                kernelMethod = PowerKernel(cache_size, degree, distanceMethod)

            elif kernel == "Log":
                kernelMethod = LogKernel(cache_size, degree, distanceMethod)

            elif kernel == "Cauchy":
                kernelMethod = CauchyKernel(cache_size, sigma, distanceMethod)

            elif kernel == "Constant":
                kernelMethod = ConstKernel(const)

            elif kernel == "Diagonal":
                kernelMethod = DiagKernel(cache_size, const)

            else:
                raise ValueError("kernel not supported by the benchmarks")

            mean_function = ConstMean()
            likelihood = SoftMaxLikelihood()
            inference_method = MultiLaplaceInferenceMethod(
                kernelMethod, self.train_features, mean_function,
                self.train_labels, likelihood)

            #Create the model
            model = SGPC(inference_method)

            #Train model
            model.train()
            if len(self.data) >= 2:
                predictions = model.apply_multiclass(
                    self.test_features).get_labels()

        metric = {}
        metric["runtime"] = totalTimer.ElapsedTime()

        if len(self.data) >= 2:
            predictions = label_decoder(predictions, self.label_map)

        if len(self.data) >= 3:
            confusionMatrix = Metrics.ConfusionMatrix(self.data[2],
                                                      predictions)
            metric['ACC'] = Metrics.AverageAccuracy(confusionMatrix)
            metric['MCC'] = Metrics.MCCMultiClass(confusionMatrix)
            metric['Precision'] = Metrics.AvgPrecision(confusionMatrix)
            metric['Recall'] = Metrics.AvgRecall(confusionMatrix)
            metric['MSE'] = Metrics.SimpleMeanSquaredError(
                self.data[2], predictions)

        return metric
Exemple #29
0
from shogun import GaussianKernel
from shogun import LibSVM, LDA
from shogun import ROCEvaluation
import util

util.set_title('ROC example')
util.DISTANCE = 0.5
subplots_adjust(hspace=0.3)

pos = util.get_realdata(True)
neg = util.get_realdata(False)
features = util.get_realfeatures(pos, neg)
labels = util.get_labels()

# classifiers
gk = GaussianKernel(features, features, 1.0)
svm = LibSVM(1000.0, gk, labels)
svm.train()
lda = LDA(1, features, labels)
lda.train()

## plot points
subplot(211)
plot(pos[0, :], pos[1, :], "r.")
plot(neg[0, :], neg[1, :], "b.")
grid(True)
title('Data', size=10)

# plot ROC for SVM
subplot(223)
ROC_evaluation = ROCEvaluation()
def quadratic_time_mmd_graphical():

    # parameters, change to get different results
    m = 100
    dim = 2

    # setting the difference of the first dimension smaller makes a harder test
    difference = 0.5

    # number of samples taken from null and alternative distribution
    num_null_samples = 500

    # streaming data generator for mean shift distributions
    gen_p = MeanShiftDataGenerator(0, dim)
    gen_q = MeanShiftDataGenerator(difference, dim)

    # Stream examples and merge them in order to compute MMD on joint sample
    # alternative is to call a different constructor of QuadraticTimeMMD
    features = gen_p.get_streamed_features(m)
    features = features.create_merged_copy(gen_q.get_streamed_features(m))

    # use the median kernel selection
    # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is
    # compute median data distance in order to use for Gaussian kernel width
    # 0.5*median_distance normally (factor two in Gaussian kernel)
    # However, shoguns kernel width is different to usual parametrization
    # Therefore 0.5*2*median_distance^2
    # Use a subset of data for that, only 200 elements. Median is stable
    sigmas = [2**x for x in range(-3, 10)]
    widths = [x * x * 2 for x in sigmas]
    print "kernel widths:", widths
    combined = CombinedKernel()
    for i in range(len(sigmas)):
        combined.append_kernel(GaussianKernel(10, widths[i]))

    # create MMD instance, use biased statistic
    mmd = QuadraticTimeMMD(combined, features, m)
    mmd.set_statistic_type(BIASED)

    # kernel selection instance (this can easily replaced by the other methods for selecting
    # single kernels
    selection = MMDKernelSelectionMax(mmd)

    # perform kernel selection
    kernel = selection.select_kernel()
    kernel = GaussianKernel.obtain_from_generic(kernel)
    mmd.set_kernel(kernel)
    print "selected kernel width:", kernel.get_width()

    # sample alternative distribution (new data each trial)
    alt_samples = zeros(num_null_samples)
    for i in range(len(alt_samples)):
        # Stream examples and merge them in order to replace in MMD
        features = gen_p.get_streamed_features(m)
        features = features.create_merged_copy(gen_q.get_streamed_features(m))
        mmd.set_p_and_q(features)
        alt_samples[i] = mmd.compute_statistic()

    # sample from null distribution
    # bootstrapping, biased statistic
    mmd.set_null_approximation_method(PERMUTATION)
    mmd.set_statistic_type(BIASED)
    mmd.set_num_null_samples(num_null_samples)
    null_samples_boot = mmd.sample_null()

    # sample from null distribution
    # spectrum, biased statistic
    if "sample_null_spectrum" in dir(QuadraticTimeMMD):
        mmd.set_null_approximation_method(MMD2_SPECTRUM)
        mmd.set_statistic_type(BIASED)
        null_samples_spectrum = mmd.sample_null_spectrum(
            num_null_samples, m - 10)

    # fit gamma distribution, biased statistic
    mmd.set_null_approximation_method(MMD2_GAMMA)
    mmd.set_statistic_type(BIASED)
    gamma_params = mmd.fit_null_gamma()
    # sample gamma with parameters
    null_samples_gamma = array([
        gamma(gamma_params[0], gamma_params[1])
        for _ in range(num_null_samples)
    ])

    # to plot data, sample a few examples from stream first
    features = gen_p.get_streamed_features(m)
    features = features.create_merged_copy(gen_q.get_streamed_features(m))
    data = features.get_feature_matrix()

    # plot
    figure()
    title('Quadratic Time MMD')

    # plot data of p and q
    subplot(2, 3, 1)
    grid(True)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=4))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=4))  # reduce number of x-ticks
    plot(data[0][0:m], data[1][0:m], 'ro', label='$x$')
    plot(data[0][m + 1:2 * m],
         data[1][m + 1:2 * m],
         'bo',
         label='$x$',
         alpha=0.5)
    title('Data, shift in $x_1$=' + str(difference) + '\nm=' + str(m))
    xlabel('$x_1, y_1$')
    ylabel('$x_2, y_2$')

    # histogram of first data dimension and pdf
    subplot(2, 3, 2)
    grid(True)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True)
    hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True)
    xs = linspace(min(data[0]) - 1, max(data[0]) + 1, 50)
    plot(xs, normpdf(xs, 0, 1), 'r', linewidth=3)
    plot(xs, normpdf(xs, difference, 1), 'b', linewidth=3)
    xlabel('$x_1, y_1$')
    ylabel('$p(x_1), p(y_1)$')
    title('Data PDF in $x_1, y_1$')

    # compute threshold for test level
    alpha = 0.05
    null_samples_boot.sort()
    null_samples_spectrum.sort()
    null_samples_gamma.sort()
    thresh_boot = null_samples_boot[floor(
        len(null_samples_boot) * (1 - alpha))]
    thresh_spectrum = null_samples_spectrum[floor(
        len(null_samples_spectrum) * (1 - alpha))]
    thresh_gamma = null_samples_gamma[floor(
        len(null_samples_gamma) * (1 - alpha))]

    type_one_error_boot = sum(
        null_samples_boot < thresh_boot) / float(num_null_samples)
    type_one_error_spectrum = sum(
        null_samples_spectrum < thresh_boot) / float(num_null_samples)
    type_one_error_gamma = sum(
        null_samples_gamma < thresh_boot) / float(num_null_samples)

    # plot alternative distribution with threshold
    subplot(2, 3, 4)
    grid(True)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    hist(alt_samples, 20, normed=True)
    axvline(thresh_boot, 0, 1, linewidth=2, color='red')
    type_two_error = sum(alt_samples < thresh_boot) / float(num_null_samples)
    title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error))

    # compute range for all null distribution histograms
    hist_range = [
        min([
            min(null_samples_boot),
            min(null_samples_spectrum),
            min(null_samples_gamma)
        ]),
        max([
            max(null_samples_boot),
            max(null_samples_spectrum),
            max(null_samples_gamma)
        ])
    ]

    # plot null distribution with threshold
    subplot(2, 3, 3)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    hist(null_samples_boot, 20, range=hist_range, normed=True)
    axvline(thresh_boot, 0, 1, linewidth=2, color='red')
    title('Sampled Null Dist.\n' + 'Type I error is ' +
          str(type_one_error_boot))
    grid(True)

    # plot null distribution spectrum
    subplot(2, 3, 5)
    grid(True)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    hist(null_samples_spectrum, 20, range=hist_range, normed=True)
    axvline(thresh_spectrum, 0, 1, linewidth=2, color='red')
    title('Null Dist. Spectrum\nType I error is ' +
          str(type_one_error_spectrum))

    # plot null distribution gamma
    subplot(2, 3, 6)
    grid(True)
    gca().xaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    gca().yaxis.set_major_locator(
        MaxNLocator(nbins=3))  # reduce number of x-ticks
    hist(null_samples_gamma, 20, range=hist_range, normed=True)
    axvline(thresh_gamma, 0, 1, linewidth=2, color='red')
    title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma))

    # pull plots a bit apart
    subplots_adjust(hspace=0.5)
    subplots_adjust(wspace=0.5)
def evaluation_cross_validation_multiclass_storage(
        traindat=traindat, label_traindat=label_traindat):
    from shogun import CrossValidation, CrossValidationResult
    from shogun import ParameterObserverCV
    from shogun import MulticlassAccuracy, F1Measure
    from shogun import StratifiedCrossValidationSplitting
    from shogun import MulticlassLabels
    from shogun import RealFeatures, CombinedFeatures
    from shogun import GaussianKernel, CombinedKernel
    from shogun import MKLMulticlass
    from shogun import Statistics, MSG_DEBUG, Math
    from shogun import ROCEvaluation

    Math.init_random(1)

    # training data, combined features all on same data
    features = RealFeatures(traindat)
    comb_features = CombinedFeatures()
    comb_features.append_feature_obj(features)
    comb_features.append_feature_obj(features)
    comb_features.append_feature_obj(features)
    labels = MulticlassLabels(label_traindat)

    # kernel, different Gaussians combined
    kernel = CombinedKernel()
    kernel.append_kernel(GaussianKernel(10, 0.1))
    kernel.append_kernel(GaussianKernel(10, 1))
    kernel.append_kernel(GaussianKernel(10, 2))

    # create mkl using libsvm, due to a mem-bug, interleaved is not possible
    svm = MKLMulticlass(1.0, kernel, labels)
    svm.set_kernel(kernel)

    # splitting strategy for 5 fold cross-validation (for classification its better
    # to use "StratifiedCrossValidation", but the standard
    # "StratifiedCrossValidationSplitting" is also available
    splitting_strategy = StratifiedCrossValidationSplitting(labels, 3)

    # evaluation method
    evaluation_criterium = MulticlassAccuracy()

    # cross-validation instance
    cross_validation = CrossValidation(svm, comb_features, labels,
                                       splitting_strategy,
                                       evaluation_criterium)
    cross_validation.set_autolock(False)

    # append cross validation parameter observer
    multiclass_storage = ParameterObserverCV()
    cross_validation.subscribe_to_parameters(multiclass_storage)
    cross_validation.set_num_runs(3)

    # perform cross-validation
    result = cross_validation.evaluate()

    # get first observation and first fold
    obs = multiclass_storage.get_observations()[0]
    fold = obs.get_folds_results()[0]

    # get fold ROC for first class
    eval_ROC = ROCEvaluation()
    pred_lab_binary = MulticlassLabels.obtain_from_generic(
        fold.get_test_result()).get_binary_for_class(0)
    true_lab_binary = MulticlassLabels.obtain_from_generic(
        fold.get_test_true_result()).get_binary_for_class(0)
    eval_ROC.evaluate(pred_lab_binary, true_lab_binary)
    print eval_ROC.get_ROC()

    # get fold evaluation result
    acc_measure = F1Measure()
    print acc_measure.evaluate(pred_lab_binary, true_lab_binary)
def linear_time_mmd_graphical():


	# parameters, change to get different results
	m=1000 # set to 10000 for a good test result
	dim=2

	# setting the difference of the first dimension smaller makes a harder test
	difference=1

	# number of samples taken from null and alternative distribution
	num_null_samples=150

	# streaming data generator for mean shift distributions
	gen_p=MeanShiftDataGenerator(0, dim)
	gen_q=MeanShiftDataGenerator(difference, dim)

	# use the median kernel selection
	# create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is
	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	sigmas=[2**x for x in range(-3,10)]
	widths=[x*x*2 for x in sigmas]
	print "kernel widths:", widths
	combined=CombinedKernel()
	for i in range(len(sigmas)):
		combined.append_kernel(GaussianKernel(10, widths[i]))

	# mmd instance using streaming features, blocksize of 10000
	block_size=1000
	mmd=LinearTimeMMD(combined, gen_p, gen_q, m, block_size)

	# kernel selection instance (this can easily replaced by the other methods for selecting
	# single kernels
	selection=MMDKernelSelectionOpt(mmd)

	# perform kernel selection
	kernel=selection.select_kernel()
	kernel=GaussianKernel.obtain_from_generic(kernel)
	mmd.set_kernel(kernel);
	print "selected kernel width:", kernel.get_width()

	# sample alternative distribution, stream ensures different samples each run
	alt_samples=zeros(num_null_samples)
	for i in range(len(alt_samples)):
		alt_samples[i]=mmd.compute_statistic()

	# sample from null distribution
	# bootstrapping, biased statistic
	mmd.set_null_approximation_method(PERMUTATION)
	mmd.set_num_null_samples(num_null_samples)
	null_samples_boot=mmd.sample_null()

	# fit normal distribution to null and sample a normal distribution
	mmd.set_null_approximation_method(MMD1_GAUSSIAN)
	variance=mmd.compute_variance_estimate()
	null_samples_gaussian=normal(0,sqrt(variance),num_null_samples)

	# to plot data, sample a few examples from stream first
	features=gen_p.get_streamed_features(m)
	features=features.create_merged_copy(gen_q.get_streamed_features(m))
	data=features.get_feature_matrix()

	# plot
	figure()

	# plot data of p and q
	subplot(2,3,1)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	plot(data[0][0:m], data[1][0:m], 'ro', label='$x$')
	plot(data[0][m+1:2*m], data[1][m+1:2*m], 'bo', label='$x$', alpha=0.5)
	title('Data, shift in $x_1$='+str(difference)+'\nm='+str(m))
	xlabel('$x_1, y_1$')
	ylabel('$x_2, y_2$')

	# histogram of first data dimension and pdf
	subplot(2,3,2)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True)
	hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True)
	xs=linspace(min(data[0])-1,max(data[0])+1, 50)
	plot(xs,normpdf( xs, 0, 1), 'r', linewidth=3)
	plot(xs,normpdf( xs, difference, 1), 'b', linewidth=3)
	xlabel('$x_1, y_1$')
	ylabel('$p(x_1), p(y_1)$')
	title('Data PDF in $x_1, y_1$')

	# compute threshold for test level
	alpha=0.05
	null_samples_boot.sort()
	null_samples_gaussian.sort()
	thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))];
	thresh_gaussian=null_samples_gaussian[floor(len(null_samples_gaussian)*(1-alpha))];

	type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples)
	type_one_error_gaussian=sum(null_samples_gaussian<thresh_boot)/float(num_null_samples)

	# plot alternative distribution with threshold
	subplot(2,3,4)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(alt_samples, 20, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples)
	title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error))

	# compute range for all null distribution histograms
	hist_range=[min([min(null_samples_boot), min(null_samples_gaussian)]), max([max(null_samples_boot), max(null_samples_gaussian)])]

	# plot null distribution with threshold
	subplot(2,3,3)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(null_samples_boot, 20, range=hist_range, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	title('Sampled Null Dist.\n' + 'Type I error is '  + str(type_one_error_boot))

	# plot null distribution gaussian
	subplot(2,3,5)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(null_samples_gaussian, 20, range=hist_range, normed=True);
	axvline(thresh_gaussian, 0, 1, linewidth=2, color='red')
	title('Null Dist. Gaussian\nType I error is '  + str(type_one_error_gaussian))

	# pull plots a bit apart
	subplots_adjust(hspace=0.5)
	subplots_adjust(wspace=0.5)
Exemple #33
0
def create_kernel(kname,kparam,feats_train):
    """Call the corresponding constructor for the kernel"""

    if kname == 'gauss':
        kernel = GaussianKernel(feats_train, feats_train, kparam['width'])
    elif kname == 'linear':
        kernel = LinearKernel(feats_train, feats_train)
        kernel.set_normalizer(AvgDiagKernelNormalizer(kparam['scale']))
    elif kname == 'poly':
        kernel = PolyKernel(feats_train, feats_train, kparam['degree'], kparam['inhomogene'], kparam['normal'])
    elif kname == 'wd':
        kernel=WeightedDegreePositionStringKernel(feats_train, feats_train, kparam['degree'])
        kernel.set_normalizer(AvgDiagKernelNormalizer(float(kparam['seqlength'])))
        kernel.set_shifts(kparam['shift']*numpy.ones(kparam['seqlength'],dtype=numpy.int32))
        #kernel=WeightedDegreeStringKernel(feats_train, feats_train, kparam['degree'])
    elif kname == 'spec':
        kernel = CommUlongStringKernel(feats_train, feats_train)
    elif kname == 'cumspec':
        kernel = WeightedCommWordStringKernel(feats_train, feats_train)
        kernel.set_weights(numpy.ones(kparam['degree']))
    elif kname == 'spec2':
        kernel = CombinedKernel()
        k0 = CommWordStringKernel(feats_train['f0'], feats_train['f0'])
        k0.io.disable_progress()
        kernel.append_kernel(k0)
        k1 = CommWordStringKernel(feats_train['f1'], feats_train['f1'])
        k1.io.disable_progress()
        kernel.append_kernel(k1)
    elif kname == 'cumspec2':
        kernel = CombinedKernel()
        k0 = WeightedCommWordStringKernel(feats_train['f0'], feats_train['f0'])
        k0.set_weights(numpy.ones(kparam['degree']))
        k0.io.disable_progress()
        kernel.append_kernel(k0)
        k1 = WeightedCommWordStringKernel(feats_train['f1'], feats_train['f1'])
        k1.set_weights(numpy.ones(kparam['degree']))
        k1.io.disable_progress()
        kernel.append_kernel(k1)
    elif kname == 'localalign':
        kernel = LocalAlignmentStringKernel(feats_train, feats_train)
    elif kname == 'localimprove':
        kernel = LocalityImprovedStringKernel(feats_train, feats_train, kparam['length'],\
                                              kparam['indeg'], kparam['outdeg'])
    else:
        print 'Unknown kernel %s' % kname

    kernel.set_cache_size(32)
    return kernel
def quadratic_time_mmd_graphical():

	# parameters, change to get different results
	m=100
	dim=2

	# setting the difference of the first dimension smaller makes a harder test
	difference=0.5

	# number of samples taken from null and alternative distribution
	num_null_samples=500

	# streaming data generator for mean shift distributions
	gen_p=MeanShiftDataGenerator(0, dim)
	gen_q=MeanShiftDataGenerator(difference, dim)

	# Stream examples and merge them in order to compute MMD on joint sample
	# alternative is to call a different constructor of QuadraticTimeMMD
	features=gen_p.get_streamed_features(m)
	features=features.create_merged_copy(gen_q.get_streamed_features(m))

	# use the median kernel selection
	# create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is
	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	sigmas=[2**x for x in range(-3,10)]
	widths=[x*x*2 for x in sigmas]
	print "kernel widths:", widths
	combined=CombinedKernel()
	for i in range(len(sigmas)):
		combined.append_kernel(GaussianKernel(10, widths[i]))

	# create MMD instance, use biased statistic
	mmd=QuadraticTimeMMD(combined,features, m)
	mmd.set_statistic_type(BIASED)

	# kernel selection instance (this can easily replaced by the other methods for selecting
	# single kernels
	selection=MMDKernelSelectionMax(mmd)

	# perform kernel selection
	kernel=selection.select_kernel()
	kernel=GaussianKernel.obtain_from_generic(kernel)
	mmd.set_kernel(kernel);
	print "selected kernel width:", kernel.get_width()

	# sample alternative distribution (new data each trial)
	alt_samples=zeros(num_null_samples)
	for i in range(len(alt_samples)):
		# Stream examples and merge them in order to replace in MMD
		features=gen_p.get_streamed_features(m)
		features=features.create_merged_copy(gen_q.get_streamed_features(m))
		mmd.set_p_and_q(features)
		alt_samples[i]=mmd.compute_statistic()

	# sample from null distribution
	# bootstrapping, biased statistic
	mmd.set_null_approximation_method(PERMUTATION)
	mmd.set_statistic_type(BIASED)
	mmd.set_num_null_samples(num_null_samples)
	null_samples_boot=mmd.sample_null()

	# sample from null distribution
	# spectrum, biased statistic
	if "sample_null_spectrum" in dir(QuadraticTimeMMD):
			mmd.set_null_approximation_method(MMD2_SPECTRUM)
			mmd.set_statistic_type(BIASED)
			null_samples_spectrum=mmd.sample_null_spectrum(num_null_samples, m-10)

	# fit gamma distribution, biased statistic
	mmd.set_null_approximation_method(MMD2_GAMMA)
	mmd.set_statistic_type(BIASED)
	gamma_params=mmd.fit_null_gamma()
	# sample gamma with parameters
	null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)])

	# to plot data, sample a few examples from stream first
	features=gen_p.get_streamed_features(m)
	features=features.create_merged_copy(gen_q.get_streamed_features(m))
	data=features.get_feature_matrix()

	# plot
	figure()
	title('Quadratic Time MMD')

	# plot data of p and q
	subplot(2,3,1)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	plot(data[0][0:m], data[1][0:m], 'ro', label='$x$')
	plot(data[0][m+1:2*m], data[1][m+1:2*m], 'bo', label='$x$', alpha=0.5)
	title('Data, shift in $x_1$='+str(difference)+'\nm='+str(m))
	xlabel('$x_1, y_1$')
	ylabel('$x_2, y_2$')

	# histogram of first data dimension and pdf
	subplot(2,3,2)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3 )) # reduce number of x-ticks
	hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True)
	hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True)
	xs=linspace(min(data[0])-1,max(data[0])+1, 50)
	plot(xs,normpdf( xs, 0, 1), 'r', linewidth=3)
	plot(xs,normpdf( xs, difference, 1), 'b', linewidth=3)
	xlabel('$x_1, y_1$')
	ylabel('$p(x_1), p(y_1)$')
	title('Data PDF in $x_1, y_1$')

	# compute threshold for test level
	alpha=0.05
	null_samples_boot.sort()
	null_samples_spectrum.sort()
	null_samples_gamma.sort()
	thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))];
	thresh_spectrum=null_samples_spectrum[floor(len(null_samples_spectrum)*(1-alpha))];
	thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))];

	type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples)
	type_one_error_spectrum=sum(null_samples_spectrum<thresh_boot)/float(num_null_samples)
	type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples)

	# plot alternative distribution with threshold
	subplot(2,3,4)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(alt_samples, 20, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples)
	title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error))

	# compute range for all null distribution histograms
	hist_range=[min([min(null_samples_boot), min(null_samples_spectrum), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_spectrum), max(null_samples_gamma)])]

	# plot null distribution with threshold
	subplot(2,3,3)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3 )) # reduce number of x-ticks
	hist(null_samples_boot, 20, range=hist_range, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	title('Sampled Null Dist.\n' + 'Type I error is '  + str(type_one_error_boot))
	grid(True)

	# plot null distribution spectrum
	subplot(2,3,5)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(null_samples_spectrum, 20, range=hist_range, normed=True);
	axvline(thresh_spectrum, 0, 1, linewidth=2, color='red')
	title('Null Dist. Spectrum\nType I error is '  + str(type_one_error_spectrum))

	# plot null distribution gamma
	subplot(2,3,6)
	grid(True)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	hist(null_samples_gamma, 20, range=hist_range, normed=True);
	axvline(thresh_gamma, 0, 1, linewidth=2, color='red')
	title('Null Dist. Gamma\nType I error is '  + str(type_one_error_gamma))

	# pull plots a bit apart
	subplots_adjust(hspace=0.5)
	subplots_adjust(wspace=0.5)
Exemple #35
0
        def RunKPCAShogun():
            totalTimer = Timer()

            try:
                # Load input dataset.
                Log.Info("Loading dataset", self.verbose)
                data = np.genfromtxt(self.dataset, delimiter=',')
                dataFeat = RealFeatures(data.T)

                with totalTimer:
                    # Get the new dimensionality, if it is necessary.
                    if "new_dimensionality" in options:
                        d = int(options.pop("new_dimensionality"))
                        if (d > data.shape[1]):
                            Log.Fatal("New dimensionality (" + str(d) +
                                      ") cannot be greater " +
                                      "than existing dimensionality (" +
                                      str(data.shape[1]) + ")!")
                            return -1
                    else:
                        d = data.shape[1]

                    # Get the kernel type and make sure it is valid.
                    if "kernel" in options:
                        kernel = str(options.pop("kernel"))
                    else:
                        Log.Fatal(
                            "Choose kernel type, valid choices are 'linear'," +
                            " 'hyptan', 'polynomial' and 'gaussian'.")
                        return -1

                    if "degree" in options:
                        degree = int(options.pop("degree"))

                    if len(options) > 0:
                        Log.Fatal("Unknown parameters: " + str(options))
                        raise Exception("unknown parameters")

                    if kernel == "polynomial":
                        kernel = PolyKernel(dataFeat, dataFeat, degree, True)
                    elif kernel == "gaussian":
                        kernel = GaussianKernel(dataFeat, dataFeat, 2.0)
                    elif kernel == "linear":
                        kernel = LinearKernel(dataFeat, dataFeat)
                    elif kernel == "hyptan":
                        kernel = SigmoidKernel(dataFeat, dataFeat, 2, 1.0, 1.0)
                    else:
                        Log.Fatal(
                            "Invalid kernel type (" + kernel.group(1) +
                            "); valid " +
                            "choices are 'linear', 'hyptan', 'polynomial' and 'gaussian'."
                        )
                        return -1

                    # Perform Kernel Principal Components Analysis.
                    model = KernelPCA(kernel)
                    model.set_target_dim(d)
                    model.init(dataFeat)
                    model.apply_to_feature_matrix(dataFeat)
            except Exception as e:
                return -1

            return totalTimer.ElapsedTime()