def classifier_svmocas(train_fname=traindat,
                       test_fname=testdat,
                       label_fname=label_traindat,
                       C=0.9,
                       epsilon=1e-5,
                       num_threads=1):
    from shogun import RealFeatures, BinaryLabels
    from shogun import CSVFile
    try:
        from shogun import SVMOcas
    except ImportError:
        print("SVMOcas not available")
        return

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))
    labels = BinaryLabels(CSVFile(label_fname))

    svm = SVMOcas(C, feats_train, labels)
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.set_bias_enabled(False)
    svm.train()

    bias = svm.get_bias()
    w = svm.get_w()
    predictions = svm.apply(feats_test)
    return predictions, svm, predictions.get_labels()
Exemple #2
0
def distance_manhattenword(train_fname=traindna,
                           test_fname=testdna,
                           order=3,
                           gap=0,
                           reverse=False):
    from shogun import StringCharFeatures, StringWordFeatures, DNA
    from shogun import SortWordString, ManhattanWordDistance, CSVFile

    charfeat = StringCharFeatures(CSVFile(train_fname), DNA)
    feats_train = StringWordFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feats_train)
    feats_train.add_preprocessor(preproc)
    feats_train.apply_preprocessor()

    charfeat = StringCharFeatures(CSVFile(test_fname), DNA)
    feats_test = StringWordFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    feats_test.add_preprocessor(preproc)
    feats_test.apply_preprocessor()

    distance = ManhattanWordDistance(feats_train, feats_train)

    dm_train = distance.get_distance_matrix()
    distance.init(feats_train, feats_test)
    dm_test = distance.get_distance_matrix()
    return dm_train, dm_test
Exemple #3
0
def features_dense_io():
    from shogun import RealFeatures, CSVFile
    feats = RealFeatures()
    f = CSVFile("../data/fm_train_real.dat", "r")
    f.set_delimiter(" ")
    feats.load(f)
    return feats
Exemple #4
0
def classifier_gpbtsvm(train_fname=traindat,
                       test_fname=testdat,
                       label_fname=label_traindat,
                       width=2.1,
                       C=1,
                       epsilon=1e-5):
    from shogun import RealFeatures, BinaryLabels
    from shogun import GaussianKernel
    from shogun import CSVFile
    try:
        from shogun import GPBTSVM
    except ImportError:
        print("GPBTSVM not available")
        exit(0)

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))
    labels = BinaryLabels(CSVFile(label_fname))
    kernel = GaussianKernel(feats_train, feats_train, width)

    svm = GPBTSVM(C, kernel, labels)
    svm.set_epsilon(epsilon)
    svm.train()

    predictions = svm.apply(feats_test)
    return predictions, svm, predictions.get_labels()
Exemple #5
0
def metric_lmnn(train_fname=traindat,
                test_fname=testdat,
                label_train_fname=label_traindat,
                k=3):
    try:
        from shogun import RealFeatures, MulticlassLabels, LMNN, KNN, CSVFile
    except ImportError:
        return

    # wrap features and labels into Shogun objects
    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))
    labels = MulticlassLabels(CSVFile(label_train_fname))

    # LMNN
    lmnn = LMNN(feats_train, labels, k)
    lmnn.train()
    lmnn_distance = lmnn.get_distance()

    # perform classification with KNN
    knn = KNN(k, lmnn_distance, labels)
    knn.train()
    output = knn.apply(feats_test).get_labels()

    return lmnn, output
def features_dense_io():
	from shogun import RealFeatures, CSVFile
	feats=RealFeatures()
	f=CSVFile("../data/fm_train_real.dat","r")
	f.set_delimiter(" ")
	feats.load(f)
	return feats
Exemple #7
0
def labels_io():
    from shogun import RegressionLabels, CSVFile
    lab = RegressionLabels()
    f = CSVFile("../data/label_train_regression.dat", "r")
    f.set_delimiter(" ")
    lab.load(f)
    #print lab.get_labels()
    return lab
Exemple #8
0
def kernel_auc (train_fname=traindat,label_fname=label_traindat,width=1.7):
	from shogun import GaussianKernel, AUCKernel, RealFeatures
	from shogun import BinaryLabels, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	subkernel=GaussianKernel(feats_train, feats_train, width)

	kernel=AUCKernel(0, subkernel)
	kernel.setup_auc_maximization(BinaryLabels(CSVFile(label_fname)))
	km_train=kernel.get_kernel_matrix()
	return kernel
Exemple #9
0
def distance_chisquare(train_fname=traindat, test_fname=testdat):
    from shogun import RealFeatures, ChiSquareDistance, CSVFile
    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))

    distance = ChiSquareDistance(feats_train, feats_train)
    dm_train = distance.get_distance_matrix()

    distance.init(feats_train, feats_test)
    dm_test = distance.get_distance_matrix()
    return distance, dm_train, dm_test
Exemple #10
0
def kernel_cauchy (train_fname=traindat,test_fname=testdat, sigma=1.0):
	from shogun import RealFeatures, CauchyKernel, CSVFile, EuclideanDistance
	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=EuclideanDistance(feats_train, feats_train)
	kernel=CauchyKernel(feats_train, feats_train, sigma, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
Exemple #11
0
def distance_canberra (train_fname=traindat,test_fname=testdat):
	from shogun import RealFeatures, CanberraMetric, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=CanberraMetric(feats_train, feats_train)
	dm_train=distance.get_distance_matrix()

	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return distance,dm_train,dm_test
Exemple #12
0
def kernel_gaussian(train_fname=traindat, test_fname=testdat, width=1.3):
    from shogun import RealFeatures, GaussianKernel, CSVFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))

    kernel = GaussianKernel(feats_train, feats_train, width)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
Exemple #13
0
def kernel_linear_byte (train_fname=traindat,test_fname=testdat):
	from shogun import LinearKernel, ByteFeatures, CSVFile

	feats_train=ByteFeatures(CSVFile(train_fname))
	feats_test=ByteFeatures(CSVFile(test_fname))

	kernel=LinearKernel(feats_train, feats_train)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return kernel
Exemple #14
0
def kernel_sigmoid (train_fname=traindat,test_fname=testdat,size_cache=10,gamma=1.2,coef0=1.3):
	from shogun import RealFeatures, SigmoidKernel, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	kernel=SigmoidKernel(feats_train, feats_train, size_cache, gamma, coef0)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
Exemple #15
0
def distance_minkowski(train_fname=traindat, test_fname=testdat, k=3):
    from shogun import RealFeatures, MinkowskiMetric, CSVFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))

    distance = MinkowskiMetric(feats_train, feats_train, k)

    dm_train = distance.get_distance_matrix()
    distance.init(feats_train, feats_test)
    dm_test = distance.get_distance_matrix()

    return distance, dm_train, dm_test
Exemple #16
0
def kernel_distance(train_fname=traindat, test_fname=testdat, width=1.7):
    from shogun import RealFeatures, DistanceKernel, EuclideanDistance, CSVFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))

    distance = EuclideanDistance()
    kernel = DistanceKernel(feats_train, feats_test, width, distance)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
def kernel_gaussian_shift (train_fname=traindat,test_fname=testdat,width=1.8,max_shift=2,shift_step=1):
	from shogun import RealFeatures, GaussianShiftKernel, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	kernel=GaussianShiftKernel(feats_train, feats_train, width, max_shift, shift_step)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train,km_test,kernel
Exemple #18
0
def kernel_exponential(train_fname=traindat, test_fname=testdat, tau_coef=1.0):
    from shogun import RealFeatures, ExponentialKernel, EuclideanDistance, CSVFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))

    distance = EuclideanDistance(feats_train, feats_train)
    kernel = ExponentialKernel(feats_train, feats_train, tau_coef, distance,
                               10)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
def classifier_libsvmoneclass (train_fname=traindat,test_fname=testdat,width=2.1,C=1,epsilon=1e-5):
	from shogun import RealFeatures, GaussianKernel, LibSVMOneClass, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	kernel=GaussianKernel(feats_train, feats_train, width)

	svm=LibSVMOneClass(C, kernel)
	svm.set_epsilon(epsilon)
	svm.train()

	predictions = svm.apply(feats_test)
	return predictions, svm, predictions.get_labels()
Exemple #20
0
def kernel_rationalquadratic (train_fname=traindat,test_fname=testdat, shift_coef=1.0):
	from shogun import RealFeatures, RationalQuadraticKernel, EuclideanDistance, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=EuclideanDistance(feats_train, feats_train)

	kernel=RationalQuadraticKernel(feats_train, feats_train, shift_coef, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
Exemple #21
0
def kernel_power(train_fname=traindat, test_fname=testdat, degree=2.0):
    from shogun import RealFeatures, PowerKernel, EuclideanDistance, CSVFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))

    distance = EuclideanDistance(feats_train, feats_train)

    kernel = PowerKernel(feats_train, feats_train, degree, distance)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
def distance_normsquared(train_fname=traindat, test_fname=testdat):
    from shogun import RealFeatures, EuclideanDistance, CSVFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))

    distance = EuclideanDistance(feats_train, feats_train)
    distance.set_disable_sqrt(True)

    dm_train = distance.get_distance_matrix()
    distance.init(feats_train, feats_test)
    dm_test = distance.get_distance_matrix()

    return distance, dm_train, dm_test
Exemple #23
0
def kernel_chi2(train_fname=traindat,
                test_fname=testdat,
                width=1.4,
                size_cache=10):
    from shogun import RealFeatures, Chi2Kernel, CSVFile, NormOne

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))

    kernel = Chi2Kernel(feats_train, feats_train, width, size_cache)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
Exemple #24
0
def kernel_anova(train_fname=traindat,
                 test_fname=testdat,
                 cardinality=2,
                 size_cache=10):
    from shogun import ANOVAKernel, RealFeatures, CSVFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))

    kernel = ANOVAKernel(feats_train, feats_train, cardinality, size_cache)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
Exemple #25
0
def kernel_linear(train_fname=traindat, test_fname=testdat, scale=1.2):

    from shogun import RealFeatures, LinearKernel, AvgDiagKernelNormalizer, CSVFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))

    kernel = LinearKernel()
    kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
    kernel.init(feats_train, feats_train)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
def converter_multidimensionalscaling(data_fname):
    try:
        import numpy
        from shogun import RealFeatures, MultidimensionalScaling, EuclideanDistance, CSVFile

        features = RealFeatures(CSVFile(data_fname))

        distance_before = EuclideanDistance()
        distance_before.init(features, features)

        converter = MultidimensionalScaling()
        converter.set_target_dim(2)
        converter.set_landmark(False)
        embedding = converter.apply(features)

        distance_after = EuclideanDistance()
        distance_after.init(embedding, embedding)

        distance_matrix_after = distance_after.get_distance_matrix()
        distance_matrix_before = distance_before.get_distance_matrix()

        return numpy.linalg.norm(distance_matrix_after -
                                 distance_matrix_before) / numpy.linalg.norm(
                                     distance_matrix_before) < 1e-6
    except ImportError:
        print('No Eigen3 available')
def features_string_file(directory, fname):
    from shogun import StringCharFeatures, RAWBYTE
    from shogun import CSVFile

    # load features from directory
    f = StringCharFeatures(RAWBYTE)
    f.load_from_directory(directory)

    #and output several stats
    #print("max string length", f.get_max_vector_length())
    #print("number of strings", f.get_num_vectors())
    #print("length of first string", f.get_vector_length(0))
    #print("str[0,0:3]", f.get_feature(0,0), f.get_feature(0,1), f.get_feature(0,2))
    #print("len(str[0])", f.get_vector_length(0))
    #print("str[0]", f.get_feature_vector(0))

    #or load features from file (one string per line)
    fil = CSVFile(fname)
    f.load(fil)
    #print(f.get_features())

    #or load fasta file
    #f.load_fasta('fasta.fa')
    #print(f.get_features())
    return f.get_features(), f
Exemple #28
0
def multiclass_c45classifiertree(train=traindat,
                                 test=testdat,
                                 labels=label_traindat,
                                 ft=feattypes):
    try:
        from shogun import RealFeatures, MulticlassLabels, CSVFile, C45ClassifierTree
        from numpy import random, int32
    except ImportError:
        print("Could not import Shogun and/or numpy modules")
        return

    # wrap features and labels into Shogun objects
    feats_train = RealFeatures(CSVFile(train))
    feats_test = RealFeatures(CSVFile(test))
    train_labels = MulticlassLabels(CSVFile(labels))

    # divide train dataset into training and validation subsets in the ratio 2/3 to 1/3
    subset = int32(random.permutation(feats_train.get_num_vectors()))
    vsubset = subset[1:int(subset.size / 3)]
    trsubset = subset[1 + int(subset.size / 3):subset.size]

    # C4.5 Tree formation using training subset
    train_labels.add_subset(trsubset)
    feats_train.add_subset(trsubset)

    c = C45ClassifierTree()
    c.set_labels(train_labels)
    c.set_feature_types(ft)
    c.train(feats_train)

    train_labels.remove_subset()
    feats_train.remove_subset()

    # prune tree using validation subset
    train_labels.add_subset(vsubset)
    feats_train.add_subset(vsubset)

    c.prune_tree(feats_train, train_labels)

    train_labels.remove_subset()
    feats_train.remove_subset()

    # Classify test data
    output = c.apply_multiclass(feats_test).get_labels()
    output_certainty = c.get_certainty_vector()

    return c, output, output_certainty
Exemple #29
0
def classifier_svmsgd (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,C=0.9,num_threads=1,num_iter=5):
	from shogun import RealFeatures, SparseRealFeatures, BinaryLabels
	from shogun import SVMSGD, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))
	labels=BinaryLabels(CSVFile(label_fname))

	svm=SVMSGD(C, feats_train, labels)
	svm.set_epochs(num_iter)
	#svm.io.set_loglevel(0)
	svm.train()

	bias=svm.get_bias()
	w=svm.get_w()
	predictions = svm.apply(feats_test)
	return predictions, svm, predictions.get_labels()
def distance_sparseeuclidean(train_fname=traindat, test_fname=testdat):
    from shogun import RealFeatures, SparseRealFeatures, SparseEuclideanDistance, CSVFile

    realfeat = RealFeatures(CSVFile(train_fname))
    feats_train = SparseRealFeatures()
    feats_train.obtain_from_simple(realfeat)
    realfeat = RealFeatures(CSVFile(test_fname))
    feats_test = SparseRealFeatures()
    feats_test.obtain_from_simple(realfeat)

    distance = SparseEuclideanDistance(feats_train, feats_train)

    dm_train = distance.get_distance_matrix()
    distance.init(feats_train, feats_test)
    dm_test = distance.get_distance_matrix()

    return distance, dm_train, dm_test
Exemple #31
0
def kernel_combined_custom_poly(train_fname=traindat,
                                test_fname=testdat,
                                train_label_fname=label_traindat):
    from shogun import CombinedFeatures, RealFeatures, BinaryLabels
    from shogun import CombinedKernel, PolyKernel, CustomKernel
    from shogun import LibSVM, CSVFile

    kernel = CombinedKernel()
    feats_train = CombinedFeatures()

    tfeats = RealFeatures(CSVFile(train_fname))
    tkernel = PolyKernel(10, 3)
    tkernel.init(tfeats, tfeats)
    K = tkernel.get_kernel_matrix()
    kernel.append_kernel(CustomKernel(K))

    subkfeats_train = RealFeatures(CSVFile(train_fname))
    feats_train.append_feature_obj(subkfeats_train)
    subkernel = PolyKernel(10, 2)
    kernel.append_kernel(subkernel)

    kernel.init(feats_train, feats_train)

    labels = BinaryLabels(CSVFile(train_label_fname))
    svm = LibSVM(1.0, kernel, labels)
    svm.train()

    kernel = CombinedKernel()
    feats_pred = CombinedFeatures()

    pfeats = RealFeatures(CSVFile(test_fname))
    tkernel = PolyKernel(10, 3)
    tkernel.init(tfeats, pfeats)
    K = tkernel.get_kernel_matrix()
    kernel.append_kernel(CustomKernel(K))

    subkfeats_test = RealFeatures(CSVFile(test_fname))
    feats_pred.append_feature_obj(subkfeats_test)
    subkernel = PolyKernel(10, 2)
    kernel.append_kernel(subkernel)
    kernel.init(feats_train, feats_pred)

    svm.set_kernel(kernel)
    svm.apply()
    km_train = kernel.get_kernel_matrix()
    return km_train, kernel