def prepare_data(use_toy=True):
    from os.path import exists
    from tools.load import LoadMatrix
    lm=LoadMatrix()

    if not use_toy and exists('../data/../mldata/uci-20070111-optdigits.mat'):
        from scipy.io import loadmat

        mat = loadmat('../data/../mldata/uci-20070111-optdigits.mat')['int0'].astype(float)
        X = mat[:-1,:]
        Y = mat[-1,:]
        isplit = X.shape[1]/2
        traindat = X[:,:isplit]
        label_traindat = Y[:isplit]
        testdat = X[:, isplit:]
        label_testdat = Y[isplit:]
    else:
        traindat = lm.load_numbers('../data/fm_train_real.dat')
        testdat  = lm.load_numbers('../data/fm_test_real.dat')
        label_traindat = lm.load_labels('../data/label_train_multiclass.dat')
        label_testdat = None

    return [traindat, label_traindat, testdat, label_testdat]
# In this example the Histogram algorithm object computes a histogram over all
# 16bit unsigned integers in the features.

from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()
traindna=lm.load_dna('../data/fm_train_dna.dat')
cubedna=lm.load_cubes('../data/fm_train_cube.dat')
parameter_list=[[traindna,cubedna,3,0,'n'],[traindna,cubedna,4,0,'n']]

def distribution_histogram(fm_train=traindna,fm_cube=cubedna,order=3,
			    gap=0,reverse='n'):

#	sg('new_distribution', 'HISTOGRAM')
	sg('add_preproc', 'SORTWORDSTRING')

	sg('set_features', 'TRAIN', fm_train, 'DNA')
	sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
	sg('attach_preproc', 'TRAIN')
#	sg('train_distribution')
#	histo=sg('get_histogram')

#	num_examples=11
#	num_param=sg('get_histogram_num_model_parameters')
#	for i in xrange(num_examples):
#		for j in xrange(num_param):
#			sg('get_log_derivative %d %d' % (j, i))

#	sg('get_log_likelihood')
#	return sg('get_log_likelihood_sample')
###########################################################################
# linear kernel on byte features
###########################################################################
from tools.load import LoadMatrix
from numpy import ubyte
lm=LoadMatrix()

traindat = ubyte(lm.load_numbers('../data/fm_train_byte.dat'))
testdat = ubyte(lm.load_numbers('../data/fm_test_byte.dat'))

parameter_list=[[traindat,testdat],[traindat,testdat]]

def kernel_linear_byte_modular(fm_train_byte=traindat,fm_test_byte=testdat):
	from shogun.Kernel import LinearKernel
	from shogun.Features import ByteFeatures

	feats_train=ByteFeatures(fm_train_byte)
	feats_test=ByteFeatures(fm_test_byte)

	kernel=LinearKernel(feats_train, feats_train)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return kernel

if __name__=='__main__':
	print('LinearByte')
	kernel_linear_byte_modular(*parameter_list[0])
from tools.load import LoadMatrix
from numpy import random
lm=LoadMatrix()

ground_truth = lm.load_labels('../data/label_train_twoclass.dat')
random.seed(17)
predicted = random.randn(len(ground_truth))

parameter_list = [[ground_truth,predicted]]

def evaluation_prcevaluation_modular(ground_truth, predicted):
	from shogun.Features import Labels
	from shogun.Evaluation import PRCEvaluation

	ground_truth_labels = Labels(ground_truth)
	predicted_labels = Labels(predicted)
	
	evaluator = PRCEvaluation()
	evaluator.evaluate(predicted_labels,ground_truth_labels)
	
	return evaluator.get_PRC(), evaluator.get_auPRC()


if __name__=='__main__':
	print 'PRCEvaluation'
	evaluation_prcevaluation_modular(*parameter_list[0])

	reverse=False

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_dna)
	feats=StringWordFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, gap, reverse)

	histo=Histogram(feats)
	histo.train()

	histo.get_histogram()

	num_examples=feats.get_num_vectors()
	num_param=histo.get_num_model_parameters()
	#for i in xrange(num_examples):
	#	for j in xrange(num_param):
	#		histo.get_log_derivative(j, i)

	histo.get_log_likelihood()
	histo.get_log_likelihood_sample()

###########################################################################
# call functions
###########################################################################

if __name__=='__main__':
	from tools.load import LoadMatrix
	lm=LoadMatrix()
	fm_dna=lm.load_dna('../data/fm_train_dna.dat')
	histogram()
from tools.load import LoadMatrix
lm=LoadMatrix()

traindna = lm.load_dna('../data/fm_train_dna.dat')
testdna = lm.load_dna('../data/fm_test_dna.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')

parameter_list = [[traindna,testdna,testdat,3,0,False],[traindna,testdna,testdat,4,0,False]]

def distance_manhattenword_modular (fm_train_dna=traindna ,fm_test_dna=testdna,fm_test_real=testdat,order=3,gap=0,reverse=False):

	from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
	from shogun.Preprocessor import SortWordString
	from shogun.Distance import ManhattanWordDistance

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()
from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()

traindat=lm.load_numbers('../data/fm_train_real.dat')
testdat=lm.load_numbers('../data/fm_test_real.dat')
parameter_list=[[traindat,testdat,1.4,10,True],[traindat,testdat,1.5,11,True]]

def preproc_prunevarsubmean (fm_train_real=traindat,fm_test_real=testdat,
		 width=1.4,size_cache=10,divide_by_std=True):

	sg('add_preproc', 'PRUNEVARSUBMEAN', divide_by_std)
	sg('set_kernel', 'CHI2', 'REAL', size_cache, width)

	sg('set_features', 'TRAIN', fm_train_real)
	sg('attach_preproc', 'TRAIN')
	km=sg('get_kernel_matrix', 'TRAIN')

	sg('set_features', 'TEST', fm_test_real)
	sg('attach_preproc', 'TEST')
	km=sg('get_kernel_matrix', 'TEST')
	return km

if __name__=='__main__':
	print 'PruneVarSubMean'
	preproc_prunevarsubmean(*parameter_list[0])
# In this example a two-class support vector machine classifier is trained on a
# DNA splice-site detection data set and the trained classifier is used to predict
# labels on test set. As training algorithm SVM^light is used with SVM
# regularization parameter C=1.2 and the Weighted Degree kernel of degree 20 and
# the precision parameter epsilon=1e-5.
# 
# For more details on the SVM^light see
#  T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel
#  Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999.
# 
# For more details on the Weighted Degree kernel see
#  G. Raetsch, S.Sonnenburg, and B. Schoelkopf. RASE: recognition of alternatively
#  spliced exons in C. elegans. Bioinformatics, 21:369-377, June 2005. 

from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_dna('../data/fm_train_dna.dat')
testdat = lm.load_dna('../data/fm_test_dna.dat')
label_traindat = lm.load_labels('../data/label_train_dna.dat')

parameter_list = [[traindat,testdat,label_traindat,1.1,1e-5,1],[traindat,testdat,label_traindat,1.2,1e-5,1]]

def classifier_svmlight_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,C=1.2,epsilon=1e-5,num_threads=1):
	from shogun.Features import StringCharFeatures, Labels, DNA
	from shogun.Kernel import WeightedDegreeStringKernel
	try:
		from shogun.Classifier import SVMLight
	except ImportError:
		print 'No support for SVMLight available.'
		return
#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import random
lm = LoadMatrix()

random.seed(17)
from tools.multiclass_shared import prepare_data
[traindat, label_traindat, testdat, label_testdat] = prepare_data(False)

parameter_list = [[traindat, label_traindat, testdat, label_testdat]]


def evaluation_multiclassovrevaluation_modular(traindat, label_traindat,
                                               testdat, label_testdat):
    from shogun.Features import MulticlassLabels
    from shogun.Evaluation import MulticlassOVREvaluation, ROCEvaluation
    from modshogun import MulticlassLibLinear, RealFeatures, ContingencyTableEvaluation, ACCURACY
    from shogun.Mathematics import Math

    Math.init_random(1)

    ground_truth_labels = MulticlassLabels(label_traindat)
    svm = MulticlassLibLinear(1.0, RealFeatures(traindat),
                              MulticlassLabels(label_traindat))
    svm.train()
    predicted_labels = svm.apply()

    binary_evaluator = ROCEvaluation()
    evaluator = MulticlassOVREvaluation(binary_evaluator)
    mean_roc = evaluator.evaluate(predicted_labels, ground_truth_labels)
    #print mean_roc
from tools.load import LoadMatrix

lm = LoadMatrix()
data = lm.load_numbers('../data/fm_train_real.dat')

parameter_list = [[data, 10], [data, 20]]


def converter_localtangentspacealignment_modular(data, k):
    from shogun.Features import RealFeatures
    from shogun.Converter import LocalTangentSpaceAlignment

    features = RealFeatures(data)

    converter = LocalTangentSpaceAlignment()
    converter.set_target_dim(1)
    converter.set_k(k)
    converter.apply(features)

    return features


if __name__ == '__main__':
    print 'LocalTangentSpaceAlignment'
    converter_localtangentspacealignment_modular(*parameter_list[0])
Exemple #11
0
from tools.load import LoadMatrix

lm = LoadMatrix()
fm_train_real = lm.load_numbers('../data/fm_train_real.dat')
fm_test_real = lm.load_numbers('../data/fm_test_real.dat')
label_train_multiclass = lm.load_labels('../data/label_train_multiclass.dat')

parameter_list = [[
    fm_train_real, fm_test_real, label_train_multiclass, 1.2, 1.2, 1e-5, 1,
    0.001, 1.5
],
                  [
                      fm_train_real, fm_test_real, label_train_multiclass, 5,
                      1.2, 1e-2, 1, 0.001, 2
                  ]]


def mkl_multiclass_modular(fm_train_real, fm_test_real, label_train_multiclass,
                           width, C, epsilon, num_threads, mkl_epsilon,
                           mkl_norm):

    from shogun.Features import CombinedFeatures, RealFeatures, Labels
    from shogun.Kernel import CombinedKernel, GaussianKernel, LinearKernel, PolyKernel
    from shogun.Classifier import MKLMultiClass

    kernel = CombinedKernel()
    feats_train = CombinedFeatures()
    feats_test = CombinedFeatures()

    subkfeats_train = RealFeatures(fm_train_real)
    subkfeats_test = RealFeatures(fm_test_real)
Exemple #12
0
from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')

parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5],[traindat,testdat,label_traindat,2.2,1,1e-5]]

def classifier_libsvm_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_twoclass=label_traindat,width=2.1,C=1,epsilon=1e-5):
	from shogun.Features import RealFeatures, BinaryLabels
	from shogun.Kernel import GaussianKernel
	from shogun.Classifier import LibSVM

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)
	
	kernel=GaussianKernel(feats_train, feats_train, width)
	labels=BinaryLabels(label_train_twoclass)

	svm=LibSVM(C, kernel, labels)
	svm.set_epsilon(epsilon)
	svm.train()

	kernel.init(feats_train, feats_test)
	labels = svm.apply().get_labels()
	supportvectors = sv_idx=svm.get_support_vectors()
	alphas=svm.get_alphas()
	predictions = svm.apply()
	print predictions.get_labels()
	return predictions, svm, predictions.get_labels()
from tools.load import LoadMatrix
from sg import sg
lm = LoadMatrix()
traindna = lm.load_dna('../data/fm_train_dna.dat')
cubedna = lm.load_cubes('../data/fm_train_cube.dat')
parameter_list = [[traindna, cubedna, 3, 0, 'n'],
                  [traindna, cubedna, 4, 0, 'n']]


def distribution_histogram(fm_train=traindna,
                           fm_cube=cubedna,
                           order=3,
                           gap=0,
                           reverse='n'):

    #	sg('new_distribution', 'HISTOGRAM')
    sg('add_preproc', 'SORTWORDSTRING')

    sg('set_features', 'TRAIN', fm_train, 'DNA')
    sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order,
       order - 1, gap, reverse)
    sg('attach_preproc', 'TRAIN')


#	sg('train_distribution')
#	histo=sg('get_histogram')

#	num_examples=11
#	num_param=sg('get_histogram_num_model_parameters')
#	for i in xrange(num_examples):
#		for j in xrange(num_param):
Exemple #14
0
# In this example a hidden markov model with 3 states and 6 transitions is trained
# on a string data set. After calling the constructor of the HMM class specifying
# the number of states and transitions the model is trained. Via the Baum-Welch
# algorithm the optimal transition and emission probabilities are estimated. The
# best path, i.e. the path with highest probability given the model can then be
# calculated using get_best_path_state.

from tools.load import LoadMatrix
lm = LoadMatrix()
data = lm.load_cubes('../data/fm_train_cube.dat')

parameter_list = [[data, 1, 64, 1e-5, 2, 0, False, 5],
                  [data, 3, 6, 1e-1, 1, 0, False, 2]]


def distribution_hmm_modular(fm_cube, N, M, pseudo, order, gap, reverse,
                             num_examples):
    from shogun.Features import StringWordFeatures, StringCharFeatures, CUBE
    from shogun.Distribution import HMM, BW_NORMAL

    charfeat = StringCharFeatures(CUBE)
    charfeat.set_features(fm_cube)
    feats = StringWordFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    hmm = HMM(feats, N, M, pseudo)
    hmm.train()
    hmm.baum_welch_viterbi_train(BW_NORMAL)

    num_examples = feats.get_num_vectors()
    num_param = hmm.get_num_model_parameters()
Exemple #15
0
from tools.load import LoadMatrix
from numpy import random

lm = LoadMatrix()

random.seed(17)
ground_truth = lm.load_labels('../data/label_train_multiclass.dat')
predicted = lm.load_labels('../data/label_train_multiclass.dat') * 2

parameter_list = [[ground_truth, predicted]]


def evaluation_multiclassaccuracy_modular(ground_truth, predicted):
    from shogun.Features import MulticlassLabels
    from shogun.Evaluation import MulticlassAccuracy

    ground_truth_labels = MulticlassLabels(ground_truth)
    predicted_labels = MulticlassLabels(predicted)

    evaluator = MulticlassAccuracy()
    accuracy = evaluator.evaluate(predicted_labels, ground_truth_labels)

    return accuracy


if __name__ == '__main__':
    print('MulticlassAccuracy')
    evaluation_multiclassaccuracy_modular(*parameter_list[0])
from tools.load import LoadMatrix
from numpy import ushort
from sg import sg
lm=LoadMatrix()

trainword=ushort(lm.load_numbers('../data/fm_test_word.dat'))
testword=ushort(lm.load_numbers('../data/fm_test_word.dat'))
parameter_list=[[trainword,testword,10,1.4],
	       [trainword,testword,11,1.5]]

def kernel_linearword (fm_train_word=trainword,fm_test_word=testword,
		       size_cache=10, scale=1.4):
	sg('set_features', 'TRAIN', fm_train_word)
	sg('set_features', 'TEST', fm_test_word)
	sg('set_kernel', 'LINEAR', 'WORD', size_cache, scale)
	km=sg('get_kernel_matrix', 'TRAIN')
	km=sg('get_kernel_matrix', 'TEST')
	return km

if __name__=='__main__':
	print('LinearWord')
	kernel_linearword(*parameter_list[0])
###########################################################################
# anova kernel
###########################################################################
from tools.load import LoadMatrix
from numpy import double
lm=LoadMatrix()

traindat = double(lm.load_numbers('../data/fm_train_real.dat'))
testdat = double(lm.load_numbers('../data/fm_test_real.dat'))
parameter_list = [[traindat,testdat,2,10], [traindat,testdat,5,10]]

def kernel_anova_modular (fm_train_real=traindat,fm_test_real=testdat,cardinality=2, size_cache=10):
	from shogun.Kernel import ANOVAKernel
	from shogun.Features import RealFeatures
	
	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)
	
	kernel=ANOVAKernel(feats_train, feats_train, cardinality, size_cache)
        
	for i in range(0,feats_train.get_num_vectors()):
		for j in range(0,feats_train.get_num_vectors()):
			k1 = kernel.compute_rec1(i,j)
			k2 = kernel.compute_rec2(i,j)
			#if abs(k1-k2) > 1e-10:
			#	print "|%s|%s|" % (k1, k2)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train, km_test, kernel
Exemple #18
0
#!/usr/bin/env python
from tools.load import LoadMatrix
lm = LoadMatrix()

traindna = lm.load_dna('../data/fm_train_dna.dat')
testdna = lm.load_dna('../data/fm_test_dna.dat')

parameter_list = [[traindna, testdna, 3, 0, False],
                  [traindna, testdna, 3, 0, False]]


def distance_canberraword(fm_train_dna=traindna,
                          fm_test_dna=testdna,
                          order=3,
                          gap=0,
                          reverse=False):
    from shogun import StringCharFeatures, StringWordFeatures, DNA
    from shogun import SortWordString
    from shogun import CanberraWordDistance

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_train_dna)
    feats_train = StringWordFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feats_train)
    feats_train.add_preprocessor(preproc)
    feats_train.apply_preprocessor()

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_test_dna)
from tools.load import LoadMatrix
import numpy

lm = LoadMatrix()
data = lm.load_numbers("../data/fm_train_real.dat")

parameter_list = [[data]]


def converter_multidimensionalscaling_modular(data):
    from shogun.Features import RealFeatures
    from shogun.Converter import MultidimensionalScaling
    from shogun.Distance import EuclidianDistance

    features = RealFeatures(data)

    distance_before = EuclidianDistance()
    distance_before.init(features, features)

    converter = MultidimensionalScaling()
    converter.set_target_dim(2)
    converter.set_landmark(False)
    embedding = converter.apply(features)

    distance_after = EuclidianDistance()
    distance_after.init(embedding, embedding)

    distance_matrix_after = distance_after.get_distance_matrix()
    distance_matrix_before = distance_before.get_distance_matrix()

    return numpy.linalg.norm(distance_matrix_after - distance_matrix_before) / numpy.linalg.norm(distance_matrix_before)
from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()

traindna=lm.load_dna('../data/fm_train_dna.dat')
testdna=lm.load_dna('../data/fm_test_dna.dat')
trainlabel=lm.load_labels('../data/label_train_dna.dat')
parameter_list=[[traindna,testdna,trainlabel,10,3,0,'n',False,'FULL'],
		[traindna,testdna,trainlabel,11,4,0,'n',False,'FULL']]

def kernel_salzbergstring (fm_train_dna=traindna,fm_test_dna=testdna,
				   label_train_dna=trainlabel,size_cache=10,
				   order=3,gap=0,reverse='n',use_sign=False,
				   normalization='FULL'):

	sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
	sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)

	sg('set_features', 'TEST', fm_test_dna, 'DNA')
	sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)

	pseudo_pos=1e-1
	pseudo_neg=1e-1
	sg('new_plugin_estimator', pseudo_pos, pseudo_neg)
	sg('set_labels', 'TRAIN', label_train_dna)
	sg('train_estimator')

	sg('set_kernel', 'SALZBERG', 'WORD', size_cache)
	#sg('set_prior_probs', 0.4, 0.6)
	sg('set_prior_probs_from_labels', label_train_dna)
	km=sg('get_kernel_matrix', 'TRAIN')
Exemple #21
0
# In this example ROC (Receiver Operator Characteristic) is being computed
# for the pair of ground truth toy labels and random labels.
# ROC curve (as matrix) and auROC (area under ROC) is returned.

from tools.load import LoadMatrix
from numpy import random
lm=LoadMatrix()

ground_truth = lm.load_labels('../data/label_train_twoclass.dat')
random.seed(17)
predicted = random.randn(len(ground_truth))

parameter_list = [[ground_truth,predicted]]

def evaluation_rocevaluation_modular(ground_truth, predicted):
	from shogun.Features import BinaryLabels
	from shogun.Evaluation import ROCEvaluation

	ground_truth_labels = BinaryLabels(ground_truth)
	predicted_labels = BinaryLabels(predicted)
	
	evaluator = ROCEvaluation()
	evaluator.evaluate(predicted_labels,ground_truth_labels)

	return evaluator.get_ROC(), evaluator.get_auROC()


if __name__=='__main__':
	print('ROCEvaluation')
	evaluation_rocevaluation_modular(*parameter_list[0])
from tools.load import LoadMatrix
lm = LoadMatrix()

traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')

parameter_list = [[traindat, testdat], [traindat, testdat]]


def distance_chebyshew_modular(fm_train_real=traindat, fm_test_real=testdat):

    from shogun.Features import RealFeatures
    from shogun.Distance import ChebyshewMetric

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    distance = ChebyshewMetric(feats_train, feats_train)

    dm_train = distance.get_distance_matrix()
    distance.init(feats_train, feats_test)
    dm_test = distance.get_distance_matrix()

    return distance, dm_train, dm_test


if __name__ == '__main__':
    print 'ChebyshewMetric'
    distance_chebyshew_modular(*parameter_list[0])
# This is an example for the initialization of a linear kernel on word (2byte)
# data.

from tools.load import LoadMatrix
from numpy import ushort

lm = LoadMatrix()
traindat = ushort(lm.load_numbers("../data/fm_train_word.dat"))
testdat = ushort(lm.load_numbers("../data/fm_test_word.dat"))

parameter_list = [[traindat, testdat, 1.2], [traindat, testdat, 1.2]]


def kernel_linear_word_modular(fm_train_word=traindat, fm_test_word=testdat, scale=1.2):

    from shogun.Kernel import LinearKernel, AvgDiagKernelNormalizer
    from shogun.Features import WordFeatures

    feats_train = WordFeatures(fm_train_word)
    feats_test = WordFeatures(fm_test_word)

    kernel = LinearKernel(feats_train, feats_train)
    kernel.set_normalizer(AvgDiagKernelNormalizer(scale))
    kernel.init(feats_train, feats_train)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return kernel

#!/usr/bin/env python
from tools.load import LoadMatrix

lm = LoadMatrix()
data = lm.load_cubes("../data/fm_train_cube.dat")

parameter_list = [[data, 1, 64, 1e-5, 2, 0, False, 5], [data, 3, 6, 1e-1, 1, 0, False, 2]]


def distribution_hmm_modular(fm_cube, N, M, pseudo, order, gap, reverse, num_examples):
    from shogun.Features import StringWordFeatures, StringCharFeatures, CUBE
    from shogun.Distribution import HMM, BW_NORMAL

    charfeat = StringCharFeatures(CUBE)
    charfeat.set_features(fm_cube)
    feats = StringWordFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order - 1, order, gap, reverse)

    hmm = HMM(feats, N, M, pseudo)
    hmm.train()
    hmm.baum_welch_viterbi_train(BW_NORMAL)

    num_examples = feats.get_num_vectors()
    num_param = hmm.get_num_model_parameters()
    for i in range(num_examples):
        for j in range(num_param):
            hmm.get_log_derivative(j, i)

    best_path = 0
    best_path_state = 0
    for i in range(num_examples):
from tools.load import LoadMatrix
lm=LoadMatrix()
data=lm.load_cubes('../data/fm_train_cube.dat')

parameter_list=[[data, 1, 64, 1e-5, 2, 0, False, 5], [data, 3, 6, 1e-1, 1, 0, False, 2]]

def distribution_hmm_modular(fm_cube, N, M, pseudo, order, gap, reverse, num_examples):
	from shogun.Features import StringWordFeatures, StringCharFeatures, CUBE
	from shogun.Distribution import HMM, BW_NORMAL

	charfeat=StringCharFeatures(CUBE)
	charfeat.set_features(fm_cube)
	feats=StringWordFeatures(charfeat.get_alphabet())
	feats.obtain_from_char(charfeat, order-1, order, gap, reverse)

	hmm=HMM(feats, N, M, pseudo)
	hmm.train()
	hmm.baum_welch_viterbi_train(BW_NORMAL)

	num_examples=feats.get_num_vectors()
	num_param=hmm.get_num_model_parameters()
	for i in xrange(num_examples):
		for j in xrange(num_param):
			hmm.get_log_derivative(j, i)

	best_path=0
	best_path_state=0
	for i in xrange(num_examples):
		best_path+=hmm.best_path(i)
		for j in xrange(N):
			best_path_state+=hmm.get_best_path_state(i, j)
    realfeat = RealFeatures(fm_train_real)
    feats_train = SparseRealFeatures()
    feats_train.obtain_from_simple(realfeat)
    realfeat = RealFeatures(fm_test_real)
    feats_test = SparseRealFeatures()
    feats_test.obtain_from_simple(realfeat)

    C = 0.9
    epsilon = 1e-5
    num_threads = 1
    labels = Labels(label_train_twoclass)

    svm = SVMOcas(C, feats_train, labels)
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.set_bias_enabled(False)
    svm.train()

    svm.set_features(feats_test)
    svm.classify().get_labels()


if __name__ == "__main__":
    from tools.load import LoadMatrix

    lm = LoadMatrix()
    fm_train_real = lm.load_numbers("../data/fm_train_real.dat")
    fm_test_real = lm.load_numbers("../data/fm_test_real.dat")
    label_train_twoclass = lm.load_labels("../data/label_train_twoclass.dat")
    svmocas()
# This example shows how to compute the Hamming Word Distance for string features.

from tools.load import LoadMatrix
lm=LoadMatrix()

traindna = lm.load_dna('../data/fm_train_dna.dat')
testdna = lm.load_dna('../data/fm_test_dna.dat')
testdat = lm.load_labels('../data/fm_test_real.dat')

parameter_list = [[traindna,testdna,testdat,4,0,False,False],
		[traindna,testdna,testdat,3,0,False,False]]

def distance_hammingword_modular (fm_train_dna=traindna,fm_test_dna=testdna,
		fm_test_real=testdat,order=3,gap=0,reverse=False,use_sign=False):

	from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
	from shogun.Preprocessor import SortWordString
	from shogun.Distance import HammingWordDistance

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
# In this example the Histogram algorithm object computes a histogram over all
# 16bit unsigned integers in the features.

from tools.load import LoadMatrix
from sg import sg

lm = LoadMatrix()
traindna = lm.load_dna("../data/fm_train_dna.dat")
cubedna = lm.load_cubes("../data/fm_train_cube.dat")
parameter_list = [[traindna, cubedna, 3, 0, "n"], [traindna, cubedna, 4, 0, "n"]]


def distribution_histogram(fm_train=traindna, fm_cube=cubedna, order=3, gap=0, reverse="n"):

    # 	sg('new_distribution', 'HISTOGRAM')
    sg("add_preproc", "SORTWORDSTRING")

    sg("set_features", "TRAIN", fm_train, "DNA")
    sg("convert", "TRAIN", "STRING", "CHAR", "STRING", "WORD", order, order - 1, gap, reverse)
    sg("attach_preproc", "TRAIN")


# 	sg('train_distribution')
# 	histo=sg('get_histogram')

# 	num_examples=11
# 	num_param=sg('get_histogram_num_model_parameters')
# 	for i in xrange(num_examples):
# 		for j in xrange(num_param):
# 			sg('get_log_derivative %d %d' % (j, i))
	print 'LaRank'

	from shogun.Features import RealFeatures, Labels
	from shogun.Kernel import GaussianKernel
	from shogun.Classifier import LaRank

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)
	width=2.1
	kernel=GaussianKernel(feats_train, feats_train, width)

	C=1
	epsilon=1e-5
	labels=Labels(label_train_multiclass)

	svm=LaRank(C, kernel, labels)
	#svm.set_tau(1e-3)
	#svm.set_batch_mode(False)
	#svm.io.enable_progress()
	svm.set_epsilon(epsilon)
	svm.train()
	out=svm.classify(feats_train).get_labels()

if __name__=='__main__':
	from tools.load import LoadMatrix
	lm=LoadMatrix()
	fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
	fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
	label_train_multiclass=lm.load_labels('../data/label_train_multiclass.dat')
	larank()
# In this example the distant segments kernel is being computed for toy data.

from tools.load import LoadMatrix

lm = LoadMatrix()
traindat = lm.load_dna("../data/fm_train_dna.dat")
testdat = lm.load_dna("../data/fm_test_dna.dat")

parameter_list = [[traindat, testdat, 5, 5], [traindat, testdat, 6, 6]]


def kernel_distantsegments_modular(fm_train_dna=traindat, fm_test_dna=testdat, delta=5, theta=5):
    from shogun.Features import StringCharFeatures, DNA
    from shogun.Kernel import DistantSegmentsKernel

    feats_train = StringCharFeatures(fm_train_dna, DNA)
    feats_test = StringCharFeatures(fm_test_dna, DNA)

    kernel = DistantSegmentsKernel(feats_train, feats_train, 10, delta, theta)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()

    return km_train, km_test, kernel


if __name__ == "__main__":
    print("DistantSegments")
    kernel_distantsegments_modular(*parameter_list[0])
# 
# where \f$\Phi_k\f$ maps a sequence \f${\bf x}\f$ that consists of letters in
# \f$\Sigma\f$ to a feature vector of size \f$|\Sigma|^k\f$. In this feature
# vector each entry denotes how often the k-mer appears in that \f${\bf x}\f$.
# 
# Note that this representation is especially tuned to small alphabets
# (like the 2-bit alphabet DNA), for which it enables spectrum kernels
# of order 8.
# 
# For this kernel the linadd speedups are quite efficiently implemented using
# direct maps.
# 

from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()

traindna=lm.load_dna('../data/fm_train_dna.dat')
testdna=lm.load_dna('../data/fm_test_dna.dat')
trainlabel=lm.load_labels('../data/label_train_dna.dat')
parameter_list=[[traindna,testdna,trainlabel,10,3,0,'n',False,'FULL'],
		[traindna,testdna,trainlabel,11,4,0,'n',False,'FULL']]

def kernel_weightedcommwordstring (fm_train_dna=traindna,fm_test_dna=testdna,
				   label_train_dna=trainlabel,size_cache=10,
				   order=3,gap=0,reverse='n',use_sign=False,
				   normalization='FULL'):

	sg('add_preproc', 'SORTWORDSTRING')
	sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
	sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
Exemple #32
0
	epsilon=1e-5
	labels=Labels(label_train_twoclass)

	svm=LibSVM(C, kernel, labels)
	svm.set_epsilon(epsilon)
	svm.train()

	#kernel.init(feats_train, feats_test)
	output = svm.classify(feats_test)#.get_labels()
        #output_vector = output.get_labels()
        out=svm.classify().get_labels()
        testerr=mean(sign(out)!=testlab)
        print testerr


	#sv_idx=svm.get_support_vectors()
	#alphas=svm.get_alphas()
        #pm = PerformanceMeasures(output_vector, output)
        #acc = pm.get_accuracy()
        #roc = pm.get_auROC()
        #fms = pm.get_fmeasure()


if __name__=='__main__':
	from tools.load import LoadMatrix
	lm=LoadMatrix()
	fm_train_real=lm.load_numbers('/home/mati/lib/shogun-0.9.3/examples/documented/data/fm_train_real.dat')
	fm_test_real=lm.load_numbers('/home/mati/lib/shogun-0.9.3/examples/documented/data/fm_test_real.dat')
	label_train_twoclass=lm.load_labels('/home/mati/lib/shogun-0.9.3/examples/documented/data/label_train_twoclass.dat')
	libsvm()
#!/usr/bin/env python
from tools.load import LoadMatrix
lm=LoadMatrix()
data=lm.load_numbers('../data/fm_train_real.dat')
label=lm.load_numbers('../data/label_train_twoclass.dat')

parameter_list=[[data,label]]

def features_io_modular (fm_train_real, label_train_twoclass):
	import numpy
	from modshogun import SparseRealFeatures, RealFeatures, MulticlassLabels
	from modshogun import GaussianKernel
	from modshogun import LibSVMFile, CSVFile, BinaryFile, HDF5File

	feats=SparseRealFeatures(fm_train_real)
	feats2=SparseRealFeatures()

	f=BinaryFile("tmp/fm_train_sparsereal.bin","w")
	feats.save(f)

	f=LibSVMFile("tmp/fm_train_sparsereal.ascii","w")
	feats.save(f)

	f=BinaryFile("tmp/fm_train_sparsereal.bin")
	feats2.load(f)

	f=LibSVMFile("tmp/fm_train_sparsereal.ascii")
	feats2.load(f)

	feats=RealFeatures(fm_train_real)
	feats2=RealFeatures()
#!/usr/bin/env python
from tools.load import LoadMatrix
from numpy import random
lm=LoadMatrix()

random.seed(17)
ground_truth = lm.load_labels('../data/label_train_multiclass.dat')
predicted = lm.load_labels('../data/label_train_multiclass.dat') * 2

parameter_list = [[ground_truth,predicted]]

def evaluation_multiclassaccuracy_modular (ground_truth, predicted):
	from shogun.Features import MulticlassLabels
	from shogun.Evaluation import MulticlassAccuracy

	ground_truth_labels = MulticlassLabels(ground_truth)
	predicted_labels = MulticlassLabels(predicted)
	
	evaluator = MulticlassAccuracy()
	accuracy = evaluator.evaluate(predicted_labels,ground_truth_labels)
	
	return accuracy


if __name__=='__main__':
	print('MulticlassAccuracy')
	evaluation_multiclassaccuracy_modular(*parameter_list[0])

def bray_curtis_distance ():
	print 'BrayCurtisDistance'
	from sg import sg
	sg('set_distance', 'BRAYCURTIS', 'REAL')

	sg('set_features', 'TRAIN', fm_train_real)
	dm=sg('get_distance_matrix', 'TRAIN')

	sg('set_features', 'TEST', fm_test_real)
	dm=sg('get_distance_matrix', 'TEST')

if __name__=='__main__':
	from tools.load import LoadMatrix
	lm=LoadMatrix()
	fm_train_real=lm.load_numbers('../data/fm_train_real.dat')
	fm_test_real=lm.load_numbers('../data/fm_test_real.dat')
	bray_curtis_distance()
def distance():
    print "Distance"

    width = 1.7
    size_cache = 10

    from sg import sg

    sg("set_features", "TRAIN", fm_train_real)
    sg("set_features", "TEST", fm_test_real)
    sg("set_distance", "EUCLIDIAN", "REAL")
    sg("set_kernel", "DISTANCE", size_cache, width)
    km = sg("get_kernel_matrix", "TRAIN")
    km = sg("get_kernel_matrix", "TEST")


if __name__ == "__main__":
    from tools.load import LoadMatrix

    lm = LoadMatrix()
    fm_train_real = lm.load_numbers("../data/fm_train_real.dat")
    fm_test_real = lm.load_numbers("../data/fm_test_real.dat")
    distance()
# In this example a two-class linear support vector machine classifier is trained
# on a toy data set and the trained classifier is used to predict labels of test
# examples. As training algorithm the Stochastic Gradient Descent (SGD) solver is
# used with the SVM regularization parameter C=0.9. The number of iterations, i.e.
# passes though all training examples, is set to num_iter=5 .
# 
# For more details on the SGD solver see
#  L. Bottou, O. Bousquet. The tradeoff of large scale learning. In NIPS 20. MIT
#  Press. 2008.

from tools.load import LoadMatrix
lm=LoadMatrix()

traindat = lm.load_numbers('../data/fm_train_real.dat')
testdat = lm.load_numbers('../data/fm_test_real.dat')
label_traindat = lm.load_labels('../data/label_train_twoclass.dat')

parameter_list = [[traindat,testdat,label_traindat,0.9,1,6],[traindat,testdat,label_traindat,0.8,1,5]]

def classifier_svmsgd_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_twoclass=label_traindat,C=0.9,num_threads=1,num_iter=5):

	from shogun.Features import RealFeatures, SparseRealFeatures, Labels
	from shogun.Classifier import SVMSGD

	realfeat=RealFeatures(fm_train_real)
	feats_train=SparseRealFeatures()
	feats_train.obtain_from_simple(realfeat)
	realfeat=RealFeatures(fm_test_real)
	feats_test=SparseRealFeatures()
	feats_test.obtain_from_simple(realfeat)
from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()


traindat=lm.load_numbers('../data/fm_train_real.dat')
testdat=lm.load_numbers('../data/fm_test_real.dat')
train_label=lm.load_labels('../data/label_train_multiclass.dat')
parameter_list=[[traindat,testdat, train_label,10,2.1,1.2,1e-5,False],
		[traindat,testdat,train_label,10,2.1,1.3,1e-4,False]]

def classifier_gmnpsvm (fm_train_real=traindat,fm_test_real=testdat,
			label_train_multiclass=train_label,
			size_cache=10, width=2.1,C=1.2,
			epsilon=1e-5,use_bias=False):

	sg('set_features', 'TRAIN', fm_train_real)
	sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width)

	sg('set_labels', 'TRAIN', label_train_multiclass)
	sg('new_classifier', 'GMNPSVM')
	sg('svm_epsilon', epsilon)
	sg('c', C)
	sg('svm_use_bias', use_bias)
	sg('train_classifier')

	sg('set_features', 'TEST', fm_test_real)
	result=sg('classify')
	kernel_matrix = sg('get_kernel_matrix', 'TEST')
	return result, kernel_matrix
from tools.load import LoadMatrix

lm=LoadMatrix()
data = lm.load_numbers('../data/fm_train_real.dat')

parameter_list = [[data, 0.01, 1.0], [data, 0.05, 2.0]]

def preprocessor_kernelpcacut_modular(data, threshold, width):
	from shogun.Features import RealFeatures
	from shogun.Preprocessor import KernelPCACut
	from shogun.Kernel import GaussianKernel
	
	features = RealFeatures(data)
	
	kernel = GaussianKernel(features,features,width)
		
	preprocessor = KernelPCACut(kernel,threshold)
	preprocessor.init(features)
	preprocessor.apply_to_feature_matrix(features)

	return features


if __name__=='__main__':
	print 'KernelPCACut'
	preprocessor_kernelpcacut_modular(*parameter_list[0])

#!/usr/bin/env python
from tools.load import LoadMatrix

lm = LoadMatrix()

train_dna = lm.load_dna("../data/fm_train_dna.dat")
test_dna = lm.load_dna("../data/fm_test_dna.dat")
label = lm.load_labels("../data/label_train_dna.dat")

parameter_list = [[train_dna, test_dna, label, 20, 0.9, 1e-3, 1], [train_dna, test_dna, label, 20, 2.3, 1e-5, 4]]


def classifier_svmlight_batch_linadd_modular(
    fm_train_dna, fm_test_dna, label_train_dna, degree, C, epsilon, num_threads
):

    from modshogun import StringCharFeatures, BinaryLabels, DNA
    from modshogun import WeightedDegreeStringKernel, MSG_DEBUG

    try:
        from modshogun import SVMLight
    except ImportError:
        print("No support for SVMLight available.")
        return

    feats_train = StringCharFeatures(DNA)
    # feats_train.io.set_loglevel(MSG_DEBUG)
    feats_train.set_features(fm_train_dna)
    feats_test = StringCharFeatures(DNA)
    feats_test.set_features(fm_test_dna)
    degree = 20
# This is an example for the initialization of the CommWordString-kernel (aka
# Spectrum or n-gram kernel; its name is derived from the unix command comm). This kernel 
# sums over k-mere matches (k='order'). For efficient computing a preprocessor is used 
# that extracts and sorts all k-mers. If 'use_sign' is set to one each k-mere is counted 
# only once. 

from tools.load import LoadMatrix
from sg import sg
lm=LoadMatrix()

traindna=lm.load_dna('../data/fm_train_dna.dat')
testdna=lm.load_dna('../data/fm_test_dna.dat')
parameter_list=[[traindna,testdna,10,3,0,'n',False,'FULL'],
		[traindna,testdna,11,4,0,'n',False,'FULL']]

def kernel_commwordstring (fm_train_dna=traindna,fm_test_dna=testdna,
			    size_cache=10,
			    order=3,gap=0,reverse='n',
			    use_sign=False,normalization='FULL'):

	sg('add_preproc', 'SORTWORDSTRING')
	sg('set_features', 'TRAIN', fm_train_dna, 'DNA')
	sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
	sg('attach_preproc', 'TRAIN')

	sg('set_features', 'TEST', fm_test_dna, 'DNA')
	sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
	sg('attach_preproc', 'TEST')

	sg('set_kernel', 'COMMSTRING', 'WORD', size_cache, use_sign, normalization)
	km=sg('get_kernel_matrix', 'TRAIN')
Exemple #42
0
# This is an example for the initialization of a linear kernel on raw byte
# data.

###########################################################################
# linear kernel on byte features
###########################################################################
from tools.load import LoadMatrix
from numpy import ubyte
lm = LoadMatrix()

traindat = ubyte(lm.load_numbers('../data/fm_train_byte.dat'))
testdat = ubyte(lm.load_numbers('../data/fm_test_byte.dat'))

parameter_list = [[traindat, testdat], [traindat, testdat]]


def kernel_linear_byte_modular(fm_train_byte=traindat, fm_test_byte=testdat):
    from shogun.Kernel import LinearKernel
    from shogun.Features import ByteFeatures

    feats_train = ByteFeatures(fm_train_byte)
    feats_test = ByteFeatures(fm_test_byte)

    kernel = LinearKernel(feats_train, feats_train)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return kernel