def prepare_data(use_toy=True): from os.path import exists from tools.load import LoadMatrix lm=LoadMatrix() if not use_toy and exists('../data/../mldata/uci-20070111-optdigits.mat'): from scipy.io import loadmat mat = loadmat('../data/../mldata/uci-20070111-optdigits.mat')['int0'].astype(float) X = mat[:-1,:] Y = mat[-1,:] isplit = X.shape[1]/2 traindat = X[:,:isplit] label_traindat = Y[:isplit] testdat = X[:, isplit:] label_testdat = Y[isplit:] else: traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') label_traindat = lm.load_labels('../data/label_train_multiclass.dat') label_testdat = None return [traindat, label_traindat, testdat, label_testdat]
# In this example the Histogram algorithm object computes a histogram over all # 16bit unsigned integers in the features. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') cubedna=lm.load_cubes('../data/fm_train_cube.dat') parameter_list=[[traindna,cubedna,3,0,'n'],[traindna,cubedna,4,0,'n']] def distribution_histogram(fm_train=traindna,fm_cube=cubedna,order=3, gap=0,reverse='n'): # sg('new_distribution', 'HISTOGRAM') sg('add_preproc', 'SORTWORDSTRING') sg('set_features', 'TRAIN', fm_train, 'DNA') sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('attach_preproc', 'TRAIN') # sg('train_distribution') # histo=sg('get_histogram') # num_examples=11 # num_param=sg('get_histogram_num_model_parameters') # for i in xrange(num_examples): # for j in xrange(num_param): # sg('get_log_derivative %d %d' % (j, i)) # sg('get_log_likelihood') # return sg('get_log_likelihood_sample')
########################################################################### # linear kernel on byte features ########################################################################### from tools.load import LoadMatrix from numpy import ubyte lm=LoadMatrix() traindat = ubyte(lm.load_numbers('../data/fm_train_byte.dat')) testdat = ubyte(lm.load_numbers('../data/fm_test_byte.dat')) parameter_list=[[traindat,testdat],[traindat,testdat]] def kernel_linear_byte_modular(fm_train_byte=traindat,fm_test_byte=testdat): from shogun.Kernel import LinearKernel from shogun.Features import ByteFeatures feats_train=ByteFeatures(fm_train_byte) feats_test=ByteFeatures(fm_test_byte) kernel=LinearKernel(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return kernel if __name__=='__main__': print('LinearByte') kernel_linear_byte_modular(*parameter_list[0])
from tools.load import LoadMatrix from numpy import random lm=LoadMatrix() ground_truth = lm.load_labels('../data/label_train_twoclass.dat') random.seed(17) predicted = random.randn(len(ground_truth)) parameter_list = [[ground_truth,predicted]] def evaluation_prcevaluation_modular(ground_truth, predicted): from shogun.Features import Labels from shogun.Evaluation import PRCEvaluation ground_truth_labels = Labels(ground_truth) predicted_labels = Labels(predicted) evaluator = PRCEvaluation() evaluator.evaluate(predicted_labels,ground_truth_labels) return evaluator.get_PRC(), evaluator.get_auPRC() if __name__=='__main__': print 'PRCEvaluation' evaluation_prcevaluation_modular(*parameter_list[0])
reverse=False charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) histo=Histogram(feats) histo.train() histo.get_histogram() num_examples=feats.get_num_vectors() num_param=histo.get_num_model_parameters() #for i in xrange(num_examples): # for j in xrange(num_param): # histo.get_log_derivative(j, i) histo.get_log_likelihood() histo.get_log_likelihood_sample() ########################################################################### # call functions ########################################################################### if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_dna=lm.load_dna('../data/fm_train_dna.dat') histogram()
from tools.load import LoadMatrix lm=LoadMatrix() traindna = lm.load_dna('../data/fm_train_dna.dat') testdna = lm.load_dna('../data/fm_test_dna.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') parameter_list = [[traindna,testdna,testdat,3,0,False],[traindna,testdna,testdat,4,0,False]] def distance_manhattenword_modular (fm_train_dna=traindna ,fm_test_dna=testdna,fm_test_real=testdat,order=3,gap=0,reverse=False): from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.Preprocessor import SortWordString from shogun.Distance import ManhattanWordDistance charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor()
from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') parameter_list=[[traindat,testdat,1.4,10,True],[traindat,testdat,1.5,11,True]] def preproc_prunevarsubmean (fm_train_real=traindat,fm_test_real=testdat, width=1.4,size_cache=10,divide_by_std=True): sg('add_preproc', 'PRUNEVARSUBMEAN', divide_by_std) sg('set_kernel', 'CHI2', 'REAL', size_cache, width) sg('set_features', 'TRAIN', fm_train_real) sg('attach_preproc', 'TRAIN') km=sg('get_kernel_matrix', 'TRAIN') sg('set_features', 'TEST', fm_test_real) sg('attach_preproc', 'TEST') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print 'PruneVarSubMean' preproc_prunevarsubmean(*parameter_list[0])
# In this example a two-class support vector machine classifier is trained on a # DNA splice-site detection data set and the trained classifier is used to predict # labels on test set. As training algorithm SVM^light is used with SVM # regularization parameter C=1.2 and the Weighted Degree kernel of degree 20 and # the precision parameter epsilon=1e-5. # # For more details on the SVM^light see # T. Joachims. Making large-scale SVM learning practical. In Advances in Kernel # Methods -- Support Vector Learning, pages 169-184. MIT Press, Cambridge, MA USA, 1999. # # For more details on the Weighted Degree kernel see # G. Raetsch, S.Sonnenburg, and B. Schoelkopf. RASE: recognition of alternatively # spliced exons in C. elegans. Bioinformatics, 21:369-377, June 2005. from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_dna('../data/fm_train_dna.dat') testdat = lm.load_dna('../data/fm_test_dna.dat') label_traindat = lm.load_labels('../data/label_train_dna.dat') parameter_list = [[traindat,testdat,label_traindat,1.1,1e-5,1],[traindat,testdat,label_traindat,1.2,1e-5,1]] def classifier_svmlight_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,C=1.2,epsilon=1e-5,num_threads=1): from shogun.Features import StringCharFeatures, Labels, DNA from shogun.Kernel import WeightedDegreeStringKernel try: from shogun.Classifier import SVMLight except ImportError: print 'No support for SVMLight available.' return
#!/usr/bin/env python from tools.load import LoadMatrix from numpy import random lm = LoadMatrix() random.seed(17) from tools.multiclass_shared import prepare_data [traindat, label_traindat, testdat, label_testdat] = prepare_data(False) parameter_list = [[traindat, label_traindat, testdat, label_testdat]] def evaluation_multiclassovrevaluation_modular(traindat, label_traindat, testdat, label_testdat): from shogun.Features import MulticlassLabels from shogun.Evaluation import MulticlassOVREvaluation, ROCEvaluation from modshogun import MulticlassLibLinear, RealFeatures, ContingencyTableEvaluation, ACCURACY from shogun.Mathematics import Math Math.init_random(1) ground_truth_labels = MulticlassLabels(label_traindat) svm = MulticlassLibLinear(1.0, RealFeatures(traindat), MulticlassLabels(label_traindat)) svm.train() predicted_labels = svm.apply() binary_evaluator = ROCEvaluation() evaluator = MulticlassOVREvaluation(binary_evaluator) mean_roc = evaluator.evaluate(predicted_labels, ground_truth_labels) #print mean_roc
from tools.load import LoadMatrix lm = LoadMatrix() data = lm.load_numbers('../data/fm_train_real.dat') parameter_list = [[data, 10], [data, 20]] def converter_localtangentspacealignment_modular(data, k): from shogun.Features import RealFeatures from shogun.Converter import LocalTangentSpaceAlignment features = RealFeatures(data) converter = LocalTangentSpaceAlignment() converter.set_target_dim(1) converter.set_k(k) converter.apply(features) return features if __name__ == '__main__': print 'LocalTangentSpaceAlignment' converter_localtangentspacealignment_modular(*parameter_list[0])
from tools.load import LoadMatrix lm = LoadMatrix() fm_train_real = lm.load_numbers('../data/fm_train_real.dat') fm_test_real = lm.load_numbers('../data/fm_test_real.dat') label_train_multiclass = lm.load_labels('../data/label_train_multiclass.dat') parameter_list = [[ fm_train_real, fm_test_real, label_train_multiclass, 1.2, 1.2, 1e-5, 1, 0.001, 1.5 ], [ fm_train_real, fm_test_real, label_train_multiclass, 5, 1.2, 1e-2, 1, 0.001, 2 ]] def mkl_multiclass_modular(fm_train_real, fm_test_real, label_train_multiclass, width, C, epsilon, num_threads, mkl_epsilon, mkl_norm): from shogun.Features import CombinedFeatures, RealFeatures, Labels from shogun.Kernel import CombinedKernel, GaussianKernel, LinearKernel, PolyKernel from shogun.Classifier import MKLMultiClass kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real)
from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') label_traindat = lm.load_labels('../data/label_train_twoclass.dat') parameter_list = [[traindat,testdat,label_traindat,2.1,1,1e-5],[traindat,testdat,label_traindat,2.2,1,1e-5]] def classifier_libsvm_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_twoclass=label_traindat,width=2.1,C=1,epsilon=1e-5): from shogun.Features import RealFeatures, BinaryLabels from shogun.Kernel import GaussianKernel from shogun.Classifier import LibSVM feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) kernel=GaussianKernel(feats_train, feats_train, width) labels=BinaryLabels(label_train_twoclass) svm=LibSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() kernel.init(feats_train, feats_test) labels = svm.apply().get_labels() supportvectors = sv_idx=svm.get_support_vectors() alphas=svm.get_alphas() predictions = svm.apply() print predictions.get_labels() return predictions, svm, predictions.get_labels()
from tools.load import LoadMatrix from sg import sg lm = LoadMatrix() traindna = lm.load_dna('../data/fm_train_dna.dat') cubedna = lm.load_cubes('../data/fm_train_cube.dat') parameter_list = [[traindna, cubedna, 3, 0, 'n'], [traindna, cubedna, 4, 0, 'n']] def distribution_histogram(fm_train=traindna, fm_cube=cubedna, order=3, gap=0, reverse='n'): # sg('new_distribution', 'HISTOGRAM') sg('add_preproc', 'SORTWORDSTRING') sg('set_features', 'TRAIN', fm_train, 'DNA') sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order - 1, gap, reverse) sg('attach_preproc', 'TRAIN') # sg('train_distribution') # histo=sg('get_histogram') # num_examples=11 # num_param=sg('get_histogram_num_model_parameters') # for i in xrange(num_examples): # for j in xrange(num_param):
# In this example a hidden markov model with 3 states and 6 transitions is trained # on a string data set. After calling the constructor of the HMM class specifying # the number of states and transitions the model is trained. Via the Baum-Welch # algorithm the optimal transition and emission probabilities are estimated. The # best path, i.e. the path with highest probability given the model can then be # calculated using get_best_path_state. from tools.load import LoadMatrix lm = LoadMatrix() data = lm.load_cubes('../data/fm_train_cube.dat') parameter_list = [[data, 1, 64, 1e-5, 2, 0, False, 5], [data, 3, 6, 1e-1, 1, 0, False, 2]] def distribution_hmm_modular(fm_cube, N, M, pseudo, order, gap, reverse, num_examples): from shogun.Features import StringWordFeatures, StringCharFeatures, CUBE from shogun.Distribution import HMM, BW_NORMAL charfeat = StringCharFeatures(CUBE) charfeat.set_features(fm_cube) feats = StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order - 1, order, gap, reverse) hmm = HMM(feats, N, M, pseudo) hmm.train() hmm.baum_welch_viterbi_train(BW_NORMAL) num_examples = feats.get_num_vectors() num_param = hmm.get_num_model_parameters()
from tools.load import LoadMatrix from numpy import random lm = LoadMatrix() random.seed(17) ground_truth = lm.load_labels('../data/label_train_multiclass.dat') predicted = lm.load_labels('../data/label_train_multiclass.dat') * 2 parameter_list = [[ground_truth, predicted]] def evaluation_multiclassaccuracy_modular(ground_truth, predicted): from shogun.Features import MulticlassLabels from shogun.Evaluation import MulticlassAccuracy ground_truth_labels = MulticlassLabels(ground_truth) predicted_labels = MulticlassLabels(predicted) evaluator = MulticlassAccuracy() accuracy = evaluator.evaluate(predicted_labels, ground_truth_labels) return accuracy if __name__ == '__main__': print('MulticlassAccuracy') evaluation_multiclassaccuracy_modular(*parameter_list[0])
from tools.load import LoadMatrix from numpy import ushort from sg import sg lm=LoadMatrix() trainword=ushort(lm.load_numbers('../data/fm_test_word.dat')) testword=ushort(lm.load_numbers('../data/fm_test_word.dat')) parameter_list=[[trainword,testword,10,1.4], [trainword,testword,11,1.5]] def kernel_linearword (fm_train_word=trainword,fm_test_word=testword, size_cache=10, scale=1.4): sg('set_features', 'TRAIN', fm_train_word) sg('set_features', 'TEST', fm_test_word) sg('set_kernel', 'LINEAR', 'WORD', size_cache, scale) km=sg('get_kernel_matrix', 'TRAIN') km=sg('get_kernel_matrix', 'TEST') return km if __name__=='__main__': print('LinearWord') kernel_linearword(*parameter_list[0])
########################################################################### # anova kernel ########################################################################### from tools.load import LoadMatrix from numpy import double lm=LoadMatrix() traindat = double(lm.load_numbers('../data/fm_train_real.dat')) testdat = double(lm.load_numbers('../data/fm_test_real.dat')) parameter_list = [[traindat,testdat,2,10], [traindat,testdat,5,10]] def kernel_anova_modular (fm_train_real=traindat,fm_test_real=testdat,cardinality=2, size_cache=10): from shogun.Kernel import ANOVAKernel from shogun.Features import RealFeatures feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) kernel=ANOVAKernel(feats_train, feats_train, cardinality, size_cache) for i in range(0,feats_train.get_num_vectors()): for j in range(0,feats_train.get_num_vectors()): k1 = kernel.compute_rec1(i,j) k2 = kernel.compute_rec2(i,j) #if abs(k1-k2) > 1e-10: # print "|%s|%s|" % (k1, k2) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train, km_test, kernel
#!/usr/bin/env python from tools.load import LoadMatrix lm = LoadMatrix() traindna = lm.load_dna('../data/fm_train_dna.dat') testdna = lm.load_dna('../data/fm_test_dna.dat') parameter_list = [[traindna, testdna, 3, 0, False], [traindna, testdna, 3, 0, False]] def distance_canberraword(fm_train_dna=traindna, fm_test_dna=testdna, order=3, gap=0, reverse=False): from shogun import StringCharFeatures, StringWordFeatures, DNA from shogun import SortWordString from shogun import CanberraWordDistance charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_test_dna)
from tools.load import LoadMatrix import numpy lm = LoadMatrix() data = lm.load_numbers("../data/fm_train_real.dat") parameter_list = [[data]] def converter_multidimensionalscaling_modular(data): from shogun.Features import RealFeatures from shogun.Converter import MultidimensionalScaling from shogun.Distance import EuclidianDistance features = RealFeatures(data) distance_before = EuclidianDistance() distance_before.init(features, features) converter = MultidimensionalScaling() converter.set_target_dim(2) converter.set_landmark(False) embedding = converter.apply(features) distance_after = EuclidianDistance() distance_after.init(embedding, embedding) distance_matrix_after = distance_after.get_distance_matrix() distance_matrix_before = distance_before.get_distance_matrix() return numpy.linalg.norm(distance_matrix_after - distance_matrix_before) / numpy.linalg.norm(distance_matrix_before)
from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') trainlabel=lm.load_labels('../data/label_train_dna.dat') parameter_list=[[traindna,testdna,trainlabel,10,3,0,'n',False,'FULL'], [traindna,testdna,trainlabel,11,4,0,'n',False,'FULL']] def kernel_salzbergstring (fm_train_dna=traindna,fm_test_dna=testdna, label_train_dna=trainlabel,size_cache=10, order=3,gap=0,reverse='n',use_sign=False, normalization='FULL'): sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) pseudo_pos=1e-1 pseudo_neg=1e-1 sg('new_plugin_estimator', pseudo_pos, pseudo_neg) sg('set_labels', 'TRAIN', label_train_dna) sg('train_estimator') sg('set_kernel', 'SALZBERG', 'WORD', size_cache) #sg('set_prior_probs', 0.4, 0.6) sg('set_prior_probs_from_labels', label_train_dna) km=sg('get_kernel_matrix', 'TRAIN')
# In this example ROC (Receiver Operator Characteristic) is being computed # for the pair of ground truth toy labels and random labels. # ROC curve (as matrix) and auROC (area under ROC) is returned. from tools.load import LoadMatrix from numpy import random lm=LoadMatrix() ground_truth = lm.load_labels('../data/label_train_twoclass.dat') random.seed(17) predicted = random.randn(len(ground_truth)) parameter_list = [[ground_truth,predicted]] def evaluation_rocevaluation_modular(ground_truth, predicted): from shogun.Features import BinaryLabels from shogun.Evaluation import ROCEvaluation ground_truth_labels = BinaryLabels(ground_truth) predicted_labels = BinaryLabels(predicted) evaluator = ROCEvaluation() evaluator.evaluate(predicted_labels,ground_truth_labels) return evaluator.get_ROC(), evaluator.get_auROC() if __name__=='__main__': print('ROCEvaluation') evaluation_rocevaluation_modular(*parameter_list[0])
from tools.load import LoadMatrix lm = LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') parameter_list = [[traindat, testdat], [traindat, testdat]] def distance_chebyshew_modular(fm_train_real=traindat, fm_test_real=testdat): from shogun.Features import RealFeatures from shogun.Distance import ChebyshewMetric feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) distance = ChebyshewMetric(feats_train, feats_train) dm_train = distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test = distance.get_distance_matrix() return distance, dm_train, dm_test if __name__ == '__main__': print 'ChebyshewMetric' distance_chebyshew_modular(*parameter_list[0])
# This is an example for the initialization of a linear kernel on word (2byte) # data. from tools.load import LoadMatrix from numpy import ushort lm = LoadMatrix() traindat = ushort(lm.load_numbers("../data/fm_train_word.dat")) testdat = ushort(lm.load_numbers("../data/fm_test_word.dat")) parameter_list = [[traindat, testdat, 1.2], [traindat, testdat, 1.2]] def kernel_linear_word_modular(fm_train_word=traindat, fm_test_word=testdat, scale=1.2): from shogun.Kernel import LinearKernel, AvgDiagKernelNormalizer from shogun.Features import WordFeatures feats_train = WordFeatures(fm_train_word) feats_test = WordFeatures(fm_test_word) kernel = LinearKernel(feats_train, feats_train) kernel.set_normalizer(AvgDiagKernelNormalizer(scale)) kernel.init(feats_train, feats_train) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return kernel
#!/usr/bin/env python from tools.load import LoadMatrix lm = LoadMatrix() data = lm.load_cubes("../data/fm_train_cube.dat") parameter_list = [[data, 1, 64, 1e-5, 2, 0, False, 5], [data, 3, 6, 1e-1, 1, 0, False, 2]] def distribution_hmm_modular(fm_cube, N, M, pseudo, order, gap, reverse, num_examples): from shogun.Features import StringWordFeatures, StringCharFeatures, CUBE from shogun.Distribution import HMM, BW_NORMAL charfeat = StringCharFeatures(CUBE) charfeat.set_features(fm_cube) feats = StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order - 1, order, gap, reverse) hmm = HMM(feats, N, M, pseudo) hmm.train() hmm.baum_welch_viterbi_train(BW_NORMAL) num_examples = feats.get_num_vectors() num_param = hmm.get_num_model_parameters() for i in range(num_examples): for j in range(num_param): hmm.get_log_derivative(j, i) best_path = 0 best_path_state = 0 for i in range(num_examples):
from tools.load import LoadMatrix lm=LoadMatrix() data=lm.load_cubes('../data/fm_train_cube.dat') parameter_list=[[data, 1, 64, 1e-5, 2, 0, False, 5], [data, 3, 6, 1e-1, 1, 0, False, 2]] def distribution_hmm_modular(fm_cube, N, M, pseudo, order, gap, reverse, num_examples): from shogun.Features import StringWordFeatures, StringCharFeatures, CUBE from shogun.Distribution import HMM, BW_NORMAL charfeat=StringCharFeatures(CUBE) charfeat.set_features(fm_cube) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) hmm=HMM(feats, N, M, pseudo) hmm.train() hmm.baum_welch_viterbi_train(BW_NORMAL) num_examples=feats.get_num_vectors() num_param=hmm.get_num_model_parameters() for i in xrange(num_examples): for j in xrange(num_param): hmm.get_log_derivative(j, i) best_path=0 best_path_state=0 for i in xrange(num_examples): best_path+=hmm.best_path(i) for j in xrange(N): best_path_state+=hmm.get_best_path_state(i, j)
realfeat = RealFeatures(fm_train_real) feats_train = SparseRealFeatures() feats_train.obtain_from_simple(realfeat) realfeat = RealFeatures(fm_test_real) feats_test = SparseRealFeatures() feats_test.obtain_from_simple(realfeat) C = 0.9 epsilon = 1e-5 num_threads = 1 labels = Labels(label_train_twoclass) svm = SVMOcas(C, feats_train, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.set_bias_enabled(False) svm.train() svm.set_features(feats_test) svm.classify().get_labels() if __name__ == "__main__": from tools.load import LoadMatrix lm = LoadMatrix() fm_train_real = lm.load_numbers("../data/fm_train_real.dat") fm_test_real = lm.load_numbers("../data/fm_test_real.dat") label_train_twoclass = lm.load_labels("../data/label_train_twoclass.dat") svmocas()
# This example shows how to compute the Hamming Word Distance for string features. from tools.load import LoadMatrix lm=LoadMatrix() traindna = lm.load_dna('../data/fm_train_dna.dat') testdna = lm.load_dna('../data/fm_test_dna.dat') testdat = lm.load_labels('../data/fm_test_real.dat') parameter_list = [[traindna,testdna,testdat,4,0,False,False], [traindna,testdna,testdat,3,0,False,False]] def distance_hammingword_modular (fm_train_dna=traindna,fm_test_dna=testdna, fm_test_real=testdat,order=3,gap=0,reverse=False,use_sign=False): from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.Preprocessor import SortWordString from shogun.Distance import HammingWordDistance charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet())
# In this example the Histogram algorithm object computes a histogram over all # 16bit unsigned integers in the features. from tools.load import LoadMatrix from sg import sg lm = LoadMatrix() traindna = lm.load_dna("../data/fm_train_dna.dat") cubedna = lm.load_cubes("../data/fm_train_cube.dat") parameter_list = [[traindna, cubedna, 3, 0, "n"], [traindna, cubedna, 4, 0, "n"]] def distribution_histogram(fm_train=traindna, fm_cube=cubedna, order=3, gap=0, reverse="n"): # sg('new_distribution', 'HISTOGRAM') sg("add_preproc", "SORTWORDSTRING") sg("set_features", "TRAIN", fm_train, "DNA") sg("convert", "TRAIN", "STRING", "CHAR", "STRING", "WORD", order, order - 1, gap, reverse) sg("attach_preproc", "TRAIN") # sg('train_distribution') # histo=sg('get_histogram') # num_examples=11 # num_param=sg('get_histogram_num_model_parameters') # for i in xrange(num_examples): # for j in xrange(num_param): # sg('get_log_derivative %d %d' % (j, i))
print 'LaRank' from shogun.Features import RealFeatures, Labels from shogun.Kernel import GaussianKernel from shogun.Classifier import LaRank feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) width=2.1 kernel=GaussianKernel(feats_train, feats_train, width) C=1 epsilon=1e-5 labels=Labels(label_train_multiclass) svm=LaRank(C, kernel, labels) #svm.set_tau(1e-3) #svm.set_batch_mode(False) #svm.io.enable_progress() svm.set_epsilon(epsilon) svm.train() out=svm.classify(feats_train).get_labels() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') label_train_multiclass=lm.load_labels('../data/label_train_multiclass.dat') larank()
# In this example the distant segments kernel is being computed for toy data. from tools.load import LoadMatrix lm = LoadMatrix() traindat = lm.load_dna("../data/fm_train_dna.dat") testdat = lm.load_dna("../data/fm_test_dna.dat") parameter_list = [[traindat, testdat, 5, 5], [traindat, testdat, 6, 6]] def kernel_distantsegments_modular(fm_train_dna=traindat, fm_test_dna=testdat, delta=5, theta=5): from shogun.Features import StringCharFeatures, DNA from shogun.Kernel import DistantSegmentsKernel feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) kernel = DistantSegmentsKernel(feats_train, feats_train, 10, delta, theta) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel if __name__ == "__main__": print("DistantSegments") kernel_distantsegments_modular(*parameter_list[0])
# # where \f$\Phi_k\f$ maps a sequence \f${\bf x}\f$ that consists of letters in # \f$\Sigma\f$ to a feature vector of size \f$|\Sigma|^k\f$. In this feature # vector each entry denotes how often the k-mer appears in that \f${\bf x}\f$. # # Note that this representation is especially tuned to small alphabets # (like the 2-bit alphabet DNA), for which it enables spectrum kernels # of order 8. # # For this kernel the linadd speedups are quite efficiently implemented using # direct maps. # from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') trainlabel=lm.load_labels('../data/label_train_dna.dat') parameter_list=[[traindna,testdna,trainlabel,10,3,0,'n',False,'FULL'], [traindna,testdna,trainlabel,11,4,0,'n',False,'FULL']] def kernel_weightedcommwordstring (fm_train_dna=traindna,fm_test_dna=testdna, label_train_dna=trainlabel,size_cache=10, order=3,gap=0,reverse='n',use_sign=False, normalization='FULL'): sg('add_preproc', 'SORTWORDSTRING') sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse)
epsilon=1e-5 labels=Labels(label_train_twoclass) svm=LibSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() #kernel.init(feats_train, feats_test) output = svm.classify(feats_test)#.get_labels() #output_vector = output.get_labels() out=svm.classify().get_labels() testerr=mean(sign(out)!=testlab) print testerr #sv_idx=svm.get_support_vectors() #alphas=svm.get_alphas() #pm = PerformanceMeasures(output_vector, output) #acc = pm.get_accuracy() #roc = pm.get_auROC() #fms = pm.get_fmeasure() if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('/home/mati/lib/shogun-0.9.3/examples/documented/data/fm_train_real.dat') fm_test_real=lm.load_numbers('/home/mati/lib/shogun-0.9.3/examples/documented/data/fm_test_real.dat') label_train_twoclass=lm.load_labels('/home/mati/lib/shogun-0.9.3/examples/documented/data/label_train_twoclass.dat') libsvm()
#!/usr/bin/env python from tools.load import LoadMatrix lm=LoadMatrix() data=lm.load_numbers('../data/fm_train_real.dat') label=lm.load_numbers('../data/label_train_twoclass.dat') parameter_list=[[data,label]] def features_io_modular (fm_train_real, label_train_twoclass): import numpy from modshogun import SparseRealFeatures, RealFeatures, MulticlassLabels from modshogun import GaussianKernel from modshogun import LibSVMFile, CSVFile, BinaryFile, HDF5File feats=SparseRealFeatures(fm_train_real) feats2=SparseRealFeatures() f=BinaryFile("tmp/fm_train_sparsereal.bin","w") feats.save(f) f=LibSVMFile("tmp/fm_train_sparsereal.ascii","w") feats.save(f) f=BinaryFile("tmp/fm_train_sparsereal.bin") feats2.load(f) f=LibSVMFile("tmp/fm_train_sparsereal.ascii") feats2.load(f) feats=RealFeatures(fm_train_real) feats2=RealFeatures()
#!/usr/bin/env python from tools.load import LoadMatrix from numpy import random lm=LoadMatrix() random.seed(17) ground_truth = lm.load_labels('../data/label_train_multiclass.dat') predicted = lm.load_labels('../data/label_train_multiclass.dat') * 2 parameter_list = [[ground_truth,predicted]] def evaluation_multiclassaccuracy_modular (ground_truth, predicted): from shogun.Features import MulticlassLabels from shogun.Evaluation import MulticlassAccuracy ground_truth_labels = MulticlassLabels(ground_truth) predicted_labels = MulticlassLabels(predicted) evaluator = MulticlassAccuracy() accuracy = evaluator.evaluate(predicted_labels,ground_truth_labels) return accuracy if __name__=='__main__': print('MulticlassAccuracy') evaluation_multiclassaccuracy_modular(*parameter_list[0])
def bray_curtis_distance (): print 'BrayCurtisDistance' from sg import sg sg('set_distance', 'BRAYCURTIS', 'REAL') sg('set_features', 'TRAIN', fm_train_real) dm=sg('get_distance_matrix', 'TRAIN') sg('set_features', 'TEST', fm_test_real) dm=sg('get_distance_matrix', 'TEST') if __name__=='__main__': from tools.load import LoadMatrix lm=LoadMatrix() fm_train_real=lm.load_numbers('../data/fm_train_real.dat') fm_test_real=lm.load_numbers('../data/fm_test_real.dat') bray_curtis_distance()
def distance(): print "Distance" width = 1.7 size_cache = 10 from sg import sg sg("set_features", "TRAIN", fm_train_real) sg("set_features", "TEST", fm_test_real) sg("set_distance", "EUCLIDIAN", "REAL") sg("set_kernel", "DISTANCE", size_cache, width) km = sg("get_kernel_matrix", "TRAIN") km = sg("get_kernel_matrix", "TEST") if __name__ == "__main__": from tools.load import LoadMatrix lm = LoadMatrix() fm_train_real = lm.load_numbers("../data/fm_train_real.dat") fm_test_real = lm.load_numbers("../data/fm_test_real.dat") distance()
# In this example a two-class linear support vector machine classifier is trained # on a toy data set and the trained classifier is used to predict labels of test # examples. As training algorithm the Stochastic Gradient Descent (SGD) solver is # used with the SVM regularization parameter C=0.9. The number of iterations, i.e. # passes though all training examples, is set to num_iter=5 . # # For more details on the SGD solver see # L. Bottou, O. Bousquet. The tradeoff of large scale learning. In NIPS 20. MIT # Press. 2008. from tools.load import LoadMatrix lm=LoadMatrix() traindat = lm.load_numbers('../data/fm_train_real.dat') testdat = lm.load_numbers('../data/fm_test_real.dat') label_traindat = lm.load_labels('../data/label_train_twoclass.dat') parameter_list = [[traindat,testdat,label_traindat,0.9,1,6],[traindat,testdat,label_traindat,0.8,1,5]] def classifier_svmsgd_modular (fm_train_real=traindat,fm_test_real=testdat,label_train_twoclass=label_traindat,C=0.9,num_threads=1,num_iter=5): from shogun.Features import RealFeatures, SparseRealFeatures, Labels from shogun.Classifier import SVMSGD realfeat=RealFeatures(fm_train_real) feats_train=SparseRealFeatures() feats_train.obtain_from_simple(realfeat) realfeat=RealFeatures(fm_test_real) feats_test=SparseRealFeatures() feats_test.obtain_from_simple(realfeat)
from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindat=lm.load_numbers('../data/fm_train_real.dat') testdat=lm.load_numbers('../data/fm_test_real.dat') train_label=lm.load_labels('../data/label_train_multiclass.dat') parameter_list=[[traindat,testdat, train_label,10,2.1,1.2,1e-5,False], [traindat,testdat,train_label,10,2.1,1.3,1e-4,False]] def classifier_gmnpsvm (fm_train_real=traindat,fm_test_real=testdat, label_train_multiclass=train_label, size_cache=10, width=2.1,C=1.2, epsilon=1e-5,use_bias=False): sg('set_features', 'TRAIN', fm_train_real) sg('set_kernel', 'GAUSSIAN', 'REAL', size_cache, width) sg('set_labels', 'TRAIN', label_train_multiclass) sg('new_classifier', 'GMNPSVM') sg('svm_epsilon', epsilon) sg('c', C) sg('svm_use_bias', use_bias) sg('train_classifier') sg('set_features', 'TEST', fm_test_real) result=sg('classify') kernel_matrix = sg('get_kernel_matrix', 'TEST') return result, kernel_matrix
from tools.load import LoadMatrix lm=LoadMatrix() data = lm.load_numbers('../data/fm_train_real.dat') parameter_list = [[data, 0.01, 1.0], [data, 0.05, 2.0]] def preprocessor_kernelpcacut_modular(data, threshold, width): from shogun.Features import RealFeatures from shogun.Preprocessor import KernelPCACut from shogun.Kernel import GaussianKernel features = RealFeatures(data) kernel = GaussianKernel(features,features,width) preprocessor = KernelPCACut(kernel,threshold) preprocessor.init(features) preprocessor.apply_to_feature_matrix(features) return features if __name__=='__main__': print 'KernelPCACut' preprocessor_kernelpcacut_modular(*parameter_list[0])
#!/usr/bin/env python from tools.load import LoadMatrix lm = LoadMatrix() train_dna = lm.load_dna("../data/fm_train_dna.dat") test_dna = lm.load_dna("../data/fm_test_dna.dat") label = lm.load_labels("../data/label_train_dna.dat") parameter_list = [[train_dna, test_dna, label, 20, 0.9, 1e-3, 1], [train_dna, test_dna, label, 20, 2.3, 1e-5, 4]] def classifier_svmlight_batch_linadd_modular( fm_train_dna, fm_test_dna, label_train_dna, degree, C, epsilon, num_threads ): from modshogun import StringCharFeatures, BinaryLabels, DNA from modshogun import WeightedDegreeStringKernel, MSG_DEBUG try: from modshogun import SVMLight except ImportError: print("No support for SVMLight available.") return feats_train = StringCharFeatures(DNA) # feats_train.io.set_loglevel(MSG_DEBUG) feats_train.set_features(fm_train_dna) feats_test = StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) degree = 20
# This is an example for the initialization of the CommWordString-kernel (aka # Spectrum or n-gram kernel; its name is derived from the unix command comm). This kernel # sums over k-mere matches (k='order'). For efficient computing a preprocessor is used # that extracts and sorts all k-mers. If 'use_sign' is set to one each k-mere is counted # only once. from tools.load import LoadMatrix from sg import sg lm=LoadMatrix() traindna=lm.load_dna('../data/fm_train_dna.dat') testdna=lm.load_dna('../data/fm_test_dna.dat') parameter_list=[[traindna,testdna,10,3,0,'n',False,'FULL'], [traindna,testdna,11,4,0,'n',False,'FULL']] def kernel_commwordstring (fm_train_dna=traindna,fm_test_dna=testdna, size_cache=10, order=3,gap=0,reverse='n', use_sign=False,normalization='FULL'): sg('add_preproc', 'SORTWORDSTRING') sg('set_features', 'TRAIN', fm_train_dna, 'DNA') sg('convert', 'TRAIN', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('attach_preproc', 'TRAIN') sg('set_features', 'TEST', fm_test_dna, 'DNA') sg('convert', 'TEST', 'STRING', 'CHAR', 'STRING', 'WORD', order, order-1, gap, reverse) sg('attach_preproc', 'TEST') sg('set_kernel', 'COMMSTRING', 'WORD', size_cache, use_sign, normalization) km=sg('get_kernel_matrix', 'TRAIN')
# This is an example for the initialization of a linear kernel on raw byte # data. ########################################################################### # linear kernel on byte features ########################################################################### from tools.load import LoadMatrix from numpy import ubyte lm = LoadMatrix() traindat = ubyte(lm.load_numbers('../data/fm_train_byte.dat')) testdat = ubyte(lm.load_numbers('../data/fm_test_byte.dat')) parameter_list = [[traindat, testdat], [traindat, testdat]] def kernel_linear_byte_modular(fm_train_byte=traindat, fm_test_byte=testdat): from shogun.Kernel import LinearKernel from shogun.Features import ByteFeatures feats_train = ByteFeatures(fm_train_byte) feats_test = ByteFeatures(fm_test_byte) kernel = LinearKernel(feats_train, feats_train) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return kernel