def distribution_hmm_modular(fm_cube, N, M, pseudo, order, gap, reverse, num_examples): from modshogun import StringWordFeatures, StringCharFeatures, CUBE from modshogun import HMM, BW_NORMAL charfeat=StringCharFeatures(CUBE) charfeat.set_features(fm_cube) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) hmm=HMM(feats, N, M, pseudo) hmm.train() hmm.baum_welch_viterbi_train(BW_NORMAL) num_examples=feats.get_num_vectors() num_param=hmm.get_num_model_parameters() for i in range(num_examples): for j in range(num_param): hmm.get_log_derivative(j, i) best_path=0 best_path_state=0 for i in range(num_examples): best_path+=hmm.best_path(i) for j in range(N): best_path_state+=hmm.get_best_path_state(i, j) lik_example = hmm.get_log_likelihood() lik_sample = hmm.get_log_likelihood_sample() return lik_example, lik_sample, hmm
def distribution_linearhmm_modular (fm_dna=traindna,order=3,gap=0,reverse=False): from modshogun import StringWordFeatures, StringCharFeatures, DNA from modshogun import LinearHMM charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) hmm=LinearHMM(feats) hmm.train() hmm.get_transition_probs() num_examples=feats.get_num_vectors() num_param=hmm.get_num_model_parameters() for i in range(num_examples): for j in range(num_param): hmm.get_log_derivative(j, i) out_likelihood = hmm.get_log_likelihood() out_sample = hmm.get_log_likelihood_sample() return hmm,out_likelihood ,out_sample
def distribution_linearhmm_modular(fm_dna=traindna, order=3, gap=0, reverse=False): from modshogun import StringWordFeatures, StringCharFeatures, DNA from modshogun import LinearHMM charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats = StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order - 1, order, gap, reverse) hmm = LinearHMM(feats) hmm.train() hmm.get_transition_probs() num_examples = feats.get_num_vectors() num_param = hmm.get_num_model_parameters() for i in range(num_examples): for j in range(num_param): hmm.get_log_derivative(j, i) out_likelihood = hmm.get_log_likelihood() out_sample = hmm.get_log_likelihood_sample() return hmm, out_likelihood, out_sample
def kernel_histogram_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,order=3,ppseudo_count=1,npseudo_count=1): from modshogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels from modshogun import HistogramWordStringKernel, AvgDiagKernelNormalizer from modshogun import PluginEstimate#, MSG_DEBUG charfeat=StringCharFeatures(DNA) #charfeat.io.set_loglevel(MSG_DEBUG) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, 0, False) charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, 0, False) pie=PluginEstimate(ppseudo_count,npseudo_count) labels=BinaryLabels(label_train_dna) pie.set_labels(labels) pie.set_features(feats_train) pie.train() kernel=HistogramWordStringKernel(feats_train, feats_train, pie) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) pie.set_features(feats_test) pie.apply().get_labels() km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_salzberg_word_string_modular(fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, order=3, gap=0, reverse=False): from modshogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels from modshogun import SalzbergWordStringKernel from modshogun import PluginEstimate charfeat = StringCharFeatures(fm_train_dna, DNA) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) charfeat = StringCharFeatures(fm_test_dna, DNA) feats_test = StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) pie = PluginEstimate() labels = BinaryLabels(label_train_dna) pie.set_labels(labels) pie.set_features(feats_train) pie.train() kernel = SalzbergWordStringKernel(feats_train, feats_train, pie, labels) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) pie.set_features(feats_test) pie.apply().get_labels() km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_poly_match_word_string_modular(fm_train_dna=traindat, fm_test_dna=testdat, degree=2, inhomogene=True, order=3, gap=0, reverse=False): from modshogun import PolyMatchWordStringKernel from modshogun import StringWordFeatures, StringCharFeatures, DNA charfeat = StringCharFeatures(fm_train_dna, DNA) feats_train = StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) charfeat = StringCharFeatures(fm_test_dna, DNA) feats_test = StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) kernel = PolyMatchWordStringKernel(feats_train, feats_train, degree, inhomogene) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_match_word_string_modular(fm_train_dna=traindat, fm_test_dna=testdat, degree=3, scale=1.4, size_cache=10, order=3, gap=0, reverse=False): from modshogun import MatchWordStringKernel, AvgDiagKernelNormalizer from modshogun import StringWordFeatures, StringCharFeatures, DNA charfeat = StringCharFeatures(fm_train_dna, DNA) feats_train = StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) charfeat = StringCharFeatures(fm_test_dna, DNA) feats_test = StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) kernel = MatchWordStringKernel(size_cache, degree) kernel.set_normalizer(AvgDiagKernelNormalizer(scale)) kernel.init(feats_train, feats_train) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def get_kernel_mat(fm_train_dna, fm_test_dna, N, M, pseudo=1e-1,order=1,gap=0,reverse=False): # train HMM for positive class print "hmm training" charfeat=StringCharFeatures(fm_train_dna, DNA) #charfeat.io.set_loglevel(MSG_DEBUG) hmm_train=StringWordFeatures(charfeat.get_alphabet()) hmm_train.obtain_from_char(charfeat, order-1, order, gap, reverse) pos=HMM(hmm_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) neg = HMM(pos) print "Kernel training data" charfeat=StringCharFeatures(fm_train_dna, DNA) wordfeats_train=StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) print "Kernel testing data" charfeat=StringCharFeatures(fm_test_dna, DNA) wordfeats_test=StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) print "get kernel on training data" pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train=FKFeatures(10, pos, neg) feats_train.set_opt_a(-1) #estimate prior print 'getting feature matrix' v0 = feats_train.get_feature_vector(0) v1 = feats_train.get_feature_vector(1) print np.dot(v0, v1) kernel=LinearKernel(feats_train, feats_train) #kernel=PolyKernel(feats_train, feats_train, *kargs) km_train=kernel.get_kernel_matrix() print km_train.shape, km_train[0, 1] print "get kernel on testing data" pos_clone=HMM(pos) neg_clone=HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test=FKFeatures(10, pos_clone, neg_clone) feats_test.set_a(feats_train.get_a()) #use prior from training data kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_histogram_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,order=3,gap=0,reverse=False): from modshogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels from modshogun import HistogramWordStringKernel from modshogun import PluginEstimate#, MSG_DEBUG reverse = reverse charfeat=StringCharFeatures(DNA) #charfeat.io.set_loglevel(MSG_DEBUG) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) pie=PluginEstimate() labels=BinaryLabels(label_train_dna) pie.set_labels(labels) pie.set_features(feats_train) pie.train() kernel=HistogramWordStringKernel(feats_train, feats_train, pie) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) pie.set_features(feats_test) pie.apply().get_labels() km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def get_spectrum_features(data, order=3, gap=0, reverse=True): """ create feature object used by spectrum kernel """ charfeat = StringCharFeatures(data, PROTEIN) feat = StringWordFeatures(charfeat.get_alphabet()) feat.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feat) feat.add_preprocessor(preproc) feat.apply_preprocessor() return feat
def distribution_histogram_modular(fm_dna=traindna, order=3, gap=0, reverse=False): from modshogun import StringWordFeatures, StringCharFeatures, DNA from modshogun import Histogram charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats = StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order - 1, order, gap, reverse) histo = Histogram(feats) histo.train() histo.get_histogram() num_examples = feats.get_num_vectors() num_param = histo.get_num_model_parameters() # for i in xrange(num_examples): # for j in xrange(num_param): # histo.get_log_derivative(j, i) out_likelihood = histo.get_log_likelihood() out_sample = histo.get_log_likelihood_sample() return histo, out_sample, out_likelihood
def distribution_histogram_modular (fm_dna=traindna,order=3,gap=0,reverse=False): from modshogun import StringWordFeatures, StringCharFeatures, DNA from modshogun import Histogram charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) histo=Histogram(feats) histo.train() histo.get_histogram() num_examples=feats.get_num_vectors() num_param=histo.get_num_model_parameters() #for i in xrange(num_examples): # for j in xrange(num_param): # histo.get_log_derivative(j, i) out_likelihood = histo.get_log_likelihood() out_sample = histo.get_log_likelihood_sample() return histo,out_sample,out_likelihood
def get_spectrum_features(data, order=3, gap=0, reverse=True): """ create feature object used by spectrum kernel """ charfeat = StringCharFeatures(data, DNA) feat = StringWordFeatures(charfeat.get_alphabet()) feat.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc = SortWordString() preproc.init(feat) feat.add_preprocessor(preproc) feat.apply_preprocessor() return feat
def features_string_word_modular (strings, start, order, gap, rev): from modshogun import StringCharFeatures, StringWordFeatures, RAWBYTE from numpy import array, uint16 #create string features cf=StringCharFeatures(strings, RAWBYTE) wf=StringWordFeatures(RAWBYTE) wf.obtain_from_char(cf, start, order, gap, rev) #and output several stats #print("max string length", wf.get_max_vector_length()) #print("number of strings", wf.get_num_vectors()) #print("length of first string", wf.get_vector_length(0)) #print("string[2]", wf.get_feature_vector(2)) #print("strings", wf.get_features()) #replace string 0 wf.set_feature_vector(array([1,2,3,4,5], dtype=uint16), 0) #print("strings", wf.get_features()) return wf.get_features(), wf
def kernel_match_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat, degree=3,scale=1.4,size_cache=10,order=3,gap=0,reverse=False): from modshogun import MatchWordStringKernel, AvgDiagKernelNormalizer from modshogun import StringWordFeatures, StringCharFeatures, DNA charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) kernel=MatchWordStringKernel(size_cache, degree) kernel.set_normalizer(AvgDiagKernelNormalizer(scale)) kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_poly_match_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat, degree=2,inhomogene=True,order=3,gap=0,reverse=False): from modshogun import PolyMatchWordStringKernel from modshogun import StringWordFeatures, StringCharFeatures, DNA charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) kernel=PolyMatchWordStringKernel(feats_train, feats_train, degree, inhomogene) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_top_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,pseudo=1e-1, order=1,gap=0,reverse=False,kargs=[1, False, True]): from modshogun import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA from modshogun import PolyKernel from modshogun import HMM, BW_NORMAL N=1 # toy HMM with 1 state M=4 # 4 observations -> DNA # train HMM for positive class charfeat=StringCharFeatures(fm_hmm_pos, DNA) hmm_pos_train=StringWordFeatures(charfeat.get_alphabet()) hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse) pos=HMM(hmm_pos_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) # train HMM for negative class charfeat=StringCharFeatures(fm_hmm_neg, DNA) hmm_neg_train=StringWordFeatures(charfeat.get_alphabet()) hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse) neg=HMM(hmm_neg_train, N, M, pseudo) neg.baum_welch_viterbi_train(BW_NORMAL) # Kernel training data charfeat=StringCharFeatures(fm_train_dna, DNA) wordfeats_train=StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) # Kernel testing data charfeat=StringCharFeatures(fm_test_dna, DNA) wordfeats_test=StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) # get kernel on training data pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train=TOPFeatures(10, pos, neg, False, False) kernel=PolyKernel(feats_train, feats_train, *kargs) km_train=kernel.get_kernel_matrix() # get kernel on testing data pos_clone=HMM(pos) neg_clone=HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test=TOPFeatures(10, pos_clone, neg_clone, False, False) kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_weighted_comm_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,order=3,gap=0,reverse=True ): from modshogun import WeightedCommWordStringKernel from modshogun import StringWordFeatures, StringCharFeatures, DNA from modshogun import SortWordString charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() use_sign=False kernel=WeightedCommWordStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def tests_check_commwordkernel_memleak_modular(num, order, gap, reverse): import gc from modshogun import Alphabet, StringCharFeatures, StringWordFeatures, DNA from modshogun import SortWordString, MSG_DEBUG from modshogun import CommWordStringKernel, IdentityKernelNormalizer from numpy import mat POS = [ num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT' ] NEG = [ num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT' ] for i in range(10): alpha = Alphabet(DNA) traindat = StringCharFeatures(alpha) traindat.set_features(POS + NEG) trainudat = StringWordFeatures(traindat.get_alphabet()) trainudat.obtain_from_char(traindat, order - 1, order, gap, reverse) #trainudat.io.set_loglevel(MSG_DEBUG) pre = SortWordString() #pre.io.set_loglevel(MSG_DEBUG) pre.init(trainudat) trainudat.add_preprocessor(pre) trainudat.apply_preprocessor() spec = CommWordStringKernel(10, False) spec.set_normalizer(IdentityKernelNormalizer()) spec.init(trainudat, trainudat) K = spec.get_kernel_matrix() del POS del NEG del order del gap del reverse return K
def kernel_fisher_modular(fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, N=1, M=4, pseudo=1e-1, order=1, gap=0, reverse=False, kargs=[1, False, True]): from modshogun import StringCharFeatures, StringWordFeatures, FKFeatures, DNA from modshogun import PolyKernel from modshogun import HMM, BW_NORMAL #, MSG_DEBUG # train HMM for positive class charfeat = StringCharFeatures(fm_hmm_pos, DNA) #charfeat.io.set_loglevel(MSG_DEBUG) hmm_pos_train = StringWordFeatures(charfeat.get_alphabet()) hmm_pos_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) pos = HMM(hmm_pos_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) # train HMM for negative class charfeat = StringCharFeatures(fm_hmm_neg, DNA) hmm_neg_train = StringWordFeatures(charfeat.get_alphabet()) hmm_neg_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) neg = HMM(hmm_neg_train, N, M, pseudo) neg.baum_welch_viterbi_train(BW_NORMAL) # Kernel training data charfeat = StringCharFeatures(fm_train_dna, DNA) wordfeats_train = StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) # Kernel testing data charfeat = StringCharFeatures(fm_test_dna, DNA) wordfeats_test = StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) # get kernel on training data pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train = FKFeatures(10, pos, neg) feats_train.set_opt_a(-1) #estimate prior kernel = PolyKernel(feats_train, feats_train, *kargs) km_train = kernel.get_kernel_matrix() # get kernel on testing data pos_clone = HMM(pos) neg_clone = HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test = FKFeatures(10, pos_clone, neg_clone) feats_test.set_a(feats_train.get_a()) #use prior from training data kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def runShogunSVMDNACombinedSpectrumKernel(train_xt, train_lt, test_xt): """ run svm with combined spectrum kernel """ ################################################## # set up svm kernel=CombinedKernel() feats_train=CombinedFeatures() feats_test=CombinedFeatures() for K in KList: # Iterate through the K's and make a spectrum kernel for each charfeat_train = StringCharFeatures(train_xt, DNA) current_feats_train = StringWordFeatures(DNA) current_feats_train.obtain_from_char(charfeat_train, K-1, K, GAP, False) preproc=SortWordString() preproc.init(current_feats_train) current_feats_train.add_preprocessor(preproc) current_feats_train.apply_preprocessor() feats_train.append_feature_obj(current_feats_train) charfeat_test = StringCharFeatures(test_xt, DNA) current_feats_test=StringWordFeatures(DNA) current_feats_test.obtain_from_char(charfeat_test, K-1, K, GAP, False) current_feats_test.add_preprocessor(preproc) current_feats_test.apply_preprocessor() feats_test.append_feature_obj(current_feats_test) current_kernel=CommWordStringKernel(10, False) kernel.append_kernel(current_kernel) kernel.io.set_loglevel(MSG_DEBUG) # init kernel labels = BinaryLabels(train_lt) # run svm model print "Ready to train!" kernel.init(feats_train, feats_train) svm=LibSVM(SVMC, kernel, labels) svm.io.set_loglevel(MSG_DEBUG) svm.train() # predictions print "Making predictions!" out1DecisionValues = svm.apply(feats_train) out1=out1DecisionValues.get_labels() kernel.init(feats_train, feats_test) out2DecisionValues = svm.apply(feats_test) out2=out2DecisionValues.get_labels() return out1,out2,out1DecisionValues,out2DecisionValues
def distance_manhattenword_modular (train_fname=traindna,test_fname=testdna,order=3,gap=0,reverse=False): from modshogun import StringCharFeatures, StringWordFeatures, DNA from modshogun import SortWordString, ManhattanWordDistance, CSVFile charfeat=StringCharFeatures(CSVFile(train_fname), DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(CSVFile(test_fname), DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance=ManhattanWordDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return dm_train,dm_test
def runShogunSVRSpectrumKernel(train_xt, train_lt, test_xt, svm_c=1): """ serialize svr with spectrum kernels """ ################################################## # set up svr charfeat_train = StringCharFeatures(train_xt, DNA) feats_train = StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat_train, K - 1, K, GAP, False) preproc = SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat_test = StringCharFeatures(test_xt, DNA) feats_test = StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat_test, K - 1, K, GAP, False) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel = CommWordStringKernel(feats_train, feats_train, False) kernel.io.set_loglevel(MSG_DEBUG) # init kernel labels = RegressionLabels(train_lt) # two svr models: epsilon and nu print "Ready to train!" svr_epsilon = LibSVR(svm_c, SVRPARAM, kernel, labels, LIBSVR_EPSILON_SVR) svr_epsilon.io.set_loglevel(MSG_DEBUG) svr_epsilon.train() # predictions print "Making predictions!" out1_epsilon = svr_epsilon.apply(feats_train).get_labels() kernel.init(feats_train, feats_test) out2_epsilon = svr_epsilon.apply(feats_test).get_labels() return out1_epsilon, out2_epsilon, kernel
def runShogunSVMMultipleKernels(train_xt, train_lt, test_xt): """ Run SVM with Multiple Kernels """ ################################################## # Take all examples idxs = np.random.randint(1, 14000, 14000) train_xt = np.array(train_xt)[idxs] train_lt = np.array(train_lt)[idxs] # Initialize kernel and features kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() labels = BinaryLabels(train_lt) ##################### Multiple Spectrum Kernels ######################### for i in range(K1, K2, -1): # append training data to combined feature object charfeat_train = StringCharFeatures(list(train_xt), DNA) feats_train_k1 = StringWordFeatures(DNA) feats_train_k1.obtain_from_char(charfeat_train, i - 1, i, GAP, False) preproc = SortWordString() preproc.init(feats_train_k1) feats_train_k1.add_preprocessor(preproc) feats_train_k1.apply_preprocessor() # append testing data to combined feature object charfeat_test = StringCharFeatures(test_xt, DNA) feats_test_k1 = StringWordFeatures(DNA) feats_test_k1.obtain_from_char(charfeat_test, i - 1, i, GAP, False) feats_test_k1.add_preprocessor(preproc) feats_test_k1.apply_preprocessor() # append features feats_train.append_feature_obj(charfeat_train) feats_test.append_feature_obj(charfeat_test) # append spectrum kernel kernel1 = CommWordStringKernel(10, i) kernel1.io.set_loglevel(MSG_DEBUG) kernel.append_kernel(kernel1) ''' Uncomment this for Multiple Weighted degree kernels and comment the multiple spectrum kernel block above instead ##################### Multiple Weighted Degree Kernel ######################### for i in range(K1,K2,-1): # append training data to combined feature object charfeat_train = StringCharFeatures(list(train_xt), DNA) # append testing data to combined feature object charfeat_test = StringCharFeatures(test_xt, DNA) # append features feats_train.append_feature_obj(charfeat_train); feats_test.append_feature_obj(charfeat_test); # setup weighted degree kernel kernel1=WeightedDegreePositionStringKernel(10,i); kernel1.io.set_loglevel(MSG_DEBUG); kernel1.set_shifts(SHIFT*np.ones(len(train_xt[0]), dtype=np.int32)) kernel1.set_position_weights(np.ones(len(train_xt[0]), dtype=np.float64)); kernel.append_kernel(kernel1); ''' ##################### Training ######################### print "Starting MKL training.." mkl = MKLClassification() mkl.set_mkl_norm(3) #1,2,3 mkl.set_C(SVMC, SVMC) mkl.set_kernel(kernel) mkl.set_labels(labels) mkl.train(feats_train) print "Making predictions!" out1 = mkl.apply(feats_train).get_labels() out2 = mkl.apply(feats_test).get_labels() return out1, out2, train_lt
def get_feature_mat(fm_train_dna, fm_test_dna, N, M, pseudo=1e-1,order=1,gap=0,reverse=False): # train HMM for positive class print "hmm training" charfeat=StringCharFeatures(fm_train_dna, DNA) #charfeat.io.set_loglevel(MSG_DEBUG) hmm_train=StringWordFeatures(charfeat.get_alphabet()) hmm_train.obtain_from_char(charfeat, order-1, order, gap, reverse) pos=HMM(hmm_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) neg = HMM(pos) print "Kernel training data" charfeat=StringCharFeatures(fm_train_dna, DNA) wordfeats_train=StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) print "Kernel testing data" charfeat=StringCharFeatures(fm_test_dna, DNA) wordfeats_test=StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) print "get kernel on training data" pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train=FKFeatures(10, pos, neg) feats_train.set_opt_a(-1) #estimate prior print 'getting feature train' train_featmat = [] for i in range(len(fm_train_dna)): train_featmat.append(feats_train.get_computed_dot_feature_vector(i)) train_featmat = np.array(train_featmat) print "get feature on testing" pos_clone=HMM(pos) neg_clone=HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test=FKFeatures(10, pos_clone, neg_clone) feats_test.set_a(feats_train.get_a()) #use prior from training data test_featmat = [] for i in range(len(fm_test_dna)): test_featmat.append(feats_test.get_feature_vector(i)) test_featmat = np.array(test_featmat) return train_featmat, test_featmat
def runShogunSVMDNAWeightedCommonWordKernel(train_xt, train_lt, test_xt): """ run svm with spectrum kernel """ ################################################## # set up svm charfeat_train = StringCharFeatures(train_xt, DNA) feats_train = StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat_train, K-1, K, GAP, False) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat_test = StringCharFeatures(test_xt, DNA) feats_test=StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat_test, K-1, K, GAP, False) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel=WeightedCommWordStringKernel(feats_train, feats_train, False) kernel.io.set_loglevel(MSG_DEBUG) # init kernel labels = BinaryLabels(train_lt) # run svm model print "Ready to train!" svm=LibSVM(SVMC, kernel, labels) svm.io.set_loglevel(MSG_DEBUG) svm.train() # predictions print "Making predictions!" out1=svm.apply(feats_train).get_labels() kernel.init(feats_train, feats_test) out2=svm.apply(feats_test).get_labels() return out1,out2
def runShogunOneClassSVMDNASpectrumKernel(train_xt, train_lt, test_xt): """ run svm with spectrum kernel """ ################################################## # set up svr charfeat_train = StringCharFeatures(train_xt, DNA) feats_train = StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat_train, K-1, K, GAP, False) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat_test = StringCharFeatures(test_xt, DNA) feats_test=StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat_test, K-1, K, GAP, False) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel=CommWordStringKernel(feats_train, feats_train, False) kernel.io.set_loglevel(MSG_DEBUG) # init kernel labels = BinaryLabels(train_lt) # run svm model print "Ready to train!" svm=LibSVMOneClass(SVMC, kernel) svm.set_epsilon(EPSILON) svm.train() # predictions print "Making predictions!" out1DecisionValues = svm.apply(feats_train) out1=out1DecisionValues.get_labels() kernel.init(feats_train, feats_test) out2DecisionValues = svm.apply(feats_test) out2=out2DecisionValues.get_labels() # predictions = svm.apply(feats_test) # return predictions, svm, predictions.get_labels() return out1,out2,out1DecisionValues,out2DecisionValues
def runShogunSVMProteinPolyMatchSpectrumKernel(train_xt, train_lt, test_xt): """ run svm with spectrum kernel """ ################################################## # set up svm charfeat_train = StringCharFeatures(train_xt, PROTEIN) feats_train = StringWordFeatures(PROTEIN) feats_train.obtain_from_char(charfeat_train, K-1, K, GAP, False) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat_test = StringCharFeatures(test_xt, PROTEIN) feats_test=StringWordFeatures(PROTEIN) feats_test.obtain_from_char(charfeat_test, K-1, K, GAP, False) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel=PolyMatchWordStringKernel(feats_train, feats_train, 2, True) kernel.io.set_loglevel(MSG_DEBUG) # init kernel labels = BinaryLabels(train_lt) # run svm model print "Ready to train!" svm=LibSVM(SVMC, kernel, labels) svm.io.set_loglevel(MSG_DEBUG) svm.train() # predictions print "Making predictions!" out1DecisionValues = svm.apply(feats_train) out1=out1DecisionValues.get_labels() kernel.init(feats_train, feats_test) out2DecisionValues = svm.apply(feats_test) out2=out2DecisionValues.get_labels() return out1,out2,out1DecisionValues,out2DecisionValues
def tests_check_commwordkernel_memleak_modular (num, order, gap, reverse): import gc from modshogun import Alphabet,StringCharFeatures,StringWordFeatures,DNA from modshogun import SortWordString, MSG_DEBUG from modshogun import CommWordStringKernel, IdentityKernelNormalizer from numpy import mat POS=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT'] NEG=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT'] for i in range(10): alpha=Alphabet(DNA) traindat=StringCharFeatures(alpha) traindat.set_features(POS+NEG) trainudat=StringWordFeatures(traindat.get_alphabet()); trainudat.obtain_from_char(traindat, order-1, order, gap, reverse) #trainudat.io.set_loglevel(MSG_DEBUG) pre = SortWordString() #pre.io.set_loglevel(MSG_DEBUG) pre.init(trainudat) trainudat.add_preprocessor(pre) trainudat.apply_preprocessor() spec = CommWordStringKernel(10, False) spec.set_normalizer(IdentityKernelNormalizer()) spec.init(trainudat, trainudat) K=spec.get_kernel_matrix() del POS del NEG del order del gap del reverse return K
def kernel_fisher_modular (fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, N=1,M=4,pseudo=1e-1,order=1,gap=0,reverse=False, kargs=[1,False,True]): from modshogun import StringCharFeatures, StringWordFeatures, FKFeatures, DNA from modshogun import PolyKernel from modshogun import HMM, BW_NORMAL#, MSG_DEBUG # train HMM for positive class charfeat=StringCharFeatures(fm_hmm_pos, DNA) #charfeat.io.set_loglevel(MSG_DEBUG) hmm_pos_train=StringWordFeatures(charfeat.get_alphabet()) hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse) pos=HMM(hmm_pos_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) # train HMM for negative class charfeat=StringCharFeatures(fm_hmm_neg, DNA) hmm_neg_train=StringWordFeatures(charfeat.get_alphabet()) hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse) neg=HMM(hmm_neg_train, N, M, pseudo) neg.baum_welch_viterbi_train(BW_NORMAL) # Kernel training data charfeat=StringCharFeatures(fm_train_dna, DNA) wordfeats_train=StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) # Kernel testing data charfeat=StringCharFeatures(fm_test_dna, DNA) wordfeats_test=StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) # get kernel on training data pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train=FKFeatures(10, pos, neg) feats_train.set_opt_a(-1) #estimate prior kernel=PolyKernel(feats_train, feats_train, *kargs) km_train=kernel.get_kernel_matrix() # get kernel on testing data pos_clone=HMM(pos) neg_clone=HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test=FKFeatures(10, pos_clone, neg_clone) feats_test.set_a(feats_train.get_a()) #use prior from training data kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def runShogunSVMMultipleKernels(train_xt, train_lt, test_xt): """ Run SVM with Multiple Kernels """ ################################################## # Take all examples idxs = np.random.randint(1,14000,14000); train_xt = np.array(train_xt)[idxs]; train_lt = np.array(train_lt)[idxs]; # Initialize kernel and features kernel=CombinedKernel() feats_train=CombinedFeatures() feats_test=CombinedFeatures() labels = BinaryLabels(train_lt) ##################### Multiple Spectrum Kernels ######################### for i in range(K1,K2,-1): # append training data to combined feature object charfeat_train = StringCharFeatures(list(train_xt), DNA) feats_train_k1 = StringWordFeatures(DNA) feats_train_k1.obtain_from_char(charfeat_train, i-1, i, GAP, False) preproc=SortWordString() preproc.init(feats_train_k1) feats_train_k1.add_preprocessor(preproc) feats_train_k1.apply_preprocessor() # append testing data to combined feature object charfeat_test = StringCharFeatures(test_xt, DNA) feats_test_k1=StringWordFeatures(DNA) feats_test_k1.obtain_from_char(charfeat_test, i-1, i, GAP, False) feats_test_k1.add_preprocessor(preproc) feats_test_k1.apply_preprocessor() # append features feats_train.append_feature_obj(charfeat_train); feats_test.append_feature_obj(charfeat_test); # append spectrum kernel kernel1=CommWordStringKernel(10,i); kernel1.io.set_loglevel(MSG_DEBUG); kernel.append_kernel(kernel1); ''' Uncomment this for Multiple Weighted degree kernels and comment the multiple spectrum kernel block above instead ##################### Multiple Weighted Degree Kernel ######################### for i in range(K1,K2,-1): # append training data to combined feature object charfeat_train = StringCharFeatures(list(train_xt), DNA) # append testing data to combined feature object charfeat_test = StringCharFeatures(test_xt, DNA) # append features feats_train.append_feature_obj(charfeat_train); feats_test.append_feature_obj(charfeat_test); # setup weighted degree kernel kernel1=WeightedDegreePositionStringKernel(10,i); kernel1.io.set_loglevel(MSG_DEBUG); kernel1.set_shifts(SHIFT*np.ones(len(train_xt[0]), dtype=np.int32)) kernel1.set_position_weights(np.ones(len(train_xt[0]), dtype=np.float64)); kernel.append_kernel(kernel1); ''' ##################### Training ######################### print "Starting MKL training.." mkl = MKLClassification(); mkl.set_mkl_norm(3) #1,2,3 mkl.set_C(SVMC, SVMC) mkl.set_kernel(kernel) mkl.set_labels(labels) mkl.train(feats_train) print "Making predictions!" out1 = mkl.apply(feats_train).get_labels(); out2 = mkl.apply(feats_test).get_labels(); return out1,out2,train_lt
def runShogunSVMDNASpectrumKernel(train_xt, train_lt, test_xt): """ run svm with spectrum kernel """ ################################################## # set up svr charfeat_train = StringCharFeatures(train_xt, DNA) feats_train = StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat_train, K - 1, K, GAP, False) preproc = SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat_test = StringCharFeatures(test_xt, DNA) feats_test = StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat_test, K - 1, K, GAP, False) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel = CommWordStringKernel(feats_train, feats_train, False) kernel.io.set_loglevel(MSG_DEBUG) # init kernel labels = BinaryLabels(train_lt) # run svm model print "Ready to train!" svm = LibSVM(SVMC, kernel, labels) svm.io.set_loglevel(MSG_DEBUG) svm.train() # predictions print "Making predictions!" out1DecisionValues = svm.apply(feats_train) out1 = out1DecisionValues.get_labels() kernel.init(feats_train, feats_test) out2DecisionValues = svm.apply(feats_test) out2 = out2DecisionValues.get_labels() return out1, out2, out1DecisionValues, out2DecisionValues
def distance_manhattenword_modular(train_fname=traindna, test_fname=testdna, order=3, gap=0, reverse=False): from modshogun import StringCharFeatures, StringWordFeatures, DNA from modshogun import SortWordString, ManhattanWordDistance, CSVFile charfeat = StringCharFeatures(CSVFile(train_fname), DNA) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat = StringCharFeatures(CSVFile(test_fname), DNA) feats_test = StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance = ManhattanWordDistance(feats_train, feats_train) dm_train = distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test = distance.get_distance_matrix() return dm_train, dm_test
def distance_canberraword_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False): from modshogun import StringCharFeatures, StringWordFeatures, DNA from modshogun import SortWordString from modshogun import CanberraWordDistance charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance=CanberraWordDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test
def distance_hammingword_modular (fm_train_dna=traindna,fm_test_dna=testdna, fm_test_real=testdat,order=3,gap=0,reverse=False,use_sign=False): from modshogun import StringCharFeatures, StringWordFeatures, DNA from modshogun import SortWordString from modshogun import HammingWordDistance charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance=HammingWordDistance(feats_train, feats_train, use_sign) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test
def runShogunSVRSpectrumKernel(train_xt, train_lt, test_xt, svm_c=1): """ serialize svr with spectrum kernels """ ################################################## # set up svr charfeat_train = StringCharFeatures(train_xt, DNA) feats_train = StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat_train, K-1, K, GAP, False) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat_test = StringCharFeatures(test_xt, DNA) feats_test=StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat_test, K-1, K, GAP, False) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel=CommWordStringKernel(feats_train, feats_train, False) kernel.io.set_loglevel(MSG_DEBUG) # init kernel labels = RegressionLabels(train_lt) # two svr models: epsilon and nu print "Ready to train!" svr_epsilon=LibSVR(svm_c, SVRPARAM, kernel, labels, LIBSVR_EPSILON_SVR) svr_epsilon.io.set_loglevel(MSG_DEBUG) svr_epsilon.train() # predictions print "Making predictions!" out1_epsilon=svr_epsilon.apply(feats_train).get_labels() kernel.init(feats_train, feats_test) out2_epsilon=svr_epsilon.apply(feats_test).get_labels() return out1_epsilon,out2_epsilon,kernel
def kernel_top_modular(fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, pseudo=1e-1, order=1, gap=0, reverse=False, kargs=[1, False, True]): from modshogun import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA from modshogun import PolyKernel from modshogun import HMM, BW_NORMAL N = 1 # toy HMM with 1 state M = 4 # 4 observations -> DNA # train HMM for positive class charfeat = StringCharFeatures(fm_hmm_pos, DNA) hmm_pos_train = StringWordFeatures(charfeat.get_alphabet()) hmm_pos_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) pos = HMM(hmm_pos_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) # train HMM for negative class charfeat = StringCharFeatures(fm_hmm_neg, DNA) hmm_neg_train = StringWordFeatures(charfeat.get_alphabet()) hmm_neg_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) neg = HMM(hmm_neg_train, N, M, pseudo) neg.baum_welch_viterbi_train(BW_NORMAL) # Kernel training data charfeat = StringCharFeatures(fm_train_dna, DNA) wordfeats_train = StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) # Kernel testing data charfeat = StringCharFeatures(fm_test_dna, DNA) wordfeats_test = StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) # get kernel on training data pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train = TOPFeatures(10, pos, neg, False, False) kernel = PolyKernel(feats_train, feats_train, *kargs) km_train = kernel.get_kernel_matrix() # get kernel on testing data pos_clone = HMM(pos) neg_clone = HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test = TOPFeatures(10, pos_clone, neg_clone, False, False) kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def preprocessor_sortwordstring_modular(fm_train_dna=traindna, fm_test_dna=testdna, order=3, gap=0, reverse=False, use_sign=False): from modshogun import CommWordStringKernel from modshogun import StringCharFeatures, StringWordFeatures, DNA from modshogun import SortWordString charfeat = StringCharFeatures(fm_train_dna, DNA) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat = StringCharFeatures(fm_test_dna, DNA) feats_test = StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel = CommWordStringKernel(feats_train, feats_train, use_sign) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def get_kernel_mat(fm_train_dna, fm_test_dna, N, M, pseudo=1e-1, order=1, gap=0, reverse=False): # train HMM for positive class print "hmm training" charfeat = StringCharFeatures(fm_train_dna, DNA) #charfeat.io.set_loglevel(MSG_DEBUG) hmm_train = StringWordFeatures(charfeat.get_alphabet()) hmm_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) pos = HMM(hmm_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) neg = HMM(pos) print "Kernel training data" charfeat = StringCharFeatures(fm_train_dna, DNA) wordfeats_train = StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) print "Kernel testing data" charfeat = StringCharFeatures(fm_test_dna, DNA) wordfeats_test = StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) print "get kernel on training data" pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train = FKFeatures(10, pos, neg) feats_train.set_opt_a(-1) #estimate prior print 'getting feature matrix' v0 = feats_train.get_feature_vector(0) v1 = feats_train.get_feature_vector(1) print np.dot(v0, v1) kernel = LinearKernel(feats_train, feats_train) #kernel=PolyKernel(feats_train, feats_train, *kargs) km_train = kernel.get_kernel_matrix() print km_train.shape, km_train[0, 1] print "get kernel on testing data" pos_clone = HMM(pos) neg_clone = HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test = FKFeatures(10, pos_clone, neg_clone) feats_test.set_a(feats_train.get_a()) #use prior from training data kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel