def distribution_linearhmm_modular (fm_dna=traindna,order=3,gap=0,reverse=False): from shogun.Features import StringWordFeatures, StringCharFeatures, DNA from shogun.Distribution import LinearHMM charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) hmm=LinearHMM(feats) hmm.train() hmm.get_transition_probs() num_examples=feats.get_num_vectors() num_param=hmm.get_num_model_parameters() for i in range(num_examples): for j in range(num_param): hmm.get_log_derivative(j, i) out_likelihood = hmm.get_log_likelihood() out_sample = hmm.get_log_likelihood_sample() return hmm,out_likelihood ,out_sample
def kernel_weighted_comm_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,order=3,gap=0,reverse=True ): from shogun.Kernel import WeightedCommWordStringKernel from shogun.Features import StringWordFeatures, StringCharFeatures, DNA from shogun.Preprocessor import SortWordString charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() use_sign=False kernel=WeightedCommWordStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def histogram (): print 'Histogram' from shogun.Features import StringWordFeatures, StringCharFeatures, DNA from shogun.Distribution import Histogram order=3 gap=0 reverse=False charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) histo=Histogram(feats) histo.train() histo.get_histogram() num_examples=feats.get_num_vectors() num_param=histo.get_num_model_parameters() #for i in xrange(num_examples): # for j in xrange(num_param): # histo.get_log_derivative(j, i) histo.get_log_likelihood() histo.get_log_likelihood_sample()
def kernel_salzberg_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat, order=3,gap=0,reverse=False): from shogun.Features import StringCharFeatures, StringWordFeatures, DNA, Labels from shogun.Kernel import SalzbergWordStringKernel from shogun.Classifier import PluginEstimate charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) pie=PluginEstimate() labels=Labels(label_train_dna) pie.set_labels(labels) pie.set_features(feats_train) pie.train() kernel=SalzbergWordStringKernel(feats_train, feats_train, pie, labels) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) pie.set_features(feats_test) pie.classify().get_labels() km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def distribution_hmm_modular(fm_cube, N, M, pseudo, order, gap, reverse, num_examples): from shogun.Features import StringWordFeatures, StringCharFeatures, CUBE from shogun.Distribution import HMM, BW_NORMAL charfeat=StringCharFeatures(CUBE) charfeat.set_features(fm_cube) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) hmm=HMM(feats, N, M, pseudo) hmm.train() hmm.baum_welch_viterbi_train(BW_NORMAL) num_examples=feats.get_num_vectors() num_param=hmm.get_num_model_parameters() for i in xrange(num_examples): for j in xrange(num_param): hmm.get_log_derivative(j, i) best_path=0 best_path_state=0 for i in xrange(num_examples): best_path+=hmm.best_path(i) for j in xrange(N): best_path_state+=hmm.get_best_path_state(i, j) lik_example = hmm.get_log_likelihood() lik_sample = hmm.get_log_likelihood_sample() return lik_example, lik_sample, hmm
def create_promoter_features(data, param): """ creates promoter combined features @param examples: @param param: """ print "creating promoter features" (center, left, right) = split_data_promoter(data, param["center_offset"], param["center_pos"]) # set up base features feat_center = StringCharFeatures(DNA) feat_center.set_features(center) feat_left = get_spectrum_features(left) feat_right = get_spectrum_features(right) # construct combined features feat = CombinedFeatures() feat.append_feature_obj(feat_center) feat.append_feature_obj(feat_left) feat.append_feature_obj(feat_right) return feat
def create_hashed_features_spectrum(param, data): """ creates hashed dot features for the spectrum kernel """ # extract parameters order = param["degree_spectrum"] # fixed parameters gap = 0 reverse = True normalize = True # create features feats_char = StringCharFeatures(data, DNA) feats_word = StringWordFeatures(feats_char.get_alphabet()) feats_word.obtain_from_char(feats_char, order-1, order, gap, reverse) # create preproc preproc = SortWordString() preproc.init(feats_word) feats_word.add_preproc(preproc) feats_word.apply_preproc() # finish feats = ImplicitWeightedSpecFeatures(feats_word, normalize) return feats
def get_kernel_matrix(li): """ Get kernel matrix from a list of strings. """ order = 6 gap = 2 reverse = False charfeat = StringCharFeatures(RAWBYTE) charfeat.set_features(li) #Get alphabet. feats_train = StringUlongFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) #CommUlongStringKernel needs sorted features. preproc = SortUlongString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() use_sign = False #Compute kernel matrix between train features. kernel = CommUlongStringKernel(feats_train, feats_train, use_sign) km_train = kernel.get_kernel_matrix() return km_train
def linear_hmm (): print 'LinearHMM' from shogun.Features import StringWordFeatures, StringCharFeatures, DNA from shogun.Distribution import LinearHMM order=3 gap=0 reverse=False charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) hmm=LinearHMM(feats) hmm.train() hmm.get_transition_probs() num_examples=feats.get_num_vectors() num_param=hmm.get_num_model_parameters() for i in xrange(num_examples): for j in xrange(num_param): hmm.get_log_derivative(j, i) hmm.get_log_likelihood() hmm.get_log_likelihood_sample()
def sort_word_string (): print 'CommWordString' from shogun.Kernel import CommWordStringKernel from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.PreProc import SortWordString order=3 gap=0 reverse=False charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() use_sign=False kernel=CommWordStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix()
def get_predictions_from_seqdict(self, seqdic, site): """ we need to generate a huge test features object containing all locations found in each seqdict-sequence and each location (this is necessary to efficiently (==fast,low memory) compute the splice outputs """ seqlen=self.window_right+self.window_left+2 for s in seqdic: position_list=DynamicIntArray() sequence=s.seq positions=s.preds[site].positions for j in xrange(len(positions)): i=positions[j] - self.offset -self.window_left position_list.append_element(i) t=StringCharFeatures([sequence], DNA) t.obtain_by_position_list(seqlen, position_list) self.wd_kernel.init(self.traindat, t) self.wd_kernel.io.enable_progress() l=self.svm.apply().get_values() self.wd_kernel.cleanup() sys.stdout.write("\n...done...\n") num=len(s.preds[site].positions) scores= num * [0] for j in xrange(num): scores[j]=l[j] s.preds[site].set_scores(scores)
def init_sensor(self, kernel, svs): f = StringCharFeatures(svs, DNA) kname = kernel['name'] if kname == 'spectrum': wf = StringWordFeatures(f.get_alphabet()) wf.obtain_from_char(f, kernel['order'] - 1, kernel['order'], 0, False) pre = SortWordString() pre.init(wf) wf.add_preprocessor(pre) wf.apply_preprocessor() f = wf k = CommWordStringKernel(0, False) k.set_use_dict_diagonal_optimization(kernel['order'] < 8) self.preproc = pre elif kname == 'wdshift': k = WeightedDegreePositionStringKernel(0, kernel['order']) k.set_normalizer(IdentityKernelNormalizer()) k.set_shifts(kernel['shift'] * numpy.ones(f.get_max_vector_length(), dtype=numpy.int32)) k.set_position_weights(1.0 / f.get_max_vector_length() * numpy.ones(f.get_max_vector_length(), dtype=numpy.float64)) else: raise "Currently, only wdshift and spectrum kernels supported" self.kernel = k self.train_features = f return (self.kernel, self.train_features)
def preproc_sortwordstring_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False,use_sign=False): from shogun.Kernel import CommWordStringKernel from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.PreProc import SortWordString charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() kernel=CommWordStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def plugin_estimate_salzberg (): print 'PluginEstimate w/ SalzbergWord' from shogun.Features import StringCharFeatures, StringWordFeatures, DNA, Labels from shogun.Kernel import SalzbergWordStringKernel from shogun.Classifier import PluginEstimate order=3 gap=0 reverse=False charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) pie=PluginEstimate() labels=Labels(label_train_dna) pie.set_labels(labels) pie.set_features(feats_train) pie.train() kernel=SalzbergWordStringKernel(feats_train, feats_test, pie, labels) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) pie.set_features(feats_test) pie.classify().get_labels() km_test=kernel.get_kernel_matrix()
def get_wd_features(data, feat_type="dna"): """ create feature object for wdk """ if feat_type == "dna": feat = StringCharFeatures(DNA) elif feat_type == "protein": feat = StringCharFeatures(PROTEIN) else: raise Exception("unknown feature type") feat.set_features(data) return feat
def kernel_fisher_modular( fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, N=1, M=4, pseudo=1e-1, order=1, gap=0, reverse=False, kargs=[1, False, True], ): from shogun.Features import StringCharFeatures, StringWordFeatures, FKFeatures, DNA from shogun.Kernel import PolyKernel from shogun.Distribution import HMM, BW_NORMAL # , MSG_DEBUG # train HMM for positive class charfeat = StringCharFeatures(fm_hmm_pos, DNA) # charfeat.io.set_loglevel(MSG_DEBUG) hmm_pos_train = StringWordFeatures(charfeat.get_alphabet()) hmm_pos_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) pos = HMM(hmm_pos_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) # train HMM for negative class charfeat = StringCharFeatures(fm_hmm_neg, DNA) hmm_neg_train = StringWordFeatures(charfeat.get_alphabet()) hmm_neg_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) neg = HMM(hmm_neg_train, N, M, pseudo) neg.baum_welch_viterbi_train(BW_NORMAL) # Kernel training data charfeat = StringCharFeatures(fm_train_dna, DNA) wordfeats_train = StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) # Kernel testing data charfeat = StringCharFeatures(fm_test_dna, DNA) wordfeats_test = StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) # get kernel on training data pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train = FKFeatures(10, pos, neg) feats_train.set_opt_a(-1) # estimate prior kernel = PolyKernel(feats_train, feats_train, *kargs) km_train = kernel.get_kernel_matrix() # get kernel on testing data pos_clone = HMM(pos) neg_clone = HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test = FKFeatures(10, pos_clone, neg_clone) feats_test.set_a(feats_train.get_a()) # use prior from training data kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def manhattan_word_distance (): print 'ManhattanWordDistance' from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.PreProc import SortWordString from shogun.Distance import ManhattanWordDistance order=3 gap=0 reverse=False charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() distance=ManhattanWordDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix()
def distribution_ppwm_modular (fm_dna=traindna, order=3): from shogun.Features import StringByteFeatures, StringCharFeatures, DNA from shogun.Distribution import PositionalPWM from numpy import array,e,log,exp charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats=StringByteFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, 0, False) L=20 k=3 sigma = 1; mu = 4 ppwm=PositionalPWM() ppwm.set_sigma(sigma) ppwm.set_mean(mu) pwm=array([[0.0, 0.5, 0.1, 1.0], [0.0, 0.5, 0.5, 0.0], [1.0, 0.0, 0.4, 0.0], [0.0, 0.0, 0.0, 0.0]]); pwm=array([[0.01,0.09,0.1],[0.09,0.01,0.1],[0.85,0.4,0.1],[0.05,0.5,0.7]]) ppwm.set_pwm(log(pwm)) #print(ppwm.get_pwm()) ppwm.compute_w(L) w=ppwm.get_w() #print(w) #from pylab import * #figure(1) #pcolor(exp(w)) #pcolor(w) #colorbar() #figure(2) ppwm.compute_scoring(1) u=ppwm.get_scoring(0) #pcolor(exp(u)) #show() #ppwm=PositionalPWM(feats) #ppwm.train() #out_likelihood = histo.get_log_likelihood() #out_sample = histo.get_log_likelihood_sample() return w,u
def get_spectrum_features(data, order=3, gap=0, reverse=True): """ create feature object used by spectrum kernel """ charfeat = StringCharFeatures(data, DNA) feat = StringWordFeatures(charfeat.get_alphabet()) feat.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc = SortWordString() preproc.init(feat) feat.add_preprocessor(preproc) feat.apply_preprocessor() return feat
def kernel_comm_ulong_string_modular (fm_train_dna=traindat,fm_test_dna=testdat, order=3, gap=0, reverse = False): from shogun.Kernel import CommUlongStringKernel from shogun.Features import StringUlongFeatures, StringCharFeatures, DNA from shogun.Preprocessor import SortUlongString charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringUlongFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortUlongString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringUlongFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() use_sign=False kernel=CommUlongStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_histogram_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,order=3,gap=0,reverse=False): from shogun.Features import StringCharFeatures, StringWordFeatures, DNA, Labels from shogun.Kernel import HistogramWordStringKernel from shogun.Classifier import PluginEstimate#, MSG_DEBUG reverse = reverse charfeat=StringCharFeatures(DNA) #charfeat.io.set_loglevel(MSG_DEBUG) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) pie=PluginEstimate() labels=Labels(label_train_dna) pie.set_labels(labels) pie.set_features(feats_train) pie.train() kernel=HistogramWordStringKernel(feats_train, feats_train, pie) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) pie.set_features(feats_test) pie.apply().get_labels() km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def distance_hammingword_modular (fm_train_dna=traindna,fm_test_dna=testdna, fm_test_real=testdat,order=3,gap=0,reverse=False,use_sign=False): from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.Preprocessor import SortWordString from shogun.Distance import HammingWordDistance charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance=HammingWordDistance(feats_train, feats_train, use_sign) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test
def perform_clustering(mss_id): import numpy import expenv mss = expenv.MultiSplitSet.get(mss_id) from method_mhc_mkl import SequencesHandler from shogun.Distance import EuclidianDistance, HammingWordDistance from shogun.Features import StringCharFeatures, StringWordFeatures, PROTEIN from shogun.Clustering import Hierarchical from shogun.PreProc import SortWordString order = 1 gap = 0 reverse = False seq_handler = SequencesHandler() data = [seq_handler.get_seq(ss.dataset.organism) for ss in mss.split_sets] charfeat=StringCharFeatures(PROTEIN) charfeat.set_features(data) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats) feats.add_preproc(preproc) feats.apply_preproc() use_sign = False distance = HammingWordDistance(feats, feats, use_sign) #distance = EuclidianDistance() merges=4 hierarchical=Hierarchical(merges, distance) hierarchical.train() hierarchical.get_merge_distances() hierarchical.get_cluster_pairs() return hierarchical
def kernel_top_modular( fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, pseudo=1e-1, order=1, gap=0, reverse=False, kargs=[1, False, True], ): from shogun.Features import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA from shogun.Kernel import PolyKernel from shogun.Distribution import HMM, BW_NORMAL N = 1 # toy HMM with 1 state M = 4 # 4 observations -> DNA # train HMM for positive class charfeat = StringCharFeatures(fm_hmm_pos, DNA) hmm_pos_train = StringWordFeatures(charfeat.get_alphabet()) hmm_pos_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) pos = HMM(hmm_pos_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) # train HMM for negative class charfeat = StringCharFeatures(fm_hmm_neg, DNA) hmm_neg_train = StringWordFeatures(charfeat.get_alphabet()) hmm_neg_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) neg = HMM(hmm_neg_train, N, M, pseudo) neg.baum_welch_viterbi_train(BW_NORMAL) # Kernel training data charfeat = StringCharFeatures(fm_train_dna, DNA) wordfeats_train = StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) # Kernel testing data charfeat = StringCharFeatures(fm_test_dna, DNA) wordfeats_test = StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) # get kernel on training data pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train = TOPFeatures(10, pos, neg, False, False) kernel = PolyKernel(feats_train, feats_train, *kargs) km_train = kernel.get_kernel_matrix() # get kernel on testing data pos_clone = HMM(pos) neg_clone = HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test = TOPFeatures(10, pos_clone, neg_clone, False, False) kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def fisher (): print "Fisher Kernel" from shogun.Features import StringCharFeatures, StringWordFeatures, FKFeatures, DNA from shogun.Kernel import PolyKernel from shogun.Distribution import HMM, BW_NORMAL N=1 # toy HMM with 1 state M=4 # 4 observations -> DNA pseudo=1e-1 order=1 gap=0 reverse=False kargs=[1, False, True] # train HMM for positive class charfeat=StringCharFeatures(fm_hmm_pos, DNA) hmm_pos_train=StringWordFeatures(charfeat.get_alphabet()) hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse) pos=HMM(hmm_pos_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) # train HMM for negative class charfeat=StringCharFeatures(fm_hmm_neg, DNA) hmm_neg_train=StringWordFeatures(charfeat.get_alphabet()) hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse) neg=HMM(hmm_neg_train, N, M, pseudo) neg.baum_welch_viterbi_train(BW_NORMAL) # Kernel training data charfeat=StringCharFeatures(fm_train_dna, DNA) wordfeats_train=StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) # Kernel testing data charfeat=StringCharFeatures(fm_test_dna, DNA) wordfeats_test=StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) # get kernel on training data pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train=FKFeatures(10, pos, neg) feats_train.set_opt_a(-1) #estimate prior kernel=PolyKernel(feats_train, feats_train, *kargs) km_train=kernel.get_kernel_matrix() # get kernel on testing data pos_clone=HMM(pos) neg_clone=HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test=FKFeatures(10, pos_clone, neg_clone) feats_test.set_a(feats_train.get_a()) #use prior from training data kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix()
def features_string_char_modular(strings): from shogun.Features import StringCharFeatures, RAWBYTE from numpy import array # create string features f = StringCharFeatures(strings, RAWBYTE) # and output several stats # print "max string length", f.get_max_vector_length() # print "number of strings", f.get_num_vectors() # print "length of first string", f.get_vector_length(0) # print "string[5]", ''.join(f.get_feature_vector(5)) # print "strings", f.get_features() # replace string 0 f.set_feature_vector(array(["t", "e", "s", "t"]), 0) # print "strings", f.get_features() return f.get_features(), f
def distribution_ppwm_modular(fm_dna=traindna, order=3): from shogun.Features import StringByteFeatures, StringCharFeatures, DNA from shogun.Distribution import PositionalPWM from numpy import array, e, log, exp charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats = StringByteFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order - 1, order, 0, False) ppwm = PositionalPWM() ppwm.set_sigma(5.0) ppwm.set_mean(10.0) pwm = array([[0.0, 0.5, 0.1, 1.0], [0.0, 0.5, 0.5, 0.0], [1.0, 0.0, 0.4, 0.0], [0.0, 0.0, 0.0, 0.0]]) ppwm.set_pwm(log(pwm)) print ppwm.get_pwm() ppwm.compute_w(20) w = ppwm.get_w()
def get_predictions(self, sequence, positions): seqlen=self.window_right+self.window_left+2 num=len(positions) testdat = [] for j in xrange(num): i=positions[j] - self.offset ; s=sequence[i-self.window_left:i+self.window_right+2] testdat.append(s) t=StringCharFeatures(DNA) t.set_string_features(testdat) self.wd_kernel.init(self.traindat, t) l=self.svm.classify().get_labels() sys.stderr.write("\n...done...\n") return l
def get_predictions(self, sequence, positions): seqlen=self.window_right+self.window_left+2 num=len(positions) position_list=DynamicIntArray() for j in xrange(num): i=positions[j] - self.offset - self.window_left position_list.append_element(i) t=StringCharFeatures([sequence], DNA) t.obtain_by_position_list(seqlen, position_list) self.wd_kernel.init(self.traindat, t) del t self.wd_kernel.io.enable_progress() l=self.svm.apply().get_values() self.wd_kernel.cleanup() sys.stdout.write("\n...done...\n") return l
def classifier_svmlight_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,C=1.2,epsilon=1e-5,num_threads=1): from shogun.Features import StringCharFeatures, Labels, DNA from shogun.Kernel import WeightedDegreeStringKernel try: from shogun.Classifier import SVMLight except ImportError: print 'No support for SVMLight available.' return feats_train=StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) degree=20 kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) labels=Labels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) svm.apply().get_labels() return kernel
def create_features(kname, examples, kparam, train_mode, preproc, seq_source, nuc_con): """Converts numpy arrays or sequences into shogun features""" if kname == 'gauss' or kname == 'linear' or kname == 'poly': examples = numpy.array(examples) feats = RealFeatures(examples) elif kname == 'wd' or kname == 'localalign' or kname == 'localimprove': if seq_source == 'dna': examples = non_atcg_convert(examples, nuc_con) feats = StringCharFeatures(examples, DNA) elif seq_source == 'protein': examples = non_aminoacid_converter(examples, nuc_con) feats = StringCharFeatures(examples, PROTEIN) else: sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n") sys.exit(-1) elif kname == 'spec' or kname == 'cumspec': if seq_source == 'dna': examples = non_atcg_convert(examples, nuc_con) feats = StringCharFeatures(examples, DNA) elif seq_source == 'protein': examples = non_aminoacid_converter(examples, nuc_con) feats = StringCharFeatures(examples, PROTEIN) else: sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n") sys.exit(-1) wf = StringUlongFeatures( feats.get_alphabet() ) wf.obtain_from_char(feats, kparam['degree']-1, kparam['degree'], 0, kname=='cumspec') del feats if train_mode: preproc = SortUlongString() preproc.init(wf) wf.add_preprocessor(preproc) ret = wf.apply_preprocessor() #assert(ret) feats = wf elif kname == 'spec2' or kname == 'cumspec2': # spectrum kernel on two sequences feats = {} feats['combined'] = CombinedFeatures() reversed = kname=='cumspec2' (ex0,ex1) = zip(*examples) f0 = StringCharFeatures(list(ex0), DNA) wf = StringWordFeatures(f0.get_alphabet()) wf.obtain_from_char(f0, kparam['degree']-1, kparam['degree'], 0, reversed) del f0 if train_mode: preproc = SortWordString() preproc.init(wf) wf.add_preprocessor(preproc) ret = wf.apply_preprocessor() assert(ret) feats['combined'].append_feature_obj(wf) feats['f0'] = wf f1 = StringCharFeatures(list(ex1), DNA) wf = StringWordFeatures( f1.get_alphabet() ) wf.obtain_from_char(f1, kparam['degree']-1, kparam['degree'], 0, reversed) del f1 if train_mode: preproc = SortWordString() preproc.init(wf) wf.add_preprocessor(preproc) ret = wf.apply_preprocessor() assert(ret) feats['combined'].append_feature_obj(wf) feats['f1'] = wf else: print 'Unknown kernel %s' % kname return (feats,preproc)
def kernel_fisher_modular(fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, N=1, M=4, pseudo=1e-1, order=1, gap=0, reverse=False, kargs=[1, False, True]): from shogun.Features import StringCharFeatures, StringWordFeatures, FKFeatures, DNA from shogun.Kernel import PolyKernel from shogun.Distribution import HMM, BW_NORMAL #, MSG_DEBUG # train HMM for positive class charfeat = StringCharFeatures(fm_hmm_pos, DNA) #charfeat.io.set_loglevel(MSG_DEBUG) hmm_pos_train = StringWordFeatures(charfeat.get_alphabet()) hmm_pos_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) pos = HMM(hmm_pos_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) # train HMM for negative class charfeat = StringCharFeatures(fm_hmm_neg, DNA) hmm_neg_train = StringWordFeatures(charfeat.get_alphabet()) hmm_neg_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) neg = HMM(hmm_neg_train, N, M, pseudo) neg.baum_welch_viterbi_train(BW_NORMAL) # Kernel training data charfeat = StringCharFeatures(fm_train_dna, DNA) wordfeats_train = StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) # Kernel testing data charfeat = StringCharFeatures(fm_test_dna, DNA) wordfeats_test = StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) # get kernel on training data pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train = FKFeatures(10, pos, neg) feats_train.set_opt_a(-1) #estimate prior kernel = PolyKernel(feats_train, feats_train, *kargs) km_train = kernel.get_kernel_matrix() # get kernel on testing data pos_clone = HMM(pos) neg_clone = HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test = FKFeatures(10, pos_clone, neg_clone) feats_test.set_a(feats_train.get_a()) #use prior from training data kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def features_string_char_compressed_modular(fname): from shogun.Features import StringCharFeatures, StringFileCharFeatures, RAWBYTE from shogun.Library import UNCOMPRESSED,LZO,GZIP,BZIP2,LZMA, MSG_DEBUG from shogun.PreProc import DecompressCharString f=StringFileCharFeatures(fname, RAWBYTE) #print "original strings", f.get_features() #uncompressed f.save_compressed("foo_uncompressed.str", UNCOMPRESSED, 1) f2=StringCharFeatures(RAWBYTE); f2.load_compressed("foo_uncompressed.str", True) #print "uncompressed strings", f2.get_features() #print # load compressed data and uncompress on load #lzo f.save_compressed("foo_lzo.str", LZO, 9) f2=StringCharFeatures(RAWBYTE); f2.load_compressed("foo_lzo.str", True) #print "lzo strings", f2.get_features() #print ##gzip f.save_compressed("foo_gzip.str", GZIP, 9) f2=StringCharFeatures(RAWBYTE); f2.load_compressed("foo_gzip.str", True) #print "gzip strings", f2.get_features() #print #bzip2 f.save_compressed("foo_bzip2.str", BZIP2, 9) f2=StringCharFeatures(RAWBYTE); f2.load_compressed("foo_bzip2.str", True) #print "bzip2 strings", f2.get_features() #print #lzma f.save_compressed("foo_lzma.str", LZMA, 9) f2=StringCharFeatures(RAWBYTE); f2.load_compressed("foo_lzma.str", True) #print "lzma strings", f2.get_features() #print # load compressed data and uncompress via preprocessor f2=StringCharFeatures(RAWBYTE); f2.load_compressed("foo_lzo.str", False) f2.add_preproc(DecompressCharString(LZO)) f2.apply_preproc() #print "lzo strings", f2.get_features() #print # load compressed data and uncompress on-the-fly via preprocessor f2=StringCharFeatures(RAWBYTE); f2.load_compressed("foo_lzo.str", False) #f2.io.set_loglevel(MSG_DEBUG) f2.add_preproc(DecompressCharString(LZO)) f2.enable_on_the_fly_preprocessing() #print "lzo strings", f2.get_features() #print #clean up import os for f in ['foo_uncompressed.str', 'foo_lzo.str', 'foo_gzip.str', 'foo_bzip2.str', 'foo_lzma.str', 'foo_lzo.str', 'foo_lzo.str']: if os.path.exists(f): os.unlink(f)
d = dat["thaliana"] subset_size = 20 examples = [i.example for i in d[0:subset_size]] labels = [i.label for i in d[0:subset_size]] print "len(examples)", len(examples) print "string length", len(examples[0]) labels[2] = 1 labels[12] = 1 labels[15] = 1 labels[8] = 1 labels[19] = 1 feat = StringCharFeatures(DNA) feat.set_features(examples) helper.save("/tmp/feat", feat) feat2 = helper.load("/tmp/feat") wdk = WeightedDegreeStringKernel(feat, feat, 1) print "PY: saving kernel" wdk.io.set_loglevel(MSG_DEBUG) helper.save("/tmp/awesome", wdk) #print wdk.toString() #print "PY: kernel saved, loading kernel" wdk2 = helper.load("/tmp/awesome") print "PY: kernel loaded"
def solve(self, C, all_xt, all_lt, task_indicator, M, L): """ implementation using multitask kernel """ xt = numpy.array(all_xt) lt = numpy.array(all_lt) tt = numpy.array(task_indicator, dtype=numpy.int32) tsm = numpy.array(M) print "task_sim:", tsm num_tasks = L.shape[0] # sanity checks assert len(xt) == len(lt) == len(tt) assert M.shape == L.shape assert num_tasks == len(set(tt)) # set up shogun objects if type(xt[0]) == numpy.string_: feat = StringCharFeatures(DNA) xt = [str(a) for a in xt] feat.set_features(xt) base_kernel = WeightedDegreeStringKernel(feat, feat, 8) else: feat = RealFeatures(xt.T) base_kernel = LinearKernel(feat, feat) lab = BinaryLabels(lt) # set up normalizer normalizer = MultitaskKernelNormalizer(tt.tolist()) for i in xrange(num_tasks): for j in xrange(num_tasks): normalizer.set_task_similarity(i, j, M[i, j]) print "num of unique tasks: ", normalizer.get_num_unique_tasks( task_indicator) # set up kernel base_kernel.set_cache_size(4000) base_kernel.set_normalizer(normalizer) base_kernel.init_normalizer() # set up svm svm = SVMLight() #LibSVM() svm.set_epsilon(self.eps) #SET THREADS TO 1 #print "reducing num threads to one" #segfaults #svm.parallel.set_num_threads(1) #print "using one thread" # how often do we like to compute objective etc svm.set_record_interval(self.record_interval) svm.set_min_interval(self.min_interval) #svm.set_target_objective(target_obj) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) #svm.set_shrinking_enabled(False) svm.io.set_loglevel(MSG_DEBUG) svm.set_C(C, C) svm.set_bias_enabled(False) # prepare for training svm.set_labels(lab) svm.set_kernel(base_kernel) # train svm svm.train() if self.record_variables: print "recording variables" self.dual_objectives = [-obj for obj in svm.get_dual_objectives()] self.train_times = svm.get_training_times() # get model parameters sv_idx = svm.get_support_vectors() sparse_alphas = svm.get_alphas() assert len(sv_idx) == len(sparse_alphas) # compute dense alpha (remove label) self.alphas = numpy.zeros(len(xt)) for id_sparse, id_dense in enumerate(sv_idx): self.alphas[id_dense] = sparse_alphas[id_sparse] * lt[id_dense] # print alphas W = alphas_to_w(self.alphas, xt, lt, task_indicator, M) self.W = W # self.final_primal_obj = compute_primal_objective( W.reshape(W.shape[0] * W.shape[1]), C, all_xt, all_lt, task_indicator, L) print "MTK duality gap:", self.dual_objectives[ -1] - self.final_primal_obj return True
def kernel_top_modular(fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, pseudo=1e-1, order=1, gap=0, reverse=False, kargs=[1, False, True]): from shogun.Features import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA from shogun.Kernel import PolyKernel from shogun.Distribution import HMM, BW_NORMAL N = 1 # toy HMM with 1 state M = 4 # 4 observations -> DNA # train HMM for positive class charfeat = StringCharFeatures(fm_hmm_pos, DNA) hmm_pos_train = StringWordFeatures(charfeat.get_alphabet()) hmm_pos_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) pos = HMM(hmm_pos_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) # train HMM for negative class charfeat = StringCharFeatures(fm_hmm_neg, DNA) hmm_neg_train = StringWordFeatures(charfeat.get_alphabet()) hmm_neg_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) neg = HMM(hmm_neg_train, N, M, pseudo) neg.baum_welch_viterbi_train(BW_NORMAL) # Kernel training data charfeat = StringCharFeatures(fm_train_dna, DNA) wordfeats_train = StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) # Kernel testing data charfeat = StringCharFeatures(fm_test_dna, DNA) wordfeats_test = StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) # get kernel on training data pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train = TOPFeatures(10, pos, neg, False, False) kernel = PolyKernel(feats_train, feats_train, *kargs) km_train = kernel.get_kernel_matrix() # get kernel on testing data pos_clone = HMM(pos) neg_clone = HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test = TOPFeatures(10, pos_clone, neg_clone, False, False) kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def features_string_sliding_window_modular(strings): from shogun.Features import StringCharFeatures, DNA from shogun.Library import DynamicIntArray f = StringCharFeatures([strings], DNA) # slide a window of length 5 over features # (memory efficient, does not copy strings) f.obtain_by_sliding_window(5, 1) #print(f.get_num_vectors()) #print(f.get_vector_length(0)) #print(f.get_vector_length(1)) #print(f.get_features()) # slide a window of length 4 over features # (memory efficient, does not copy strings) f.obtain_by_sliding_window(4, 1) #print(f.get_num_vectors()) #print(f.get_vector_length(0)) #print(f.get_vector_length(1)) #print(f.get_features()) # extract string-windows at position 0,6,16,25 of window size 4 # (memory efficient, does not copy strings) f.set_features([s]) positions = DynamicIntArray() positions.append_element(0) positions.append_element(6) positions.append_element(16) positions.append_element(25) f.obtain_by_position_list(4, positions) #print(f.get_features()) # now extract windows of size 8 from same positon list f.obtain_by_position_list(8, positions) #print(f.get_features()) return f