def distribution_hmm(fm_cube, N, M, pseudo, order, gap, reverse, num_examples): from shogun import StringWordFeatures, StringCharFeatures, CUBE from shogun import HMM, BW_NORMAL charfeat = StringCharFeatures(CUBE) charfeat.set_features(fm_cube) feats = StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order - 1, order, gap, reverse) hmm = HMM(feats, N, M, pseudo) hmm.train() hmm.baum_welch_viterbi_train(BW_NORMAL) num_examples = feats.get_num_vectors() num_param = hmm.get_num_model_parameters() for i in range(num_examples): for j in range(num_param): hmm.get_log_derivative(j, i) best_path = 0 best_path_state = 0 for i in range(num_examples): best_path += hmm.best_path(i) for j in range(N): best_path_state += hmm.get_best_path_state(i, j) lik_example = hmm.get_log_likelihood() lik_sample = hmm.get_log_likelihood_sample() return lik_example, lik_sample, hmm
def kernel_histogram_word_string(fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, order=3, ppseudo_count=1, npseudo_count=1): from shogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels from shogun import HistogramWordStringKernel, AvgDiagKernelNormalizer from shogun import PluginEstimate #, MSG_DEBUG charfeat = StringCharFeatures(DNA) #charfeat.io.set_loglevel(MSG_DEBUG) charfeat.set_features(fm_train_dna) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, 0, False) charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test = StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order - 1, order, 0, False) pie = PluginEstimate(ppseudo_count, npseudo_count) labels = BinaryLabels(label_train_dna) pie.set_labels(labels) pie.set_features(feats_train) pie.train() kernel = HistogramWordStringKernel(feats_train, feats_train, pie) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) pie.set_features(feats_test) pie.apply().get_labels() km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def distribution_linearhmm(fm_dna=traindna, order=3, gap=0, reverse=False): from shogun import StringWordFeatures, StringCharFeatures, DNA from shogun import LinearHMM charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats = StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order - 1, order, gap, reverse) hmm = LinearHMM(feats) hmm.train() hmm.get_transition_probs() num_examples = feats.get_num_vectors() num_param = hmm.get_num_model_parameters() for i in range(num_examples): for j in range(num_param): hmm.get_log_derivative(j, i) out_likelihood = hmm.get_log_likelihood() out_sample = hmm.get_log_likelihood_sample() return hmm, out_likelihood, out_sample
def classifier_ssk(fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, C=1, maxlen=1, decay=1): from shogun import StringCharFeatures, BinaryLabels from shogun import LibSVM, SubsequenceStringKernel, DNA from shogun import ErrorRateMeasure feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) labels = BinaryLabels(label_train_dna) kernel = SubsequenceStringKernel(feats_train, feats_train, maxlen, decay) svm = LibSVM(C, kernel, labels) svm.train() out = svm.apply(feats_train) evaluator = ErrorRateMeasure() trainerr = evaluator.evaluate(out, labels) # print(trainerr) kernel.init(feats_train, feats_test) predicted_labels = svm.apply(feats_test).get_labels() # print predicted_labels return predicted_labels
def get_predictions_from_seqdict(self, seqdic, site): """ we need to generate a huge test features object containing all locations found in each seqdict-sequence and each location (this is necessary to efficiently (==fast,low memory) compute the splice outputs """ seqlen=self.window_right+self.window_left+2 for s in seqdic: position_list=DynamicIntArray() sequence=s.seq positions=s.preds[site].positions for j in xrange(len(positions)): i=positions[j] - self.offset -self.window_left position_list.append_element(i) t=StringCharFeatures([sequence], DNA) t.obtain_by_position_list(seqlen, position_list) self.wd_kernel.init(self.traindat, t) self.wd_kernel.io.enable_progress() l=self.svm.apply().get_values() self.wd_kernel.cleanup() sys.stdout.write("\n...done...\n") num=len(s.preds[site].positions) scores= num * [0] for j in xrange(num): scores[j]=l[j] s.preds[site].set_scores(scores)
def kernel_combined (fm_train_real=traindat,fm_test_real=testdat,fm_train_dna=traindna,fm_test_dna=testdna ): from shogun import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel from shogun import RealFeatures, StringCharFeatures, CombinedFeatures, DNA kernel=CombinedKernel() feats_train=CombinedFeatures() feats_test=CombinedFeatures() subkfeats_train=RealFeatures(fm_train_real) subkfeats_test=RealFeatures(fm_test_real) subkernel=GaussianKernel(10, 1.1) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=StringCharFeatures(fm_train_dna, DNA) subkfeats_test=StringCharFeatures(fm_test_dna, DNA) degree=3 subkernel=FixedDegreeStringKernel(10, degree) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=StringCharFeatures(fm_train_dna, DNA) subkfeats_test=StringCharFeatures(fm_test_dna, DNA) subkernel=LocalAlignmentStringKernel(10) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def distribution_linearhmm (fm_dna=traindna,order=3,gap=0,reverse=False): from shogun import StringWordFeatures, StringCharFeatures, DNA from shogun import LinearHMM charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) hmm=LinearHMM(feats) hmm.train() hmm.get_transition_probs() num_examples=feats.get_num_vectors() num_param=hmm.get_num_model_parameters() for i in range(num_examples): for j in range(num_param): hmm.get_log_derivative(j, i) out_likelihood = hmm.get_log_likelihood() out_sample = hmm.get_log_likelihood_sample() return hmm,out_likelihood ,out_sample
def kernel_salzberg_word_string (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat, order=3,gap=0,reverse=False): from shogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels from shogun import SalzbergWordStringKernel from shogun import PluginEstimate charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) pie=PluginEstimate() labels=BinaryLabels(label_train_dna) pie.set_labels(labels) pie.set_features(feats_train) pie.train() kernel=SalzbergWordStringKernel(feats_train, feats_train, pie, labels) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) pie.set_features(feats_test) pie.apply().get_labels() km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_comm_word_string(fm_train_dna=traindat, fm_test_dna=testdat, order=3, gap=0, reverse=False, use_sign=False): from shogun import CommWordStringKernel from shogun import StringWordFeatures, StringCharFeatures, DNA from shogun import SortWordString charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test = StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel = CommWordStringKernel(feats_train, feats_train, use_sign) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_weighted_degree_string (fm_train_dna=traindat,fm_test_dna=testdat,degree=20): from shogun import StringCharFeatures, DNA from shogun import WeightedDegreeStringKernel, MSG_DEBUG feats_train=StringCharFeatures(fm_train_dna, DNA) #feats_train.io.set_loglevel(MSG_DEBUG) feats_test=StringCharFeatures(fm_test_dna, DNA) kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) from numpy import arange,double weights=arange(1,degree+1,dtype=double)[::-1]/ \ sum(arange(1,degree+1,dtype=double)) kernel.set_wd_weights(weights) #from numpy import ones,float64,int32 #kernel.set_position_weights(ones(len(fm_train_dna[0]), dtype=float64)) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() #this is how to serializate the kernel #import pickle #pickle.dump(kernel, file('tmp/kernel_obj.dump','w'), protocol=2) #k=pickle.load(file('tmp/kernel_obj.dump','r')) return km_train, km_test, kernel
def kernel_weighted_comm_word_string (fm_train_dna=traindat,fm_test_dna=testdat,order=3,gap=0,reverse=True ): from shogun import WeightedCommWordStringKernel from shogun import StringWordFeatures, StringCharFeatures, DNA from shogun import SortWordString charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() use_sign=False kernel=WeightedCommWordStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def classifier_domainadaptationsvm (fm_train_dna=traindna,fm_test_dna=testdna, \ label_train_dna=label_traindna, \ label_test_dna=label_testdna,fm_train_dna2=traindna2,fm_test_dna2=testdna2, \ label_train_dna2=label_traindna2,label_test_dna2=label_testdna2,C=1,degree=3): feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels = BinaryLabels(label_train_dna) svm = SVMLight(C, kernel, labels) svm.train() #svm.io.set_loglevel(MSG_DEBUG) ##################################### #print("obtaining DA SVM from previously trained SVM") feats_train2 = StringCharFeatures(fm_train_dna, DNA) feats_test2 = StringCharFeatures(fm_test_dna, DNA) kernel2 = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels2 = BinaryLabels(label_train_dna) # we regularize against the previously obtained solution dasvm = DomainAdaptationSVM(C, kernel2, labels2, svm, 1.0) dasvm.train() out = dasvm.apply_binary(feats_test2) return out #,dasvm TODO
def init_sensor(self, kernel, svs): f = StringCharFeatures(svs, DNA) kname = kernel['name'] if kname == 'spectrum': wf = StringWordFeatures(f.get_alphabet()) wf.obtain_from_char(f, kernel['order'] - 1, kernel['order'], 0, False) pre = SortWordString() pre.init(wf) wf.add_preprocessor(pre) wf.apply_preprocessor() f = wf k = CommWordStringKernel(0, False) k.set_use_dict_diagonal_optimization(kernel['order'] < 8) self.preproc = pre elif kname == 'wdshift': k = WeightedDegreePositionStringKernel(0, kernel['order']) k.set_normalizer(IdentityKernelNormalizer()) k.set_shifts(kernel['shift'] * numpy.ones(f.get_max_vector_length(), dtype=numpy.int32)) k.set_position_weights(1.0 / f.get_max_vector_length() * numpy.ones(f.get_max_vector_length(), dtype=numpy.float64)) else: raise "Currently, only wdshift and spectrum kernels supported" self.kernel = k self.train_features = f return (self.kernel, self.train_features)
def distance_hammingword (fm_train_dna=traindna,fm_test_dna=testdna, fm_test_real=testdat,order=3,gap=0,reverse=False,use_sign=False): from shogun import StringCharFeatures, StringWordFeatures, DNA from shogun import SortWordString from shogun import HammingWordDistance charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance=HammingWordDistance(feats_train, feats_train, use_sign) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test
def kernel_match_word_string(fm_train_dna=traindat, fm_test_dna=testdat, degree=3, scale=1.4, size_cache=10, order=3, gap=0, reverse=False): from shogun import MatchWordStringKernel, AvgDiagKernelNormalizer from shogun import StringWordFeatures, StringCharFeatures, DNA charfeat = StringCharFeatures(fm_train_dna, DNA) feats_train = StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) charfeat = StringCharFeatures(fm_test_dna, DNA) feats_test = StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) kernel = MatchWordStringKernel(size_cache, degree) kernel.set_normalizer(AvgDiagKernelNormalizer(scale)) kernel.init(feats_train, feats_train) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_poly_match_word_string(fm_train_dna=traindat, fm_test_dna=testdat, degree=2, inhomogene=True, order=3, gap=0, reverse=False): from shogun import PolyMatchWordStringKernel from shogun import StringWordFeatures, StringCharFeatures, DNA charfeat = StringCharFeatures(fm_train_dna, DNA) feats_train = StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) charfeat = StringCharFeatures(fm_test_dna, DNA) feats_test = StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) kernel = PolyMatchWordStringKernel(feats_train, feats_train, degree, inhomogene) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def distribution_hmm(fm_cube, N, M, pseudo, order, gap, reverse, num_examples): from shogun import StringWordFeatures, StringCharFeatures, CUBE from shogun import HMM, BW_NORMAL charfeat=StringCharFeatures(CUBE) charfeat.set_features(fm_cube) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) hmm=HMM(feats, N, M, pseudo) hmm.train() hmm.baum_welch_viterbi_train(BW_NORMAL) num_examples=feats.get_num_vectors() num_param=hmm.get_num_model_parameters() for i in range(num_examples): for j in range(num_param): hmm.get_log_derivative(j, i) best_path=0 best_path_state=0 for i in range(num_examples): best_path+=hmm.best_path(i) for j in range(N): best_path_state+=hmm.get_best_path_state(i, j) lik_example = hmm.get_log_likelihood() lik_sample = hmm.get_log_likelihood_sample() return lik_example, lik_sample, hmm
def kernel_linear_string(fm_train_dna=traindat, fm_test_dna=testdat): from shogun import StringCharFeatures, DNA from shogun import LinearStringKernel feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) kernel = LinearStringKernel(feats_train, feats_train) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def predict(self, data): from shogun import SNP, RAWBYTE from shogun import StringCharFeatures if self.isSNP: feats_test = StringCharFeatures(data, SNP) else: feats_test = StringCharFeatures(data, RAWBYTE) # 将测试string数据转化为中间量 self.kernel.init(self.feats_train, feats_test) feats_test = self.kernel.get_kernel_matrix() result = self.clf.predict(feats_test.T) print ' '.join(map(str, result)) return result
def get_spectrum_features(data, order=3, gap=0, reverse=True): """ create feature object used by spectrum kernel """ charfeat = StringCharFeatures(data, DNA) feat = StringWordFeatures(charfeat.get_alphabet()) feat.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feat) feat.add_preprocessor(preproc) feat.apply_preprocessor() return feat
def classifier_svmlight(fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, C=1.2, epsilon=1e-5, num_threads=1): from shogun import StringCharFeatures, BinaryLabels, DNA from shogun import WeightedDegreeStringKernel try: from shogun import SVMLight except ImportError: print('No support for SVMLight available.') return feats_train = StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test = StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) degree = 20 kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels = BinaryLabels(label_train_dna) svm = SVMLight(C, kernel, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) svm.apply().get_labels() return kernel
def kernel_fixed_degree_string (fm_train_dna=traindat, fm_test_dna=testdat,degree=3): from shogun import StringCharFeatures, DNA from shogun import FixedDegreeStringKernel feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_test_dna, DNA) kernel=FixedDegreeStringKernel(feats_train, feats_train, degree) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_histogram_word_string (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,order=3,ppseudo_count=1,npseudo_count=1): from shogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels from shogun import HistogramWordStringKernel, AvgDiagKernelNormalizer from shogun import PluginEstimate#, MSG_DEBUG charfeat=StringCharFeatures(DNA) #charfeat.io.set_loglevel(MSG_DEBUG) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, 0, False) charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, 0, False) pie=PluginEstimate(ppseudo_count,npseudo_count) labels=BinaryLabels(label_train_dna) pie.set_labels(labels) pie.set_features(feats_train) pie.train() kernel=HistogramWordStringKernel(feats_train, feats_train, pie) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) pie.set_features(feats_test) pie.apply().get_labels() km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def classifier_svmlight_linear_term (fm_train_dna=traindna,fm_test_dna=testdna, \ label_train_dna=label_traindna,degree=3, \ C=10,epsilon=1e-5,num_threads=1): from shogun import StringCharFeatures, BinaryLabels, DNA from shogun import WeightedDegreeStringKernel try: from shogun import SVMLight except ImportError: print("SVMLight is not available") exit(0) feats_train=StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) labels=BinaryLabels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_qpsize(3) svm.set_linear_term(-numpy.array([1,2,3,4,5,6,7,8,7,6], dtype=numpy.double)); svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) out = svm.apply().get_labels() return out,kernel
def distribution_ppwm (fm_dna=traindna, order=3): from shogun import StringByteFeatures, StringCharFeatures, DNA from shogun import PositionalPWM from numpy import array,e,log,exp charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats=StringByteFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, 0, False) L=20 k=3 sigma = 1; mu = 4 ppwm=PositionalPWM() ppwm.set_sigma(sigma) ppwm.set_mean(mu) pwm=array([[0.0, 0.5, 0.1, 1.0], [0.0, 0.5, 0.5, 0.0], [1.0, 0.0, 0.4, 0.0], [0.0, 0.0, 0.0, 0.0]]); pwm=array([[0.01,0.09,0.1],[0.09,0.01,0.1],[0.85,0.4,0.1],[0.05,0.5,0.7]]) ppwm.set_pwm(log(pwm)) #print(ppwm.get_pwm()) ppwm.compute_w(L) w=ppwm.get_w() #print(w) #from pylab import * #figure(1) #pcolor(exp(w)) #pcolor(w) #colorbar() #figure(2) ppwm.compute_scoring(1) u=ppwm.get_scoring(0) #pcolor(exp(u)) #show() #ppwm=PositionalPWM(feats) #ppwm.train() #out_likelihood = histo.get_log_likelihood() #out_sample = histo.get_log_likelihood_sample() return w,u
def kernel_comm_ulong_string (fm_train_dna=traindat,fm_test_dna=testdat, order=3, gap=0, reverse = False): from shogun import CommUlongStringKernel from shogun import StringUlongFeatures, StringCharFeatures, DNA from shogun import SortUlongString charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringUlongFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortUlongString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringUlongFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() use_sign=False kernel=CommUlongStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def distance_canberraword (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False): from shogun import StringCharFeatures, StringWordFeatures, DNA from shogun import SortWordString from shogun import CanberraWordDistance charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance=CanberraWordDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test
def create_distance_matrix(full_essays, ids): string_features = StringCharFeatures(full_essays, RAWBYTE) sk = SubsequenceStringKernel(string_features, string_features, 3, 0.5) sk_matrix = sk.get_kernel_matrix() sk_df = pd.DataFrame(sk_matrix) sk_df.columns = ['id_' + str(i) for i in ids] return (sk_df)
def features_hasheddocdot(strings): from shogun import StringCharFeatures, RAWBYTE from shogun import HashedDocDotFeatures from shogun import NGramTokenizer from numpy import array #create string features f = StringCharFeatures(strings, RAWBYTE) #set the number of bits of the target dimension #means a dim of size 2^5=32 num_bits = 5 #create the ngram tokenizer of size 8 to parse the strings tokenizer = NGramTokenizer(8) #normalize results normalize = True #create HashedDocDot features hddf = HashedDocDotFeatures(num_bits, f, tokenizer, normalize) #should expect 32 #print('Feature space dimensionality is', hddf.get_dim_feature_space()) #print('Self dot product of string 0', hddf.dot(0, hddf, 0)) return hddf
def distribution_ppwm(fm_dna=traindna, order=3): from shogun import StringByteFeatures, StringCharFeatures, DNA from shogun import PositionalPWM from numpy import array, e, log, exp charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats = StringByteFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order - 1, order, 0, False) L = 20 k = 3 sigma = 1 mu = 4 ppwm = PositionalPWM() ppwm.set_sigma(sigma) ppwm.set_mean(mu) pwm = array([[0.0, 0.5, 0.1, 1.0], [0.0, 0.5, 0.5, 0.0], [1.0, 0.0, 0.4, 0.0], [0.0, 0.0, 0.0, 0.0]]) pwm = array([[0.01, 0.09, 0.1], [0.09, 0.01, 0.1], [0.85, 0.4, 0.1], [0.05, 0.5, 0.7]]) ppwm.set_pwm(log(pwm)) #print(ppwm.get_pwm()) ppwm.compute_w(L) w = ppwm.get_w() #print(w) #from pylab import * #figure(1) #pcolor(exp(w)) #pcolor(w) #colorbar() #figure(2) ppwm.compute_scoring(1) u = ppwm.get_scoring(0) #pcolor(exp(u)) #show() #ppwm=PositionalPWM(feats) #ppwm.train() #out_likelihood = histo.get_log_likelihood() #out_sample = histo.get_log_likelihood_sample() return w, u
def score(self, data, label): from shogun import SNP, RAWBYTE from shogun import StringCharFeatures from sklearn.metrics import accuracy_score from sklearn.metrics import f1_score if self.isSNP: feats_test = StringCharFeatures(data, SNP) else: feats_test = StringCharFeatures(data, RAWBYTE) # 将测试string数据转化为中间量 self.kernel.init(self.feats_train, feats_test) feats_test = self.kernel.get_kernel_matrix() retult = self.clf.predict(feats_test.T) acc = accuracy_score(label, retult) f1 = f1_score(label, retult, average='macro') print '正确率是:' + str(acc), 'F1得分是:' + str(f1) return acc, f1
def kernel_poly_match_string(fm_train_dna=traindat, fm_test_dna=testdat, degree=3, inhomogene=False): from shogun import PolyMatchStringKernel from shogun import StringCharFeatures, DNA feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_train_dna, DNA) kernel = PolyMatchStringKernel(feats_train, feats_train, degree, inhomogene) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_distantsegments(fm_train_dna=traindat, fm_test_dna=testdat, delta=5, theta=5): from shogun import StringCharFeatures, DNA from shogun import DistantSegmentsKernel feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) kernel = DistantSegmentsKernel(feats_train, feats_train, 10, delta, theta) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def features_string_file(directory, fname): from shogun import StringCharFeatures, RAWBYTE from shogun import CSVFile # load features from directory f = StringCharFeatures(RAWBYTE) f.load_from_directory(directory) #and output several stats #print("max string length", f.get_max_vector_length()) #print("number of strings", f.get_num_vectors()) #print("length of first string", f.get_vector_length(0)) #print("str[0,0:3]", f.get_feature(0,0), f.get_feature(0,1), f.get_feature(0,2)) #print("len(str[0])", f.get_vector_length(0)) #print("str[0]", f.get_feature_vector(0)) #or load features from file (one string per line) fil = CSVFile(fname) f.load(fil) #print(f.get_features()) #or load fasta file #f.load_fasta('fasta.fa') #print(f.get_features()) return f.get_features(), f
def kernel_simple_locality_improved_string (fm_train_dna=traindat,fm_test_dna=testdat, length=5,inner_degree=5,outer_degree=1 ): from shogun import StringCharFeatures, DNA from shogun import SimpleLocalityImprovedStringKernel, MSG_DEBUG feats_train=StringCharFeatures(fm_train_dna, DNA) #feats_train.io.set_loglevel(MSG_DEBUG) feats_test=StringCharFeatures(fm_test_dna, DNA) kernel=SimpleLocalityImprovedStringKernel( feats_train, feats_train, length, inner_degree, outer_degree) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_ssk_string(fm_train_dna=traindat, fm_test_dna=testdat, maxlen=1, decay=1): from shogun import SubsequenceStringKernel from shogun import StringCharFeatures, DNA feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) kernel = SubsequenceStringKernel(feats_train, feats_train, maxlen, decay) km_train = kernel.get_kernel_matrix() # print(km_train) kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() # print(km_test) return km_train, km_test, kernel
def kernel_locality_improved_string(fm_train_dna=traindat, fm_test_dna=testdat, length=5, inner_degree=5, outer_degree=7): from shogun import StringCharFeatures, DNA from shogun import LocalityImprovedStringKernel feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) kernel = LocalityImprovedStringKernel(feats_train, feats_train, length, inner_degree, outer_degree) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def get_predictions(self, sequence, positions): seqlen=self.window_right+self.window_left+2 num=len(positions) testdat = [] for j in xrange(num): i=positions[j] - self.offset ; s=sequence[i-self.window_left:i+self.window_right+2] testdat.append(s) t=StringCharFeatures(DNA) t.set_string_features(testdat) self.wd_kernel.init(self.traindat, t) l=self.svm.classify().get_labels() sys.stderr.write("\n...done...\n") return l
def get_predictions(self, sequence, positions): seqlen = self.window_right + self.window_left + 2 num = len(positions) testdat = [] for j in xrange(num): i = positions[j] - self.offset s = sequence[i - self.window_left:i + self.window_right + 2] testdat.append(s) t = StringCharFeatures(DNA) t.set_string_features(testdat) self.wd_kernel.init(self.traindat, t) l = self.svm.classify().get_labels() sys.stderr.write("\n...done...\n") return l
def features_string_char (strings): from shogun import StringCharFeatures, RAWBYTE from numpy import array #create string features f=StringCharFeatures(strings, RAWBYTE) #and output several stats #print("max string length", f.get_max_vector_length()) #print("number of strings", f.get_num_vectors()) #print("length of first string", f.get_vector_length(0)) #print("string[5]", ''.join(f.get_feature_vector(5))) #print("strings", f.get_features()) #replace string 0 f.set_feature_vector(array(['t','e','s','t']), 0) #print("strings", f.get_features()) return f.get_features(), f
def kernel_fisher (fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, N=1,M=4,pseudo=1e-1,order=1,gap=0,reverse=False, kargs=[1,False,True]): from shogun import StringCharFeatures, StringWordFeatures, FKFeatures, DNA from shogun import PolyKernel from shogun import HMM, BW_NORMAL#, MSG_DEBUG # train HMM for positive class charfeat=StringCharFeatures(fm_hmm_pos, DNA) #charfeat.io.set_loglevel(MSG_DEBUG) hmm_pos_train=StringWordFeatures(charfeat.get_alphabet()) hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse) pos=HMM(hmm_pos_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) # train HMM for negative class charfeat=StringCharFeatures(fm_hmm_neg, DNA) hmm_neg_train=StringWordFeatures(charfeat.get_alphabet()) hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse) neg=HMM(hmm_neg_train, N, M, pseudo) neg.baum_welch_viterbi_train(BW_NORMAL) # Kernel training data charfeat=StringCharFeatures(fm_train_dna, DNA) wordfeats_train=StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) # Kernel testing data charfeat=StringCharFeatures(fm_test_dna, DNA) wordfeats_test=StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) # get kernel on training data pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train=FKFeatures(10, pos, neg) feats_train.set_opt_a(-1) #estimate prior kernel=PolyKernel(feats_train, feats_train, *kargs) km_train=kernel.get_kernel_matrix() # get kernel on testing data pos_clone=HMM(pos) neg_clone=HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test=FKFeatures(10, pos_clone, neg_clone) feats_test.set_a(feats_train.get_a()) #use prior from training data kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def get_predictions(self, sequence, positions): seqlen=self.window_right+self.window_left+2 num=len(positions) position_list=DynamicIntArray() for j in xrange(num): i=positions[j] - self.offset - self.window_left position_list.append_element(i) t=StringCharFeatures([sequence], DNA) t.obtain_by_position_list(seqlen, position_list) self.wd_kernel.init(self.traindat, t) del t self.wd_kernel.io.enable_progress() l=self.svm.apply().get_values() self.wd_kernel.cleanup() sys.stdout.write("\n...done...\n") return l
def features_string_file (directory, fname): from shogun import StringCharFeatures, RAWBYTE from shogun import CSVFile # load features from directory f=StringCharFeatures(RAWBYTE) f.load_from_directory(directory) #and output several stats #print("max string length", f.get_max_vector_length()) #print("number of strings", f.get_num_vectors()) #print("length of first string", f.get_vector_length(0)) #print("str[0,0:3]", f.get_feature(0,0), f.get_feature(0,1), f.get_feature(0,2)) #print("len(str[0])", f.get_vector_length(0)) #print("str[0]", f.get_feature_vector(0)) #or load features from file (one string per line) fil=CSVFile(fname) f.load(fil) #print(f.get_features()) #or load fasta file #f.load_fasta('fasta.fa') #print(f.get_features()) return f.get_features(), f
def kernel_top (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,pseudo=1e-1, order=1,gap=0,reverse=False,kargs=[1, False, True]): from shogun import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA from shogun import PolyKernel from shogun import HMM, BW_NORMAL N=1 # toy HMM with 1 state M=4 # 4 observations -> DNA # train HMM for positive class charfeat=StringCharFeatures(fm_hmm_pos, DNA) hmm_pos_train=StringWordFeatures(charfeat.get_alphabet()) hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse) pos=HMM(hmm_pos_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) # train HMM for negative class charfeat=StringCharFeatures(fm_hmm_neg, DNA) hmm_neg_train=StringWordFeatures(charfeat.get_alphabet()) hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse) neg=HMM(hmm_neg_train, N, M, pseudo) neg.baum_welch_viterbi_train(BW_NORMAL) # Kernel training data charfeat=StringCharFeatures(fm_train_dna, DNA) wordfeats_train=StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) # Kernel testing data charfeat=StringCharFeatures(fm_test_dna, DNA) wordfeats_test=StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) # get kernel on training data pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train=TOPFeatures(10, pos, neg, False, False) kernel=PolyKernel(feats_train, feats_train, *kargs) km_train=kernel.get_kernel_matrix() # get kernel on testing data pos_clone=HMM(pos) neg_clone=HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test=TOPFeatures(10, pos_clone, neg_clone, False, False) kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def classifier_svmlight (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,C=1.2,epsilon=1e-5,num_threads=1): from shogun import StringCharFeatures, BinaryLabels, DNA from shogun import WeightedDegreeStringKernel try: from shogun import SVMLight except ImportError: print('No support for SVMLight available.') return feats_train=StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) degree=20 kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) labels=BinaryLabels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) svm.apply().get_labels() return kernel
def distribution_histogram (fm_dna=traindna,order=3,gap=0,reverse=False): from shogun import StringWordFeatures, StringCharFeatures, DNA from shogun import Histogram charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) histo=Histogram(feats) histo.train() histo.get_histogram() num_examples=feats.get_num_vectors() num_param=histo.get_num_model_parameters() #for i in xrange(num_examples): # for j in xrange(num_param): # histo.get_log_derivative(j, i) out_likelihood = histo.get_log_likelihood() out_sample = histo.get_log_likelihood_sample() return histo,out_sample,out_likelihood
def distance_manhattenword (train_fname=traindna,test_fname=testdna,order=3,gap=0,reverse=False): from shogun import StringCharFeatures, StringWordFeatures, DNA from shogun import SortWordString, ManhattanWordDistance, CSVFile charfeat=StringCharFeatures(CSVFile(train_fname), DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(CSVFile(test_fname), DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance=ManhattanWordDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return dm_train,dm_test
def get_test_features(self, seq, window): start = self.window[0] - window[0] end = len(seq) - window[1] + self.window[2] size = self.window[2] - self.window[0] + 1 seq = seq[start:end] seq = seq.replace("N", "A").replace("R", "A").replace("M", "A") f = StringCharFeatures([seq], DNA) if self.preproc: wf = StringWordFeatures(f.get_alphabet()) o = self.train_features.get_order() wf.obtain_from_char(f, 0, o, 0, False) f = wf f.obtain_by_sliding_window(size, 1, o - 1) else: f.obtain_by_sliding_window(size, 1) return f
def classifier_svmlight_batch_linadd (fm_train_dna, fm_test_dna, label_train_dna, degree, C, epsilon, num_threads): from shogun import StringCharFeatures, BinaryLabels, DNA from shogun import WeightedDegreeStringKernel, MSG_DEBUG try: from shogun import SVMLight except ImportError: print('No support for SVMLight available.') return feats_train=StringCharFeatures(DNA) #feats_train.io.set_loglevel(MSG_DEBUG) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) degree=20 kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) labels=BinaryLabels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) #print('SVMLight Objective: %f num_sv: %d' % \) # (svm.get_objective(), svm.get_num_support_vectors()) svm.set_batch_computation_enabled(False) svm.set_linadd_enabled(False) svm.apply().get_labels() svm.set_batch_computation_enabled(True) labels = svm.apply().get_labels() return labels, svm
def features_string_char_compressed (fname): from shogun import StringCharFeatures, StringFileCharFeatures, RAWBYTE from shogun import UNCOMPRESSED,SNAPPY,LZO,GZIP,BZIP2,LZMA, MSG_DEBUG from shogun import DecompressCharString f=StringFileCharFeatures(fname, RAWBYTE) #print("original strings", f.get_features()) #uncompressed f.save_compressed("tmp/foo_uncompressed.str", UNCOMPRESSED, 1) f2=StringCharFeatures(RAWBYTE); f2.load_compressed("tmp/foo_uncompressed.str", True) #print("uncompressed strings", f2.get_features()) #print # load compressed data and uncompress on load #snappy - not stable yet?! #f.save_compressed("tmp/foo_snappy.str", SNAPPY, 9) #f2=StringCharFeatures(RAWBYTE); #f2.load_compressed("tmp/foo_snappy.str", True) #print("snappy strings", f2.get_features()) #print #lzo f.save_compressed("tmp/foo_lzo.str", LZO, 9) f2=StringCharFeatures(RAWBYTE); f2.load_compressed("tmp/foo_lzo.str", True) #print("lzo strings", f2.get_features()) #print ##gzip f.save_compressed("tmp/foo_gzip.str", GZIP, 9) f2=StringCharFeatures(RAWBYTE); f2.load_compressed("tmp/foo_gzip.str", True) #print("gzip strings", f2.get_features()) #print #bzip2 f.save_compressed("tmp/foo_bzip2.str", BZIP2, 9) f2=StringCharFeatures(RAWBYTE); f2.load_compressed("tmp/foo_bzip2.str", True) #print("bzip2 strings", f2.get_features()) #print #lzma f.save_compressed("tmp/foo_lzma.str", LZMA, 9) f2=StringCharFeatures(RAWBYTE); f2.load_compressed("tmp/foo_lzma.str", True) #print("lzma strings", f2.get_features()) #print # load compressed data and uncompress via preprocessor f2=StringCharFeatures(RAWBYTE); f2.load_compressed("tmp/foo_lzo.str", False) f2.add_preprocessor(DecompressCharString(LZO)) f2.apply_preprocessor() #print("lzo strings", f2.get_features()) #print # load compressed data and uncompress on-the-fly via preprocessor f2=StringCharFeatures(RAWBYTE); f2.load_compressed("tmp/foo_lzo.str", False) #f2.io.set_loglevel(MSG_DEBUG) f2.add_preprocessor(DecompressCharString(LZO)) f2.enable_on_the_fly_preprocessing() #print("lzo strings", f2.get_features()) #print #clean up import os for f in ['tmp/foo_uncompressed.str', 'tmp/foo_snappy.str', 'tmp/foo_lzo.str', 'tmp/foo_gzip.str', 'tmp/foo_bzip2.str', 'tmp/foo_lzma.str', 'tmp/foo_lzo.str', 'tmp/foo_lzo.str']: if os.path.exists(f): os.unlink(f)
def features_string_sliding_window (strings): from shogun import StringCharFeatures, DNA from shogun import DynamicIntArray f=StringCharFeatures([strings], DNA) # slide a window of length 5 over features # (memory efficient, does not copy strings) f.obtain_by_sliding_window(5,1) #print(f.get_num_vectors()) #print(f.get_vector_length(0)) #print(f.get_vector_length(1)) #print(f.get_features()) # slide a window of length 4 over features # (memory efficient, does not copy strings) f.obtain_by_sliding_window(4,1) #print(f.get_num_vectors()) #print(f.get_vector_length(0)) #print(f.get_vector_length(1)) #print(f.get_features()) # extract string-windows at position 0,6,16,25 of window size 4 # (memory efficient, does not copy strings) f.set_features([s]) positions=DynamicIntArray() positions.append_element(0) positions.append_element(6) positions.append_element(16) positions.append_element(25) f.obtain_by_position_list(4,positions) #print(f.get_features()) # now extract windows of size 8 from same positon list f.obtain_by_position_list(8,positions) #print(f.get_features()) return f
def tests_check_commwordkernel_memleak (num, order, gap, reverse): import gc from shogun import Alphabet,StringCharFeatures,StringWordFeatures,DNA from shogun import SortWordString, MSG_DEBUG from shogun import CommWordStringKernel, IdentityKernelNormalizer from numpy import mat POS=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT'] NEG=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT'] for i in range(10): alpha=Alphabet(DNA) traindat=StringCharFeatures(alpha) traindat.set_features(POS+NEG) trainudat=StringWordFeatures(traindat.get_alphabet()); trainudat.obtain_from_char(traindat, order-1, order, gap, reverse) #trainudat.io.set_loglevel(MSG_DEBUG) pre = SortWordString() #pre.io.set_loglevel(MSG_DEBUG) pre.init(trainudat) trainudat.add_preprocessor(pre) trainudat.apply_preprocessor() spec = CommWordStringKernel(10, False) spec.set_normalizer(IdentityKernelNormalizer()) spec.init(trainudat, trainudat) K=spec.get_kernel_matrix() del POS del NEG del order del gap del reverse return K
def create_features(kname, examples, kparam, train_mode, preproc, seq_source, nuc_con): """Converts numpy arrays or sequences into shogun features""" if kname == 'gauss' or kname == 'linear' or kname == 'poly': examples = numpy.array(examples) feats = RealFeatures(examples) elif kname == 'wd' or kname == 'localalign' or kname == 'localimprove': if seq_source == 'dna': examples = non_atcg_convert(examples, nuc_con) feats = StringCharFeatures(examples, DNA) elif seq_source == 'protein': examples = non_aminoacid_converter(examples, nuc_con) feats = StringCharFeatures(examples, PROTEIN) else: sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n") sys.exit(-1) elif kname == 'spec' or kname == 'cumspec': if seq_source == 'dna': examples = non_atcg_convert(examples, nuc_con) feats = StringCharFeatures(examples, DNA) elif seq_source == 'protein': examples = non_aminoacid_converter(examples, nuc_con) feats = StringCharFeatures(examples, PROTEIN) else: sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n") sys.exit(-1) wf = StringUlongFeatures( feats.get_alphabet() ) wf.obtain_from_char(feats, kparam['degree']-1, kparam['degree'], 0, kname=='cumspec') del feats if train_mode: preproc = SortUlongString() preproc.init(wf) wf.add_preprocessor(preproc) ret = wf.apply_preprocessor() #assert(ret) feats = wf elif kname == 'spec2' or kname == 'cumspec2': # spectrum kernel on two sequences feats = {} feats['combined'] = CombinedFeatures() reversed = kname=='cumspec2' (ex0,ex1) = zip(*examples) f0 = StringCharFeatures(list(ex0), DNA) wf = StringWordFeatures(f0.get_alphabet()) wf.obtain_from_char(f0, kparam['degree']-1, kparam['degree'], 0, reversed) del f0 if train_mode: preproc = SortWordString() preproc.init(wf) wf.add_preprocessor(preproc) ret = wf.apply_preprocessor() assert(ret) feats['combined'].append_feature_obj(wf) feats['f0'] = wf f1 = StringCharFeatures(list(ex1), DNA) wf = StringWordFeatures( f1.get_alphabet() ) wf.obtain_from_char(f1, kparam['degree']-1, kparam['degree'], 0, reversed) del f1 if train_mode: preproc = SortWordString() preproc.init(wf) wf.add_preprocessor(preproc) ret = wf.apply_preprocessor() assert(ret) feats['combined'].append_feature_obj(wf) feats['f1'] = wf else: print 'Unknown kernel %s' % kname return (feats,preproc)