def distribution_linearhmm_modular(fm_dna=traindna, order=3, gap=0, reverse=False): from modshogun import StringWordFeatures, StringCharFeatures, DNA from modshogun import LinearHMM charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats = StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order - 1, order, gap, reverse) hmm = LinearHMM(feats) hmm.train() hmm.get_transition_probs() num_examples = feats.get_num_vectors() num_param = hmm.get_num_model_parameters() for i in range(num_examples): for j in range(num_param): hmm.get_log_derivative(j, i) out_likelihood = hmm.get_log_likelihood() out_sample = hmm.get_log_likelihood_sample() return hmm, out_likelihood, out_sample
def runShogunSVMDNASubsequenceStringKernel(train_xt, train_lt, test_xt): """ run svm with spectrum kernel """ ################################################## # set up svm feats_train = StringCharFeatures(train_xt, DNA) feats_test = StringCharFeatures(test_xt, DNA) kernel = SubsequenceStringKernel(feats_train, feats_train, MAXLEN, DECAY) kernel.io.set_loglevel(MSG_DEBUG) kernel.init(feats_train, feats_train) # init kernel labels = BinaryLabels(train_lt) # run svm model print "Ready to train!" svm = LibSVM(SVMC, kernel, labels) svm.io.set_loglevel(MSG_DEBUG) svm.train() # predictions print "Making predictions!" out1DecisionValues = svm.apply(feats_train) out1 = out1DecisionValues.get_labels() kernel.init(feats_train, feats_test) out2DecisionValues = svm.apply(feats_test) out2 = out2DecisionValues.get_labels() return out1, out2, out1DecisionValues, out2DecisionValues
def runShogunSVMDNALinearStringKernel(train_xt, train_lt, test_xt): """ run svm with spectrum kernel """ ################################################## # set up svm feats_train = StringCharFeatures(train_xt, DNA) feats_test = StringCharFeatures(test_xt, DNA) kernel = LinearStringKernel(feats_train, feats_train) kernel.io.set_loglevel(MSG_DEBUG) # init kernel labels = BinaryLabels(train_lt) # run svm model print "Ready to train!" svm = LibSVM(SVMC, kernel, labels) svm.io.set_loglevel(MSG_DEBUG) svm.train() # predictions print "Making predictions!" out1 = svm.apply(feats_train).get_labels() kernel.init(feats_train, feats_test) out2 = svm.apply(feats_test).get_labels() return out1, out2
def kernel_histogram_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,order=3,ppseudo_count=1,npseudo_count=1): from modshogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels from modshogun import HistogramWordStringKernel, AvgDiagKernelNormalizer from modshogun import PluginEstimate#, MSG_DEBUG charfeat=StringCharFeatures(DNA) #charfeat.io.set_loglevel(MSG_DEBUG) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, 0, False) charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, 0, False) pie=PluginEstimate(ppseudo_count,npseudo_count) labels=BinaryLabels(label_train_dna) pie.set_labels(labels) pie.set_features(feats_train) pie.train() kernel=HistogramWordStringKernel(feats_train, feats_train, pie) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) pie.set_features(feats_test) pie.apply().get_labels() km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_weighted_comm_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,order=3,gap=0,reverse=True ): from modshogun import WeightedCommWordStringKernel from modshogun import StringWordFeatures, StringCharFeatures, DNA from modshogun import SortWordString charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() use_sign=False kernel=WeightedCommWordStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_poly_match_word_string_modular(fm_train_dna=traindat, fm_test_dna=testdat, degree=2, inhomogene=True, order=3, gap=0, reverse=False): from modshogun import PolyMatchWordStringKernel from modshogun import StringWordFeatures, StringCharFeatures, DNA charfeat = StringCharFeatures(fm_train_dna, DNA) feats_train = StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) charfeat = StringCharFeatures(fm_test_dna, DNA) feats_test = StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) kernel = PolyMatchWordStringKernel(feats_train, feats_train, degree, inhomogene) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def cross_validation(X, Y, d, c, K): N = len(Y) n = N / K accuracy_list = [] for k in range(0, K): print 'degree = %s\tC = %s\tcross_validation_iter = %s/%s' % (d, c, k + 1, K) sys.stdout.flush() X_test = list(X[k:k + n]) Y_test = list(Y[k:k + n]) X_train = [] X_train.extend(X[:k]) X_train.extend(X[k + n:]) Y_train = [] Y_train.extend(Y[:k]) Y_train.extend(Y[k + n:]) X_train = StringCharFeatures(X_train, DNA) X_test = StringCharFeatures(X_test, DNA) Y_train = BinaryLabels(np.array(Y_train, dtype=np.float64)) Y_test = np.array(Y_test) args_tuple = (X_train, Y_train, X_test, Y_test, d, c) accuracy, Y_test_proba = svm_process(args_tuple) accuracy_list.append(accuracy) return np.array(accuracy_list).mean()
def runShogunSVMDNAWDNoPositionKernel(train_xt, train_lt, test_xt): """ run svm with non-position WD kernel """ ################################################## # set up svm feats_train = StringCharFeatures(train_xt, DNA) feats_test = StringCharFeatures(test_xt, DNA) kernel = WeightedDegreeStringKernel(feats_train, feats_train, DEGREE) kernel.io.set_loglevel(MSG_DEBUG) weights=arange(1,DEGREE+1,dtype=double)[::-1]/ \ sum(arange(1,DEGREE+1,dtype=double)) kernel.set_wd_weights(weights) # init kernel labels = BinaryLabels(train_lt) # run svm model print "Ready to train!" svm = LibSVM(SVMC, kernel, labels) svm.io.set_loglevel(MSG_DEBUG) svm.train() # predictions print "Making predictions!" out1 = svm.apply(feats_train).get_labels() kernel.init(feats_train, feats_test) out2 = svm.apply(feats_test).get_labels() return out1, out2
def kernel_weighted_degree_string_modular(fm_train_dna=traindat, fm_test_dna=testdat, degree=20): from modshogun import StringCharFeatures, DNA from modshogun import WeightedDegreeStringKernel, MSG_DEBUG feats_train = StringCharFeatures(fm_train_dna, DNA) #feats_train.io.set_loglevel(MSG_DEBUG) feats_test = StringCharFeatures(fm_test_dna, DNA) kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree) from numpy import arange, double weights=arange(1,degree+1,dtype=double)[::-1]/ \ sum(arange(1,degree+1,dtype=double)) kernel.set_wd_weights(weights) #from numpy import ones,float64,int32 #kernel.set_position_weights(ones(len(fm_train_dna[0]), dtype=float64)) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() #this is how to serializate the kernel #import pickle #pickle.dump(kernel, file('kernel_obj.dump','w'), protocol=2) #k=pickle.load(file('kernel_obj.dump','r')) return km_train, km_test, kernel
def runShogunSVMDNAWDKernel(train_xt, train_lt, test_xt): """ run svm with string kernels """ ################################################## # set up svm feats_train = StringCharFeatures(train_xt, DNA) feats_test = StringCharFeatures(test_xt, DNA) kernel = WeightedDegreePositionStringKernel(feats_train, feats_train, DEGREE) kernel.io.set_loglevel(MSG_DEBUG) kernel.set_shifts(NUMSHIFTS * ones(len(train_xt[0]), dtype=int32)) kernel.set_position_weights(ones(len(train_xt[0]), dtype=float64)) # init kernel labels = BinaryLabels(train_lt) # run svm model print "Ready to train!" svm = LibSVM(SVMC, kernel, labels) svm.io.set_loglevel(MSG_DEBUG) svm.train() # predictions print "Making predictions!" out1DecisionValues = svm.apply(feats_train) out1 = out1DecisionValues.get_labels() kernel.init(feats_train, feats_test) out2DecisionValues = svm.apply(feats_test) out2 = out2DecisionValues.get_labels() return out1, out2, out1DecisionValues, out2DecisionValues
def kernel_combined_modular (fm_train_real=traindat,fm_test_real=testdat,fm_train_dna=traindna,fm_test_dna=testdna ): from modshogun import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel from modshogun import RealFeatures, StringCharFeatures, CombinedFeatures, DNA kernel=CombinedKernel() feats_train=CombinedFeatures() feats_test=CombinedFeatures() subkfeats_train=RealFeatures(fm_train_real) subkfeats_test=RealFeatures(fm_test_real) subkernel=GaussianKernel(10, 1.1) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=StringCharFeatures(fm_train_dna, DNA) subkfeats_test=StringCharFeatures(fm_test_dna, DNA) degree=3 subkernel=FixedDegreeStringKernel(10, degree) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=StringCharFeatures(fm_train_dna, DNA) subkfeats_test=StringCharFeatures(fm_test_dna, DNA) subkernel=LocalAlignmentStringKernel(10) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_match_word_string_modular(fm_train_dna=traindat, fm_test_dna=testdat, degree=3, scale=1.4, size_cache=10, order=3, gap=0, reverse=False): from modshogun import MatchWordStringKernel, AvgDiagKernelNormalizer from modshogun import StringWordFeatures, StringCharFeatures, DNA charfeat = StringCharFeatures(fm_train_dna, DNA) feats_train = StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) charfeat = StringCharFeatures(fm_test_dna, DNA) feats_test = StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) kernel = MatchWordStringKernel(size_cache, degree) kernel.set_normalizer(AvgDiagKernelNormalizer(scale)) kernel.init(feats_train, feats_train) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_salzberg_word_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat, order=3,gap=0,reverse=False): from modshogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels from modshogun import SalzbergWordStringKernel from modshogun import PluginEstimate charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) pie=PluginEstimate() labels=BinaryLabels(label_train_dna) pie.set_labels(labels) pie.set_features(feats_train) pie.train() kernel=SalzbergWordStringKernel(feats_train, feats_train, pie, labels) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) pie.set_features(feats_test) pie.apply().get_labels() km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def distribution_hmm_modular(fm_cube, N, M, pseudo, order, gap, reverse, num_examples): from modshogun import StringWordFeatures, StringCharFeatures, CUBE from modshogun import HMM, BW_NORMAL charfeat=StringCharFeatures(CUBE) charfeat.set_features(fm_cube) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) hmm=HMM(feats, N, M, pseudo) hmm.train() hmm.baum_welch_viterbi_train(BW_NORMAL) num_examples=feats.get_num_vectors() num_param=hmm.get_num_model_parameters() for i in range(num_examples): for j in range(num_param): hmm.get_log_derivative(j, i) best_path=0 best_path_state=0 for i in range(num_examples): best_path+=hmm.best_path(i) for j in range(N): best_path_state+=hmm.get_best_path_state(i, j) lik_example = hmm.get_log_likelihood() lik_sample = hmm.get_log_likelihood_sample() return lik_example, lik_sample, hmm
def classifier_domainadaptationsvm_modular (fm_train_dna=traindna,fm_test_dna=testdna, \ label_train_dna=label_traindna, \ label_test_dna=label_testdna,fm_train_dna2=traindna2,fm_test_dna2=testdna2, \ label_train_dna2=label_traindna2,label_test_dna2=label_testdna2,C=1,degree=3): feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels = BinaryLabels(label_train_dna) svm = SVMLight(C, kernel, labels) svm.train() #svm.io.set_loglevel(MSG_DEBUG) ##################################### #print("obtaining DA SVM from previously trained SVM") feats_train2 = StringCharFeatures(fm_train_dna, DNA) feats_test2 = StringCharFeatures(fm_test_dna, DNA) kernel2 = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels2 = BinaryLabels(label_train_dna) # we regularize against the previously obtained solution dasvm = DomainAdaptationSVM(C, kernel2, labels2, svm, 1.0) dasvm.train() out = dasvm.apply_binary(feats_test2) return out #,dasvm TODO
def classifier_ssk_modular(fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, C=1, maxlen=1, decay=1): from modshogun import StringCharFeatures, BinaryLabels from modshogun import LibSVM, StringSubsequenceKernel, DNA from modshogun import ErrorRateMeasure feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) labels = BinaryLabels(label_train_dna) kernel = StringSubsequenceKernel(feats_train, feats_train, maxlen, decay) svm = LibSVM(C, kernel, labels) svm.train() out = svm.apply(feats_train) evaluator = ErrorRateMeasure() trainerr = evaluator.evaluate(out, labels) # print(trainerr) kernel.init(feats_train, feats_test) predicted_labels = svm.apply(feats_test).get_labels() # print predicted_labels return predicted_labels
def distance_canberraword_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False): from modshogun import StringCharFeatures, StringWordFeatures, DNA from modshogun import SortWordString from modshogun import CanberraWordDistance charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance=CanberraWordDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test
def distribution_linearhmm_modular (fm_dna=traindna,order=3,gap=0,reverse=False): from modshogun import StringWordFeatures, StringCharFeatures, DNA from modshogun import LinearHMM charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) hmm=LinearHMM(feats) hmm.train() hmm.get_transition_probs() num_examples=feats.get_num_vectors() num_param=hmm.get_num_model_parameters() for i in range(num_examples): for j in range(num_param): hmm.get_log_derivative(j, i) out_likelihood = hmm.get_log_likelihood() out_sample = hmm.get_log_likelihood_sample() return hmm,out_likelihood ,out_sample
def kernel_comm_word_string_modular(fm_train_dna=traindat, fm_test_dna=testdat, order=3, gap=0, reverse=False, use_sign=False): from modshogun import CommWordStringKernel from modshogun import StringWordFeatures, StringCharFeatures, DNA from modshogun import SortWordString charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test = StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel = CommWordStringKernel(feats_train, feats_train, use_sign) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def runShogunSVMDNACombinedSpectrumKernel(train_xt, train_lt, test_xt): """ run svm with combined spectrum kernel """ ################################################## # set up svm kernel=CombinedKernel() feats_train=CombinedFeatures() feats_test=CombinedFeatures() for K in KList: # Iterate through the K's and make a spectrum kernel for each charfeat_train = StringCharFeatures(train_xt, DNA) current_feats_train = StringWordFeatures(DNA) current_feats_train.obtain_from_char(charfeat_train, K-1, K, GAP, False) preproc=SortWordString() preproc.init(current_feats_train) current_feats_train.add_preprocessor(preproc) current_feats_train.apply_preprocessor() feats_train.append_feature_obj(current_feats_train) charfeat_test = StringCharFeatures(test_xt, DNA) current_feats_test=StringWordFeatures(DNA) current_feats_test.obtain_from_char(charfeat_test, K-1, K, GAP, False) current_feats_test.add_preprocessor(preproc) current_feats_test.apply_preprocessor() feats_test.append_feature_obj(current_feats_test) current_kernel=CommWordStringKernel(10, False) kernel.append_kernel(current_kernel) kernel.io.set_loglevel(MSG_DEBUG) # init kernel labels = BinaryLabels(train_lt) # run svm model print "Ready to train!" kernel.init(feats_train, feats_train) svm=LibSVM(SVMC, kernel, labels) svm.io.set_loglevel(MSG_DEBUG) svm.train() # predictions print "Making predictions!" out1DecisionValues = svm.apply(feats_train) out1=out1DecisionValues.get_labels() kernel.init(feats_train, feats_test) out2DecisionValues = svm.apply(feats_test) out2=out2DecisionValues.get_labels() return out1,out2,out1DecisionValues,out2DecisionValues
def get_wd_features(data, feat_type="dna"): """ create feature object for wdk """ if feat_type == "dna": feat = StringCharFeatures(DNA) elif feat_type == "protein": feat = StringCharFeatures(PROTEIN) else: raise Exception("unknown feature type") feat.set_features(data) return feat
def kernel_linear_string_modular (fm_train_dna=traindat,fm_test_dna=testdat): from modshogun import StringCharFeatures, DNA from modshogun import LinearStringKernel feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_test_dna, DNA) kernel=LinearStringKernel(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_poly_match_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,degree=3,inhomogene=False): from modshogun import PolyMatchStringKernel from modshogun import StringCharFeatures, DNA feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_train_dna, DNA) kernel=PolyMatchStringKernel(feats_train, feats_train, degree, inhomogene) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def get_spectrum_features(data, order=3, gap=0, reverse=True): """ create feature object used by spectrum kernel """ charfeat = StringCharFeatures(data, PROTEIN) feat = StringWordFeatures(charfeat.get_alphabet()) feat.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feat) feat.add_preprocessor(preproc) feat.apply_preprocessor() return feat
def get_spectrum_features(data, order=3, gap=0, reverse=True): """ create feature object used by spectrum kernel """ charfeat = StringCharFeatures(data, DNA) feat = StringWordFeatures(charfeat.get_alphabet()) feat.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc = SortWordString() preproc.init(feat) feat.add_preprocessor(preproc) feat.apply_preprocessor() return feat
def classifier_svmlight_linear_term_modular (fm_train_dna=traindna,fm_test_dna=testdna, \ label_train_dna=label_traindna,degree=3, \ C=10,epsilon=1e-5,num_threads=1): from modshogun import StringCharFeatures, BinaryLabels, DNA from modshogun import WeightedDegreeStringKernel try: from modshogun import SVMLight except ImportError: print("SVMLight is not available") exit(0) feats_train = StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test = StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels = BinaryLabels(label_train_dna) svm = SVMLight(C, kernel, labels) svm.set_qpsize(3) svm.set_linear_term( -numpy.array([1, 2, 3, 4, 5, 6, 7, 8, 7, 6], dtype=numpy.double)) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) out = svm.apply().get_labels() return out, kernel
def preprocessor_sortulongstring_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False,use_sign=False): from modshogun import CommUlongStringKernel from modshogun import StringCharFeatures, StringUlongFeatures, DNA from modshogun import SortUlongString charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringUlongFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringUlongFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortUlongString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel=CommUlongStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def distance_hammingword_modular (fm_train_dna=traindna,fm_test_dna=testdna, fm_test_real=testdat,order=3,gap=0,reverse=False,use_sign=False): from modshogun import StringCharFeatures, StringWordFeatures, DNA from modshogun import SortWordString from modshogun import HammingWordDistance charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance=HammingWordDistance(feats_train, feats_train, use_sign) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test
def classifier_svmlight_modular(fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, C=1.2, epsilon=1e-5, num_threads=1): from modshogun import StringCharFeatures, BinaryLabels, DNA from modshogun import WeightedDegreeStringKernel try: from modshogun import SVMLight except ImportError: print('No support for SVMLight available.') return feats_train = StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test = StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) degree = 20 kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels = BinaryLabels(label_train_dna) svm = SVMLight(C, kernel, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) svm.apply().get_labels() return kernel
def get_kernel_mat(fm_train_dna, fm_test_dna, N, M, pseudo=1e-1, order=1, gap=0, reverse=False): # train HMM for positive class print "hmm training" charfeat = StringCharFeatures(fm_train_dna, DNA) #charfeat.io.set_loglevel(MSG_DEBUG) hmm_train = StringWordFeatures(charfeat.get_alphabet()) hmm_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) pos = HMM(hmm_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) neg = HMM(pos) print "Kernel training data" charfeat = StringCharFeatures(fm_train_dna, DNA) wordfeats_train = StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) print "Kernel testing data" charfeat = StringCharFeatures(fm_test_dna, DNA) wordfeats_test = StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) print "get kernel on training data" pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train = FKFeatures(10, pos, neg) feats_train.set_opt_a(-1) #estimate prior print 'getting feature matrix' v0 = feats_train.get_feature_vector(0) v1 = feats_train.get_feature_vector(1) print np.dot(v0, v1) kernel = LinearKernel(feats_train, feats_train) #kernel=PolyKernel(feats_train, feats_train, *kargs) km_train = kernel.get_kernel_matrix() print km_train.shape, km_train[0, 1] print "get kernel on testing data" pos_clone = HMM(pos) neg_clone = HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test = FKFeatures(10, pos_clone, neg_clone) feats_test.set_a(feats_train.get_a()) #use prior from training data kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def make_string_feature (astringv, start=1, order=8, gap=0, reverse=False): from modshogun import StringUlongFeatures, StringCharFeatures, RAWBYTE from modshogun import SortUlongString charfeat=StringCharFeatures(astringv, RAWBYTE) feats_train=StringUlongFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, start, order, gap, reverse) preproc=SortUlongString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() return feats_train
def kernel_locality_improved_string_modular (fm_train_dna=traindat,fm_test_dna=testdat,length=5,inner_degree=5,outer_degree=7): from modshogun import StringCharFeatures, DNA from modshogun import LocalityImprovedStringKernel feats_train=StringCharFeatures(fm_train_dna, DNA) feats_test=StringCharFeatures(fm_test_dna, DNA) kernel=LocalityImprovedStringKernel( feats_train, feats_train, length, inner_degree, outer_degree) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def get_k_nearest_docs(self, docs_list, doc_index, k, keywords_list): cleaned_doc_list = [] for doc in docs_list: cleaned_doc_list.append(self.get_clean_text(doc)) features = StringCharFeatures(cleaned_doc_list, RAWBYTE) # print(dir(features)) n = 7 lambda_sym = 0.2 sk = SubsequenceStringKernel(features, features, n, lambda_sym) sim_mat = sk.get_kernel_matrix() # print("sim_mat.shape : ", sim_mat.shape) similarity_scores_with_indices = [ (sim_score, i) for i, sim_score in list(enumerate(sim_mat[doc_index])) ] k_nearest_docs_with_indices = sorted(similarity_scores_with_indices, reverse=True)[:k] # print("k_nearest_docs_with_indices : ", k_nearest_docs_with_indices) k_nearest_docs = [ docs_list[i] for score, i in k_nearest_docs_with_indices ] k_nearest_doc_scores = np.array([ score for score, i in k_nearest_docs_with_indices ]).reshape(k, 1, 1) return k_nearest_docs, k_nearest_doc_scores
def kernel_fixed_degree_string_modular(fm_train_dna=traindat, fm_test_dna=testdat, degree=3): from modshogun import StringCharFeatures, DNA from modshogun import FixedDegreeStringKernel feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) kernel = FixedDegreeStringKernel(feats_train, feats_train, degree) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def features_hasheddocdot_modular(strings): from modshogun import StringCharFeatures, RAWBYTE from modshogun import HashedDocDotFeatures from modshogun import NGramTokenizer from numpy import array #create string features f = StringCharFeatures(strings, RAWBYTE) #set the number of bits of the target dimension #means a dim of size 2^5=32 num_bits = 5 #create the ngram tokenizer of size 8 to parse the strings tokenizer = NGramTokenizer(8) #normalize results normalize = True #create HashedDocDot features hddf = HashedDocDotFeatures(num_bits, f, tokenizer, normalize) #should expect 32 #print('Feature space dimensionality is', hddf.get_dim_feature_space()) #print('Self dot product of string 0', hddf.dot(0, hddf, 0)) return hddf
def runShogunOneClassSVMDNASpectrumKernel(train_xt, train_lt, test_xt): """ run svm with spectrum kernel """ ################################################## # set up svr charfeat_train = StringCharFeatures(train_xt, DNA) feats_train = StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat_train, K-1, K, GAP, False) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat_test = StringCharFeatures(test_xt, DNA) feats_test=StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat_test, K-1, K, GAP, False) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel=CommWordStringKernel(feats_train, feats_train, False) kernel.io.set_loglevel(MSG_DEBUG) # init kernel labels = BinaryLabels(train_lt) # run svm model print "Ready to train!" svm=LibSVMOneClass(SVMC, kernel) svm.set_epsilon(EPSILON) svm.train() # predictions print "Making predictions!" out1DecisionValues = svm.apply(feats_train) out1=out1DecisionValues.get_labels() kernel.init(feats_train, feats_test) out2DecisionValues = svm.apply(feats_test) out2=out2DecisionValues.get_labels() # predictions = svm.apply(feats_test) # return predictions, svm, predictions.get_labels() return out1,out2,out1DecisionValues,out2DecisionValues
def distribution_ppwm_modular(fm_dna=traindna, order=3): from modshogun import StringByteFeatures, StringCharFeatures, DNA from modshogun import PositionalPWM from numpy import array, e, log, exp charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats = StringByteFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order - 1, order, 0, False) L = 20 k = 3 sigma = 1 mu = 4 ppwm = PositionalPWM() ppwm.set_sigma(sigma) ppwm.set_mean(mu) pwm = array([[0.0, 0.5, 0.1, 1.0], [0.0, 0.5, 0.5, 0.0], [1.0, 0.0, 0.4, 0.0], [0.0, 0.0, 0.0, 0.0]]) pwm = array([[0.01, 0.09, 0.1], [0.09, 0.01, 0.1], [0.85, 0.4, 0.1], [0.05, 0.5, 0.7]]) ppwm.set_pwm(log(pwm)) #print(ppwm.get_pwm()) ppwm.compute_w(L) w = ppwm.get_w() #print(w) #from pylab import * #figure(1) #pcolor(exp(w)) #pcolor(w) #colorbar() #figure(2) ppwm.compute_scoring(1) u = ppwm.get_scoring(0) #pcolor(exp(u)) #show() #ppwm=PositionalPWM(feats) #ppwm.train() #out_likelihood = histo.get_log_likelihood() #out_sample = histo.get_log_likelihood_sample() return w, u
def kernel_distantsegments_modular(fm_train_dna=traindat, fm_test_dna=testdat, delta=5, theta=5): from modshogun import StringCharFeatures, DNA from modshogun import DistantSegmentsKernel feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) kernel = DistantSegmentsKernel(feats_train, feats_train, 10, delta, theta) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def distribution_ppwm_modular(fm_dna=traindna, order=3): from modshogun import StringByteFeatures, StringCharFeatures, DNA from modshogun import PositionalPWM from numpy import array, e, log, exp charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats = StringByteFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order - 1, order, 0, False) L = 20 k = 3 sigma = 1 mu = 4 ppwm = PositionalPWM() ppwm.set_sigma(sigma) ppwm.set_mean(mu) pwm = array([[0.0, 0.5, 0.1, 1.0], [0.0, 0.5, 0.5, 0.0], [1.0, 0.0, 0.4, 0.0], [0.0, 0.0, 0.0, 0.0]]) pwm = array([[0.01, 0.09, 0.1], [0.09, 0.01, 0.1], [0.85, 0.4, 0.1], [0.05, 0.5, 0.7]]) ppwm.set_pwm(log(pwm)) # print(ppwm.get_pwm()) ppwm.compute_w(L) w = ppwm.get_w() # print(w) # from pylab import * # figure(1) # pcolor(exp(w)) # pcolor(w) # colorbar() # figure(2) ppwm.compute_scoring(1) u = ppwm.get_scoring(0) # pcolor(exp(u)) # show() # ppwm=PositionalPWM(feats) # ppwm.train() # out_likelihood = histo.get_log_likelihood() # out_sample = histo.get_log_likelihood_sample() return w, u
def features_string_char_modular (strings): from modshogun import StringCharFeatures, RAWBYTE from numpy import array #create string features f=StringCharFeatures(strings, RAWBYTE) #and output several stats #print("max string length", f.get_max_vector_length()) #print("number of strings", f.get_num_vectors()) #print("length of first string", f.get_vector_length(0)) #print("string[5]", ''.join(f.get_feature_vector(5))) #print("strings", f.get_features()) #replace string 0 f.set_feature_vector(array(['t','e','s','t']), 0) #print("strings", f.get_features()) return f.get_features(), f
def get_predictions(self, sequence, positions): seqlen=self.window_right+self.window_left+2 num=len(positions) testdat = [] for j in xrange(num): i=positions[j] - self.offset ; s=sequence[i-self.window_left:i+self.window_right+2] testdat.append(s) t=StringCharFeatures(DNA) t.set_string_features(testdat) self.wd_kernel.init(self.traindat, t) l=self.svm.classify().get_labels() sys.stderr.write("\n...done...\n") return l
def kernel_fisher_modular (fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, N=1,M=4,pseudo=1e-1,order=1,gap=0,reverse=False, kargs=[1,False,True]): from modshogun import StringCharFeatures, StringWordFeatures, FKFeatures, DNA from modshogun import PolyKernel from modshogun import HMM, BW_NORMAL#, MSG_DEBUG # train HMM for positive class charfeat=StringCharFeatures(fm_hmm_pos, DNA) #charfeat.io.set_loglevel(MSG_DEBUG) hmm_pos_train=StringWordFeatures(charfeat.get_alphabet()) hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse) pos=HMM(hmm_pos_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) # train HMM for negative class charfeat=StringCharFeatures(fm_hmm_neg, DNA) hmm_neg_train=StringWordFeatures(charfeat.get_alphabet()) hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse) neg=HMM(hmm_neg_train, N, M, pseudo) neg.baum_welch_viterbi_train(BW_NORMAL) # Kernel training data charfeat=StringCharFeatures(fm_train_dna, DNA) wordfeats_train=StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) # Kernel testing data charfeat=StringCharFeatures(fm_test_dna, DNA) wordfeats_test=StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) # get kernel on training data pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train=FKFeatures(10, pos, neg) feats_train.set_opt_a(-1) #estimate prior kernel=PolyKernel(feats_train, feats_train, *kargs) km_train=kernel.get_kernel_matrix() # get kernel on testing data pos_clone=HMM(pos) neg_clone=HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test=FKFeatures(10, pos_clone, neg_clone) feats_test.set_a(feats_train.get_a()) #use prior from training data kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def features_string_file_modular (directory, fname): from modshogun import StringCharFeatures, RAWBYTE from modshogun import CSVFile # load features from directory f=StringCharFeatures(RAWBYTE) f.load_from_directory(directory) #and output several stats #print("max string length", f.get_max_vector_length()) #print("number of strings", f.get_num_vectors()) #print("length of first string", f.get_vector_length(0)) #print("str[0,0:3]", f.get_feature(0,0), f.get_feature(0,1), f.get_feature(0,2)) #print("len(str[0])", f.get_vector_length(0)) #print("str[0]", f.get_feature_vector(0)) #or load features from file (one string per line) fil=CSVFile(fname) f.load(fil) #print(f.get_features()) #or load fasta file #f.load_fasta('fasta.fa') #print(f.get_features()) return f.get_features(), f
def classifier_svmlight_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,C=1.2,epsilon=1e-5,num_threads=1): from modshogun import StringCharFeatures, BinaryLabels, DNA from modshogun import WeightedDegreeStringKernel try: from modshogun import SVMLight except ImportError: print('No support for SVMLight available.') return feats_train=StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) degree=20 kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) labels=BinaryLabels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) svm.apply().get_labels() return kernel
def kernel_top_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,pseudo=1e-1, order=1,gap=0,reverse=False,kargs=[1, False, True]): from modshogun import StringCharFeatures, StringWordFeatures, TOPFeatures, DNA from modshogun import PolyKernel from modshogun import HMM, BW_NORMAL N=1 # toy HMM with 1 state M=4 # 4 observations -> DNA # train HMM for positive class charfeat=StringCharFeatures(fm_hmm_pos, DNA) hmm_pos_train=StringWordFeatures(charfeat.get_alphabet()) hmm_pos_train.obtain_from_char(charfeat, order-1, order, gap, reverse) pos=HMM(hmm_pos_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) # train HMM for negative class charfeat=StringCharFeatures(fm_hmm_neg, DNA) hmm_neg_train=StringWordFeatures(charfeat.get_alphabet()) hmm_neg_train.obtain_from_char(charfeat, order-1, order, gap, reverse) neg=HMM(hmm_neg_train, N, M, pseudo) neg.baum_welch_viterbi_train(BW_NORMAL) # Kernel training data charfeat=StringCharFeatures(fm_train_dna, DNA) wordfeats_train=StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) # Kernel testing data charfeat=StringCharFeatures(fm_test_dna, DNA) wordfeats_test=StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) # get kernel on training data pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train=TOPFeatures(10, pos, neg, False, False) kernel=PolyKernel(feats_train, feats_train, *kargs) km_train=kernel.get_kernel_matrix() # get kernel on testing data pos_clone=HMM(pos) neg_clone=HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test=TOPFeatures(10, pos_clone, neg_clone, False, False) kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def classifier_svmlight_linear_term_modular (fm_train_dna=traindna,fm_test_dna=testdna, \ label_train_dna=label_traindna,degree=3, \ C=10,epsilon=1e-5,num_threads=1): from modshogun import StringCharFeatures, BinaryLabels, DNA from modshogun import WeightedDegreeStringKernel try: from modshogun import SVMLight except ImportError: print("SVMLight is not available") exit(0) feats_train=StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) labels=BinaryLabels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_qpsize(3) svm.set_linear_term(-numpy.array([1,2,3,4,5,6,7,8,7,6], dtype=numpy.double)); svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) out = svm.apply().get_labels() return out,kernel
def get_predictions_from_seqdict(self, seqdic, site): """ we need to generate a huge test features object containing all locations found in each seqdict-sequence and each location (this is necessary to efficiently (==fast,low memory) compute the splice outputs """ seqlen=self.window_right+self.window_left+2 num=0 for s in seqdic: num+= len(s.preds[site].positions) testdat = [] for s in seqdic: sequence=s.seq positions=s.preds[site].positions for j in xrange(len(positions)): i=positions[j] - self.offset s=sequence[i-self.window_left:i+self.window_right+2] testdat.append(s) t=StringCharFeatures(DNA) t.set_string_features(testdat) self.wd_kernel.init(self.traindat, t) l=self.svm.classify().get_labels() sys.stderr.write("\n...done...\n") k=0 for s in seqdic: num=len(s.preds[site].positions) scores= num * [0] for j in xrange(num): scores[j]=l[k] k+=1 s.preds[site].set_scores(scores)
def distribution_histogram_modular(fm_dna=traindna, order=3, gap=0, reverse=False): from modshogun import StringWordFeatures, StringCharFeatures, DNA from modshogun import Histogram charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_dna) feats = StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order - 1, order, gap, reverse) histo = Histogram(feats) histo.train() histo.get_histogram() num_examples = feats.get_num_vectors() num_param = histo.get_num_model_parameters() # for i in xrange(num_examples): # for j in xrange(num_param): # histo.get_log_derivative(j, i) out_likelihood = histo.get_log_likelihood() out_sample = histo.get_log_likelihood_sample() return histo, out_sample, out_likelihood
def distance_manhattenword_modular (train_fname=traindna,test_fname=testdna,order=3,gap=0,reverse=False): from modshogun import StringCharFeatures, StringWordFeatures, DNA from modshogun import SortWordString, ManhattanWordDistance, CSVFile charfeat=StringCharFeatures(CSVFile(train_fname), DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat=StringCharFeatures(CSVFile(test_fname), DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance=ManhattanWordDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return dm_train,dm_test
def get_kernel_mat(fm_train_dna, fm_test_dna, N, M, pseudo=1e-1,order=1,gap=0,reverse=False): # train HMM for positive class print "hmm training" charfeat=StringCharFeatures(fm_train_dna, DNA) #charfeat.io.set_loglevel(MSG_DEBUG) hmm_train=StringWordFeatures(charfeat.get_alphabet()) hmm_train.obtain_from_char(charfeat, order-1, order, gap, reverse) pos=HMM(hmm_train, N, M, pseudo) pos.baum_welch_viterbi_train(BW_NORMAL) neg = HMM(pos) print "Kernel training data" charfeat=StringCharFeatures(fm_train_dna, DNA) wordfeats_train=StringWordFeatures(charfeat.get_alphabet()) wordfeats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) print "Kernel testing data" charfeat=StringCharFeatures(fm_test_dna, DNA) wordfeats_test=StringWordFeatures(charfeat.get_alphabet()) wordfeats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) print "get kernel on training data" pos.set_observations(wordfeats_train) neg.set_observations(wordfeats_train) feats_train=FKFeatures(10, pos, neg) feats_train.set_opt_a(-1) #estimate prior print 'getting feature matrix' v0 = feats_train.get_feature_vector(0) v1 = feats_train.get_feature_vector(1) print np.dot(v0, v1) kernel=LinearKernel(feats_train, feats_train) #kernel=PolyKernel(feats_train, feats_train, *kargs) km_train=kernel.get_kernel_matrix() print km_train.shape, km_train[0, 1] print "get kernel on testing data" pos_clone=HMM(pos) neg_clone=HMM(neg) pos_clone.set_observations(wordfeats_test) neg_clone.set_observations(wordfeats_test) feats_test=FKFeatures(10, pos_clone, neg_clone) feats_test.set_a(feats_train.get_a()) #use prior from training data kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def classifier_svmlight_batch_linadd_modular( fm_train_dna, fm_test_dna, label_train_dna, degree, C, epsilon, num_threads ): from modshogun import StringCharFeatures, BinaryLabels, DNA from modshogun import WeightedDegreeStringKernel, MSG_DEBUG try: from modshogun import SVMLight except ImportError: print("No support for SVMLight available.") return feats_train = StringCharFeatures(DNA) # feats_train.io.set_loglevel(MSG_DEBUG) feats_train.set_features(fm_train_dna) feats_test = StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) degree = 20 kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels = BinaryLabels(label_train_dna) svm = SVMLight(C, kernel, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) # print('SVMLight Objective: %f num_sv: %d' % \) # (svm.get_objective(), svm.get_num_support_vectors()) svm.set_batch_computation_enabled(False) svm.set_linadd_enabled(False) svm.apply().get_labels() svm.set_batch_computation_enabled(True) labels = svm.apply().get_labels() return labels, svm
def features_string_char_compressed_modular(fname): from modshogun import StringCharFeatures, StringFileCharFeatures, RAWBYTE from modshogun import UNCOMPRESSED, SNAPPY, LZO, GZIP, BZIP2, LZMA, MSG_DEBUG from modshogun import DecompressCharString f = StringFileCharFeatures(fname, RAWBYTE) # print("original strings", f.get_features()) # uncompressed f.save_compressed("tmp/foo_uncompressed.str", UNCOMPRESSED, 1) f2 = StringCharFeatures(RAWBYTE) f2.load_compressed("tmp/foo_uncompressed.str", True) # print("uncompressed strings", f2.get_features()) # print # load compressed data and uncompress on load # snappy - not stable yet?! # f.save_compressed("tmp/foo_snappy.str", SNAPPY, 9) # f2=StringCharFeatures(RAWBYTE); # f2.load_compressed("tmp/foo_snappy.str", True) # print("snappy strings", f2.get_features()) # print # lzo f.save_compressed("tmp/foo_lzo.str", LZO, 9) f2 = StringCharFeatures(RAWBYTE) f2.load_compressed("tmp/foo_lzo.str", True) # print("lzo strings", f2.get_features()) # print ##gzip f.save_compressed("tmp/foo_gzip.str", GZIP, 9) f2 = StringCharFeatures(RAWBYTE) f2.load_compressed("tmp/foo_gzip.str", True) # print("gzip strings", f2.get_features()) # print # bzip2 f.save_compressed("tmp/foo_bzip2.str", BZIP2, 9) f2 = StringCharFeatures(RAWBYTE) f2.load_compressed("tmp/foo_bzip2.str", True) # print("bzip2 strings", f2.get_features()) # print # lzma f.save_compressed("tmp/foo_lzma.str", LZMA, 9) f2 = StringCharFeatures(RAWBYTE) f2.load_compressed("tmp/foo_lzma.str", True) # print("lzma strings", f2.get_features()) # print # load compressed data and uncompress via preprocessor f2 = StringCharFeatures(RAWBYTE) f2.load_compressed("tmp/foo_lzo.str", False) f2.add_preprocessor(DecompressCharString(LZO)) f2.apply_preprocessor() # print("lzo strings", f2.get_features()) # print # load compressed data and uncompress on-the-fly via preprocessor f2 = StringCharFeatures(RAWBYTE) f2.load_compressed("tmp/foo_lzo.str", False) # f2.io.set_loglevel(MSG_DEBUG) f2.add_preprocessor(DecompressCharString(LZO)) f2.enable_on_the_fly_preprocessing() # print("lzo strings", f2.get_features()) # print # clean up import os for f in [ "tmp/foo_uncompressed.str", "tmp/foo_snappy.str", "tmp/foo_lzo.str", "tmp/foo_gzip.str", "tmp/foo_bzip2.str", "tmp/foo_lzma.str", "tmp/foo_lzo.str", "tmp/foo_lzo.str", ]: if os.path.exists(f): os.unlink(f)
def features_string_sliding_window_modular (strings): from modshogun import StringCharFeatures, DNA from modshogun import DynamicIntArray f=StringCharFeatures([strings], DNA) # slide a window of length 5 over features # (memory efficient, does not copy strings) f.obtain_by_sliding_window(5,1) #print(f.get_num_vectors()) #print(f.get_vector_length(0)) #print(f.get_vector_length(1)) #print(f.get_features()) # slide a window of length 4 over features # (memory efficient, does not copy strings) f.obtain_by_sliding_window(4,1) #print(f.get_num_vectors()) #print(f.get_vector_length(0)) #print(f.get_vector_length(1)) #print(f.get_features()) # extract string-windows at position 0,6,16,25 of window size 4 # (memory efficient, does not copy strings) f.set_features([s]) positions=DynamicIntArray() positions.append_element(0) positions.append_element(6) positions.append_element(16) positions.append_element(25) f.obtain_by_position_list(4,positions) #print(f.get_features()) # now extract windows of size 8 from same positon list f.obtain_by_position_list(8,positions) #print(f.get_features()) return f
def tests_check_commwordkernel_memleak_modular (num, order, gap, reverse): import gc from modshogun import Alphabet,StringCharFeatures,StringWordFeatures,DNA from modshogun import SortWordString, MSG_DEBUG from modshogun import CommWordStringKernel, IdentityKernelNormalizer from numpy import mat POS=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT'] NEG=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT'] for i in range(10): alpha=Alphabet(DNA) traindat=StringCharFeatures(alpha) traindat.set_features(POS+NEG) trainudat=StringWordFeatures(traindat.get_alphabet()); trainudat.obtain_from_char(traindat, order-1, order, gap, reverse) #trainudat.io.set_loglevel(MSG_DEBUG) pre = SortWordString() #pre.io.set_loglevel(MSG_DEBUG) pre.init(trainudat) trainudat.add_preprocessor(pre) trainudat.apply_preprocessor() spec = CommWordStringKernel(10, False) spec.set_normalizer(IdentityKernelNormalizer()) spec.init(trainudat, trainudat) K=spec.get_kernel_matrix() del POS del NEG del order del gap del reverse return K