def create_hashed_features_spectrum(param, data): """ creates hashed dot features for the spectrum kernel """ # extract parameters order = param["degree_spectrum"] # fixed parameters gap = 0 reverse = True normalize = True # create features feats_char = StringCharFeatures(data, DNA) feats_word = StringWordFeatures(feats_char.get_alphabet()) feats_word.obtain_from_char(feats_char, order-1, order, gap, reverse) # create preproc preproc = SortWordString() preproc.init(feats_word) feats_word.add_preproc(preproc) feats_word.apply_preproc() # finish feats = ImplicitWeightedSpecFeatures(feats_word, normalize) return feats
def manhattan_word_distance (): print 'ManhattanWordDistance' from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.PreProc import SortWordString from shogun.Distance import ManhattanWordDistance order=3 gap=0 reverse=False charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() distance=ManhattanWordDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix()
def init_sensor(self, kernel, svs): f = StringCharFeatures(svs, DNA) kname = kernel['name'] if kname == 'spectrum': wf = StringWordFeatures(f.get_alphabet()) wf.obtain_from_char(f, kernel['order'] - 1, kernel['order'], 0, False) pre = SortWordString() pre.init(wf) wf.add_preproc(pre) wf.apply_preproc() f = wf k = CommWordStringKernel(0, False) k.set_use_dict_diagonal_optimization(kernel['order'] < 8) self.preproc = pre elif kname == 'wdshift': k = WeightedDegreePositionStringKernel(0, kernel['order']) k.set_normalizer(IdentityKernelNormalizer()) k.set_shifts( kernel['shift'] * numpy.ones(f.get_max_vector_length(), dtype=numpy.int32)) k.set_position_weights( 1.0 / f.get_max_vector_length() * numpy.ones(f.get_max_vector_length(), dtype=numpy.float64)) else: raise "Currently, only wdshift and spectrum kernels supported" self.kernel = k self.train_features = f return (self.kernel, self.train_features)
def non_redundant_word_features(feats, kmerlen): """convert the features from Shogun toolbox to non-redundant word features (handle reverse complements) Arguments: feats -- StringWordFeatures kmerlen -- integer, length of k-mer Return: StringWordFeatures after converting reverse complement k-mer ids """ rcmap = g_rcmap for i in xrange(feats.get_num_vectors()): nf = [rcmap[int(kmerid)] for kmerid in feats.get_feature_vector(i)] feats.set_feature_vector(numpy.array(nf, numpy.dtype('u2')), i) preproc = SortWordString() preproc.init(feats) try: feats.add_preproc(preproc) feats.apply_preproc() except AttributeError: feats.add_preprocessor(preproc) feats.apply_preprocessor() return feats
def sort_word_string (): print 'CommWordString' from shogun.Kernel import CommWordStringKernel from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.PreProc import SortWordString order=3 gap=0 reverse=False charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() use_sign=False kernel=CommWordStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix()
def preproc_sortwordstring_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False,use_sign=False): from shogun.Kernel import CommWordStringKernel from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.PreProc import SortWordString charfeat=StringCharFeatures(fm_train_dna, DNA) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat=StringCharFeatures(fm_test_dna, DNA) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() kernel=CommWordStringKernel(feats_train, feats_train, use_sign) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def distance_canberraword_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False): from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.PreProc import SortWordString from shogun.Distance import CanberraWordDistance charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat=StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() distance=CanberraWordDistance(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test
def init_sensor(self, kernel, svs): f = StringCharFeatures(svs, DNA) kname = kernel['name'] if kname == 'spectrum': wf = StringWordFeatures(f.get_alphabet()) wf.obtain_from_char(f, kernel['order'] - 1, kernel['order'], 0, False) pre = SortWordString() pre.init(wf) wf.add_preproc(pre) wf.apply_preproc() f = wf k = CommWordStringKernel(0, False) k.set_use_dict_diagonal_optimization(kernel['order'] < 8) self.preproc = pre elif kname == 'wdshift': k = WeightedDegreePositionStringKernel(0, kernel['order']) k.set_normalizer(IdentityKernelNormalizer()) k.set_shifts(kernel['shift'] * numpy.ones(f.get_max_vector_length(), dtype=numpy.int32)) k.set_position_weights(1.0 / f.get_max_vector_length() * numpy.ones(f.get_max_vector_length(), dtype=numpy.float64)) else: raise "Currently, only wdshift and spectrum kernels supported" self.kernel = k self.train_features = f return (self.kernel, self.train_features)
def create_hashed_features_spectrum(param, data): """ creates hashed dot features for the spectrum kernel """ # extract parameters order = param["degree_spectrum"] # fixed parameters gap = 0 reverse = True normalize = True # create features feats_char = StringCharFeatures(data, DNA) feats_word = StringWordFeatures(feats_char.get_alphabet()) feats_word.obtain_from_char(feats_char, order - 1, order, gap, reverse) # create preproc preproc = SortWordString() preproc.init(feats_word) feats_word.add_preproc(preproc) feats_word.apply_preproc() # finish feats = ImplicitWeightedSpecFeatures(feats_word, normalize) return feats
def distance_hammingword_modular(fm_train_dna=traindna, fm_test_dna=testdna, fm_test_real=testdat, order=3, gap=0, reverse=False, use_sign=False): from shogun.Features import StringCharFeatures, StringWordFeatures, DNA from shogun.PreProc import SortWordString from shogun.Distance import HammingWordDistance charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_train_dna) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test = StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() distance = HammingWordDistance(feats_train, feats_train, use_sign) dm_train = distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test = distance.get_distance_matrix() return distance, dm_train, dm_test
def get_spectrum_features(data, order=3, gap=0, reverse=True): """ create feature object used by spectrum kernel """ charfeat = StringCharFeatures(data, DNA) feat = StringWordFeatures(charfeat.get_alphabet()) feat.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feat) feat.add_preprocessor(preproc) feat.apply_preprocessor() return feat
def K(self,logtheta,*args): """new version using shogun""" x1 = args[0][:,self.index].copy() if(len(args)==1): x2 = x1.copy() else: x2 = args[1][:,self.index].copy() V0 = exp(2*logtheta[0]) #try to get K from cache if (self.x1_.shape[0]==x1.shape[0]) and (self.x2_.shape[0]==x2.shape[0]): if ((self.x1_==x1).all() and (self.x2_==x2).all()): K = self.K_ K = K*V0 return K DNA = 0 order = 3 gap =0 reverse = False charfeat=StringCharFeatures(DNA) charfeat.set_string_features(x1.tolist()) feats_train=StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() charfeat=StringCharFeatures(DNA) charfeat.set_string_features(x2.tolist()) feats_test=StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() use_sign=False kernel=CommWordStringKernel(feats_train, feats_train, use_sign) kernel.init(feats_train,feats_test) K=kernel.get_kernel_matrix() self.K_ = K self.x1_ = x1 self.x2_ = x2 #scale factor K=K*V0 return K
def get_spectrum_features(data, order=3, gap=0, reverse=True): """ create feature object used by spectrum kernel """ charfeat = StringCharFeatures(data, DNA) feat = StringWordFeatures(charfeat.get_alphabet()) feat.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc = SortWordString() preproc.init(feat) feat.add_preprocessor(preproc) feat.apply_preprocessor() return feat
def perform_clustering(mss_id): import numpy import expenv mss = expenv.MultiSplitSet.get(mss_id) from method_mhc_mkl import SequencesHandler from shogun.Distance import EuclidianDistance, HammingWordDistance from shogun.Features import StringCharFeatures, StringWordFeatures, PROTEIN from shogun.Clustering import Hierarchical from shogun.PreProc import SortWordString order = 1 gap = 0 reverse = False seq_handler = SequencesHandler() data = [seq_handler.get_seq(ss.dataset.organism) for ss in mss.split_sets] charfeat=StringCharFeatures(PROTEIN) charfeat.set_features(data) feats=StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats) feats.add_preproc(preproc) feats.apply_preproc() use_sign = False distance = HammingWordDistance(feats, feats, use_sign) #distance = EuclidianDistance() merges=4 hierarchical=Hierarchical(merges, distance) hierarchical.train() hierarchical.get_merge_distances() hierarchical.get_cluster_pairs() return hierarchical
def perform_clustering(mss_id): import numpy import expenv mss = expenv.MultiSplitSet.get(mss_id) from method_mhc_mkl import SequencesHandler from shogun.Distance import EuclidianDistance, HammingWordDistance from shogun.Features import StringCharFeatures, StringWordFeatures, PROTEIN from shogun.Clustering import Hierarchical from shogun.PreProc import SortWordString order = 1 gap = 0 reverse = False seq_handler = SequencesHandler() data = [seq_handler.get_seq(ss.dataset.organism) for ss in mss.split_sets] charfeat = StringCharFeatures(PROTEIN) charfeat.set_features(data) feats = StringWordFeatures(charfeat.get_alphabet()) feats.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feats) feats.add_preproc(preproc) feats.apply_preproc() use_sign = False distance = HammingWordDistance(feats, feats, use_sign) #distance = EuclidianDistance() merges = 4 hierarchical = Hierarchical(merges, distance) hierarchical.train() hierarchical.get_merge_distances() hierarchical.get_cluster_pairs() return hierarchical
def seqToWordFeatures(train,test,order=7,gap=10): charfeat_train, charfeat_test = seqToStringFeatures(train,test) reverse=False feats_train=StringWordFeatures(charfeat_train.get_alphabet()) feats_train.obtain_from_char(charfeat_train, order-1, order, gap, reverse) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preproc(preproc) feats_train.apply_preproc() reverse = False feats_test=StringWordFeatures(charfeat_test.get_alphabet()) feats_test.obtain_from_char(charfeat_test, order-1, order, gap, reverse) feats_test.add_preproc(preproc) feats_test.apply_preproc() #pdb.set_trace() return feats_train, feats_test
def non_redundant_word_features(feats, kmerlen): """convert the features from Shogun toolbox to non-redundant word features (handle reverse complements) Arguments: feats -- StringWordFeatures kmerlen -- integer, length of k-mer Return: StringWordFeatures after converting reverse complement k-mer ids """ rcmap = g_rcmap # the number of sequences, 1923 for CT18 window size 5000 print 'The number of feature vectors: %s' % (feats.get_num_vectors()) # one vector for each sequence for i in xrange(feats.get_num_vectors()): # a list of kmer id for each feature vector # some k-mer ids may be redundant nf = [rcmap[int(kmerid)] for kmerid in feats.get_feature_vector(i)] ''' set_feature_vector (SGVector< ST > vector, int32_t num) ''' feats.set_feature_vector(numpy.array(nf, numpy.dtype('u2')), i) # sort features by kmer id preproc = SortWordString() preproc.init(feats) try: feats.add_preproc(preproc) feats.apply_preproc() except AttributeError: feats.add_preprocessor(preproc) feats.apply_preprocessor() return feats
def normal_word_feature(feats): preproc = SortWordString() preproc.init(feats) feats.add_preprocessor(preproc) feats.apply_preprocessor() return feats
def tests_check_commwordkernel_memleak_modular(num, order, gap, reverse): import gc from shogun.Features import Alphabet, StringCharFeatures, StringWordFeatures, DNA from shogun.PreProc import SortWordString, MSG_DEBUG from shogun.Kernel import CommWordStringKernel, IdentityKernelNormalizer from numpy import mat POS = [ num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT' ] NEG = [ num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT' ] for i in xrange(10): alpha = Alphabet(DNA) traindat = StringCharFeatures(alpha) traindat.set_features(POS + NEG) trainudat = StringWordFeatures(traindat.get_alphabet()) trainudat.obtain_from_char(traindat, order - 1, order, gap, reverse) #trainudat.io.set_loglevel(MSG_DEBUG) pre = SortWordString() #pre.io.set_loglevel(MSG_DEBUG) pre.init(trainudat) trainudat.add_preproc(pre) trainudat.apply_preproc() spec = CommWordStringKernel(10, False) spec.set_normalizer(IdentityKernelNormalizer()) spec.init(trainudat, trainudat) K = spec.get_kernel_matrix() del POS del NEG del order del gap del reverse return K
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT'] order=7 gap=0 reverse=False for i in xrange(10): alpha=Alphabet(DNA) traindat=StringCharFeatures(alpha) traindat.set_features(POS+NEG) trainudat=StringWordFeatures(traindat.get_alphabet()); trainudat.obtain_from_char(traindat, order-1, order, gap, reverse) #trainudat.io.set_loglevel(MSG_DEBUG) pre = SortWordString() #pre.io.set_loglevel(MSG_DEBUG) pre.init(trainudat) trainudat.add_preproc(pre) trainudat.apply_preproc() spec = CommWordStringKernel(10, False) spec.set_normalizer(IdentityKernelNormalizer()) spec.init(trainudat, trainudat) K=mat(spec.get_kernel_matrix()) del POS del NEG del order del gap del reverse
def create_features(kname, examples, kparam, train_mode, preproc, seq_source, nuc_con): """Converts numpy arrays or sequences into shogun features""" if kname == 'gauss' or kname == 'linear' or kname == 'poly': examples = numpy.array(examples) feats = RealFeatures(examples) elif kname == 'wd' or kname == 'localalign' or kname == 'localimprove': if seq_source == 'dna': examples = non_atcg_convert(examples, nuc_con) feats = StringCharFeatures(examples, DNA) elif seq_source == 'protein': examples = non_aminoacid_converter(examples, nuc_con) feats = StringCharFeatures(examples, PROTEIN) else: sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n") sys.exit(-1) elif kname == 'spec' or kname == 'cumspec': if seq_source == 'dna': examples = non_atcg_convert(examples, nuc_con) feats = StringCharFeatures(examples, DNA) elif seq_source == 'protein': examples = non_aminoacid_converter(examples, nuc_con) feats = StringCharFeatures(examples, PROTEIN) else: sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n") sys.exit(-1) wf = StringUlongFeatures( feats.get_alphabet() ) wf.obtain_from_char(feats, kparam['degree']-1, kparam['degree'], 0, kname=='cumspec') del feats if train_mode: preproc = SortUlongString() preproc.init(wf) wf.add_preproc(preproc) ret = wf.apply_preproc() #assert(ret) feats = wf elif kname == 'spec2' or kname == 'cumspec2': # spectrum kernel on two sequences feats = {} feats['combined'] = CombinedFeatures() reversed = kname=='cumspec2' (ex0,ex1) = zip(*examples) f0 = StringCharFeatures(list(ex0), DNA) wf = StringWordFeatures(f0.get_alphabet()) wf.obtain_from_char(f0, kparam['degree']-1, kparam['degree'], 0, reversed) del f0 if train_mode: preproc = SortWordString() preproc.init(wf) wf.add_preprocessor(preproc) ret = wf.apply_preprocessors() assert(ret) feats['combined'].append_feature_obj(wf) feats['f0'] = wf f1 = StringCharFeatures(list(ex1), DNA) wf = StringWordFeatures( f1.get_alphabet() ) wf.obtain_from_char(f1, kparam['degree']-1, kparam['degree'], 0, reversed) del f1 if train_mode: preproc = SortWordString() preproc.init(wf) wf.add_preproc(preproc) ret = wf.apply_preproc() assert(ret) feats['combined'].append_feature_obj(wf) feats['f1'] = wf else: print 'Unknown kernel %s' % kname return (feats,preproc)
def tests_check_commwordkernel_memleak_modular(num, order, gap, reverse): import gc from shogun.Features import Alphabet,StringCharFeatures,StringWordFeatures,DNA from shogun.PreProc import SortWordString, MSG_DEBUG from shogun.Kernel import CommWordStringKernel, IdentityKernelNormalizer from numpy import mat POS=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT'] NEG=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT'] for i in xrange(10): alpha=Alphabet(DNA) traindat=StringCharFeatures(alpha) traindat.set_features(POS+NEG) trainudat=StringWordFeatures(traindat.get_alphabet()); trainudat.obtain_from_char(traindat, order-1, order, gap, reverse) #trainudat.io.set_loglevel(MSG_DEBUG) pre = SortWordString() #pre.io.set_loglevel(MSG_DEBUG) pre.init(trainudat) trainudat.add_preproc(pre) trainudat.apply_preproc() spec = CommWordStringKernel(10, False) spec.set_normalizer(IdentityKernelNormalizer()) spec.init(trainudat, trainudat) K=spec.get_kernel_matrix() del POS del NEG del order del gap del reverse return K