def manhattan_word_distance ():
	print 'ManhattanWordDistance'

	from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
	from shogun.PreProc import SortWordString
	from shogun.Distance import ManhattanWordDistance

	order=3
	gap=0
	reverse=False

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preproc(preproc)
	feats_train.apply_preproc()

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preproc(preproc)
	feats_test.apply_preproc()

	distance=ManhattanWordDistance(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
def kernel_comm_word_string_modular(fm_train_dna=traindat,
                                    fm_test_dna=testdat,
                                    order=3,
                                    gap=0,
                                    reverse=False,
                                    use_sign=False):

    from shogun.Kernel import CommWordStringKernel
    from shogun.Features import StringWordFeatures, StringCharFeatures, DNA
    from shogun.Preprocessor import SortWordString

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_train_dna)
    feats_train = StringWordFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feats_train)
    feats_train.add_preproc(preproc)
    feats_train.apply_preproc()

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_test_dna)
    feats_test = StringWordFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    feats_test.add_preproc(preproc)
    feats_test.apply_preproc()

    kernel = CommWordStringKernel(feats_train, feats_train, use_sign)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
def sort_word_string ():
	print 'CommWordString'

	from shogun.Kernel import CommWordStringKernel
	from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
	from shogun.PreProc import SortWordString

	order=3
	gap=0
	reverse=False

	charfeat=StringCharFeatures(fm_train_dna, DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preproc(preproc)
	feats_train.apply_preproc()

	charfeat=StringCharFeatures(fm_test_dna, DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preproc(preproc)
	feats_test.apply_preproc()

	use_sign=False

	kernel=CommWordStringKernel(feats_train, feats_train, use_sign)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
def preproc_sortwordstring_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False,use_sign=False):

	from shogun.Kernel import CommWordStringKernel
	from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
	from shogun.Preprocessor import SortWordString

	charfeat=StringCharFeatures(fm_train_dna, DNA)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preproc(preproc)
	feats_train.apply_preproc()

	charfeat=StringCharFeatures(fm_test_dna, DNA)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preproc(preproc)
	feats_test.apply_preproc()

	kernel=CommWordStringKernel(feats_train, feats_train, use_sign)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train,km_test,kernel
def distance_canberraword_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False):
	from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
	from shogun.Preprocessor import SortWordString
	from shogun.Distance import CanberraWordDistance
	
	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringWordFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortWordString()
	preproc.init(feats_train)
	feats_train.add_preproc(preproc)
	feats_train.apply_preproc()

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringWordFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preproc(preproc)
	feats_test.apply_preproc()

	distance=CanberraWordDistance(feats_train, feats_train)

	dm_train=distance.get_distance_matrix()
	distance.init(feats_train, feats_test)
	dm_test=distance.get_distance_matrix()
	return distance,dm_train,dm_test
def create_hashed_features_spectrum(param, data):
    """
    creates hashed dot features for the spectrum kernel
    """

    # extract parameters
    order = param["degree_spectrum"]

    # fixed parameters
    gap = 0
    reverse = True
    normalize = True

    # create features
    feats_char = StringCharFeatures(data, DNA)
    feats_word = StringWordFeatures(feats_char.get_alphabet())
    feats_word.obtain_from_char(feats_char, order - 1, order, gap, reverse)

    # create preproc
    preproc = SortWordString()
    preproc.init(feats_word)
    feats_word.add_preproc(preproc)
    feats_word.apply_preproc()

    # finish
    feats = ImplicitWeightedSpecFeatures(feats_word, normalize)

    return feats
Exemple #7
0
    def init_sensor(self, kernel, svs):
        f = StringCharFeatures(svs, DNA)

        kname = kernel['name']
        if  kname == 'spectrum':
            wf = StringWordFeatures(f.get_alphabet())
            wf.obtain_from_char(f, kernel['order'] - 1, kernel['order'], 0, False)

            pre = SortWordString()
            pre.init(wf)
            wf.add_preproc(pre)
            wf.apply_preproc()
            f = wf

            k = CommWordStringKernel(0, False)
            k.set_use_dict_diagonal_optimization(kernel['order'] < 8)
            self.preproc = pre

        elif kname == 'wdshift':
                k = WeightedDegreePositionStringKernel(0, kernel['order'])
                k.set_normalizer(IdentityKernelNormalizer())
                k.set_shifts(kernel['shift'] *
                        numpy.ones(f.get_max_vector_length(), dtype=numpy.int32))
                k.set_position_weights(1.0 / f.get_max_vector_length() *
                        numpy.ones(f.get_max_vector_length(), dtype=numpy.float64))
        else:
            raise "Currently, only wdshift and spectrum kernels supported"

        self.kernel = k
        self.train_features = f

        return (self.kernel, self.train_features)
def distance_hammingword_modular(fm_train_dna=traindna,
                                 fm_test_dna=testdna,
                                 fm_test_real=testdat,
                                 order=3,
                                 gap=0,
                                 reverse=False,
                                 use_sign=False):

    from shogun.Features import StringCharFeatures, StringWordFeatures, DNA
    from shogun.PreProc import SortWordString
    from shogun.Distance import HammingWordDistance

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_train_dna)
    feats_train = StringWordFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feats_train)
    feats_train.add_preproc(preproc)
    feats_train.apply_preproc()

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_test_dna)
    feats_test = StringWordFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    feats_test.add_preproc(preproc)
    feats_test.apply_preproc()

    distance = HammingWordDistance(feats_train, feats_train, use_sign)

    dm_train = distance.get_distance_matrix()
    distance.init(feats_train, feats_test)
    dm_test = distance.get_distance_matrix()
    return distance, dm_train, dm_test
def create_hashed_features_spectrum(param, data):
    """
    creates hashed dot features for the spectrum kernel
    """

    # extract parameters
    order = param["degree_spectrum"]

    # fixed parameters
    gap = 0
    reverse = True 
    normalize = True

    # create features
    feats_char = StringCharFeatures(data, DNA)
    feats_word = StringWordFeatures(feats_char.get_alphabet())
    feats_word.obtain_from_char(feats_char, order-1, order, gap, reverse)

    # create preproc
    preproc = SortWordString()
    preproc.init(feats_word)
    feats_word.add_preproc(preproc)
    feats_word.apply_preproc()

    # finish 
    feats = ImplicitWeightedSpecFeatures(feats_word, normalize)

    return feats
Exemple #10
0
    def init_sensor(self, kernel, svs):
        f = StringCharFeatures(svs, DNA)

        kname = kernel['name']
        if kname == 'spectrum':
            wf = StringWordFeatures(f.get_alphabet())
            wf.obtain_from_char(f, kernel['order'] - 1, kernel['order'], 0,
                                False)

            pre = SortWordString()
            pre.init(wf)
            wf.add_preproc(pre)
            wf.apply_preproc()
            f = wf

            k = CommWordStringKernel(0, False)
            k.set_use_dict_diagonal_optimization(kernel['order'] < 8)
            self.preproc = pre

        elif kname == 'wdshift':
            k = WeightedDegreePositionStringKernel(0, kernel['order'])
            k.set_normalizer(IdentityKernelNormalizer())
            k.set_shifts(
                kernel['shift'] *
                numpy.ones(f.get_max_vector_length(), dtype=numpy.int32))
            k.set_position_weights(
                1.0 / f.get_max_vector_length() *
                numpy.ones(f.get_max_vector_length(), dtype=numpy.float64))
        else:
            raise "Currently, only wdshift and spectrum kernels supported"

        self.kernel = k
        self.train_features = f

        return (self.kernel, self.train_features)
Exemple #11
0
def perform_clustering(mss_id):

    import numpy
    import expenv
    
    mss = expenv.MultiSplitSet.get(mss_id)
    


    from method_mhc_mkl import SequencesHandler
    from shogun.Distance import EuclidianDistance, HammingWordDistance
    from shogun.Features import StringCharFeatures, StringWordFeatures, PROTEIN
    from shogun.Clustering import Hierarchical
    from shogun.PreProc import SortWordString
    
    order = 1
    gap = 0
    reverse = False
    
    seq_handler = SequencesHandler()
    
    data = [seq_handler.get_seq(ss.dataset.organism) for ss in mss.split_sets] 

    charfeat=StringCharFeatures(PROTEIN)
    charfeat.set_features(data)
    feats=StringWordFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order-1, order, gap, reverse)
    preproc=SortWordString()
    preproc.init(feats)
    feats.add_preproc(preproc)
    feats.apply_preproc()

    
    use_sign = False

    distance = HammingWordDistance(feats, feats, use_sign)
    #distance = EuclidianDistance()
    
    merges=4
    hierarchical=Hierarchical(merges, distance)
    hierarchical.train()

    hierarchical.get_merge_distances()
    hierarchical.get_cluster_pairs()
    
    
    return hierarchical
Exemple #12
0
def perform_clustering(mss_id):

    import numpy
    import expenv

    mss = expenv.MultiSplitSet.get(mss_id)

    from method_mhc_mkl import SequencesHandler
    from shogun.Distance import EuclidianDistance, HammingWordDistance
    from shogun.Features import StringCharFeatures, StringWordFeatures, PROTEIN
    from shogun.Clustering import Hierarchical
    from shogun.PreProc import SortWordString

    order = 1
    gap = 0
    reverse = False

    seq_handler = SequencesHandler()

    data = [seq_handler.get_seq(ss.dataset.organism) for ss in mss.split_sets]

    charfeat = StringCharFeatures(PROTEIN)
    charfeat.set_features(data)
    feats = StringWordFeatures(charfeat.get_alphabet())
    feats.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortWordString()
    preproc.init(feats)
    feats.add_preproc(preproc)
    feats.apply_preproc()

    use_sign = False

    distance = HammingWordDistance(feats, feats, use_sign)
    #distance = EuclidianDistance()

    merges = 4
    hierarchical = Hierarchical(merges, distance)
    hierarchical.train()

    hierarchical.get_merge_distances()
    hierarchical.get_cluster_pairs()

    return hierarchical
def tests_check_commwordkernel_memleak_modular(num, order, gap, reverse):
	import gc
	from shogun.Features import Alphabet,StringCharFeatures,StringWordFeatures,DNA
	from shogun.Preprocessor import SortWordString, MSG_DEBUG
	from shogun.Kernel import CommWordStringKernel, IdentityKernelNormalizer
	from numpy import mat

	POS=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', 
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', 
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', 
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT']
	NEG=[num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', 
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', 
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'TTGT', num*'TTGT', 
	num*'TTGT',num*'TTGT', num*'TTGT', num*'TTGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT',num*'ACGT', num*'ACGT', 
	num*'ACGT',num*'ACGT', num*'ACGT', num*'ACGT']

	for i in xrange(10):
		alpha=Alphabet(DNA)
		traindat=StringCharFeatures(alpha)
		traindat.set_features(POS+NEG)
		trainudat=StringWordFeatures(traindat.get_alphabet());
		trainudat.obtain_from_char(traindat, order-1, order, gap, reverse)
		#trainudat.io.set_loglevel(MSG_DEBUG)
		pre = SortWordString()
		#pre.io.set_loglevel(MSG_DEBUG)
		pre.init(trainudat)
		trainudat.add_preproc(pre)
		trainudat.apply_preproc()
		spec = CommWordStringKernel(10, False)
		spec.set_normalizer(IdentityKernelNormalizer())
		spec.init(trainudat, trainudat)
		K=spec.get_kernel_matrix()

	del POS
	del NEG
	del order
	del gap
	del reverse
	return K
Exemple #14
0
def tests_check_commwordkernel_memleak_modular(num, order, gap, reverse):
    import gc
    from shogun.Features import Alphabet, StringCharFeatures, StringWordFeatures, DNA
    from shogun.PreProc import SortWordString, MSG_DEBUG
    from shogun.Kernel import CommWordStringKernel, IdentityKernelNormalizer
    from numpy import mat

    POS = [
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT'
    ]
    NEG = [
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT', num * 'TTGT',
        num * 'TTGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT', num * 'ACGT',
        num * 'ACGT'
    ]

    for i in xrange(10):
        alpha = Alphabet(DNA)
        traindat = StringCharFeatures(alpha)
        traindat.set_features(POS + NEG)
        trainudat = StringWordFeatures(traindat.get_alphabet())
        trainudat.obtain_from_char(traindat, order - 1, order, gap, reverse)
        #trainudat.io.set_loglevel(MSG_DEBUG)
        pre = SortWordString()
        #pre.io.set_loglevel(MSG_DEBUG)
        pre.init(trainudat)
        trainudat.add_preproc(pre)
        trainudat.apply_preproc()
        spec = CommWordStringKernel(10, False)
        spec.set_normalizer(IdentityKernelNormalizer())
        spec.init(trainudat, trainudat)
        K = spec.get_kernel_matrix()

    del POS
    del NEG
    del order
    del gap
    del reverse
    return K
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT',100*'ACGT', 100*'ACGT', 
100*'ACGT',100*'ACGT', 100*'ACGT', 100*'ACGT']
order=7
gap=0
reverse=False

for i in xrange(10):
    alpha=Alphabet(DNA)
    traindat=StringCharFeatures(alpha)
    traindat.set_features(POS+NEG)
    trainudat=StringWordFeatures(traindat.get_alphabet());
    trainudat.obtain_from_char(traindat, order-1, order, gap, reverse)
    #trainudat.io.set_loglevel(MSG_DEBUG)
    pre = SortWordString()
    #pre.io.set_loglevel(MSG_DEBUG)
    pre.init(trainudat)
    trainudat.add_preproc(pre)
    trainudat.apply_preproc()
    spec = CommWordStringKernel(10, False)
    spec.set_normalizer(IdentityKernelNormalizer())
    spec.init(trainudat, trainudat)
    K=mat(spec.get_kernel_matrix())

del POS
del NEG
del order
del gap
del reverse