Esempio n. 1
0
def non_redundant_ulong_features(feats, kmerlen):
    """convert the features from Shogun toolbox to non-redundant ulong features
	Arguments:
	feats -- StringUlongFeatures
	kmerlen -- integer, length of k-mer

	Return:
	StringUlongFeatures after converting reverse complement k-mer ids
	"""

    for i in xrange(feats.get_num_vectors()):
        nf = [get_rcmap(int(kmerid), kmerlen) \
          for kmerid in feats.get_feature_vector(i)]

        feats.set_feature_vector(numpy.array(nf, numpy.dtype('u8')), i)

    preproc = SortUlongString()
    preproc.init(feats)
    try:
        feats.add_preproc(preproc)
        feats.apply_preproc()
    except AttributeError:
        feats.add_preprocessor(preproc)
        feats.apply_preprocessor()

    return feats
Esempio n. 2
0
def kernel_comm_ulong_string_modular(fm_train_dna=traindat,
                                     fm_test_dna=testdat,
                                     order=3,
                                     gap=0,
                                     reverse=False):

    from shogun.Kernel import CommUlongStringKernel
    from shogun.Features import StringUlongFeatures, StringCharFeatures, DNA
    from shogun.PreProc import SortUlongString

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_train_dna)
    feats_train = StringUlongFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    preproc = SortUlongString()
    preproc.init(feats_train)
    feats_train.add_preproc(preproc)
    feats_train.apply_preproc()

    charfeat = StringCharFeatures(DNA)
    charfeat.set_features(fm_test_dna)
    feats_test = StringUlongFeatures(charfeat.get_alphabet())
    feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    feats_test.add_preproc(preproc)
    feats_test.apply_preproc()

    use_sign = False

    kernel = CommUlongStringKernel(feats_train, feats_train, use_sign)

    km_train = kernel.get_kernel_matrix()
    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
def comm_ulong_string ():
	print 'CommUlongString'
	from shogun.Kernel import CommUlongStringKernel
	from shogun.Features import StringUlongFeatures, StringCharFeatures, DNA
	from shogun.PreProc import SortUlongString
	order=3
	gap=0
	reverse=False

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringUlongFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
	preproc=SortUlongString()
	preproc.init(feats_train)
	feats_train.add_preproc(preproc)
	feats_train.apply_preproc()


	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringUlongFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)
	feats_test.add_preproc(preproc)
	feats_test.apply_preproc()

	use_sign=False

	kernel=CommUlongStringKernel(feats_train, feats_train, use_sign)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
def non_redundant_ulong_features(feats, kmerlen):
	"""convert the features from Shogun toolbox to non-redundant ulong features
	Arguments:
	feats -- StringUlongFeatures
	kmerlen -- integer, length of k-mer

	Return:
	StringUlongFeatures after converting reverse complement k-mer ids
	"""

	for i in xrange(feats.get_num_vectors()):
		nf = [get_rcmap(int(kmerid), kmerlen) \
				for kmerid in feats.get_feature_vector(i)]

		feats.set_feature_vector(numpy.array(nf, numpy.dtype('u8')), i)

	preproc = SortUlongString()
	preproc.init(feats)
	try:
		feats.add_preproc(preproc)
		feats.apply_preproc()
	except AttributeError:
		feats.add_preprocessor(preproc)
		feats.apply_preprocessor()

	return feats
def get_kernel_matrix(li):
    """
    Get kernel matrix from a list of strings.
    """

    order = 6
    gap = 2
    reverse = False
    charfeat = StringCharFeatures(RAWBYTE)
    charfeat.set_features(li)
    #Get alphabet.
    feats_train = StringUlongFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)
    #CommUlongStringKernel needs sorted features.
    preproc = SortUlongString()
    preproc.init(feats_train)
    feats_train.add_preproc(preproc)
    feats_train.apply_preproc()

    use_sign = False

    #Compute kernel matrix between train features.
    kernel = CommUlongStringKernel(feats_train, feats_train, use_sign)
    km_train = kernel.get_kernel_matrix()
    return km_train
def preproc_sortulongstring_modular (fm_train_dna=traindna,fm_test_dna=testdna,order=3,gap=0,reverse=False,use_sign=False):

	from shogun.Kernel import CommUlongStringKernel
	from shogun.Features import StringCharFeatures, StringUlongFeatures, DNA
	from shogun.PreProc import SortUlongString


	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_train_dna)
	feats_train=StringUlongFeatures(charfeat.get_alphabet())
	feats_train.obtain_from_char(charfeat, order-1, order, gap, reverse)

	charfeat=StringCharFeatures(DNA)
	charfeat.set_features(fm_test_dna)
	feats_test=StringUlongFeatures(charfeat.get_alphabet())
	feats_test.obtain_from_char(charfeat, order-1, order, gap, reverse)

	preproc=SortUlongString()
	preproc.init(feats_train)
	feats_train.add_preproc(preproc)
	feats_train.apply_preproc()
	feats_test.add_preproc(preproc)
	feats_test.apply_preproc()

	kernel=CommUlongStringKernel(feats_train, feats_train, use_sign)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
Esempio n. 7
0
def get_kernel_matrix(li):
    """
    Get kernel matrix from a list of strings.
    """

    order = 6
    gap = 2
    reverse = False
    charfeat = StringCharFeatures(RAWBYTE)
    charfeat.set_features(li)
    #Get alphabet.
    feats_train = StringUlongFeatures(charfeat.get_alphabet())
    feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse)
    #CommUlongStringKernel needs sorted features.
    preproc = SortUlongString()
    preproc.init(feats_train)
    feats_train.add_preproc(preproc)
    feats_train.apply_preproc()

    use_sign = False

    #Compute kernel matrix between train features.
    kernel = CommUlongStringKernel(feats_train, feats_train, use_sign)
    km_train = kernel.get_kernel_matrix()
    return km_train
Esempio n. 8
0
def create_features(kname, examples, kparam, train_mode, preproc, seq_source, nuc_con):
    """Converts numpy arrays or sequences into shogun features"""

    if kname == 'gauss' or kname == 'linear' or kname == 'poly':
        examples = numpy.array(examples)
        feats = RealFeatures(examples)
        
    elif kname == 'wd' or kname == 'localalign' or kname == 'localimprove':
        if seq_source == 'dna': 
            examples = non_atcg_convert(examples, nuc_con)
            feats = StringCharFeatures(examples, DNA)
        elif seq_source == 'protein':
            examples = non_aminoacid_converter(examples, nuc_con) 
            feats = StringCharFeatures(examples, PROTEIN)
        else:
            sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n")
            sys.exit(-1)

    elif kname == 'spec' or kname == 'cumspec':
        if seq_source == 'dna':
            examples = non_atcg_convert(examples, nuc_con)
            feats = StringCharFeatures(examples, DNA) 
        elif seq_source == 'protein':    
            examples = non_aminoacid_converter(examples, nuc_con)
            feats = StringCharFeatures(examples, PROTEIN)
        else:
            sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n")
            sys.exit(-1)
       
        wf = StringUlongFeatures( feats.get_alphabet() )
        wf.obtain_from_char(feats, kparam['degree']-1, kparam['degree'], 0, kname=='cumspec')
        del feats

        if train_mode:
            preproc = SortUlongString()
            preproc.init(wf)
        wf.add_preproc(preproc)
        ret = wf.apply_preproc()
        #assert(ret)

        feats = wf
    elif kname == 'spec2' or kname == 'cumspec2':
        # spectrum kernel on two sequences
        feats = {}
        feats['combined'] = CombinedFeatures()

        reversed = kname=='cumspec2'

        (ex0,ex1) = zip(*examples)

        f0 = StringCharFeatures(list(ex0), DNA)
        wf = StringWordFeatures(f0.get_alphabet())
        wf.obtain_from_char(f0, kparam['degree']-1, kparam['degree'], 0, reversed)
        del f0

        if train_mode:
            preproc = SortWordString()
            preproc.init(wf)
        wf.add_preprocessor(preproc)
        ret = wf.apply_preprocessors()
        assert(ret)
        feats['combined'].append_feature_obj(wf)
        feats['f0'] = wf

        f1 = StringCharFeatures(list(ex1), DNA)
        wf = StringWordFeatures( f1.get_alphabet() )
        wf.obtain_from_char(f1, kparam['degree']-1, kparam['degree'], 0, reversed)
        del f1

        if train_mode:
            preproc = SortWordString()
            preproc.init(wf)
        wf.add_preproc(preproc)
        ret = wf.apply_preproc()
        assert(ret)
        feats['combined'].append_feature_obj(wf)
        feats['f1'] = wf

    else:
        print 'Unknown kernel %s' % kname
    
    return (feats,preproc)
Esempio n. 9
0
def normal_ulong_feature(feats):
    preproc = SortUlongString()
    preproc.init(feats)
    feats.add_preprocessor(preproc)
    feats.apply_preprocessor()
    return feats