Ejemplo n.º 1
0
def calc_merit( pos_strings, neg_strings, mapping ):
    # Apply mapping to strings
    pos_strings = [ mapping.translate( s ) for s in pos_strings ]
    neg_strings = [ mapping.translate( s ) for s in neg_strings ]
    # Cross validate using those strings
    radix = mapping.get_out_size()
    order = max_order( radix )
    model_factory = lambda d0, d1: model.train( order, radix, d0, d1 )
    cv_engine = cv.CV( model_factory, pos_strings, neg_strings, fold=fold, passes=passes )
    cv_engine.run()
    # Merit is TP + TN
    return cv_engine.cls1.pos + cv_engine.cls2.neg
Ejemplo n.º 2
0
 def calc_merit(self, mapping):
     # Apply mapping to strings
     pos_strings = [mapping.translate(s) for s in self.pos_strings]
     neg_strings = [mapping.translate(s) for s in self.neg_strings]
     # Cross validate using those strings
     radix = mapping.get_out_size()
     ## model_factory = lambda d0, d1: ProductModel( radix, d0, d1 )
     model_factory = lambda d0, d1: rp.models.complex_periodic.train(5, 1, 4, radix, d0, d1)
     cv_engine = rp.cv.CV(
         model_factory,
         zip(self.pos_genome_seqs, pos_strings),
         zip(self.neg_genome_seqs, neg_strings),
         fold=fold,
         passes=passes,
     )
     cv_engine.run()
     # Merit is TP + TN
     ## print "Pos:", cv_engine.cls1
     ## print "Neg:", cv_engine.cls2
     return (cv_engine.cls1.pos / (len(pos_strings) * passes) + cv_engine.cls2.neg / (len(neg_strings) * passes)) / 2
Ejemplo n.º 3
0
def run( pos_file, neg_file, format, mapping, order, modname ):

    pos_blocks = list( bx.align.maf.Reader( pos_file ) )
    neg_blocks = list( bx.align.maf.Reader( neg_file ) )
    
    # Read integer sequences
    pos_strings = list( [ mapping.translate( rp.mapping.DNA.translate_list( [ c.text for c in block.components ] ) ) for block in pos_blocks ] )
    neg_strings = list( [ mapping.translate( rp.mapping.DNA.translate_list( [ c.text for c in block.components ] ) ) for block in neg_blocks ] )
    
    # Determine radix
    radix = max( map( max, pos_strings ) + map( max, neg_strings ) ) + 1
               
    # Build model
    model = rp.models.train( modname, order, radix, pos_strings, neg_strings )

    # Find all words in the training data
    words = words_from_blocks( itertools.chain( pos_blocks, neg_blocks ), order )
    for word, count in words:
        ints = rp.mapping.DNA.translate_list( word )
        ints = mapping.translate( ints )
        assert len( ints ) == order + 1
        scores = array( [ float("nan") ] * len( ints ), typecode="f" )
        model.score_positions( ints, scores )
        print "%s\t%d\t%0.6f" % ( "|".join( word ), count, scores[-1] )
Ejemplo n.º 4
0
def run( data_file, modname, model_file, out_file, mapping, window, shift, low, high, reorder ):

    # Read model
    model = rp.models.get( modname ).from_file( model_file )
    radix = model.get_radix()

    # Open maf file
    mafs = bx.align.maf.Reader( data_file )

    # Score each alignment
    for i, maf in enumerate( mafs ):
        if reorder: components = [ maf.components[ i ] for i in reorder ]
        else: components = maf.components
        ints = rp.mapping.DNA.translate_list( [ c.text for c in components ] )
        if mapping: ints = mapping.translate( ints )
        # print i
        score_windows( maf, ints, model, out_file, window, shift, low, high )
Ejemplo n.º 5
0
def calc_merit( training_sets, mapping, modname, modorder ):
    # Apply mapping to strings
    training_sets = [ [ mapping.translate( s ) for s in strings ] for strings in training_sets ]
    # Cross validate using those strings
    radix = mapping.get_out_size()
    
    if len( training_sets ) == 2:
        pos_strings, neg_strings = training_sets
        model_factory = lambda d0, d1: rp.models.train( modname, modorder, radix, d0, d1 )
        cv_engine = rp.cv.CV( model_factory, pos_strings, neg_strings, fold=fold, passes=passes )
        cv_engine.run()
        # Merit is TP + TN
        return ( cv_engine.cls1.pos / ( len( pos_strings ) * passes ) + cv_engine.cls2.neg / ( len( neg_strings ) * passes ) ) / 2
    elif len( training_sets ) > 2:
        model_factory = lambda d: rp.models.prob_train( modname, modorder, radix, d )    
        cv_engine = rp.cv.MultiCV( model_factory, training_sets, fold=fold, passes=passes )
        cv_engine.run()
        ## print >> sys.stderr, cv_engine.get_summary()
        ## print >> sys.stderr, cv_engine.get_success_rate()
        return cv_engine.get_success_rate()       
    else:
        raise Exception( "No support for '%d' training sets" % len( training_sets ) )
Ejemplo n.º 6
0
#!/usr/bin/env python

import bx.align.maf
import rp.mapping
import sys

mapping = rp.mapping.alignment_mapping_from_file( file( sys.argv[1] ) )

for maf in align.maf.Reader( sys.stdin ):
    ints = rp.mapping.DNA.translate_list( [ c.text for c in maf.components ] )
    ints = mapping.translate( ints )
    print ' '.join( map( str, ints ) )