def calc_merit( pos_strings, neg_strings, mapping ): # Apply mapping to strings pos_strings = [ mapping.translate( s ) for s in pos_strings ] neg_strings = [ mapping.translate( s ) for s in neg_strings ] # Cross validate using those strings radix = mapping.get_out_size() order = max_order( radix ) model_factory = lambda d0, d1: model.train( order, radix, d0, d1 ) cv_engine = cv.CV( model_factory, pos_strings, neg_strings, fold=fold, passes=passes ) cv_engine.run() # Merit is TP + TN return cv_engine.cls1.pos + cv_engine.cls2.neg
def calc_merit(self, mapping): # Apply mapping to strings pos_strings = [mapping.translate(s) for s in self.pos_strings] neg_strings = [mapping.translate(s) for s in self.neg_strings] # Cross validate using those strings radix = mapping.get_out_size() ## model_factory = lambda d0, d1: ProductModel( radix, d0, d1 ) model_factory = lambda d0, d1: rp.models.complex_periodic.train(5, 1, 4, radix, d0, d1) cv_engine = rp.cv.CV( model_factory, zip(self.pos_genome_seqs, pos_strings), zip(self.neg_genome_seqs, neg_strings), fold=fold, passes=passes, ) cv_engine.run() # Merit is TP + TN ## print "Pos:", cv_engine.cls1 ## print "Neg:", cv_engine.cls2 return (cv_engine.cls1.pos / (len(pos_strings) * passes) + cv_engine.cls2.neg / (len(neg_strings) * passes)) / 2
def run( pos_file, neg_file, format, mapping, order, modname ): pos_blocks = list( bx.align.maf.Reader( pos_file ) ) neg_blocks = list( bx.align.maf.Reader( neg_file ) ) # Read integer sequences pos_strings = list( [ mapping.translate( rp.mapping.DNA.translate_list( [ c.text for c in block.components ] ) ) for block in pos_blocks ] ) neg_strings = list( [ mapping.translate( rp.mapping.DNA.translate_list( [ c.text for c in block.components ] ) ) for block in neg_blocks ] ) # Determine radix radix = max( map( max, pos_strings ) + map( max, neg_strings ) ) + 1 # Build model model = rp.models.train( modname, order, radix, pos_strings, neg_strings ) # Find all words in the training data words = words_from_blocks( itertools.chain( pos_blocks, neg_blocks ), order ) for word, count in words: ints = rp.mapping.DNA.translate_list( word ) ints = mapping.translate( ints ) assert len( ints ) == order + 1 scores = array( [ float("nan") ] * len( ints ), typecode="f" ) model.score_positions( ints, scores ) print "%s\t%d\t%0.6f" % ( "|".join( word ), count, scores[-1] )
def run( data_file, modname, model_file, out_file, mapping, window, shift, low, high, reorder ): # Read model model = rp.models.get( modname ).from_file( model_file ) radix = model.get_radix() # Open maf file mafs = bx.align.maf.Reader( data_file ) # Score each alignment for i, maf in enumerate( mafs ): if reorder: components = [ maf.components[ i ] for i in reorder ] else: components = maf.components ints = rp.mapping.DNA.translate_list( [ c.text for c in components ] ) if mapping: ints = mapping.translate( ints ) # print i score_windows( maf, ints, model, out_file, window, shift, low, high )
def calc_merit( training_sets, mapping, modname, modorder ): # Apply mapping to strings training_sets = [ [ mapping.translate( s ) for s in strings ] for strings in training_sets ] # Cross validate using those strings radix = mapping.get_out_size() if len( training_sets ) == 2: pos_strings, neg_strings = training_sets model_factory = lambda d0, d1: rp.models.train( modname, modorder, radix, d0, d1 ) cv_engine = rp.cv.CV( model_factory, pos_strings, neg_strings, fold=fold, passes=passes ) cv_engine.run() # Merit is TP + TN return ( cv_engine.cls1.pos / ( len( pos_strings ) * passes ) + cv_engine.cls2.neg / ( len( neg_strings ) * passes ) ) / 2 elif len( training_sets ) > 2: model_factory = lambda d: rp.models.prob_train( modname, modorder, radix, d ) cv_engine = rp.cv.MultiCV( model_factory, training_sets, fold=fold, passes=passes ) cv_engine.run() ## print >> sys.stderr, cv_engine.get_summary() ## print >> sys.stderr, cv_engine.get_success_rate() return cv_engine.get_success_rate() else: raise Exception( "No support for '%d' training sets" % len( training_sets ) )
#!/usr/bin/env python import bx.align.maf import rp.mapping import sys mapping = rp.mapping.alignment_mapping_from_file( file( sys.argv[1] ) ) for maf in align.maf.Reader( sys.stdin ): ints = rp.mapping.DNA.translate_list( [ c.text for c in maf.components ] ) ints = mapping.translate( ints ) print ' '.join( map( str, ints ) )