def run( pos_file, neg_file, format, mapping, radix, orders, modname, fold, loo ): # Split up # Read integer sequences pos_strings = list( rp.io.get_reader( pos_file, format, mapping ) ) neg_strings = list( rp.io.get_reader( neg_file, format, mapping ) ) # Determine radix if not radix: if mapping: radix = mapping.get_out_size() else: radix = max( map( max, pos_strings ) + map( max, neg_strings ) ) + 1 print "Order TP ~TP ~FN FN FP ~FP ~TN TN % time" # Cross validate for various orders for order in orders: model_factory = lambda d0, d1: rp.models.train( modname, order, radix, d0, d1 ) if loo: passes = 1 else: passes = 5 cv_engine = rp.cv.CV( model_factory, pos_strings, neg_strings, fold, passes, loo ) start_time = time.time() cv_engine.run() seconds = time.time() - start_time print "%5d " % order, print cv_engine.cls1, cv_engine.cls2, print " %2.2f %2.2f" % ( cv_engine.get_success_rate()*100, seconds )
def run( ts_fnames, format, mapping, radix, orders, modname, fold, loo, print_results ): # Read integer sequences radix = 0 training_sets = [] for f in ts_fnames: strings = [] for s in rp.io.get_reader( open( f ), format, mapping ): if sum( s != -1 ) >= 50: strings.append( s ) radix = max( radix, max( s ) + 1 ) training_sets.append( strings ) # Determine radix if mapping: radix = mapping.get_out_size() # Cross validate for various orders for order in orders: model_factory = lambda d0: rp.models.prob_train( modname, order, radix, d0 ) if loo: passes = 1 else: passes = 5 cv_engine = rp.cv.MultiCV( model_factory, training_sets, fold, passes, loo, keep_results=print_results ) start_time = time.time() cv_engine.run() seconds = time.time() - start_time if print_results: for r, c in zip( cv_engine.scores, cv_engine.classes ): print "\t".join( map( str, list( r ) + list( c ) ) ) else: print cv_engine.get_success_rate()*100, cv_engine.get_summary(), seconds
def calc_merit( pos_strings, neg_strings, mapping ): # Apply mapping to strings pos_strings = [ mapping.translate( s ) for s in pos_strings ] neg_strings = [ mapping.translate( s ) for s in neg_strings ] # Cross validate using those strings radix = mapping.get_out_size() order = max_order( radix ) model_factory = lambda d0, d1: model.train( order, radix, d0, d1 ) cv_engine = cv.CV( model_factory, pos_strings, neg_strings, fold=fold, passes=passes ) cv_engine.run() # Merit is TP + TN return cv_engine.cls1.pos + cv_engine.cls2.neg
def run( pos_file, neg_file, out_dir, format, align_count, mapping ): # Open merit output merit_out = open( os.path.join( out_dir, 'merits.txt' ), 'w' ) # Read integer sequences pos_strings = list( io.get_reader( pos_file, format, None ) ) neg_strings = list( io.get_reader( neg_file, format, None ) ) symbol_count = mapping.get_out_size() # Collapse while symbol_count > stop_size: print >> sys.stderr, "Collapsing from:", symbol_count best_mapping = None best_merit = 0 pb = ProgressBar( 0, symbol_count * ( symbol_count - 1 ) / 2, 78 ) count = 0 pairs = all_pairs( symbol_count ) for i, j in pairs: collapsed = mapping.collapse( i, j ) merit = calc_merit( pos_strings, neg_strings, collapsed ) if merit > best_merit: best_merit = merit best_mapping = collapsed count += 1 pb.update_and_print( count, sys.stderr ) mapping = best_mapping symbol_count -= 1 # Append merit to merit output print >>merit_out, symbol_count, best_merit print >>sys.stderr, "\nBest Merit %d." % best_merit, # Write best mapping to a file mapping_out = open( os.path.join( out_dir, "%03d.mapping" % symbol_count ), 'w' ) for i, symbol in enumerate( mapping.get_table() ): print >>mapping_out, str.join( '', rp.mapping.DNA.reverse_map( i, align_count ) ), symbol mapping_out.close()
def run( pos_file, neg_file, out_file, format, mapping, radix, order, modname ): # Read integer sequences pos_strings = list( rp.io.get_reader( pos_file, format, mapping ) ) neg_strings = list( rp.io.get_reader( neg_file, format, mapping ) ) # Determine radix if not radix: if mapping: radix = mapping.get_out_size() else: radix = max( map( max, pos_strings ) + map( max, neg_strings ) ) + 1 # Build model model = rp.models.train( modname, order, radix, pos_strings, neg_strings ) # Write to out file model.to_file( out_file )
def run( pos_file, out_file, format, mapping, radix, order, modname ): # Read integer sequences pos_strings = list( rp.io.get_reader( pos_file, format, mapping ) ) # Determine radix if not radix: if mapping: radix = mapping.get_out_size() else: radix = max( map( max, pos_strings ) ) + 1 # Build model print "about to train" model = rp.models.prob_train( modname, order, radix, pos_strings ) print "trained" # Write to out file print "about to write" model.to_file( out_file ) print "written"
def calc_merit(self, mapping): # Apply mapping to strings pos_strings = [mapping.translate(s) for s in self.pos_strings] neg_strings = [mapping.translate(s) for s in self.neg_strings] # Cross validate using those strings radix = mapping.get_out_size() ## model_factory = lambda d0, d1: ProductModel( radix, d0, d1 ) model_factory = lambda d0, d1: rp.models.complex_periodic.train(5, 1, 4, radix, d0, d1) cv_engine = rp.cv.CV( model_factory, zip(self.pos_genome_seqs, pos_strings), zip(self.neg_genome_seqs, neg_strings), fold=fold, passes=passes, ) cv_engine.run() # Merit is TP + TN ## print "Pos:", cv_engine.cls1 ## print "Neg:", cv_engine.cls2 return (cv_engine.cls1.pos / (len(pos_strings) * passes) + cv_engine.cls2.neg / (len(neg_strings) * passes)) / 2
def calc_merit( training_sets, mapping, modname, modorder ): # Apply mapping to strings training_sets = [ [ mapping.translate( s ) for s in strings ] for strings in training_sets ] # Cross validate using those strings radix = mapping.get_out_size() if len( training_sets ) == 2: pos_strings, neg_strings = training_sets model_factory = lambda d0, d1: rp.models.train( modname, modorder, radix, d0, d1 ) cv_engine = rp.cv.CV( model_factory, pos_strings, neg_strings, fold=fold, passes=passes ) cv_engine.run() # Merit is TP + TN return ( cv_engine.cls1.pos / ( len( pos_strings ) * passes ) + cv_engine.cls2.neg / ( len( neg_strings ) * passes ) ) / 2 elif len( training_sets ) > 2: model_factory = lambda d: rp.models.prob_train( modname, modorder, radix, d ) cv_engine = rp.cv.MultiCV( model_factory, training_sets, fold=fold, passes=passes ) cv_engine.run() ## print >> sys.stderr, cv_engine.get_summary() ## print >> sys.stderr, cv_engine.get_success_rate() return cv_engine.get_success_rate() else: raise Exception( "No support for '%d' training sets" % len( training_sets ) )
def run(self): mapping = self.starting_mapping # Open merit output merit_out = open(os.path.join(self.out_dir, "merits.txt"), "w") # Read training data and build seperate sets for sequence (translated # to ints in range(0,4)) and alignment (translated with self.atom_mapping) print >>sys.stderr, "Loading training data" self.pos_genome_seqs, self.pos_strings = self.read_maf(self.pos_fname) self.neg_genome_seqs, self.neg_strings = self.read_maf(self.neg_fname) # Count how many times each atom appears in the training data atom_counts = zeros(self.atom_mapping.get_out_size()) for string in chain(self.pos_strings, self.neg_strings): for val in string: atom_counts[val] += 1 # Valid candiates for expansion must occur more than 10 times in the training data can_expand = compress(atom_counts > 10, arange(len(atom_counts))) # Handling bad columns in the training data is not obvious, so don't do it for now # for string in chain( pos_strings, neg_strings ): # assert -1 not in string, "Cannot have invalid columns (map to -1) in training data" best_merit_overall = 0 best_mapping_overall = None index_best_merit_overall = 0 out_counter = 0 step_counter = 0 # last_force_counter = 0 print >>sys.stderr, "Searching" # Collapse while 1: symbol_count = mapping.get_out_size() best_merit = 0 best_mapping = None clock = time.clock() cv_runs = 0 # First try a bunch of collapses if symbol_count > stop_size: pairs = all_pairs(symbol_count) if len(pairs) > samp_size_collapse: pairs = random.sample(pairs, samp_size_collapse) for i, j in pairs: new_mapping = mapping.collapse(i, j) merit = self.calc_merit(new_mapping) cv_runs += 1 if merit > best_merit: best_merit = merit best_mapping = new_mapping # Also try a bunch of expansions elements = random.sample(can_expand, samp_size_expand) for i in elements: new_mapping = mapping.expand(i) if new_mapping.get_out_size() == symbol_count: continue merit = self.calc_merit(new_mapping) cv_runs += 1 if merit > best_merit: best_merit = merit best_mapping = new_mapping clock = time.clock() - clock mapping = best_mapping # Append merit to merit output print >> merit_out, step_counter, symbol_count, best_merit merit_out.flush() if best_merit >= best_merit_overall: best_merit_overall = best_merit best_mapping_overall = best_mapping # So we know what step the best mapping was encountered at best_merit_overall_index = step_counter restart_counter = step_counter # Reset the counter we use to force expansions last_force_counter = step_counter # Write best mapping to a file mapping_out = open(os.path.join(self.out_dir, "%03d.mapping" % out_counter), "w") for i, symbol in enumerate(self.atom_mapping.get_table()): # Apply the 'second' mapping to the atom symbol if symbol >= 0: symbol = mapping[symbol] print >> mapping_out, str.join("", TS_DNA.reverse_map(i, self.align_count)), symbol mapping_out.close() out_counter += 1 print >>sys.stderr, "%06d, New best merit: %2.2f%%, size: %d, overall best: %2.2f%% at %06d, cvs per sec: %f" % ( step_counter, best_merit * 100, mapping.get_out_size(), best_merit_overall * 100, best_merit_overall_index, cv_runs / clock, ) # If we have gone 50 steps without improving over the best, restart from best if step_counter > restart_counter + 50: print >>sys.stderr, "Restarting from best mapping" print >> merit_out, step_counter, "RESTART" mapping = best_mapping_overall restart_counter = step_counter # Immediately force expansions after restart last_force_counter = 0 if step_counter > last_force_counter + 20: last_force_counter = step_counter print >>sys.stderr, "Forcing expansions" print >> merit_out, step_counter, "FORCED EXPANSIONS" for i in range(5): symbol_count = mapping.get_out_size() best_merit = 0 best_mapping = None for i in random.sample(can_expand, samp_size_expand): new_mapping = mapping.expand(i) if new_mapping.get_out_size() == symbol_count: continue merit = self.calc_merit(new_mapping) if merit > best_merit: best_merit = merit best_mapping = new_mapping mapping = best_mapping step_counter += 1
def run( ts_fnames, out_dir, format, align_count, atom_mapping, mapping, modname, modorder ): samp_size_collapse = 30 samp_size_expand = 10 if mpi: global pypar, node_id, nodes # Startup pypar and get some info about what node we are pypar = __import__( 'pypar' ) nodes = pypar.size() node_id = pypar.rank() print "I am node %d of %d" % ( node_id, nodes ) # Modify these, they get split over nodes samp_size_collapse = samp_size_collapse // nodes samp_size_expand = samp_size_expand // nodes # Open merit output merit_out = open( os.path.join( out_dir, 'merits.txt' ), 'w' ) # Read integer sequences message( "Loading training data" ) training_sets = [] for fname in ts_fnames: strings = [] skipped = 0 for s in rp.io.get_reader( open( fname ), format, None ): # Apply initial mapping s = atom_mapping.translate( s ) # Ensure required columns if sum( s != -1 ) < min_cols: skipped += 1 continue # Add to set strings.append( s ) # Informational message( "Loaded training data from '%s', found %d usable strings and %d skipped" \ % ( fname, len( strings ), skipped ) ) training_sets.append( strings ) # Count how many times each atom appears in the training data, valid # candiates for expansion must occur more than 10 times in the training # data. message( "Finding expandable atoms" ) atom_counts = zeros( atom_mapping.get_out_size() ) for string in chain( * training_sets ): for val in string: atom_counts[ int( val ) ] += 1 can_expand = compress( atom_counts > 10, arange( len( atom_counts ) ) ) # Open merit output merit_out = open( os.path.join( out_dir, 'merits.txt' ), 'w' ) best_merit_overall = 0 best_mapping_overall = None index_best_merit_overall = 0 out_counter = 0 step_counter = 0 last_force_counter = 0 message( "Searching" ) # Collapse while 1: clock = time.clock() cv_runs = 0 if mpi: # Sync up nodes at start of each pass pypar.barrier() symbol_count = mapping.get_out_size() best_i = None best_j = None best_merit = 0 best_mapping = None # First try a bunch of collapses if symbol_count > stop_size: # Select some random pairs from the region owned by this node pairs = all_pairs( symbol_count ) if mpi: lo, hi = pypar.balance( len( pairs ), nodes, node_id ) pairs = pairs[lo:hi] if len( pairs ) > samp_size_collapse: pairs = random.sample( pairs, samp_size_collapse ) # Try collapsing each pair for i, j in pairs: new_mapping = mapping.collapse( i, j ) merit = calc_merit( training_sets, new_mapping, modname, modorder ) cv_runs += 1 if merit > best_merit: best_i, best_j = i, j best_merit = merit best_mapping = new_mapping # Also try a bunch of expansions if mpi: lo, hi = pypar.balance( len( can_expand ), nodes, node_id ) elements = random.sample( can_expand[lo:hi], samp_size_expand ) else: elements = random.sample( can_expand, samp_size_expand ) for i in elements: new_mapping = mapping.expand( i ) if new_mapping.get_out_size() == symbol_count: continue merit = calc_merit( training_sets, new_mapping, modname, modorder ) cv_runs += 1 if merit > best_merit: best_i, best_j = i, None best_merit = merit best_mapping = new_mapping clock = time.clock() - clock if mpi: best_i, best_j, best_merit, cv_runs = sync_nodes( best_i, best_j, best_merit, cv_runs ) # Collapse or expand (if j is None) to get the overall best mapping if best_j is None: best_mapping = mapping.expand( best_i ) else: best_mapping = mapping.collapse( best_i, best_j ) mapping = best_mapping # Append merit to merit output if not mpi or node_id == 0: print >>merit_out, step_counter, symbol_count, best_merit merit_out.flush() if best_merit >= best_merit_overall: best_merit_overall = best_merit best_mapping_overall = best_mapping # So we know what step the best mapping was encountered at best_merit_overall_index = step_counter restart_counter = step_counter # Reset the counter we use to force expansions last_force_counter = step_counter # Write best mapping to a file if not mpi or node_id == 0: write_mapping( mapping, os.path.join( out_dir, "%03d.mapping" % out_counter ) ) out_counter += 1 message( "%06d, New best merit: %2.2f%%, size: %d, overall best: %2.2f%% at %06d, cvs/sec: %f" \ % ( step_counter, best_merit * 100, mapping.get_out_size(), best_merit_overall * 100, best_merit_overall_index, cv_runs/clock ) ) # If we have gone 50 steps without improving over the best, restart from best if step_counter > restart_counter + 50: message( "Restarting from best mapping" ) if not mpi or node_id == 0: print >>merit_out, step_counter, "RESTART" mapping = best_mapping_overall restart_counter = step_counter # Immediately force expansions after restart last_force_counter = 0 if step_counter > last_force_counter + 20: last_force_counter = step_counter message( "Forcing expansions" ) if not mpi or node_id == 0: print >>merit_out, step_counter, "FORCED EXPANSIONS" if mpi: lo, hi = pypar.balance( len( can_expand ), nodes, node_id ) my_can_expand = can_expand[lo:hi] else: my_can_expand = can_expand for i in range( 5 ): symbol_count = mapping.get_out_size() best_merit = 0 best_i = None best_mapping = None for i in random.sample( my_can_expand, samp_size_expand ): new_mapping = mapping.expand( i ) if new_mapping.get_out_size() == symbol_count: continue merit = calc_merit( training_sets, new_mapping, modname, modorder ) if merit > best_merit: best_i = i best_merit = merit best_mapping = new_mapping if mpi: best_i, best_j, best_merit, cv_runs = sync_nodes( best_i, None, best_merit, 0 ) assert best_j == None best_mapping = mapping.expand( best_i ) if best_mapping: mapping = best_mapping step_counter += 1
def run( pos_file, neg_file, out_dir, format, align_count, mapping ): # Open merit output merit_out = open( os.path.join( out_dir, 'merits.txt' ), 'w' ) # Read integer sequences pos_strings = list( rp.io.get_reader( pos_file, format, None ) ) neg_strings = list( rp.io.get_reader( neg_file, format, None ) ) symbol_count = mapping.get_out_size() # Collapse while symbol_count > stop_size: # Sync nodes on each pass, may not be required pypar.barrier() if node_id == 0: print "Collapsing from:", symbol_count pairs = all_pairs( symbol_count ) # Decide which subrange of all pairs this node will handle lo, hi = pypar.balance( len( pairs ), nodes, node_id ) # Find best collapsed mapping in interval best_i, best_j, best_merit = None, None, 0 for i, j in pairs[lo:hi]: merit = calc_merit( pos_strings, neg_strings, mapping.collapse( i, j ) ) if merit > best_merit: best_i, best_j, best_merit = i, j, merit # Aggregate results if node_id != 0: # Send best i, j, merit to the master pypar.send( ( best_i, best_j, merit ), 0 ) else: # I am the master, get results from all other nodes and determine # which had the best merit for other_node_id in range( 1, nodes ): i, j, merit = pypar.receive( other_node_id ) if merit > best_merit: best_i, best_j, best_merit = i, j, merit # Collapse the two symbols that resulted in the best merit mapping = mapping.collapse( best_i, best_j ) symbol_count -= 1 # Ensure only the master writes files if node_id == 0: # Append merit to merit output print >>merit_out, symbol_count, best_merit print "\nBest Merit %d." % best_merit, # Write best mapping to a file mapping_out = open( os.path.join( out_dir, "%03d.mapping" % symbol_count ), 'w' ) for i, symbol in enumerate( mapping.get_table() ): print >>mapping_out, str.join( '', rp.mapping.DNA.reverse_map( i, align_count ) ), symbol mapping_out.close()