def run( pos_file, neg_file, out_dir, format, align_count, mapping ): # Open merit output merit_out = open( os.path.join( out_dir, 'merits.txt' ), 'w' ) # Read integer sequences pos_strings = list( io.get_reader( pos_file, format, None ) ) neg_strings = list( io.get_reader( neg_file, format, None ) ) symbol_count = mapping.get_out_size() # Collapse while symbol_count > stop_size: print >> sys.stderr, "Collapsing from:", symbol_count best_mapping = None best_merit = 0 pb = ProgressBar( 0, symbol_count * ( symbol_count - 1 ) / 2, 78 ) count = 0 pairs = all_pairs( symbol_count ) for i, j in pairs: collapsed = mapping.collapse( i, j ) merit = calc_merit( pos_strings, neg_strings, collapsed ) if merit > best_merit: best_merit = merit best_mapping = collapsed count += 1 pb.update_and_print( count, sys.stderr ) mapping = best_mapping symbol_count -= 1 # Append merit to merit output print >>merit_out, symbol_count, best_merit print >>sys.stderr, "\nBest Merit %d." % best_merit, # Write best mapping to a file mapping_out = open( os.path.join( out_dir, "%03d.mapping" % symbol_count ), 'w' ) for i, symbol in enumerate( mapping.get_table() ): print >>mapping_out, str.join( '', rp.mapping.DNA.reverse_map( i, align_count ) ), symbol mapping_out.close()
def run(self): mapping = self.starting_mapping # Open merit output merit_out = open(os.path.join(self.out_dir, "merits.txt"), "w") # Read training data and build seperate sets for sequence (translated # to ints in range(0,4)) and alignment (translated with self.atom_mapping) print >>sys.stderr, "Loading training data" self.pos_genome_seqs, self.pos_strings = self.read_maf(self.pos_fname) self.neg_genome_seqs, self.neg_strings = self.read_maf(self.neg_fname) # Count how many times each atom appears in the training data atom_counts = zeros(self.atom_mapping.get_out_size()) for string in chain(self.pos_strings, self.neg_strings): for val in string: atom_counts[val] += 1 # Valid candiates for expansion must occur more than 10 times in the training data can_expand = compress(atom_counts > 10, arange(len(atom_counts))) # Handling bad columns in the training data is not obvious, so don't do it for now # for string in chain( pos_strings, neg_strings ): # assert -1 not in string, "Cannot have invalid columns (map to -1) in training data" best_merit_overall = 0 best_mapping_overall = None index_best_merit_overall = 0 out_counter = 0 step_counter = 0 # last_force_counter = 0 print >>sys.stderr, "Searching" # Collapse while 1: symbol_count = mapping.get_out_size() best_merit = 0 best_mapping = None clock = time.clock() cv_runs = 0 # First try a bunch of collapses if symbol_count > stop_size: pairs = all_pairs(symbol_count) if len(pairs) > samp_size_collapse: pairs = random.sample(pairs, samp_size_collapse) for i, j in pairs: new_mapping = mapping.collapse(i, j) merit = self.calc_merit(new_mapping) cv_runs += 1 if merit > best_merit: best_merit = merit best_mapping = new_mapping # Also try a bunch of expansions elements = random.sample(can_expand, samp_size_expand) for i in elements: new_mapping = mapping.expand(i) if new_mapping.get_out_size() == symbol_count: continue merit = self.calc_merit(new_mapping) cv_runs += 1 if merit > best_merit: best_merit = merit best_mapping = new_mapping clock = time.clock() - clock mapping = best_mapping # Append merit to merit output print >> merit_out, step_counter, symbol_count, best_merit merit_out.flush() if best_merit >= best_merit_overall: best_merit_overall = best_merit best_mapping_overall = best_mapping # So we know what step the best mapping was encountered at best_merit_overall_index = step_counter restart_counter = step_counter # Reset the counter we use to force expansions last_force_counter = step_counter # Write best mapping to a file mapping_out = open(os.path.join(self.out_dir, "%03d.mapping" % out_counter), "w") for i, symbol in enumerate(self.atom_mapping.get_table()): # Apply the 'second' mapping to the atom symbol if symbol >= 0: symbol = mapping[symbol] print >> mapping_out, str.join("", TS_DNA.reverse_map(i, self.align_count)), symbol mapping_out.close() out_counter += 1 print >>sys.stderr, "%06d, New best merit: %2.2f%%, size: %d, overall best: %2.2f%% at %06d, cvs per sec: %f" % ( step_counter, best_merit * 100, mapping.get_out_size(), best_merit_overall * 100, best_merit_overall_index, cv_runs / clock, ) # If we have gone 50 steps without improving over the best, restart from best if step_counter > restart_counter + 50: print >>sys.stderr, "Restarting from best mapping" print >> merit_out, step_counter, "RESTART" mapping = best_mapping_overall restart_counter = step_counter # Immediately force expansions after restart last_force_counter = 0 if step_counter > last_force_counter + 20: last_force_counter = step_counter print >>sys.stderr, "Forcing expansions" print >> merit_out, step_counter, "FORCED EXPANSIONS" for i in range(5): symbol_count = mapping.get_out_size() best_merit = 0 best_mapping = None for i in random.sample(can_expand, samp_size_expand): new_mapping = mapping.expand(i) if new_mapping.get_out_size() == symbol_count: continue merit = self.calc_merit(new_mapping) if merit > best_merit: best_merit = merit best_mapping = new_mapping mapping = best_mapping step_counter += 1
def run( ts_fnames, out_dir, format, align_count, atom_mapping, mapping, modname, modorder ): samp_size_collapse = 30 samp_size_expand = 10 if mpi: global pypar, node_id, nodes # Startup pypar and get some info about what node we are pypar = __import__( 'pypar' ) nodes = pypar.size() node_id = pypar.rank() print "I am node %d of %d" % ( node_id, nodes ) # Modify these, they get split over nodes samp_size_collapse = samp_size_collapse // nodes samp_size_expand = samp_size_expand // nodes # Open merit output merit_out = open( os.path.join( out_dir, 'merits.txt' ), 'w' ) # Read integer sequences message( "Loading training data" ) training_sets = [] for fname in ts_fnames: strings = [] skipped = 0 for s in rp.io.get_reader( open( fname ), format, None ): # Apply initial mapping s = atom_mapping.translate( s ) # Ensure required columns if sum( s != -1 ) < min_cols: skipped += 1 continue # Add to set strings.append( s ) # Informational message( "Loaded training data from '%s', found %d usable strings and %d skipped" \ % ( fname, len( strings ), skipped ) ) training_sets.append( strings ) # Count how many times each atom appears in the training data, valid # candiates for expansion must occur more than 10 times in the training # data. message( "Finding expandable atoms" ) atom_counts = zeros( atom_mapping.get_out_size() ) for string in chain( * training_sets ): for val in string: atom_counts[ int( val ) ] += 1 can_expand = compress( atom_counts > 10, arange( len( atom_counts ) ) ) # Open merit output merit_out = open( os.path.join( out_dir, 'merits.txt' ), 'w' ) best_merit_overall = 0 best_mapping_overall = None index_best_merit_overall = 0 out_counter = 0 step_counter = 0 last_force_counter = 0 message( "Searching" ) # Collapse while 1: clock = time.clock() cv_runs = 0 if mpi: # Sync up nodes at start of each pass pypar.barrier() symbol_count = mapping.get_out_size() best_i = None best_j = None best_merit = 0 best_mapping = None # First try a bunch of collapses if symbol_count > stop_size: # Select some random pairs from the region owned by this node pairs = all_pairs( symbol_count ) if mpi: lo, hi = pypar.balance( len( pairs ), nodes, node_id ) pairs = pairs[lo:hi] if len( pairs ) > samp_size_collapse: pairs = random.sample( pairs, samp_size_collapse ) # Try collapsing each pair for i, j in pairs: new_mapping = mapping.collapse( i, j ) merit = calc_merit( training_sets, new_mapping, modname, modorder ) cv_runs += 1 if merit > best_merit: best_i, best_j = i, j best_merit = merit best_mapping = new_mapping # Also try a bunch of expansions if mpi: lo, hi = pypar.balance( len( can_expand ), nodes, node_id ) elements = random.sample( can_expand[lo:hi], samp_size_expand ) else: elements = random.sample( can_expand, samp_size_expand ) for i in elements: new_mapping = mapping.expand( i ) if new_mapping.get_out_size() == symbol_count: continue merit = calc_merit( training_sets, new_mapping, modname, modorder ) cv_runs += 1 if merit > best_merit: best_i, best_j = i, None best_merit = merit best_mapping = new_mapping clock = time.clock() - clock if mpi: best_i, best_j, best_merit, cv_runs = sync_nodes( best_i, best_j, best_merit, cv_runs ) # Collapse or expand (if j is None) to get the overall best mapping if best_j is None: best_mapping = mapping.expand( best_i ) else: best_mapping = mapping.collapse( best_i, best_j ) mapping = best_mapping # Append merit to merit output if not mpi or node_id == 0: print >>merit_out, step_counter, symbol_count, best_merit merit_out.flush() if best_merit >= best_merit_overall: best_merit_overall = best_merit best_mapping_overall = best_mapping # So we know what step the best mapping was encountered at best_merit_overall_index = step_counter restart_counter = step_counter # Reset the counter we use to force expansions last_force_counter = step_counter # Write best mapping to a file if not mpi or node_id == 0: write_mapping( mapping, os.path.join( out_dir, "%03d.mapping" % out_counter ) ) out_counter += 1 message( "%06d, New best merit: %2.2f%%, size: %d, overall best: %2.2f%% at %06d, cvs/sec: %f" \ % ( step_counter, best_merit * 100, mapping.get_out_size(), best_merit_overall * 100, best_merit_overall_index, cv_runs/clock ) ) # If we have gone 50 steps without improving over the best, restart from best if step_counter > restart_counter + 50: message( "Restarting from best mapping" ) if not mpi or node_id == 0: print >>merit_out, step_counter, "RESTART" mapping = best_mapping_overall restart_counter = step_counter # Immediately force expansions after restart last_force_counter = 0 if step_counter > last_force_counter + 20: last_force_counter = step_counter message( "Forcing expansions" ) if not mpi or node_id == 0: print >>merit_out, step_counter, "FORCED EXPANSIONS" if mpi: lo, hi = pypar.balance( len( can_expand ), nodes, node_id ) my_can_expand = can_expand[lo:hi] else: my_can_expand = can_expand for i in range( 5 ): symbol_count = mapping.get_out_size() best_merit = 0 best_i = None best_mapping = None for i in random.sample( my_can_expand, samp_size_expand ): new_mapping = mapping.expand( i ) if new_mapping.get_out_size() == symbol_count: continue merit = calc_merit( training_sets, new_mapping, modname, modorder ) if merit > best_merit: best_i = i best_merit = merit best_mapping = new_mapping if mpi: best_i, best_j, best_merit, cv_runs = sync_nodes( best_i, None, best_merit, 0 ) assert best_j == None best_mapping = mapping.expand( best_i ) if best_mapping: mapping = best_mapping step_counter += 1
def run( pos_file, neg_file, out_dir, format, align_count, mapping ): # Open merit output merit_out = open( os.path.join( out_dir, 'merits.txt' ), 'w' ) # Read integer sequences pos_strings = list( rp.io.get_reader( pos_file, format, None ) ) neg_strings = list( rp.io.get_reader( neg_file, format, None ) ) symbol_count = mapping.get_out_size() # Collapse while symbol_count > stop_size: # Sync nodes on each pass, may not be required pypar.barrier() if node_id == 0: print "Collapsing from:", symbol_count pairs = all_pairs( symbol_count ) # Decide which subrange of all pairs this node will handle lo, hi = pypar.balance( len( pairs ), nodes, node_id ) # Find best collapsed mapping in interval best_i, best_j, best_merit = None, None, 0 for i, j in pairs[lo:hi]: merit = calc_merit( pos_strings, neg_strings, mapping.collapse( i, j ) ) if merit > best_merit: best_i, best_j, best_merit = i, j, merit # Aggregate results if node_id != 0: # Send best i, j, merit to the master pypar.send( ( best_i, best_j, merit ), 0 ) else: # I am the master, get results from all other nodes and determine # which had the best merit for other_node_id in range( 1, nodes ): i, j, merit = pypar.receive( other_node_id ) if merit > best_merit: best_i, best_j, best_merit = i, j, merit # Collapse the two symbols that resulted in the best merit mapping = mapping.collapse( best_i, best_j ) symbol_count -= 1 # Ensure only the master writes files if node_id == 0: # Append merit to merit output print >>merit_out, symbol_count, best_merit print "\nBest Merit %d." % best_merit, # Write best mapping to a file mapping_out = open( os.path.join( out_dir, "%03d.mapping" % symbol_count ), 'w' ) for i, symbol in enumerate( mapping.get_table() ): print >>mapping_out, str.join( '', rp.mapping.DNA.reverse_map( i, align_count ) ), symbol mapping_out.close()