def main(argv): if len(argv) != 2: print print 'myr shred' print print 'Generate fake Illumina reads.' print 'Not guaranteed to be sanely calibrated, for testing only.' print print 'Usage:' print print ' myr shred <number of reads> <sequence.fna>' print return 1 how_many = int(argv[0]) seq = sequence.sequence_file_iterator(argv[1]).next()[1] READ_SIZE = 33 error_p = numpy.array([ 0.00912327, 0.00930828, 0.00929492, 0.00928049, 0.0093261, 0.00928905, 0.00938066, 0.00936397, 0.00939301, 0.00947136, 0.00952966, 0.00956763, 0.01073044, 0.01091972, 0.01121085, 0.01159389, 0.01200634, 0.01233303, 0.01271543, 0.01334389, 0.01349712, 0.01412138, 0.01462227, 0.01720922, 0.01617627, 0.01671721, 0.01795653, 0.01904574, 0.02032015, 0.0220367, 0.02354595, 0.02560759, 0.03480737 ]) for i in xrange(how_many): print '>read%d' % i pos = random.randint(len(seq) - READ_SIZE + 1) read = seq[pos:pos + READ_SIZE] if random.randint(2): read = sequence.reverse_complement(read) read = read.copy() mutations = random.random(READ_SIZE) < error_p read[mutations] = (read[mutations] + random.randint( 1, 4, size=numpy.sum(mutations)).astype('uint8')) % 4 print sequence.string_from_sequence(read)
def callback(working_dir): print >> sys.stderr, 'Sampling' samples = [] n = 0 for item in sequence.sequence_files_iterator(read_files): n += 1 if len(samples) < n_samples: samples.append(item) elif random.random() * n_samples < n: samples[random.randrange(n_samples)] = item outfile = open(os.path.join(working_dir, 'sample.fna'), 'wb') for item in samples: print >> outfile, '>%s' % item[0] print >> outfile, '%s' % sequence.string_from_sequence(item[1])
def show(self, cursor, distance_cutoff): positions = {} todo = [] heapq.heappush(todo, (0, 0, cursor)) def add_todo(location, distance, position): if distance > distance_cutoff: raise Out_of_bounds() if location in positions: return assert self.valid_location(location) heapq.heappush(todo, (distance, position, location)) #dag = Dag() dag = {} def dag_link(a, b): if a not in dag: dag[a] = [] if b not in dag: dag[b] = [] dag[a].append(b) contigua = Union() while todo: distance, position, location = heapq.heappop(todo) if location in positions: continue positions[location] = position #dag.get_keyset(location) if location not in dag: dag[location] = [] contigua.create(location) #flipped_location = location ^ FORWARD_MASK #add_todo(flipped_location, distance) #dag.merge_keys(flipped_location, location) try: linked_location = self.location_move(location, 1) contigua.merge_if_created(location, linked_location) add_todo(linked_location, distance + 1, position + 1) dag_link(location, linked_location) except Out_of_bounds: pass try: linked_location = self.location_move(location, -1) contigua.merge_if_created(location, linked_location) add_todo(linked_location, distance + 1, position - 1) dag_link(linked_location, location) except Out_of_bounds: pass def merge(linked_location): try: add_todo(linked_location, distance, position) dag_link(location, linked_location) dag_link(linked_location, location) except Out_of_bounds: pass for i in self.base_links.find_all('location1', location): merge(self.base_links.location2[i]) for i in self.base_links.find_all('location2', location): merge(self.base_links.location1[i]) for i in self.base_links.find_all('location1', location ^ FORWARD_MASK): merge(self.base_links.location2[i] ^ FORWARD_MASK) for i in self.base_links.find_all('location2', location ^ FORWARD_MASK): merge(self.base_links.location1[i] ^ FORWARD_MASK) class Contig: pass contigs = [] for item in contigua.sets(): sample = iter(item).next() seq = location_sequence(sample) forward = (sample & FORWARD_MASK) != 0 contig = Contig() contigs.append(contig) contig.seq = seq contig.name = self.sequences.name[seq] contig.forward = forward contig.sort_key = (contig.name, not forward) contig.locations = item contigs.sort(lambda a, b: cmp(a.sort_key, b.sort_key)) #print contigua #for contig in contigua: # item = iter(contig).next() # seq = location_sequence( item ) # forward = (item & FORWARD_MASK) != 0 # print self.sequences.name[seq], forward #order = dag.sort(positions) def priority(component): return float(sum([positions[item] for item in component])) / len(component) order = sort.compact_robust_topological_sort(dag, priority) table = [] column_width = [] for x, locations in enumerate(order): column = [] table.append(column) for y, contig in enumerate(contigs): relevant = [ location for location in locations if location in contig.locations ] relevant.sort() if relevant and not (relevant[0] & FORWARD_MASK): relevant = relevant[::-1] if self.cursor in relevant: cursor_y = y cursor_x = numpy.sum(column_width) + relevant.index( self.cursor) cursor_column = column column.append(relevant) #sequence.string_from_sequence( [ self.location_get(location) # for location in relevant ] ) ) column_width.append(max([len(item) for item in column])) self.screen.clear() maxy, maxx = self.screen.getmaxyx() offset_y = int(maxy // 2 - cursor_y) offset_x = int(maxx // 2 - cursor_x) def addstr(y, x, string): if y < 0 or y >= maxy: return while string and x < 0: string = string[1:] x += 1 if x + len(string) > maxx: string = string[:max(0, maxx - x)] if not string: return #try: self.screen.addstr(y, x, string) #except: # raise repr((y,x,string,maxy,maxx)) for y in xrange(len(contigs)): #item = iter(contigua[y]).next() #seq = location_sequence( item ) #forward = (item & FORWARD_MASK) != 0 #sys.stdout.write('% 20s %d ' % (self.sequences.name[seq],forward)) scr_x = 0 for x in xrange(len(order)): item = table[x][y] #item += ' '*(column_width[x]-len(item)) #sys.stdout.write(item) string = sequence.string_from_sequence( [self.location_get(location) for location in item]) addstr(y + offset_y, scr_x + offset_x, string) scr_x += column_width[x] #sys.stdout.write('\n') info = contigs[y].name if self.sequences.comment[contigs[y].seq]: info += ' ' + self.sequences.comment[contigs[y].seq] if contigs[y].forward: info += ' >>> ' else: info += ' <<< ' addstr(y + offset_y, max(0, -len(info) - 1 + offset_x), info) cursor_seq, cursor_fwd, cursor_pos = location_parts(cursor) addstr(1, 1, '%s @ %d' % (self.sequences.name[cursor_seq], cursor_pos)) self.screen.move(cursor_y + offset_y, cursor_x + offset_x) self.screen.refresh() return cursor_column, cursor_y
def align(seq1, seq2, n_errors, indel_cost): """ Produce an alignment (for once we have found a hit). Start point is zero in both seqs. End point may be anywhere in seq2, must be end of seq1. """ radius = n_errors // indel_cost len1 = len(seq1) len2 = len(seq2) scores = numpy.empty((len1 + 1, len2 + 1), 'int') #scores[:,:] = n_errors+1 #scores[0,:] = numpy.arange(len2+1) #scores[:,0] = numpy.arange(len1+1) scores[0,:radius+2] = \ scores[:radius+2,0] = numpy.arange(radius+2) * indel_cost #TODO: no need to allocate entire array for i in xrange(1, len1 + 1): #for j in xrange(1,len2+1): left = max(1, i - radius) right = min(len2, i + radius) if left > 1: scores[i, left - 1] = n_errors + 1 if i > 1: scores[i - 1, right] = n_errors + 1 for j in xrange(left, right + 1): scores[i, j] = min( scores[i - 1, j - 1] + sequence.NOTEQUAL[seq1[i - 1], seq2[j - 1]], scores[i - 1, j] + indel_cost, scores[i, j - 1] + indel_cost) left = max(1, len1 - radius) right = min(len2, len1 + radius) #end2 = numpy.argmin(scores[len1,1:])+1 end2 = numpy.argmin(scores[len1, left:right + 1]) + left str_seq1 = sequence.string_from_sequence(seq1) str_seq2 = sequence.string_from_sequence(seq2) pos1 = len1 pos2 = end2 ali1 = [] ali2 = [] while True: if pos1 and pos2: step = scores[pos1 - 1, pos2 - 1] del1 = scores[pos1 - 1, pos2] del2 = scores[pos1, pos2 - 1] if step <= del1 and step <= del2: ali1.append(str_seq1[pos1 - 1]) ali2.append(str_seq2[pos2 - 1]) pos1 -= 1 pos2 -= 1 elif del1 <= del2: ali1.append(str_seq1[pos1 - 1]) ali2.append('-') pos1 -= 1 else: ali1.append('-') ali2.append(str_seq2[pos2 - 1]) pos2 -= 1 elif pos1: ali1.append(str_seq1[:pos1]) ali2.append('-' * pos1) break else: ali1.append('-' * pos2) ali2.append(str_seq2[:pos2]) break return ''.join(ali1[::-1]), ''.join(ali2[::-1]), end2, scores[len1, end2]
todo[hits.start[i]].append(i) lanes = [] def find_lane(): for start in (0, 4, 2, 6, 1, 5, 3, 7): for i in xrange(start, len(lanes), 8): if lanes[i] is None: return i lanes.extend([None] * 8) return find_lane() pad = ' ' * 5 total_with_a_hit = 0 for pos, ref_nuc in enumerate(sequence.string_from_sequence(reference)): while lanes and lanes[-1] is None: del lanes[-1] if pos in todo: for i in todo[pos]: lane_no = find_lane() lanes[lane_no] = [ hits.ref_ali[i] + pad, hits.read_ali[i] + pad ] to_show = [ref_nuc] for i in xrange(len(lanes)): if lanes[i] is None: to_show.append('')