def main(argv): if len(argv) != 2: print print 'myr shred' print print 'Generate fake Illumina reads.' print 'Not guaranteed to be sanely calibrated, for testing only.' print print 'Usage:' print print ' myr shred <number of reads> <sequence.fna>' print return 1 how_many = int(argv[0]) seq = sequence.sequence_file_iterator(argv[1]).next()[1] READ_SIZE = 33 error_p = numpy.array([ 0.00912327, 0.00930828, 0.00929492, 0.00928049, 0.0093261, 0.00928905, 0.00938066, 0.00936397, 0.00939301, 0.00947136, 0.00952966, 0.00956763, 0.01073044, 0.01091972, 0.01121085, 0.01159389, 0.01200634, 0.01233303, 0.01271543, 0.01334389, 0.01349712, 0.01412138, 0.01462227, 0.01720922, 0.01617627, 0.01671721, 0.01795653, 0.01904574, 0.02032015, 0.0220367, 0.02354595, 0.02560759, 0.03480737 ]) for i in xrange(how_many): print '>read%d' % i pos = random.randint(len(seq) - READ_SIZE + 1) read = seq[pos:pos + READ_SIZE] if random.randint(2): read = sequence.reverse_complement(read) read = read.copy() mutations = random.random(READ_SIZE) < error_p read[mutations] = (read[mutations] + random.randint( 1, 4, size=numpy.sum(mutations)).astype('uint8')) % 4 print sequence.string_from_sequence(read)
def main(argv): if len(argv) < 2: print >> sys.stderr, '' print >> sys.stderr, 'myr assess <sample size> <max errors> <contigs file> <reads> [<reads> ...]' print >> sys.stderr, '' return 1 sample_size = int(argv[0]) max_errors = int(argv[1]) sample_file = sample(argv[3:], sample_size) hit_file = invoke_align(argv[2], sample_file, max_errors) hits = {} seqs = {} max_length = 0 for item in sequence.sequence_file_iterator(sample_file): seqs[item[0]] = item[1] hits[item[0]] = [] max_length = max(len(item[1]), max_length) for line in open(hit_file, 'rb'): line = line.strip() if line.startswith('#'): continue name, direction, n_errors, span, read_ali, ref_ali = line.rstrip( ).split() hits[name].append( (int(n_errors), direction == 'fwd', read_ali, ref_ali)) n_ambiguous = 0 n_unhit = 0 error_count = [0] * (max_errors + 1) error_pos_count = [0] * max_length indel_pos_count = [0] * max_length for name in hits: hits[name].sort() if not hits[name]: n_unhit += 1 continue if len(hits[name]) > 1 and \ hits[name][0][0]+2 > hits[name][1][0]: n_ambiguous += 1 continue error_count[hits[name][0][0]] += 1 forward, read_ali, ref_ali = hits[name][0][1:] if not forward: read_ali = read_ali[::-1] ref_ali = ref_ali[::-1] # Don't worry about complementing... read_pos = 0 for i in xrange(len(read_ali)): if read_ali[i] == '-' or ref_ali[i] == '-': indel_pos_count[read_pos] += 1 elif read_ali[i] != ref_ali[i]: error_pos_count[read_pos] += 1 if read_ali[i] != '-': read_pos += 1 print 'Error profile' for i in xrange(max_length): print 'pos=%5d snps=%5d indels=%5d' % (i + 1, error_pos_count[i], indel_pos_count[i]) print print 'Sampled', len(hits), 'reads' print n_ambiguous, 'had no clear best hit' print n_unhit, 'hit nothing' for i in xrange(max_errors + 1 - 2): print '%3d errors: %d' % (i, error_count[i])
def load_sequences(self, filename): for name, seq in sequence.sequence_file_iterator(filename): self.add_sequence(name, seq)
def main(argv): if len(argv) < 4: print >> sys.stderr, '' print >> sys.stderr, 'myr align <max error> <indel cost> <reference.fna> <reads.fna> [<reads.fna>...]' print >> sys.stderr, '' print >> sys.stderr, 'Align short reads to a reference genome.' print >> sys.stderr, '' print >> sys.stderr, 'Files can be in FASTA or ELAND format.' print >> sys.stderr, '' print >> sys.stderr, 'Each subsitution counts as one error. The cost of an indel can be specified,' print >> sys.stderr, 'but must be an integer. The whole read (not just part of it) must align to ' print >> sys.stderr, 'the reference with less than the specified maximum errors in order to' print >> sys.stderr, 'produce a hit.' print >> sys.stderr, '' print >> sys.stderr, 'For Illumina reads, we suggest (on the basis of very little experience):' print >> sys.stderr, '' print >> sys.stderr, ' myr align 6 2 reference.fna reads.fna' print >> sys.stderr, '' return 1 if CELL_PROCESSOR: print >> sys.stderr, 'Cell processor detected' else: print >> sys.stderr, 'Cell processor not detected' print >> sys.stderr, 'Using', PROCESSES, 'processes' maxerror = int(argv[0]) assert maxerror >= 0 indel_cost = int(argv[1]) assert indel_cost >= 1 waiting = [children.Self_child() for i in xrange(PROCESSES)] running = [] t1 = time.time() total_alignments = [0] def handle_events(): for child in children.wait(running): message, value = child.receive() if message == 'done': running.remove(child) waiting.append(child) dt = time.time() - t1 total_alignments[ 0] += value // 2 # Forwards + backwards == 1 alignment util.show_status( '%d alignments in %.2f seconds, %.4f per alignment' % (total_alignments[0], dt, dt / total_alignments[0])) else: print value print '#Max errors:', maxerror print '#Indel cost:', indel_cost for ref_name, ref_seq in sequence.sequence_file_iterator(argv[2]): print '#Reference:', ref_name for child in waiting: child.send(('ref', ref_seq)) # Collect reads of the same length, # and do them in batches buckets = {} # length -> [ [name], [seq] ] def do_bucket(length, only_if_full): if CELL_PROCESSOR: #Hmmm chunk = 1800000 // (length * ((maxerror + 1) * 2 + 5)) chunk -= chunk & 127 chunk = max(chunk, 128) else: chunk = 8192 if only_if_full and len(buckets[length][0]) < chunk: return read_names = buckets[length][0][:chunk] del buckets[length][0][:chunk] read_seqs = buckets[length][1][:chunk] del buckets[length][1][:chunk] if not buckets[length][0]: del buckets[length] while not waiting: handle_events() #print >> sys.stderr, 'Starting batch alignment of', len(read_seqs), '%d-mers'%length child = waiting.pop() child.send( ('align', (read_seqs, read_names, maxerror, indel_cost))) running.append(child) for read_name, read_seq in sequence.sequence_files_iterator(argv[3:]): length = len(read_seq) if length not in buckets: buckets[length] = ([], []) buckets[length][0].append(read_name + ' fwd') buckets[length][1].append(read_seq) buckets[length][0].append(read_name + ' rev') buckets[length][1].append(sequence.reverse_complement(read_seq)) do_bucket(length, True) while buckets: for length in list(buckets): do_bucket(length, False) while running: handle_events() for child in waiting: child.close() util.show_status('') return 0
def read_files(argv): clip_start, argv = get_option_value(argv, '-s', int, 0) clip_end, argv = get_option_value(argv, '-e', int, 0) if len(argv) < 2: raise Bad_option( 'Expected at least two filenames, a reference genome and and alignment file' ) reference = sequence.sequence_file_iterator(argv[0]).next()[1] #read_hits = { } hits = Hits() #nth = 0 for filename in argv[1:]: #for line in open(filename,'rb'): # if not line.endswith('\n'): continue # if line.startswith('#'): continue for ref_name, name, forward, start, end, read_ali, ref_ali \ in iter_hit_file(filename): #hit = Hit() #hit.name, hit.direction, hit.n_errors, span, hit.read_ali, hit.ref_ali = line.rstrip().split() i = hits.length hits.resize(i + 1) #hits.name[i], direction, n_errors, span, hits.read_ali[i], hits.ref_ali[i] = line.rstrip().split() #start, end = span.split('..') #hits.start[i] = int(start)-1 #hits.end[i] = int(end) #hits.n_errors[i] = int(n_errors) #hits.forward[i] = (direction == 'fwd') hits.name[i] = name hits.forward[i] = forward hits.start[i] = start hits.end[i] = end hits.read_ali[i] = read_ali hits.ref_ali[i] = ref_ali if clip_start or clip_end: if hits.forward[i]: hits.read_ali[i], hits.ref_ali[ i], clipped_start, clipped_end = clip_alignment( hits.read_ali[i], hits.ref_ali[i], clip_start, clip_end) else: hits.read_ali[i], hits.ref_ali[ i], clipped_start, clipped_end = clip_alignment( hits.read_ali[i], hits.ref_ali[i], clip_end, clip_start) hits.start[i] += clipped_start hits.end[i] -= clipped_end #if hits.name[i] not in read_hits: #read_hits[hit.name] = [ ] #read_hits[hit.name].append(hit) #nth += 1 #if nth % 10000 == 0: #sys.stderr.write('Loading hits: %d \r' % nth) #sys.stderr.flush() hits.you_are_dirty() return reference, hits