def run(self): seqs = [] seen = 0 for filename in self.filenames: for seq in io.read_sequences(filename, qualities=True): seen += 1 if seen % 100000 == 0: grace.status('Scanned ' + grace.pretty_number(seen)) if len(seqs) < self.n: seqs.append(seq) elif self.n <= random.random() * seen: seqs[random.randrange(self.n)] = seq grace.status('') print >> sys.stderr, 'Sampled', grace.pretty_number( len(seqs)), 'of', grace.pretty_number(seen), 'sequences' if not seqs: return qualities = len(seqs[0]) if qualities: for name, seq, qual in seqs: io.write_fastq(sys.stdout, name, seq, qual) else: for name, seq in seqs: io.write_fastq(sys.stdout, name, seq)
def run(self): headers = bam_headers(self.input) writer = Bam_writer(self.prefix + '.bam', headers) chrom = None endpoints = [] total = 0 discarded = 0 for record in Bam_reader(self.input): if record.flag & FLAG_UNMAPPED: continue total += 1 if chrom != record.rname: chrom = record.rname endpoints = [] while endpoints and endpoints[0] <= record.pos: heapq.heappop(endpoints) if len(endpoints) >= self.depth: discarded += 1 continue heapq.heappush(endpoints, record.pos + record.length) record.flag &= ~(FLAG_PAIRED | FLAG_PROPER | FLAG_MATE_UNMAPPED | FLAG_MATE_REVERSE | FLAG_FIRST | FLAG_SECOND) record.mrnm = '*' record.mpos = 0 writer.write(record) writer.close() self.log.log( 'Discarded %s alignments out of %s.\n' % (grace.pretty_number(discarded), grace.pretty_number(total)))
def run(self): headers = bam_headers(self.input) writer = Bam_writer(self.prefix+'.bam', headers) chrom = None endpoints = [ ] total = 0 discarded = 0 for record in Bam_reader(self.input): if record.flag & FLAG_UNMAPPED: continue total += 1 if chrom != record.rname: chrom = record.rname endpoints = [ ] while endpoints and endpoints[0] <= record.pos: heapq.heappop(endpoints) if len(endpoints) >= self.depth: discarded += 1 continue heapq.heappush(endpoints, record.pos+record.length) record.flag &= ~(FLAG_PAIRED|FLAG_PROPER|FLAG_MATE_UNMAPPED|FLAG_MATE_REVERSE|FLAG_FIRST|FLAG_SECOND) record.mrnm = '*' record.mpos = 0 writer.write(record) writer.close() self.log.log('Discarded %s alignments out of %s.\n' % (grace.pretty_number(discarded),grace.pretty_number(total)))
def reader(working_dirs, references, use_reference, annotations={}): for name, sequence in references: features = annotations.get(sequence, []) if use_reference: readers = [ reference_reader(sequence) ] else: readers = [ ] readers.extend( evidence_reader(working_dir, name) for working_dir in working_dirs ) active_features = [ ] feature_pos = 0 for i in xrange(len(sequence)): if i % 10000 == 0: grace.status('%s %s' % (name, grace.pretty_number(i))) active_features = [ item for item in active_features if item.location.nofuzzy_end > i ] while feature_pos < len(features) and \ features[feature_pos].location.nofuzzy_start <= i: active_features.append(features[feature_pos]) feature_pos += 1 for is_insertion in (True, False): yield Calls(name, i, is_insertion, [ item.next() for item in readers ], active_features) for reader in readers: for item in reader: raise grace.Error('Unexpected extra data in evidence file') grace.status('')
def reader(working_dirs, references, use_reference, annotations={}): for name, sequence in references: features = annotations.get(sequence, []) if use_reference: readers = [reference_reader(sequence)] else: readers = [] readers.extend(evidence_reader(working_dir, name) for working_dir in working_dirs) active_features = [] feature_pos = 0 for i in xrange(len(sequence)): if i % 10000 == 0: grace.status("%s %s" % (name, grace.pretty_number(i))) active_features = [item for item in active_features if item.location.nofuzzy_end > i] while feature_pos < len(features) and features[feature_pos].location.nofuzzy_start <= i: active_features.append(features[feature_pos]) feature_pos += 1 for is_insertion in (True, False): yield Calls(name, i, is_insertion, [item.next() for item in readers], active_features) for reader in readers: for item in reader: raise grace.Error("Unexpected extra data in evidence file") grace.status("")
def eat(f): for line in f: if line.startswith('@'): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status('%s alignments produced' % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) sam_header_sent[0] = True
def normalize_files(dirnames, prefix, min_depth): contents = [ read_userplot(os.path.join(item, prefix + '-depth.userplot')) for item in dirnames ] data = [item[2] for item in contents] if contents[0][1]: j_range = xrange(1, len(contents[2][0])) else: j_range = [0] totals = [0.0] * len(data) n = 0 for i in xrange(len(contents[0][2])): for j in j_range: depths = [item[i][j] for item in data] good = True for item in depths: if item < min_depth: good = False break if not good: continue for k, item in enumerate(depths): totals[k] += math.log(item) n += 1 print prefix, 'sites used:', grace.pretty_number(n) if n == 0: print 'Can\'t normalize, skipping.' print return avg_total = sum(totals) / len(data) norm_divisors = [math.exp((item - avg_total) / n) for item in totals] print 'Relative abundances:' for i in xrange(len(dirnames)): print ' %.3f %s' % (norm_divisors[i], dirnames[i]) print #for i, item in enumerate(contents): # write_normalized_userplot(item, 1.0/norm_divisors[i], os.path.join(dirnames[i],prefix+'-norm.userplot')) for i, dirname in enumerate(dirnames): for filename in os.listdir(dirname): if not filename.startswith(prefix): continue if not filename.endswith('.userplot'): continue if filename.endswith('-norm.userplot'): continue fullname = os.path.join(dirname, filename) full_outname = fullname[:-9] + '-norm.userplot' write_normalized_userplot(read_userplot(fullname), 1.0 / norm_divisors[i], full_outname)
def normalize_files(dirnames, prefix, min_depth): contents = [ read_userplot(os.path.join(item,prefix+'-depth.userplot')) for item in dirnames ] data = [ item[2] for item in contents ] if contents[0][1]: j_range = xrange(1,len(contents[2][0])) else: j_range = [0] totals = [ 0.0 ] * len(data) n = 0 for i in xrange(len(contents[0][2])): for j in j_range: depths = [ item[i][j] for item in data ] good = True for item in depths: if item < min_depth: good = False break if not good: continue for k, item in enumerate(depths): totals[k] += math.log(item) n += 1 print prefix, 'sites used:', grace.pretty_number(n) if n == 0: print 'Can\'t normalize, skipping.' print return avg_total = sum( totals ) / len(data) norm_divisors = [ math.exp( (item - avg_total)/n ) for item in totals ] print 'Relative abundances:' for i in xrange(len(dirnames)): print ' %.3f %s' % (norm_divisors[i], dirnames[i]) print #for i, item in enumerate(contents): # write_normalized_userplot(item, 1.0/norm_divisors[i], os.path.join(dirnames[i],prefix+'-norm.userplot')) for i, dirname in enumerate(dirnames): for filename in os.listdir(dirname): if not filename.startswith(prefix): continue if not filename.endswith('.userplot'): continue if filename.endswith('-norm.userplot'): continue fullname = os.path.join(dirname, filename) full_outname = fullname[:-9] + '-norm.userplot' write_normalized_userplot( read_userplot(fullname), 1.0/norm_divisors[i], full_outname)
def eat(process): for line in process.stdout: if line.startswith('@'): if sam_header_sent[0]: continue else: n_seen[0] += 1 if n_seen[0] % 100000 == 0: grace.status('%s alignments produced' % grace.pretty_number(n_seen[0])) sam_eater.write_raw(line) assert process.wait() == 0, 'shrimp failed' sam_header_sent[0] = True
def run(self): seqs = [ ] seen = 0 for filename in self.filenames: for seq in io.read_sequences(filename, qualities=True): seen += 1 if seen % 100000 == 0: grace.status('Scanned '+grace.pretty_number(seen)) if len(seqs) < self.n: seqs.append(seq) elif self.n <= random.random() * seen: seqs[ random.randrange(self.n) ] = seq grace.status('') print >> sys.stderr, 'Sampled', grace.pretty_number(len(seqs)), 'of', grace.pretty_number(seen), 'sequences' if not seqs: return qualities = len(seqs[0]) if qualities: for name, seq, qual in seqs: io.write_fastq(sys.stdout, name, seq, qual) else: for name, seq in seqs: io.write_fastq(sys.stdout, name, seq)
def main(args): grace.require_shrimp_1() n_cpus = grace.how_many_cpus() solid, args = grace.get_flag(args, '--solid') verbose, args = grace.get_flag(args, '--verbose') threshold, args = grace.get_option_value(args, '--threshold', str, '68%') stride, args = grace.get_option_value(args, '--stride', int, 1) max_shrimps, args = grace.get_option_value(args, '--cpus', int, n_cpus) batch_size, args = grace.get_option_value(args, '--batch-size', int, 5000000) input_reference_filenames = [] reads_filenames = [] shrimp_options = ['-h', threshold] if threshold.endswith('%'): threshold = -float(threshold[:-1]) / 100.0 else: threshold = int(threshold) output_dir = [] #As list so can write to from function. Gah. def front_command(args): grace.expect_no_further_options(args) if len(args) < 1: return output_dir.append(args[0]) input_reference_filenames.extend( [os.path.abspath(filename) for filename in args[1:]]) def reads_command(args): grace.expect_no_further_options(args) reads_filenames.extend([[os.path.abspath(filename)] for filename in args]) def pairs_command(args): grace.expect_no_further_options(args) assert len(args) == 2, 'Expected exactly two files in "pairs"' reads_filenames.append( [os.path.abspath(filename) for filename in args]) def shrimp_options_command(args): shrimp_options.extend(args) grace.execute( args, { 'reads': reads_command, '--reads': reads_command, 'pairs': pairs_command, 'shrimp-options': shrimp_options_command, '--shrimp-options': shrimp_options_command, }, front_command) if not output_dir: print >> sys.stderr, USAGE % n_cpus return 1 output_dir = output_dir[0] assert input_reference_filenames, 'No reference files given' assert reads_filenames, 'No read files given' for filename in itertools.chain(input_reference_filenames, *reads_filenames): assert os.path.exists(filename), '%s does not exist' % filename if not os.path.isdir(output_dir): os.mkdir(output_dir) if solid: shrimp = 'rmapper-cs' else: shrimp = 'rmapper-ls' reference_filename = os.path.join(output_dir, 'reference.fa') reference_file = open(reference_filename, 'wb') total_reference_sequences = 0 total_reference_bases = 0 for input_reference_filename in input_reference_filenames: for name, sequence in io.read_sequences(input_reference_filename): #Don't retain any comment name = name.split()[0] io.write_fasta(reference_file, name, sequence) total_reference_sequences += 1 total_reference_bases += len(sequence) reference_file.close() print '%s base%s in %s reference sequence%s' % ( grace.pretty_number(total_reference_bases), 's' if total_reference_bases != 1 else '', grace.pretty_number(total_reference_sequences), 's' if total_reference_sequences != 1 else '') assert total_reference_bases, 'Reference sequence file is empty' config = { 'references': input_reference_filenames, 'reads': reads_filenames, 'stride': stride, 'solid': solid, 'threshold': threshold, } config_file = open(os.path.join(output_dir, 'config.txt'), 'wb') pprint.pprint(config, config_file) config_file.close() output_filename = os.path.join(output_dir, 'shrimp_hits.txt.gz') output_file = gzip.open(output_filename, 'wb') unmapped_filename = os.path.join(output_dir, 'unmapped.fa.gz') unmapped_file = gzip.open(unmapped_filename, 'wb') dirty_filenames = set() dirty_filenames.add(output_filename) dirty_filenames.add(unmapped_filename) #warn_low_threshold = True try: #Cleanup temporary files N = [0] def do_shrimp(read_set): my_number = N[0] N[0] += 1 tempname = os.path.join(output_dir, 'temp%d-%d.fa' % (os.getpid(), my_number)) tempname_out = os.path.join( output_dir, 'temp%d-%d.txt' % (os.getpid(), my_number)) dirty_filenames.add(tempname) dirty_filenames.add(tempname_out) f = open(tempname, 'wb') for read_name, read_seq in read_set: print >> f, '>' + read_name print >> f, read_seq f.close() command = shrimp + ' ' + ' '.join(shrimp_options) + ' ' + \ tempname + ' ' + reference_filename + ' >' + tempname_out if not verbose: command += ' 2>/dev/null' #f = os.popen(command, 'r') child_pid = os.spawnl(os.P_NOWAIT, '/bin/sh', '/bin/sh', '-c', command) #print 'SHRiMP %d running' % my_number def finalize(): exit_status = os.waitpid(child_pid, 0)[1] assert exit_status == 0, 'Shrimp indicated an error' hits = {} # read_name -> [ hit line ] f = open(tempname_out, 'rb') for line in f: if line.startswith('>'): read_name = line.split(None, 1)[0][1:] if read_name not in hits: hits[read_name] = [] hits[read_name].append(line) f.close() for read_name, read_seq in read_set: if read_name in hits: for hit in hits[read_name]: output_file.write(hit) else: print >> unmapped_file, '>' + read_name print >> unmapped_file, read_seq output_file.flush() unmapped_file.flush() os.unlink(tempname) dirty_filenames.remove(tempname) os.unlink(tempname_out) dirty_filenames.remove(tempname_out) #print 'SHRiMP %d finished' % my_number return finalize shrimps = [] reader = iter_reads(config) read_count = 0 while True: read_set = [] read_set_bases = 0 #Read name should not include comment cruft # - SHRIMP passes this through # - might stuff up identification of pairs for read_name, read_seq in reader: read_name = read_name.split()[0] read_set.append((read_name, read_seq)) read_set_bases += len(read_seq) #if warn_low_threshold and len(read_seq)*7 < threshold: #Require 70% exact match # sys.stderr.write('\n*** WARNING: Short reads, consider reducing --threshold ***\n\n') # warn_low_threshold = False read_count += 1 if read_set_bases >= batch_size: break if not read_set: break if len(shrimps) >= max_shrimps: shrimps.pop(0)() shrimps.append(do_shrimp(read_set)) grace.status('SHRiMPing %s' % grace.pretty_number(read_count)) while shrimps: grace.status('Waiting for SHRiMPs to finish %d ' % len(shrimps)) shrimps.pop(0)() grace.status('') output_file.close() dirty_filenames.remove(output_filename) unmapped_file.close() dirty_filenames.remove(unmapped_filename) return 0 finally: for filename in dirty_filenames: if os.path.exists(filename): os.unlink(filename)
def run(self): """ <sequence> <poly-A> <adaptor> <anything> """ clip_quality = chr(33+self.clip_quality) ignore_quality = chr(33+self.ignore_quality) with io.open_possibly_compressed_writer(self.prefix+'.fastq.gz') as out_file, \ io.open_possibly_compressed_writer(self.prefix+'.clips.gz') as out_clips_file: print >> out_clips_file, '#Read\tread length\tpoly-A start\tpoly-A end\tpoly-A start, ignoring adaptor\tpoly-A end, ignoring adaptor\tadaptor bases matched' n = 0 n_discarded = 0 n_clipped = 0 total_before = 0 total_clipped = 0 for filename in self.filenames: for name, seq, qual in io.read_sequences(filename, qualities='required'): # "Good quality" sequence ends at the first low quality base #good_quality_end = 0 #while good_quality_end < len(seq) and qual[good_quality_end] >= clip_quality: # good_quality_end += 1 goodness_score = 0 best_goodness_score = 0 good_quality_end = 0 i = 0 while True: if goodness_score > best_goodness_score: best_goodness_score = goodness_score good_quality_end = i if i >= len(seq): break if qual[i] >= clip_quality: goodness_score += 1 else: goodness_score -= 9 i += 1 best_score = 0 best_a_start = good_quality_end best_a_end = good_quality_end best_adaptor_bases = 0 best_aonly_score = 0 best_aonly_start = good_quality_end best_aonly_end = good_quality_end # Consider each possible start position for the poly(A) for a_start in xrange(good_quality_end): if a_start and seq[a_start-1] == 'A': continue # Consider each possible end position for the poly(A) a_end = a_start aonly_score = 0 while True: if aonly_score > best_aonly_score: best_aonly_score = aonly_score best_aonly_start = a_start best_aonly_end = a_end # The poly(A) should be followed by adaptor, # at least until the end of good quality sequence. # However if there is evidence of the adaptor beyond # the end of good quality, we still want to know that, # and count it towards the number of adaptor bases present. score = aonly_score adaptor_bases = 0 i = a_end while True: if (score > best_score and (i >= good_quality_end or i >= a_end+len(self.adaptor))): best_score = score best_a_start = a_start best_a_end = a_end best_adaptor_bases = adaptor_bases if i >= a_end+len(self.adaptor) or i >= len(seq): break if qual[i] >= ignore_quality: if seq[i] == self.adaptor[i-a_end]: score += 1 adaptor_bases += 1 else: score -= 4 i += 1 #if a_end >= len(seq): break # poly(A) tail only within good quality region. if a_end >= good_quality_end: break if qual[a_end] >= ignore_quality: if seq[a_end] == 'A': aonly_score += 1 else: aonly_score -= 4 if aonly_score <= 0: break a_end += 1 a_start = best_a_start a_end = best_a_end adaptor_bases = best_adaptor_bases aonly_start = best_aonly_start aonly_end = best_aonly_end if self.debug: # and a_end == a_start and a_end < len(seq)-10: print name print ''.join( 'I' if item<ignore_quality else ('C' if item<clip_quality else ' ') for item in qual ) print '-' * good_quality_end print seq print ' '*a_start + 'A'*(a_end-a_start) + self.adaptor + ".%d %d"%(adaptor_bases,best_score) #print ' '*aonly_start + 'A'*(aonly_end-aonly_start) + "." print sys.stdout.flush() n += 1 total_before += len(seq) # 0 - sequence name # 1 - sequence length # 2 - poly(A) start # 3 - poly(A) end # (4 - best run of As start, for debugging the need to detect adaptor seq) # (5 - best run of As end) # 6 - number of adaptor bases matched print >> out_clips_file, '%s\t%d\t%d\t%d\t%d\t%d\t%d' % (name, len(seq) , a_start, a_end, aonly_start, aonly_end, adaptor_bases) if a_start > self.length: if a_start < len(seq): n_clipped += 1 total_clipped += a_start print >> out_file, '@'+name print >> out_file, seq[:a_start] print >> out_file, '+' print >> out_file, qual[:a_start] else: n_discarded += 1 if n%10000 == 0: grace.status('Clip-runs ' + self.sample + ' ' + grace.pretty_number(n)) # + ' (' + grace.pretty_number(len(dstates)) + ' dstates)') grace.status('') self.log.datum(self.sample,'reads',n) if n: self.log.datum(self.sample,'mean length before poly-A/adaptor clipping',float(total_before)/n) self.log.datum(self.sample,'reads discarded as too short after poly-A/adaptor clipping',n_discarded) self.log.datum(self.sample,'reads poly-A/adaptor clipped and kept',n_clipped) if n_clipped: self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
def run(self): log = self.log #quality_cutoff, args = grace.get_option_value(args, '--quality', int, 10) #qoffset, args = grace.get_option_value(args, '--qoffset', int, None) #clip_ambiguous, args = grace.get_option_value(args, '--clip-ambiguous', grace.as_bool, True) #length_cutoff, args = grace.get_option_value(args, '--length', int, 24) #adaptor_cutoff, args = grace.get_option_value(args, '--match', int, 10) #max_error, args = grace.get_option_value(args, '--max-errors', int, 1) #adaptor_set, args = grace.get_option_value(args, '--adaptors', str, 'truseq-adapter,truseq-srna,genomic,multiplexing,pe,srna') #disallow_homopolymers, args = grace.get_option_value(args, '--homopolymers', grace.as_bool, False) #reverse_complement, args = grace.get_option_value(args, '--revcom', grace.as_bool, False) #trim_start, args = grace.get_option_value(args, '--trim-start', int, 0) #trim_end, args = grace.get_option_value(args, '--trim-end', int, 0) #output_fasta, args = grace.get_option_value(args, '--fasta', grace.as_bool, False) #use_gzip, args = grace.get_option_value(args, '--gzip', grace.as_bool, True) #output_rejects, args = grace.get_option_value(args, '--rejects', grace.as_bool, False) #grace.expect_no_further_options(args) prefix = self.prefix log_name = os.path.split(prefix)[1] quality_cutoff = self.quality qoffset = self.qoffset clip_ambiguous = self.clip_ambiguous length_cutoff = self.length adaptor_cutoff = self.match max_error = self.max_errors adaptor_set = self.adaptors disallow_homopolymers = self.homopolymers reverse_complement = self.revcom trim_start = self.trim_start trim_end = self.trim_end output_fasta = self.fasta use_gzip = self.gzip output_rejects = self.rejects iterators = [] filenames = [] any_paired = False for filename in self.reads: filenames.append(filename) iterators.append( itertools.izip(io.read_sequences(filename, qualities=True))) for pair_filenames in self.pairs: assert len(pair_filenames ) == 2, 'Expected a pair of files for "pairs" section.' filenames.extend(pair_filenames) any_paired = True iterators.append( itertools.izip( io.read_sequences(pair_filenames[0], qualities=True), io.read_sequences(pair_filenames[1], qualities=True))) for filename in self.interleaved: filenames.extend(filename) any_paired = True iterators.append( deinterleave(io.read_sequences(filename, qualities=True))) fragment_reads = (2 if any_paired else 1) read_in_fragment_names = ['read-1', 'read-2' ] if any_paired else ['read'] assert iterators, 'Nothing to clip' if qoffset is None: guesses = [ io.guess_quality_offset(filename) for filename in filenames ] assert len( set(guesses) ) == 1, 'Conflicting quality offset guesses, please specify manually.' qoffset = guesses[0] log.log('FASTQ offset seems to be %d\n' % qoffset) quality_cutoff_char = chr(qoffset + quality_cutoff) #log.log('Minimum quality: %d (%s)\n' % (quality_cutoff, quality_cutoff_char)) #log.log('Clip ambiguous bases: %s\n' % (grace.describe_bool(clip_ambiguous))) #log.log('Minimum adaptor match: %d bases, %d errors\n' % (adaptor_cutoff, max_error)) #log.log('Minimum length: %d bases\n' % length_cutoff) adaptor_seqs = [] adaptor_names = [] if adaptor_set and adaptor_set.lower() != 'none': for item in adaptor_set.split(','): item = item.strip().lower() + ' ' any = False for line in ADAPTORS.strip().split('\n'): if line.startswith('#'): continue if not line.lower().startswith(item): continue any = True name, seq = line.rsplit(None, 1) seq = seq.replace('U', 'T') #if seq in adaptor_seqs: print 'Dup', name adaptor_seqs.append(seq) adaptor_names.append(name) adaptor_seqs.append(bio.reverse_complement(seq)) adaptor_names.append(name) if not any: raise grace.Error('Unknown adaptor set: ' + item) matcher = Matcher(adaptor_seqs, adaptor_names, max_error) start_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] end_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] if output_fasta: write_sequence = io.write_fasta_single_line else: write_sequence = io.write_fastq f_single = io.open_possibly_compressed_writer( self.reads_output_filenames()[0]) if fragment_reads == 2: f_paired = io.open_possibly_compressed_writer( self.interleaved_output_filenames()[0]) if output_rejects: f_reject = io.open_possibly_compressed_writer( self.rejects_output_filenames()[0]) n_single = 0 n_paired = 0 n_in_single = 0 n_in_paired = 0 total_in_length = [0] * fragment_reads n_out = [0] * fragment_reads n_q_clipped = [0] * fragment_reads n_a_clipped = [0] * fragment_reads n_homopolymers = [0] * fragment_reads total_out_length = [0] * fragment_reads #log.attach(open(prefix + '_log.txt', 'wb')) for iterator in iterators: for fragment in iterator: if (n_in_single + n_in_paired) % 10000 == 0: grace.status( 'Clipping fragment %s' % grace.pretty_number(n_in_single + n_in_paired)) if len(fragment) == 1: n_in_single += 1 else: n_in_paired += 1 graduates = [] rejects = [] for i, (name, seq, qual) in enumerate(fragment): name = name.split()[0] seq = seq.upper() total_in_length[i] += len(seq) start = trim_start best_start = 0 best_len = 0 for j in xrange(len(seq) - trim_end): if qual[j] < quality_cutoff_char or \ (clip_ambiguous and seq[j] not in 'ACGT'): if best_len < j - start: best_start = start best_len = j - start start = j + 1 j = len(seq) - trim_end if best_len < j - start: best_start = start best_len = j - start clipped_seq = seq[best_start:best_start + best_len] clipped_qual = qual[best_start:best_start + best_len] if len(clipped_seq) < length_cutoff: n_q_clipped[i] += 1 rejects.append((name, seq, qual, 'quality')) continue match = matcher.match(clipped_seq) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[match[0]:] clipped_qual = clipped_qual[match[0]:] start_clips[i][match[0]].append(match[1][0]) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append((name, seq, qual, 'adaptor')) continue match = matcher.match(bio.reverse_complement(clipped_seq)) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[:len(clipped_seq) - match[0]] clipped_qual = clipped_qual[:len(clipped_qual) - match[0]] end_clips[i][match[0]].append(match[1][0]) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append((name, seq, qual, 'adaptor')) continue if disallow_homopolymers and len(set(clipped_seq)) <= 1: n_homopolymers[i] += 1 rejects.append((name, seq, qual, 'homopolymer')) continue graduates.append((name, clipped_seq, clipped_qual)) n_out[i] += 1 total_out_length[i] += len(clipped_seq) if output_rejects: for name, seq, qual, reason in rejects: write_sequence(f_reject, name + ' ' + reason, seq, qual) if graduates: if reverse_complement: graduates = [(name, bio.reverse_complement(seq), qual[::-1]) for name, seq, qual in graduates] if len(graduates) == 1: this_f = f_single n_single += 1 else: assert len(graduates) == 2 this_f = f_paired n_paired += 1 for name, seq, qual in graduates: write_sequence(this_f, name, seq, qual) grace.status('') if output_rejects: f_reject.close() if fragment_reads == 2: f_paired.close() f_single.close() def summarize_clips(name, location, clips): total = 0 for i in clips: total += len(clips[i]) log.datum(log_name, name + ' adaptors clipped at ' + location, total) if not clips: return for i in xrange(min(clips), max(clips) + 1): item = clips[i] log.quietly_log('%3d bases: %10d ' % (i, len(item))) if item: avg_errors = float(sum(item2[0] for item2 in item)) / len(item) log.quietly_log(' avg errors: %5.2f ' % avg_errors) counts = collections.defaultdict(int) for item2 in item: counts[item2[1]] += 1 #print counts for no in sorted(counts, key=lambda item2: counts[item2], reverse=True)[:2]: log.quietly_log('%dx%s ' % (counts[no], matcher.names[no])) if len(counts) > 2: log.quietly_log('...') log.quietly_log('\n') log.quietly_log('\n') if n_in_paired: log.datum(log_name, 'read-pairs', n_in_paired) if n_in_single: log.datum(log_name, 'single reads', n_in_single) for i in xrange(fragment_reads): if start_clips: summarize_clips(read_in_fragment_names[i], 'start', start_clips[i]) if end_clips: summarize_clips(read_in_fragment_names[i], 'end', end_clips[i]) prefix = read_in_fragment_names[i] log.datum(log_name, prefix + ' too short after quality clip', n_q_clipped[i]) log.datum(log_name, prefix + ' too short after adaptor clip', n_a_clipped[i]) if disallow_homopolymers: log.datum(log_name, prefix + ' homopolymers', n_homopolymers[i]) if fragment_reads > 1: log.datum(log_name, prefix + ' kept', n_out[i]) log.datum(log_name, prefix + ' average input length', float(total_in_length[i]) / (n_in_single + n_in_paired)) if n_out[i]: log.datum(log_name, prefix + ' average output length', float(total_out_length[i]) / n_out[i]) if fragment_reads == 2: log.datum(log_name, 'pairs kept after clipping', n_paired) log.datum(log_name, 'reads kept after clipping', n_single)
def make_ambiguity_bigwig_by_readname(prefix, bam_filenames, stop_after=None, subsample=1): #import pysam #alf = pysam.AlignmentFile(bam_filenames[0]) #header = alf.header header = sam.parsed_bam_headers(bam_filenames[0]) with open(prefix+"-chrom.sizes","wb") as f: for entry in header["SQ"]: f.write("{}\t{}\n".format(entry["SN"],entry["LN"])) chrom_names = [ entry["SN"] for entry in header["SQ"] ] chrom_sizes = [ int(entry["LN"]) for entry in header["SQ"] ] #alf.close() unambiguous = dict([ (i,Piler(j)) for i,j in zip(chrom_names,chrom_sizes) ]) total = dict([ (i,Piler(j)) for i,j in zip(chrom_names,chrom_sizes) ]) old = grace.status("Ambiguity bigwig") for filename in bam_filenames: #alf = pysam.AlignmentFile(filename) alf = sam.Bam_reader(filename) n = 0 sub = subsample-1 for (key,items) in itertools.groupby(alf, lambda item: item.query_name): sub = (sub + 1) % subsample if sub: continue items = [ item for item in items if not item.is_unmapped and not item.is_supplementary ] if not items: continue # Only use top scoring alignments AS = [ item.get_AS() for item in items ] best_AS = max(AS) items = [ item for item, this_AS in zip(items,AS) if this_AS >= best_AS ] for item in items: #spanner = fragment_split_coverage([item]) spanner = fragment_coverage([item]) #TODO fixme when blocks available spanner = scale_spanner(1.0/len(items), spanner) total[item.reference_name].add(spanner) if len(items) == 1: unambiguous[item.reference_name].add(spanner) n += 1 if stop_after is not None and n > stop_after: break if n % 1000000 == 0: grace.status(os.path.basename(prefix)+" "+filename+" "+grace.pretty_number(n)) alf.close() ambiguities = [ ] for i in xrange(len(total)): u = unambiguous[chrom_names[i]].get() t = map_spanner(lambda x: x*1j, total[chrom_names[i]].get()) c = pile([u,t],initial=0.0) c = map_spanner(lambda x: max(0.0,x.imag-x.real)/max(x.imag,1.0), c) ambiguities.append(c) bedgraph(prefix+".bedgraph", zip(chrom_names, [ item for item in ambiguities ])) subprocess.check_call([ "wigToBigWig",prefix+".bedgraph",prefix+"-chrom.sizes",prefix+".bw"]) os.unlink(prefix+".bedgraph") os.unlink(prefix+"-chrom.sizes") grace.status(old)
def make_ambiguity_bigwig(prefix, bam_filenames, stop_after=None, subsample=1): #import pysam #alf = pysam.AlignmentFile(bam_filenames[0]) #header = alf.header header = sam.parsed_bam_headers(bam_filenames[0]) with open(prefix+"-chrom.sizes","wb") as f: for entry in header["SQ"]: f.write("{}\t{}\n".format(entry["SN"],entry["LN"])) chrom_names = [ entry["SN"] for entry in header["SQ"] ] chrom_sizes = [ int(entry["LN"]) for entry in header["SQ"] ] #alf.close() unambiguous = dict([ (i,Piler(j)) for i,j in zip(chrom_names,chrom_sizes) ]) total = dict([ (i,Piler(j)) for i,j in zip(chrom_names,chrom_sizes) ]) for filename in bam_filenames: #alf = pysam.AlignmentFile(filename) alf = sam.Bam_reader(filename) n = 0 sub = subsample-1 for item in alf: if item.is_unmapped or item.is_supplementary: continue sub = (sub + 1) % subsample if sub: continue #spanner = fragment_split_coverage([item]) spanner = fragment_coverage([item]) #TODO fixme when blocks available total[item.reference_name].add(spanner) NH = 1 for item2 in item.extra: if item2.startswith("NH:i:"): NH = int(item2[5:]) if NH == 1: unambiguous[item.reference_name].add(spanner) n += 1 if stop_after is not None and n > stop_after: break if n % 1000000 == 0: print prefix, filename, grace.pretty_number(n) alf.close() ambiguities = [ ] for i in xrange(len(total)): u = unambiguous[chrom_names[i]].get() t = map_spanner(lambda x: x*1j, total[chrom_names[i]].get()) c = pile([u,t],initial=0.0) c = map_spanner(lambda x: max(0.0,x.imag-x.real)/max(x.imag,1.0), c) ambiguities.append(c) bedgraph(prefix+".bedgraph", zip(chrom_names, [ item for item in ambiguities ])) subprocess.check_call([ "wigToBigWig",prefix+".bedgraph",prefix+"-chrom.sizes",prefix+".bw"]) os.unlink(prefix+".bedgraph") os.unlink(prefix+"-chrom.sizes")
def make_bigwig(prefix, bam_filenames, make_spanner, fragments=False, stop_after=None, scale=1.0, polya=False): have_pysam = False try: import pysam have_pysam = True except ImportError: pass #alf = pysam.AlignmentFile(bam_filenames[0]) #header = alf.header header = sam.parsed_bam_headers(bam_filenames[0]) with open(prefix+"-chrom.sizes","wb") as f: for entry in header["SQ"]: f.write("{}\t{}\n".format(entry["SN"],entry["LN"])) chrom_names = [ entry["SN"] for entry in header["SQ"] ] chrom_sizes = [ int(entry["LN"]) for entry in header["SQ"] ] #alf.close() forward = dict([ (i,Piler(j)) for i,j in zip(chrom_names,chrom_sizes) ]) reverse = dict([ (i,Piler(j)) for i,j in zip(chrom_names,chrom_sizes) ]) old = grace.status("Bigwig") for filename in bam_filenames: if have_pysam: alf = pysam.AlignmentFile(filename) else: alf = sam.Bam_reader(filename) n = 0 if not fragments: for item in alf: if item.is_unmapped or item.is_secondary or item.is_supplementary: continue if polya and not alignment_is_polya(item): continue # Assume --> <-- oriented read pairs which = forward if bool(item.is_reverse) == bool(item.is_read2) else reverse which[item.reference_name].add( make_spanner(item) ) n += 1 if stop_after is not None and n > stop_after: break if n % 1000000 == 0: grace.status(os.path.basename(prefix)+" "+filename+" "+grace.pretty_number(n)) else: for item in iter_fragments(alf): if polya and not any(alignment_is_polya(al) for al in item): continue # Assume --> <-- oriented read pairs which = forward if bool(item[0].is_reverse) == bool(item[0].is_read2) else reverse which[item[0].reference_name].add( make_spanner(item) ) n += 1 if stop_after is not None and n > stop_after: break if n % 1000000 == 0: grace.status(os.path.basename(prefix)+" "+filename+" "+grace.pretty_number(n)) if have_pysam: alf.close() bedgraph(prefix+"-fwd.bedgraph", zip(chrom_names, [ scale_spanner(scale, forward[item].get()) for item in chrom_names ])) subprocess.check_call([ "wigToBigWig",prefix+"-fwd.bedgraph",prefix+"-chrom.sizes",prefix+"-fwd.bw"]) os.unlink(prefix+"-fwd.bedgraph") bedgraph(prefix+"-rev.bedgraph", zip(chrom_names, [ scale_spanner(scale, reverse[item].get()) for item in chrom_names ])) subprocess.check_call([ "wigToBigWig",prefix+"-rev.bedgraph",prefix+"-chrom.sizes",prefix+"-rev.bw"]) os.unlink(prefix+"-rev.bedgraph") os.unlink(prefix+"-chrom.sizes") grace.status(old)
def make_ambiguity_bigwig_by_readname(prefix, bam_filenames, stop_after=None, subsample=1): #import pysam #alf = pysam.AlignmentFile(bam_filenames[0]) #header = alf.header header = sam.parsed_bam_headers(bam_filenames[0]) with open(prefix + "-chrom.sizes", "wb") as f: for entry in header["SQ"]: f.write("{}\t{}\n".format(entry["SN"], entry["LN"])) chrom_names = [entry["SN"] for entry in header["SQ"]] chrom_sizes = [int(entry["LN"]) for entry in header["SQ"]] #alf.close() unambiguous = dict([(i, Piler(j)) for i, j in zip(chrom_names, chrom_sizes)]) total = dict([(i, Piler(j)) for i, j in zip(chrom_names, chrom_sizes)]) old = grace.status("Ambiguity bigwig") for filename in bam_filenames: #alf = pysam.AlignmentFile(filename) alf = sam.Bam_reader(filename) n = 0 sub = subsample - 1 for (key, items) in itertools.groupby(alf, lambda item: item.query_name): sub = (sub + 1) % subsample if sub: continue items = [ item for item in items if not item.is_unmapped and not item.is_supplementary ] if not items: continue # Only use top scoring alignments AS = [item.get_AS() for item in items] best_AS = max(AS) items = [ item for item, this_AS in zip(items, AS) if this_AS >= best_AS ] for item in items: #spanner = fragment_split_coverage([item]) spanner = fragment_coverage( [item]) #TODO fixme when blocks available spanner = scale_spanner(1.0 / len(items), spanner) total[item.reference_name].add(spanner) if len(items) == 1: unambiguous[item.reference_name].add(spanner) n += 1 if stop_after is not None and n > stop_after: break if n % 1000000 == 0: grace.status( os.path.basename(prefix) + " " + filename + " " + grace.pretty_number(n)) alf.close() ambiguities = [] for i in xrange(len(total)): u = unambiguous[chrom_names[i]].get() t = map_spanner(lambda x: x * 1j, total[chrom_names[i]].get()) c = pile([u, t], initial=0.0) c = map_spanner(lambda x: max(0.0, x.imag - x.real) / max(x.imag, 1.0), c) ambiguities.append(c) bedgraph(prefix + ".bedgraph", zip(chrom_names, [item for item in ambiguities])) subprocess.check_call([ "wigToBigWig", prefix + ".bedgraph", prefix + "-chrom.sizes", prefix + ".bw" ]) os.unlink(prefix + ".bedgraph") os.unlink(prefix + "-chrom.sizes") grace.status(old)
def make_ambiguity_bigwig(prefix, bam_filenames, stop_after=None, subsample=1): #import pysam #alf = pysam.AlignmentFile(bam_filenames[0]) #header = alf.header header = sam.parsed_bam_headers(bam_filenames[0]) with open(prefix + "-chrom.sizes", "wb") as f: for entry in header["SQ"]: f.write("{}\t{}\n".format(entry["SN"], entry["LN"])) chrom_names = [entry["SN"] for entry in header["SQ"]] chrom_sizes = [int(entry["LN"]) for entry in header["SQ"]] #alf.close() unambiguous = dict([(i, Piler(j)) for i, j in zip(chrom_names, chrom_sizes)]) total = dict([(i, Piler(j)) for i, j in zip(chrom_names, chrom_sizes)]) for filename in bam_filenames: #alf = pysam.AlignmentFile(filename) alf = sam.Bam_reader(filename) n = 0 sub = subsample - 1 for item in alf: if item.is_unmapped or item.is_supplementary: continue sub = (sub + 1) % subsample if sub: continue #spanner = fragment_split_coverage([item]) spanner = fragment_coverage([item ]) #TODO fixme when blocks available total[item.reference_name].add(spanner) NH = 1 for item2 in item.extra: if item2.startswith("NH:i:"): NH = int(item2[5:]) if NH == 1: unambiguous[item.reference_name].add(spanner) n += 1 if stop_after is not None and n > stop_after: break if n % 1000000 == 0: print prefix, filename, grace.pretty_number(n) alf.close() ambiguities = [] for i in xrange(len(total)): u = unambiguous[chrom_names[i]].get() t = map_spanner(lambda x: x * 1j, total[chrom_names[i]].get()) c = pile([u, t], initial=0.0) c = map_spanner(lambda x: max(0.0, x.imag - x.real) / max(x.imag, 1.0), c) ambiguities.append(c) bedgraph(prefix + ".bedgraph", zip(chrom_names, [item for item in ambiguities])) subprocess.check_call([ "wigToBigWig", prefix + ".bedgraph", prefix + "-chrom.sizes", prefix + ".bw" ]) os.unlink(prefix + ".bedgraph") os.unlink(prefix + "-chrom.sizes")
def run(self): #mincov, args = grace.get_option_value(args, '--mincov', int, 1) #maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) #minsize, args = grace.get_option_value(args, '--minsize', int, 200) #what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core') #is_core = (what == 'core') # #grace.expect_no_further_options(args) # #if len(args) < 2: # print >> sys.stderr, HELP # raise grace.Help_shown() # #output_dir, working_dirs = args[0], args[1:] # ##assert not path.exists(path.join(output_dir, 'reference.fa')), \ #assert not path.exists(path.join(output_dir, 'parameters')), \ # 'Output directory not given' # #if not path.exists(output_dir): # os.mkdir(output_dir) assert self.what in ('core','unique'), 'Expected --what to be either "core" or "unique".' is_core = (self.what == 'core') workspace = self.get_workspace() for name, seq in io.read_sequences(working_directory.Working(self.working_dirs[0]).get_reference().reference_fasta_filename()): self.log.log(name + '\n') friendly_name = grace.filesystem_friendly_name(name) good = [ True ] * len(seq) for working_dir in self.working_dirs: if is_core: suffix = '-depth.userplot' else: suffix = '-ambiguous-depth.userplot' data = trivia.read_unstranded_userplot( os.path.join(working_dir, friendly_name+suffix) ) assert len(seq) == len(data) for i in xrange(len(seq)): if good[i]: if is_core: good[i] = data[i] >= self.mincov else: good[i] = data[i] < self.mincov #Close holes start = -self.maxdiff-1 n_holes = 0 for i in xrange(len(seq)): if good[i]: if 0 < i-start <= self.maxdiff: for j in xrange(start,i): good[j] = True n_holes += 1 start = i+1 self.log.log('Closed '+grace.pretty_number(n_holes)+' holes\n') f = open( workspace/('%s-%s.fa' % (friendly_name,self.what)), 'wb') io.write_fasta(f, name, ''.join([ (seq[i] if good[i] else 'N') for i in xrange(len(seq)) ]) ) f.close() f = open( workspace/('%s-%s_masked.fa' % (friendly_name,self.what)), 'wb') io.write_fasta(f, name, ''.join([ (seq[i] if good[i] else seq[i].lower()) for i in xrange(len(seq)) ]) ) f.close() f_good = open( workspace/('%s-%s_parts.fa' % (friendly_name,self.what)), 'wb') f_nongood = open( workspace/('%s-non%s_parts.fa' % (friendly_name,self.what)), 'wb') start = 0 n_good = [0] n_good_bases = [0] def emit(i): if i-start < self.minsize: return if good[start]: n_good[0] += 1 n_good_bases[0] += i-start io.write_fasta( f_good if good[start] else f_nongood, '%s:%d..%d' % (name, start+1,i), seq[start:i] ) for i in xrange(1,len(seq)): if good[i] != good[start]: emit(i) start = i emit(len(seq)) f_nongood.close() f_good.close() self.log.log(grace.pretty_number(sum(good))+' bases are '+self.what+', of '+grace.pretty_number(len(seq))+' in reference sequence\n') self.log.log(grace.pretty_number(n_good[0])+' parts at least '+grace.pretty_number(self.minsize)+' bases long with '+grace.pretty_number(n_good_bases[0])+' total bases\n') self.log.log('\n')
def bam_iter_fragments(filename, status_text='Processing'): reader = Bam_reader(filename) old_status = grace.status(status_text) n = 0 n_ambiguous = 0 for read_name, alignment_iter in itertools.groupby(reader, lambda read: read.qname): if n % 100000 == 0: grace.status(status_text + ' fragment %s' % grace.pretty_number(n)) n += 1 unpaired = [ ] first = [ ] second = [ ] unmapped = [ ] for al in alignment_iter: if al.flag&FLAG_UNMAPPED: unmapped.append(al) elif not al.flag&FLAG_PAIRED or al.flag&FLAG_MATE_UNMAPPED: unpaired.append((al,)) elif al.flag&FLAG_FIRST: first.append(al) elif al.flag&FLAG_SECOND: second.append(al) else: assert False, 'Read in pair that is neither first nor second' pairs = [ ] unused = set(first + second) second_index = { } for al in second: key = (al.rname, al.pos) if key not in second_index: second_index[key] = [ ] second_index[key].append(al) for al1 in first: key = (al1.get_mrnm(), al1.mpos) for al2 in second_index.get(key, ()): if al2.get_mrnm() != al1.rname or \ al2.mpos != al1.pos: continue if al1 not in unused or al2 not in unused: # pfh says: I am displeased that the pairing is sometimes ambiguous n_ambiguous += 1 continue pairs.append( (al1, al2) ) unused.remove(al1) unused.remove(al2) if unused: print unused assert not unused, 'Alignment pairing not even pretending to make sense. Is the BAM file sorted by read name?' yield read_name, pairs + unpaired, unmapped grace.status(old_status) if n_ambiguous: print >> sys.stderr print >> sys.stderr, 'The alignment pairing was unclear %s times, and alignments were paired arbitrarily.' % grace.pretty_number(n_ambiguous) print >> sys.stderr, 'Blame the SAM format.' print >> sys.stderr
def main(args): grace.require_shrimp_1() n_cpus = grace.how_many_cpus() solid, args = grace.get_flag(args, '--solid') verbose, args = grace.get_flag(args, '--verbose') threshold, args = grace.get_option_value(args, '--threshold', str, '68%') stride, args = grace.get_option_value(args, '--stride', int, 1) max_shrimps, args = grace.get_option_value(args, '--cpus', int, n_cpus) batch_size, args = grace.get_option_value(args, '--batch-size', int, 5000000) input_reference_filenames = [ ] reads_filenames = [ ] shrimp_options = [ '-h', threshold ] if threshold.endswith('%'): threshold = -float(threshold[:-1])/100.0 else: threshold = int(threshold) output_dir = [ ] #As list so can write to from function. Gah. def front_command(args): grace.expect_no_further_options(args) if len(args) < 1: return output_dir.append(args[0]) input_reference_filenames.extend( [ os.path.abspath(filename) for filename in args[1:] ]) def reads_command(args): grace.expect_no_further_options(args) reads_filenames.extend([ [ os.path.abspath(filename) ] for filename in args]) def pairs_command(args): grace.expect_no_further_options(args) assert len(args) == 2, 'Expected exactly two files in "pairs"' reads_filenames.append([ os.path.abspath(filename) for filename in args ]) def shrimp_options_command(args): shrimp_options.extend(args) grace.execute(args, { 'reads': reads_command, '--reads': reads_command, 'pairs': pairs_command, 'shrimp-options': shrimp_options_command, '--shrimp-options': shrimp_options_command, }, front_command) if not output_dir: print >> sys.stderr, USAGE % n_cpus return 1 output_dir = output_dir[0] assert input_reference_filenames, 'No reference files given' assert reads_filenames, 'No read files given' for filename in itertools.chain(input_reference_filenames, *reads_filenames): assert os.path.exists(filename), '%s does not exist' % filename if not os.path.isdir(output_dir): os.mkdir(output_dir) if solid: shrimp = 'rmapper-cs' else: shrimp = 'rmapper-ls' reference_filename = os.path.join(output_dir,'reference.fa') reference_file = open(reference_filename,'wb') total_reference_sequences = 0 total_reference_bases = 0 for input_reference_filename in input_reference_filenames: for name, sequence in io.read_sequences(input_reference_filename): #Don't retain any comment name = name.split()[0] io.write_fasta(reference_file, name, sequence) total_reference_sequences += 1 total_reference_bases += len(sequence) reference_file.close() print '%s base%s in %s reference sequence%s' % ( grace.pretty_number(total_reference_bases), 's' if total_reference_bases != 1 else '', grace.pretty_number(total_reference_sequences), 's' if total_reference_sequences != 1 else '') assert total_reference_bases, 'Reference sequence file is empty' config = { 'references' : input_reference_filenames, 'reads' : reads_filenames, 'stride' : stride, 'solid': solid, 'threshold': threshold, } config_file = open(os.path.join(output_dir, 'config.txt'), 'wb') pprint.pprint(config, config_file) config_file.close() output_filename = os.path.join(output_dir, 'shrimp_hits.txt.gz') output_file = gzip.open(output_filename, 'wb') unmapped_filename = os.path.join(output_dir, 'unmapped.fa.gz') unmapped_file = gzip.open(unmapped_filename, 'wb') dirty_filenames = set() dirty_filenames.add(output_filename) dirty_filenames.add(unmapped_filename) #warn_low_threshold = True try: #Cleanup temporary files N = [0] def do_shrimp(read_set): my_number = N[0] N[0] += 1 tempname = os.path.join(output_dir,'temp%d-%d.fa' % (os.getpid(),my_number)) tempname_out = os.path.join(output_dir,'temp%d-%d.txt' % (os.getpid(),my_number)) dirty_filenames.add(tempname) dirty_filenames.add(tempname_out) f = open(tempname,'wb') for read_name, read_seq in read_set: print >> f, '>' + read_name print >> f, read_seq f.close() command = shrimp + ' ' + ' '.join(shrimp_options) + ' ' + \ tempname + ' ' + reference_filename + ' >' + tempname_out if not verbose: command += ' 2>/dev/null' #f = os.popen(command, 'r') child_pid = os.spawnl(os.P_NOWAIT,'/bin/sh','/bin/sh','-c',command) #print 'SHRiMP %d running' % my_number def finalize(): exit_status = os.waitpid(child_pid, 0)[1] assert exit_status == 0, 'Shrimp indicated an error' hits = { } # read_name -> [ hit line ] f = open(tempname_out,'rb') for line in f: if line.startswith('>'): read_name = line.split(None,1)[0][1:] if read_name not in hits: hits[read_name] = [ ] hits[read_name].append(line) f.close() for read_name, read_seq in read_set: if read_name in hits: for hit in hits[read_name]: output_file.write(hit) else: print >> unmapped_file, '>' + read_name print >> unmapped_file, read_seq output_file.flush() unmapped_file.flush() os.unlink(tempname) dirty_filenames.remove(tempname) os.unlink(tempname_out) dirty_filenames.remove(tempname_out) #print 'SHRiMP %d finished' % my_number return finalize shrimps = [ ] reader = iter_reads(config) read_count = 0 while True: read_set = [ ] read_set_bases = 0 #Read name should not include comment cruft # - SHRIMP passes this through # - might stuff up identification of pairs for read_name, read_seq in reader: read_name = read_name.split()[0] read_set.append((read_name, read_seq)) read_set_bases += len(read_seq) #if warn_low_threshold and len(read_seq)*7 < threshold: #Require 70% exact match # sys.stderr.write('\n*** WARNING: Short reads, consider reducing --threshold ***\n\n') # warn_low_threshold = False read_count += 1 if read_set_bases >= batch_size: break if not read_set: break if len(shrimps) >= max_shrimps: shrimps.pop(0)() shrimps.append( do_shrimp(read_set) ) grace.status('SHRiMPing %s' % grace.pretty_number(read_count)) while shrimps: grace.status('Waiting for SHRiMPs to finish %d ' % len(shrimps) ) shrimps.pop(0)() grace.status('') output_file.close() dirty_filenames.remove(output_filename) unmapped_file.close() dirty_filenames.remove(unmapped_filename) return 0 finally: for filename in dirty_filenames: if os.path.exists(filename): os.unlink(filename)
def main(args): mincov, args = grace.get_option_value(args, '--mincov', int, 1) maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) minsize, args = grace.get_option_value(args, '--minsize', int, 200) what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core') is_core = (what == 'core') grace.expect_no_further_options(args) if len(args) < 2: print >> sys.stderr, HELP raise grace.Help_shown() output_dir, working_dirs = args[0], args[1:] assert not path.exists(path.join(output_dir, 'reference.fa')), \ 'Output directory not given' if not path.exists(output_dir): os.mkdir(output_dir) for name, seq in io.read_sequences( path.join(working_dirs[0], 'reference.fa')): print name friendly_name = grace.filesystem_friendly_name(name) good = [True] * len(seq) for working_dir in working_dirs: if is_core: suffix = '-depth.userplot' else: suffix = '-ambiguous-depth.userplot' data = trivia.read_unstranded_userplot( os.path.join(working_dir, friendly_name + suffix)) assert len(seq) == len(data) for i in xrange(len(seq)): if good[i]: if is_core: good[i] = data[i] >= mincov else: good[i] = data[i] < mincov #Close holes start = -maxdiff - 1 n_holes = 0 for i in xrange(len(seq)): if good[i]: if 0 < i - start <= maxdiff: for j in xrange(start, i): good[j] = True n_holes += 1 start = i + 1 print 'Closed', grace.pretty_number(n_holes), 'holes' f = open(path.join(output_dir, '%s-%s.fa' % (friendly_name, what)), 'wb') io.write_fasta( f, name, ''.join([(seq[i] if good[i] else 'N') for i in xrange(len(seq))])) f.close() f = open( path.join(output_dir, '%s-%s_masked.fa' % (friendly_name, what)), 'wb') io.write_fasta( f, name, ''.join([(seq[i] if good[i] else seq[i].lower()) for i in xrange(len(seq))])) f.close() f_good = open( path.join(output_dir, '%s-%s_parts.fa' % (friendly_name, what)), 'wb') f_nongood = open( path.join(output_dir, '%s-non%s_parts.fa' % (friendly_name, what)), 'wb') start = 0 n_good = [0] n_good_bases = [0] def emit(i): if i - start < minsize: return if good[start]: n_good[0] += 1 n_good_bases[0] += i - start io.write_fasta(f_good if good[start] else f_nongood, '%s:%d..%d' % (name, start + 1, i), seq[start:i]) for i in xrange(1, len(seq)): if good[i] != good[start]: emit(i) start = i emit(len(seq)) f_nongood.close() f_good.close() print grace.pretty_number( sum(good)), 'bases are ' + what + ', of', grace.pretty_number( len(seq)), 'in reference sequence' print grace.pretty_number( n_good[0]), 'parts at least', grace.pretty_number( minsize), 'bases long with', grace.pretty_number( n_good_bases[0]), 'total bases' print
def recombination(args): grace.expect_no_further_options(args) if len(args) != 2: print >> sys.stderr, USAGE raise grace.Help_shown() working_dir, seq_name = args references = dict(io.read_sequences(os.path.join(working_dir, 'reference.fa'))) depth = { } prefixes = { } suffixes = { } for name in references: depth[name] = numpy.zeros(len(references[name]), 'int64') prefixes[name] = [ [] for base in references[name] ] suffixes[name] = [ [] for base in references[name] ] def register_divergence(hit): if not hit.query_forward: hit = hit.reversed() margin = 20 if hit.target_end - hit.target_start < 20: return False depth[hit.target_name][hit.target_start : hit.target_end] += 1 any = False if hit.query_end <= len(hit.query_seq)-margin: # and hit.target_end < len(hit.target_seq): suffixes[hit.target_name][hit.target_end-1].append( hit.query_seq[hit.query_end:] ) any = True if hit.query_start >= margin: # and hit.target_start > 0: prefixes[hit.target_name][hit.target_start].append( hit.query_seq[:hit.query_start] ) any = True return any n = 0 for (read_name, read_seq), hits in shrimp.iter_read_hits(working_dir): # Skip reads containing Ns if 'N' in read_seq: continue for line in hits: register_divergence(alignment_from_shrimp(line, references, read_name, read_seq)) n += 1 #if n > 100000: # break if n%10000 == 0: grace.status('Processing read %s' % grace.pretty_number(n)) grace.status('') def show_items(items): original_length = len(items) cut = 0 while len(items) > 80: cut += 1 items = [ item for item in items if item[0] >= cut ] for item in items: print item[1] if len(items) < original_length: print '(and %d more occurring %d times or less)' % (original_length-len(items), cut-1) def score(items): if not items: return 1.0 return float(sum( item[0] * item[0] for item in items )) / (sum( item[0] for item in items )**2) def summarize_prefixes(seqs, pad): seqs = sorted(seqs, key=lambda seq: seq[::-1]) cut = 100 while True: items = [ ] for (seq, iterator) in itertools.groupby(seqs, key = lambda x: x[-cut:]): ss = list(iterator) anylong = any( item != seq for item in ss ) n = len(ss) items.append( (n, ('%'+str(pad)+'s')%(('...' if anylong else '') + seq) + ' x %d' % n) ) if score(items) >= 1.0/20: break cut -= 1 show_items(items) def summarize_suffixes(seqs, pad): seqs = sorted(seqs) cut = 100 while True: items = [ ] for (seq, iterator) in itertools.groupby(seqs, key = lambda x: x[:cut]): ss = list(iterator) anylong = any( item != seq for item in ss ) n = len(ss) items.append( (n, ('%'+str(pad)+'s')%('%d x '%n) + seq + ('...' if anylong else '')) ) if score(items) >= 1.0/20: break cut -= 1 show_items(items) print 'Position Depth Changed prefixes Changed suffixes' print ' Count % of depth Count % of depth' for i in xrange(len(references[seq_name])): print '%8d %10d %9d %11s %9d %11s' % ( i+1, depth[seq_name][i], len(prefixes[seq_name][i]), '%.3f%%' % (len(prefixes[seq_name][i])*100.0/depth[seq_name][i]) if prefixes[seq_name][i] else '', len(suffixes[seq_name][i]), '%.3f%%' % (len(suffixes[seq_name][i])*100.0/depth[seq_name][i]) if suffixes[seq_name][i] else '') #summarize_suffixes(suffixes[name][i], references[name][i+1:], references[name], suffix_depth[name][i]) print print 'Details' print for i in xrange(len(references[seq_name])): print '%-80s*' % ('Base %d' % (i+1)) print pad_slice(references[seq_name], i-80,i+1+80) summarize_prefixes(prefixes[seq_name][i], 80) summarize_suffixes(suffixes[seq_name][i], 81) print
def run(self): """ <sequence> <poly-A> <adaptor> <anything> """ min_quality = chr(33+self.quality) with io.open_possibly_compressed_writer(self.prefix+'.fastq.gz') as out_file, \ io.open_possibly_compressed_writer(self.prefix+'.clips.gz') as out_clips_file: print >> out_clips_file, '#Read\tread length\tpoly-A start\tpoly-A end\tpoly-A start, ignoring adaptor\tpoly-A end, ignoring adaptor\tadaptor bases matched' n = 0 n_discarded = 0 n_clipped = 0 total_before = 0 total_clipped = 0 for filename in self.filenames: for name, seq, qual in io.read_sequences(filename, qualities='required'): best_score = 0 best_a_start = len(seq) best_a_end = len(seq) best_adaptor_bases = 0 best_aonly_score = 0 best_aonly_start = len(seq) best_aonly_end = len(seq) for a_start in xrange(len(seq)): if a_start and seq[a_start-1] == 'A': continue a_end = a_start aonly_score = 0 while True: if aonly_score > best_aonly_score: best_aonly_score = aonly_score best_aonly_start = a_start best_aonly_end = a_end score = aonly_score adaptor_bases = 0 for i in xrange(a_end,min(a_end+len(self.adaptor),len(seq))): if qual[i] >= min_quality: if seq[i] == self.adaptor[i-a_end]: score += 1 adaptor_bases += 1 else: score -= 4 if score > best_score: best_score = score best_a_start = a_start best_a_end = a_end best_adaptor_bases = adaptor_bases if a_end >= len(seq): break if qual[a_end] >= min_quality: if seq[a_end] == 'A': aonly_score += 1 else: aonly_score -= 4 if aonly_score <= 0: break a_end += 1 a_start = best_a_start a_end = best_a_end adaptor_bases = best_adaptor_bases aonly_start = best_aonly_start aonly_end = best_aonly_end if self.debug: # and a_end == a_start and a_end < len(seq)-10: print name print ''.join( 'X' if item<min_quality else ' ' for item in qual ) print seq print ' '*a_start + 'A'*(a_end-a_start) + self.adaptor print ' '*aonly_start + 'A'*(aonly_end-aonly_start) print sys.stdout.flush() n += 1 total_before += len(seq) # 0 - sequence name # 1 - sequence length # 2 - poly(A) start # 3 - poly(A) end # (4 - best run of As start, for debugging the need to detect adaptor seq) # (5 - best run of As end) # 6 - number of adaptor bases matched print >> out_clips_file, '%s\t%d\t%d\t%d\t%d\t%d\t%d' % (name, len(seq) , a_start, a_end, aonly_start, aonly_end, adaptor_bases) if a_start > self.length: if a_start < len(seq): n_clipped += 1 total_clipped += a_start print >> out_file, '@'+name print >> out_file, seq[:a_start] print >> out_file, '+' print >> out_file, qual[:a_start] else: n_discarded += 1 if n%10000 == 0: grace.status('Clip-runs ' + self.sample + ' ' + grace.pretty_number(n)) # + ' (' + grace.pretty_number(len(dstates)) + ' dstates)') grace.status('') self.log.datum(self.sample,'reads',n) if n: self.log.datum(self.sample,'mean length before poly-A/adaptor clipping',float(total_before)/n) self.log.datum(self.sample,'reads discarded as too short after poly-A/adaptor clipping',n_discarded) self.log.datum(self.sample,'reads poly-A/adaptor clipped and kept',n_clipped) if n_clipped: self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
def run(self): log = self.log #quality_cutoff, args = grace.get_option_value(args, '--quality', int, 10) #qoffset, args = grace.get_option_value(args, '--qoffset', int, None) #clip_ambiguous, args = grace.get_option_value(args, '--clip-ambiguous', grace.as_bool, True) #length_cutoff, args = grace.get_option_value(args, '--length', int, 24) #adaptor_cutoff, args = grace.get_option_value(args, '--match', int, 10) #max_error, args = grace.get_option_value(args, '--max-errors', int, 1) #adaptor_set, args = grace.get_option_value(args, '--adaptors', str, 'truseq-adapter,truseq-srna,genomic,multiplexing,pe,srna') #disallow_homopolymers, args = grace.get_option_value(args, '--homopolymers', grace.as_bool, False) #reverse_complement, args = grace.get_option_value(args, '--revcom', grace.as_bool, False) #trim_start, args = grace.get_option_value(args, '--trim-start', int, 0) #trim_end, args = grace.get_option_value(args, '--trim-end', int, 0) #output_fasta, args = grace.get_option_value(args, '--fasta', grace.as_bool, False) #use_gzip, args = grace.get_option_value(args, '--gzip', grace.as_bool, True) #output_rejects, args = grace.get_option_value(args, '--rejects', grace.as_bool, False) #grace.expect_no_further_options(args) prefix = self.prefix log_name = os.path.split(prefix)[1] quality_cutoff = self.quality qoffset = self.qoffset clip_ambiguous = self.clip_ambiguous length_cutoff = self.length adaptor_cutoff = self.match max_error = self.max_errors disallow_homopolymers = self.homopolymers reverse_complement = self.revcom trim_start = self.trim_start trim_end = self.trim_end output_fasta = self.fasta use_gzip = self.gzip output_rejects = self.rejects iterators = [ ] filenames = [ ] any_paired = False for filename in self.reads: filenames.append(filename) iterators.append(itertools.izip( io.read_sequences(filename, qualities=True) )) for pair_filenames in self.pairs: assert len(pair_filenames) == 2, 'Expected a pair of files for "pairs" section.' filenames.extend(pair_filenames) any_paired = True iterators.append(itertools.izip( io.read_sequences(pair_filenames[0], qualities=True), io.read_sequences(pair_filenames[1], qualities=True) )) for filename in self.interleaved: filenames.append(filename) any_paired = True iterators.append(deinterleave( io.read_sequences(filename, qualities=True) )) fragment_reads = (2 if any_paired else 1) read_in_fragment_names = [ 'read-1', 'read-2' ] if any_paired else [ 'read' ] assert iterators, 'Nothing to clip' io.check_name_uniqueness(self.reads, self.pairs, self.interleaved) if qoffset is None: guesses = [ io.guess_quality_offset(filename) for filename in filenames ] assert len(set(guesses)) == 1, 'Conflicting quality offset guesses, please specify manually.' qoffset = guesses[0] log.log('FASTQ offset seems to be %d\n' % qoffset) quality_cutoff_char = chr(qoffset + quality_cutoff) #log.log('Minimum quality: %d (%s)\n' % (quality_cutoff, quality_cutoff_char)) #log.log('Clip ambiguous bases: %s\n' % (grace.describe_bool(clip_ambiguous))) #log.log('Minimum adaptor match: %d bases, %d errors\n' % (adaptor_cutoff, max_error)) #log.log('Minimum length: %d bases\n' % length_cutoff) adaptor_seqs = [ ] adaptor_names = [ ] if self.adaptor_clip: if self.adaptor_file: adaptor_iter = io.read_sequences(self.adaptor_file) else: adaptor_iter = ADAPTORS for name, seq in adaptor_iter: seq = seq.upper().replace('U','T') adaptor_seqs.append(seq) adaptor_names.append(name) adaptor_seqs.append(bio.reverse_complement(seq)) adaptor_names.append(name) matcher = Matcher(adaptor_seqs, adaptor_names, max_error) start_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] end_clips = [ collections.defaultdict(list) for i in xrange(fragment_reads) ] if output_fasta: write_sequence = io.write_fasta_single_line else: write_sequence = io.write_fastq f_single = io.open_possibly_compressed_writer(self.reads_output_filenames()[0]) if fragment_reads == 2: names = self.pairs_output_filenames()[0] if self.out_separate else self.interleaved_output_filenames() f_paired = map(io.open_possibly_compressed_writer, names) if output_rejects: f_reject = io.open_possibly_compressed_writer(self.rejects_output_filenames()[0]) n_single = 0 n_paired = 0 n_in_single = 0 n_in_paired = 0 total_in_length = [ 0 ] * fragment_reads n_out = [ 0 ] * fragment_reads n_q_clipped = [ 0 ] * fragment_reads n_a_clipped = [ 0 ] * fragment_reads n_homopolymers = [ 0 ] * fragment_reads total_out_length = [ 0 ] * fragment_reads #log.attach(open(prefix + '_log.txt', 'wb')) for iterator in iterators: for fragment in iterator: if (n_in_single+n_in_paired) % 10000 == 0: grace.status('Clipping fragment %s' % grace.pretty_number(n_in_single+n_in_paired)) if len(fragment) == 1: n_in_single += 1 else: n_in_paired += 1 graduates = [ ] rejects = [ ] for i, (name, seq, qual) in enumerate(fragment): seq = seq.upper() total_in_length[i] += len(seq) if self.trim_to: seq = seq[:self.trim_to] qual = qual[:self.trim_to] start = trim_start best_start = 0 best_len = 0 for j in xrange(len(seq)-trim_end): if qual[j] < quality_cutoff_char or \ (clip_ambiguous and seq[j] not in 'ACGT'): if best_len < j-start: best_start = start best_len = j-start start = j + 1 j = len(seq)-trim_end if best_len < j-start: best_start = start best_len = j-start clipped_seq = seq[best_start:best_start+best_len] clipped_qual = qual[best_start:best_start+best_len] if len(clipped_seq) < length_cutoff: n_q_clipped[i] += 1 rejects.append( (name,seq,qual,'quality') ) continue match = matcher.match(clipped_seq) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[match[0]:] clipped_qual = clipped_qual[match[0]:] start_clips[i][match[0]].append( match[1][0] ) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append( (name,seq,qual,'adaptor') ) continue match = matcher.match(bio.reverse_complement(clipped_seq)) if match and match[0] >= adaptor_cutoff: clipped_seq = clipped_seq[: len(clipped_seq)-match[0] ] clipped_qual = clipped_qual[: len(clipped_qual)-match[0] ] end_clips[i][match[0]].append( match[1][0] ) if len(clipped_seq) < length_cutoff: n_a_clipped[i] += 1 rejects.append( (name,seq,qual,'adaptor') ) continue if disallow_homopolymers and len(set(clipped_seq)) <= 1: n_homopolymers[i] += 1 rejects.append( (name,seq,qual,'homopolymer') ) continue graduates.append( (name, clipped_seq, clipped_qual) ) n_out[i] += 1 total_out_length[i] += len(clipped_seq) if output_rejects: for name,seq,qual,reason in rejects: write_sequence(f_reject, name + ' ' + reason, seq, qual) if graduates: if reverse_complement: graduates = [ (name, bio.reverse_complement(seq), qual[::-1]) for name, seq, qual in graduates ] if len(graduates) == 1: n_single += 1 (name, seq, qual) = graduates[0] write_sequence(f_single, name, seq, qual) else: assert len(graduates) == 2 n_paired += 1 # Write the pair to an interleaved file or separate l/r files for (lr,(name, seq, qual)) in enumerate(graduates): write_sequence(f_paired[lr%len(f_paired)], name, seq, qual) grace.status('') if output_rejects: f_reject.close() if fragment_reads == 2: map(lambda f: f.close(), f_paired) f_single.close() def summarize_clips(name, location, clips): total = 0 for i in clips: total += len(clips[i]) log.datum(log_name, name + ' adaptors clipped at ' + location, total) if not clips: return for i in xrange(min(clips), max(clips)+1): item = clips[i] log.quietly_log('%3d bases: %10d ' % (i, len(item))) if item: avg_errors = float(sum( item2[0] for item2 in item )) / len(item) log.quietly_log(' avg errors: %5.2f ' % avg_errors) counts = collections.defaultdict(int) for item2 in item: counts[item2[1]] += 1 #print counts for no in sorted(counts,key=lambda item2:counts[item2],reverse=True)[:2]: log.quietly_log('%dx%s ' % (counts[no], matcher.names[no])) if len(counts) > 2: log.quietly_log('...') log.quietly_log('\n') log.quietly_log('\n') if n_in_paired: log.datum(log_name,'read-pairs', n_in_paired) if n_in_single: log.datum(log_name,'single reads', n_in_single) for i in xrange(fragment_reads): if start_clips: summarize_clips(read_in_fragment_names[i], 'start', start_clips[i]) if end_clips: summarize_clips(read_in_fragment_names[i], 'end', end_clips[i]) prefix = read_in_fragment_names[i] log.datum(log_name, prefix + ' too short after quality clip', n_q_clipped[i]) log.datum(log_name, prefix + ' too short after adaptor clip', n_a_clipped[i]) if disallow_homopolymers: log.datum(log_name, prefix + ' homopolymers', n_homopolymers[i]) if fragment_reads > 1: log.datum(log_name, prefix + ' kept', n_out[i]) log.datum(log_name, prefix + ' average input length', float(total_in_length[i]) / (n_in_single+n_in_paired)) if n_out[i]: log.datum(log_name, prefix + ' average output length', float(total_out_length[i]) / n_out[i]) if fragment_reads == 2: log.datum(log_name,'pairs kept after clipping', n_paired) log.datum(log_name, 'reads kept after clipping', n_single)
def run(self): #mincov, args = grace.get_option_value(args, '--mincov', int, 1) #maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) #minsize, args = grace.get_option_value(args, '--minsize', int, 200) #what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core') #is_core = (what == 'core') # #grace.expect_no_further_options(args) # #if len(args) < 2: # print >> sys.stderr, HELP # raise grace.Help_shown() # #output_dir, working_dirs = args[0], args[1:] # ##assert not path.exists(path.join(output_dir, 'reference.fa')), \ #assert not path.exists(path.join(output_dir, 'parameters')), \ # 'Output directory not given' # #if not path.exists(output_dir): # os.mkdir(output_dir) assert self.what in ( 'core', 'unique'), 'Expected --what to be either "core" or "unique".' is_core = (self.what == 'core') workspace = self.get_workspace() for name, seq in io.read_sequences( working_directory.Working(self.working_dirs[0]).get_reference( ).reference_fasta_filename()): self.log.log(name + '\n') friendly_name = grace.filesystem_friendly_name(name) good = [True] * len(seq) for working_dir in self.working_dirs: if is_core: suffix = '-depth.userplot' else: suffix = '-ambiguous-depth.userplot' data = trivia.read_unstranded_userplot( os.path.join(working_dir, friendly_name + suffix)) assert len(seq) == len(data) for i in xrange(len(seq)): if good[i]: if is_core: good[i] = data[i] >= self.mincov else: good[i] = data[i] < self.mincov #Close holes start = -self.maxdiff - 1 n_holes = 0 for i in xrange(len(seq)): if good[i]: if 0 < i - start <= self.maxdiff: for j in xrange(start, i): good[j] = True n_holes += 1 start = i + 1 self.log.log('Closed ' + grace.pretty_number(n_holes) + ' holes\n') f = open(workspace / ('%s-%s.fa' % (friendly_name, self.what)), 'wb') io.write_fasta( f, name, ''.join([(seq[i] if good[i] else 'N') for i in xrange(len(seq))])) f.close() f = open( workspace / ('%s-%s_masked.fa' % (friendly_name, self.what)), 'wb') io.write_fasta( f, name, ''.join([(seq[i] if good[i] else seq[i].lower()) for i in xrange(len(seq))])) f.close() f_good = open( workspace / ('%s-%s_parts.fa' % (friendly_name, self.what)), 'wb') f_nongood = open( workspace / ('%s-non%s_parts.fa' % (friendly_name, self.what)), 'wb') start = 0 n_good = [0] n_good_bases = [0] def emit(i): if i - start < self.minsize: return if good[start]: n_good[0] += 1 n_good_bases[0] += i - start io.write_fasta(f_good if good[start] else f_nongood, '%s:%d..%d' % (name, start + 1, i), seq[start:i]) for i in xrange(1, len(seq)): if good[i] != good[start]: emit(i) start = i emit(len(seq)) f_nongood.close() f_good.close() self.log.log( grace.pretty_number(sum(good)) + ' bases are ' + self.what + ', of ' + grace.pretty_number(len(seq)) + ' in reference sequence\n') self.log.log( grace.pretty_number(n_good[0]) + ' parts at least ' + grace.pretty_number(self.minsize) + ' bases long with ' + grace.pretty_number(n_good_bases[0]) + ' total bases\n') self.log.log('\n')
def bam_iter_fragments(filename, status_text='Processing'): reader = Bam_reader(filename) n = 0 n_ambiguous = 0 for read_name, alignment_iter in itertools.groupby(reader, lambda read: read.qname): if n % 100000 == 0: grace.status(status_text + ' fragment %s' % grace.pretty_number(n)) n += 1 unpaired = [ ] first = [ ] second = [ ] unmapped = [ ] for al in alignment_iter: if al.flag&FLAG_UNMAPPED: unmapped.append(al) elif not al.flag&FLAG_PAIRED or al.flag&FLAG_MATE_UNMAPPED: unpaired.append((al,)) elif al.flag&FLAG_FIRST: first.append(al) elif al.flag&FLAG_SECOND: second.append(al) else: assert False, 'Read in pair that is neither first nor second' pairs = [ ] unused = set(first + second) second_index = { } for al in second: key = (al.rname, al.pos) if key not in second_index: second_index[key] = [ ] second_index[key].append(al) for al1 in first: key = (al1.get_mrnm(), al1.mpos) for al2 in second_index.get(key, ()): if al2.get_mrnm() != al1.rname or \ al2.mpos != al1.pos: continue if al1 not in unused or al2 not in unused: # pfh says: I am displeased that the pairing is sometimes ambiguous n_ambiguous += 1 continue pairs.append( (al1, al2) ) unused.remove(al1) unused.remove(al2) if unused: print unused assert not unused, 'Alignment pairing not even pretending to make sense. Is the BAM file sorted by read name?' yield read_name, pairs + unpaired, unmapped grace.status('') if n_ambiguous: print >> sys.stderr print >> sys.stderr, 'The alignment pairing was unclear %s times, and alignments were paired arbitrarily.' % grace.pretty_number(n_ambiguous) print >> sys.stderr, 'Blame the SAM format.' print >> sys.stderr
def run(self): """ <sequence> <poly-A> <adaptor> <anything> """ clip_quality = chr(33+self.clip_quality) #ignore_quality = chr(33+self.ignore_quality) with io.open_possibly_compressed_writer(self.prefix+'.fastq.gz') as out_file, \ io.open_possibly_compressed_writer(self.prefix+'.clips.gz') as out_clips_file: print >> out_clips_file, '#Read\tread length\tpoly-A start\tpoly-A end\tpoly-A start, ignoring adaptor\tpoly-A end, ignoring adaptor\tadaptor bases matched' n = 0 n_discarded = 0 n_clipped = 0 total_before = 0 total_clipped = 0 for filename in self.filenames: for name, seq, qual in io.read_sequences(filename, qualities='required'): # "Good quality" sequence ends at the first low quality base #good_quality_end = 0 #while good_quality_end < len(seq) and qual[good_quality_end] >= clip_quality: # good_quality_end += 1 goodness_score = 0 best_goodness_score = 0 good_quality_end = 0 i = 0 while True: if goodness_score > best_goodness_score: best_goodness_score = goodness_score good_quality_end = i if i >= len(seq): break if qual[i] >= clip_quality: goodness_score += 1 else: goodness_score -= 9 i += 1 best_score = self.min_score-1 best_a_start = good_quality_end best_a_end = good_quality_end best_adaptor_bases = 0 best_aonly_score = 0 best_aonly_start = good_quality_end best_aonly_end = good_quality_end # Consider each possible start position for the poly(A) for a_start in xrange(len(seq)): if a_start and seq[a_start-1] == 'A': continue # Consider each possible end position for the poly(A) a_end = a_start aonly_score = 0 while True: if aonly_score > best_aonly_score: best_aonly_score = aonly_score best_aonly_start = a_start best_aonly_end = a_end # The poly(A) should be followed by adaptor, ## at least until the end of good quality sequence. # However if there is evidence of the adaptor beyond # the end of good quality, we still want to know that, # and count it towards the number of adaptor bases present. score = aonly_score adaptor_bases = 0 i = a_end abort_score = best_score-len(self.adaptor) abort_i = min(len(seq), a_end+len(self.adaptor)) while score >= abort_score: #if (score > best_score and # (i >= good_quality_end or i >= a_end+len(self.adaptor))): if score > best_score: best_score = score best_a_start = a_start best_a_end = a_end best_adaptor_bases = adaptor_bases if i >= abort_i: break if seq[i] == self.adaptor[i-a_end]: score += 1 adaptor_bases += 1 else: score -= 4 i += 1 #if a_end >= len(seq): break # Modified 2018-03-21 # poly(A) tail only within good quality region. #if a_end >= good_quality_end: break #if qual[a_end] >= ignore_quality: # if seq[a_end] == 'A': # aonly_score += 1 # else: # aonly_score -= 4 # if aonly_score <= 0: break if a_end >= len(seq): break if seq[a_end] == 'A': aonly_score += 1 else: #if qual[a_end] >= ignore_quality: aonly_score -= 4 #else: # aonly_score -= 1 a_end += 1 # 2018-03-21 # Look for tail starting after good quality, # however don't call a tail if starts after good quality if best_a_start > good_quality_end: best_a_start = good_quality_end best_a_end = good_quality_end best_adaptor_bases = 0 best_score = 0 a_start = best_a_start a_end = best_a_end adaptor_bases = best_adaptor_bases aonly_start = best_aonly_start aonly_end = best_aonly_end if self.debug: # and a_end == a_start and a_end < len(seq)-10: print name print ''.join( ('C' if item<clip_quality else ' ') for item in qual ) print '-' * good_quality_end print seq print ' '*a_start + 'A'*(a_end-a_start) + self.adaptor + ".%d %d"%(adaptor_bases,best_score) #print ' '*aonly_start + 'A'*(aonly_end-aonly_start) + "." print sys.stdout.flush() n += 1 total_before += len(seq) # 0 - sequence name # 1 - sequence length # 2 - poly(A) start # 3 - poly(A) end # (4 - best run of As start, for debugging the need to detect adaptor seq) # (5 - best run of As end) # 6 - number of adaptor bases matched print >> out_clips_file, '%s\t%d\t%d\t%d\t%d\t%d\t%d' % (name, len(seq) , a_start, a_end, aonly_start, aonly_end, adaptor_bases) if a_start >= self.length: if a_start < len(seq): n_clipped += 1 total_clipped += a_start print >> out_file, '@'+name print >> out_file, seq[:a_start] print >> out_file, '+' print >> out_file, qual[:a_start] else: n_discarded += 1 if n%10000 == 0: grace.status('Clip-runs ' + self.sample + ' ' + grace.pretty_number(n)) # + ' (' + grace.pretty_number(len(dstates)) + ' dstates)') # Option to do a quick subsample if self.only and self.only <= n: break grace.status('') self.log.datum(self.sample,'reads',n) if n: self.log.datum(self.sample,'mean length before poly-A/adaptor clipping',float(total_before)/n) self.log.datum(self.sample,'reads discarded as too short after poly-A/adaptor clipping',n_discarded) self.log.datum(self.sample,'reads poly-A/adaptor clipped and kept',n_clipped) if n_clipped: self.log.datum(self.sample,'mean length clipped',float(total_clipped)/n_clipped)
def main(args): mincov, args = grace.get_option_value(args, '--mincov', int, 1) maxdiff, args = grace.get_option_value(args, '--maxdiff', int, 16) minsize, args = grace.get_option_value(args, '--minsize', int, 200) what, args = grace.get_option_value(args, '--what', as_core_or_unique, 'core') is_core = (what == 'core') grace.expect_no_further_options(args) if len(args) < 2: print >> sys.stderr, HELP raise grace.Help_shown() output_dir, working_dirs = args[0], args[1:] assert not path.exists(path.join(output_dir, 'reference.fa')), \ 'Output directory not given' if not path.exists(output_dir): os.mkdir(output_dir) for name, seq in io.read_sequences(path.join(working_dirs[0],'reference.fa')): print name friendly_name = grace.filesystem_friendly_name(name) good = [ True ] * len(seq) for working_dir in working_dirs: if is_core: suffix = '-depth.userplot' else: suffix = '-ambiguous-depth.userplot' data = trivia.read_unstranded_userplot( os.path.join(working_dir, friendly_name+suffix) ) assert len(seq) == len(data) for i in xrange(len(seq)): if good[i]: if is_core: good[i] = data[i] >= mincov else: good[i] = data[i] < mincov #Close holes start = -maxdiff-1 n_holes = 0 for i in xrange(len(seq)): if good[i]: if 0 < i-start <= maxdiff: for j in xrange(start,i): good[j] = True n_holes += 1 start = i+1 print 'Closed', grace.pretty_number(n_holes), 'holes' f = open(path.join(output_dir, '%s-%s.fa' % (friendly_name,what)), 'wb') io.write_fasta(f, name, ''.join([ (seq[i] if good[i] else 'N') for i in xrange(len(seq)) ]) ) f.close() f = open(path.join(output_dir, '%s-%s_masked.fa' % (friendly_name,what)), 'wb') io.write_fasta(f, name, ''.join([ (seq[i] if good[i] else seq[i].lower()) for i in xrange(len(seq)) ]) ) f.close() f_good = open(path.join(output_dir, '%s-%s_parts.fa' % (friendly_name,what)), 'wb') f_nongood = open(path.join(output_dir, '%s-non%s_parts.fa' % (friendly_name,what)), 'wb') start = 0 n_good = [0] n_good_bases = [0] def emit(i): if i-start < minsize: return if good[start]: n_good[0] += 1 n_good_bases[0] += i-start io.write_fasta( f_good if good[start] else f_nongood, '%s:%d..%d' % (name, start+1,i), seq[start:i] ) for i in xrange(1,len(seq)): if good[i] != good[start]: emit(i) start = i emit(len(seq)) f_nongood.close() f_good.close() print grace.pretty_number(sum(good)), 'bases are '+what+', of', grace.pretty_number(len(seq)), 'in reference sequence' print grace.pretty_number(n_good[0]), 'parts at least', grace.pretty_number(minsize), 'bases long with', grace.pretty_number(n_good_bases[0]), 'total bases' print
def make_bigwig(prefix, bam_filenames, make_spanner, fragments=False, stop_after=None, scale=1.0, polya=False): have_pysam = False try: import pysam have_pysam = True except ImportError: pass #alf = pysam.AlignmentFile(bam_filenames[0]) #header = alf.header header = sam.parsed_bam_headers(bam_filenames[0]) with open(prefix + "-chrom.sizes", "wb") as f: for entry in header["SQ"]: f.write("{}\t{}\n".format(entry["SN"], entry["LN"])) chrom_names = [entry["SN"] for entry in header["SQ"]] chrom_sizes = [int(entry["LN"]) for entry in header["SQ"]] #alf.close() forward = dict([(i, Piler(j)) for i, j in zip(chrom_names, chrom_sizes)]) reverse = dict([(i, Piler(j)) for i, j in zip(chrom_names, chrom_sizes)]) old = grace.status("Bigwig") for filename in bam_filenames: if have_pysam: alf = pysam.AlignmentFile(filename) else: alf = sam.Bam_reader(filename) n = 0 if not fragments: for item in alf: if item.is_unmapped or item.is_secondary or item.is_supplementary: continue if polya and not alignment_is_polya(item): continue # Assume --> <-- oriented read pairs which = forward if bool(item.is_reverse) == bool( item.is_read2) else reverse which[item.reference_name].add(make_spanner(item)) n += 1 if stop_after is not None and n > stop_after: break if n % 1000000 == 0: grace.status( os.path.basename(prefix) + " " + filename + " " + grace.pretty_number(n)) else: for item in iter_fragments(alf): if polya and not any(alignment_is_polya(al) for al in item): continue # Assume --> <-- oriented read pairs which = forward if bool(item[0].is_reverse) == bool( item[0].is_read2) else reverse which[item[0].reference_name].add(make_spanner(item)) n += 1 if stop_after is not None and n > stop_after: break if n % 1000000 == 0: grace.status( os.path.basename(prefix) + " " + filename + " " + grace.pretty_number(n)) if have_pysam: alf.close() bedgraph( prefix + "-fwd.bedgraph", zip(chrom_names, [ scale_spanner(scale, forward[item].get()) for item in chrom_names ])) subprocess.check_call([ "wigToBigWig", prefix + "-fwd.bedgraph", prefix + "-chrom.sizes", prefix + "-fwd.bw" ]) os.unlink(prefix + "-fwd.bedgraph") bedgraph( prefix + "-rev.bedgraph", zip(chrom_names, [ scale_spanner(scale, reverse[item].get()) for item in chrom_names ])) subprocess.check_call([ "wigToBigWig", prefix + "-rev.bedgraph", prefix + "-chrom.sizes", prefix + "-rev.bw" ]) os.unlink(prefix + "-rev.bedgraph") os.unlink(prefix + "-chrom.sizes") grace.status(old)