def write_read_introns_from_sam_stream(sam_stream, output_stream, retrieved_intron_counts, instance=False): """ Writes output that maps QNAMES to exon-exon junctions overlapped. sam_stream: where to find retrieved alignments in SAM form output_stream: where to write output. Each line takes the form: <read name><TAB>RNAME<TAB><sorted list of intron starts and ends> <TAB>['r' for 'retrieved'] retrieved_intron_counts: defaultdict(int) that counts number of retrieved alignments overlapping exon-exon junction No return value. """ for line in sam_stream: if line[0] == '@': continue try: tokens = line.strip().split('\t') flag = int(tokens[1]) if flag & 4: continue name = tokens[0] rname = tokens[2] cigar = tokens[5] pos = int(tokens[3]) seq = tokens[9] flag = int(tokens[1]) if 'N' not in cigar or flag & 256: continue #md = [token[5:] for token in tokens if token[:5] == 'MD:Z:'][0] _, _, introns, _, _ = indels_junctions_exons_mismatches( cigar, dummy_md_index(cigar), pos, seq) introns = [intron[:2] for intron in introns] introns = [ rname + ';'.join([''] + [str(bound) for bound in intron]) for intron in sorted(list(introns)) ] if instance: for intron in introns: retrieved_intron_counts[intron] += 1 print >> output_stream, '%s\t%s\tr' % (name, '\t'.join(introns)) else: for intron in introns: retrieved_intron_counts[intron] += 1 print >> output_stream, '%s;%s\t%s\tr' % (name, intron, intron) except IndexError: print >> sys.stderr, ('Error found on line: ' + line) raise
def write_read_introns_from_sam_stream(sam_stream, output_stream, retrieved_intron_counts, instance=False): """ Writes output that maps QNAMES to exon-exon junctions overlapped. sam_stream: where to find retrieved alignments in SAM form output_stream: where to write output. Each line takes the form: <read name><TAB>RNAME<TAB><sorted list of intron starts and ends> <TAB>['r' for 'retrieved'] retrieved_intron_counts: defaultdict(int) that counts number of retrieved alignments overlapping exon-exon junction No return value. """ for line in sam_stream: if line[0] == "@": continue try: tokens = line.strip().split("\t") flag = int(tokens[1]) if flag & 4: continue name = tokens[0] rname = tokens[2] cigar = tokens[5] pos = int(tokens[3]) seq = tokens[9] flag = int(tokens[1]) if "N" not in cigar or flag & 256: continue # md = [token[5:] for token in tokens if token[:5] == 'MD:Z:'][0] _, _, introns, _, _ = indels_junctions_exons_mismatches(cigar, dummy_md_index(cigar), pos, seq) introns = [intron[:2] for intron in introns] introns = [rname + ";".join([""] + [str(bound) for bound in intron]) for intron in sorted(list(introns))] if instance: for intron in introns: retrieved_intron_counts[intron] += 1 print >> output_stream, "%s\t%s\tr" % (name, "\t".join(introns)) else: for intron in introns: retrieved_intron_counts[intron] += 1 print >> output_stream, "%s;%s\t%s\tr" % (name, intron, intron) except IndexError: print >> sys.stderr, ("Error found on line: " + line) raise
def go(true_bed_stream, sam_stream=sys.stdin, generous=False, base_threshold=0.5, clip_threshold=1.0, dump_incorrect=False, temp_dir=None, ignore_spliced_reads=False): """ Finds relevant and retrieved instance counts. true_bed_stream: file handle for BED output of Flux simulation sam_stream: where to read in aligner's mappings generous: True iff aligner cuts off /1 or /2 of a given read base_threshold: proportion of a read's bases that must align correctly for a read to be considered a correct mapping clip_threshold: proportion of a read's bases that must be clipped for a read to be considered unmapped dump_incorrect: write incorrect (read) alignments to stderr ignore_spliced_reads: ignores all spliced reads """ from tempdel import remove_temporary_directories import tempfile import atexit if temp_dir is None: temp_dir_path = tempfile.mkdtemp() else: try: temp_dir_path = tempfile.mkdtemp(dir=temp_dir) except: temp_dir_path = tempfile.mkdtemp() #print >>sys.stderr, temp_dir_path atexit.register(remove_temporary_directories, [temp_dir_path]) # Store everything in one file, then sort it on read name combined_file = os.path.join(temp_dir_path, 'combined.temp') with open(combined_file, 'w') as temp_stream: if ignore_spliced_reads: if generous: for line in true_bed_stream: tokens = line.strip().split('\t') if ',' in tokens[-1]: continue # skip intron line print >>temp_stream, '\t'.join([tokens[3][:-2], '0'] + tokens[:3] + tokens[4:]) else: for line in true_bed_stream: tokens = line.strip().split('\t') if ',' in tokens[-1]: continue # skip intron line print >>temp_stream, '\t'.join( [tokens[3], '0'] + tokens[:3] + tokens[4:] ) for line in sam_stream: if line[0] == '@' or not line.strip(): continue tokens = line.strip().split('\t') if 'N' in tokens[5]: continue # skip intron line print >>temp_stream, '\t'.join([tokens[0], '1'] + tokens[1:]) else: if generous: for line in true_bed_stream: tokens = line.strip().split('\t') print >>temp_stream, '\t'.join([tokens[3][:-2], '0'] + tokens[:3] + tokens[4:]) else: for line in true_bed_stream: tokens = line.strip().split('\t') print >>temp_stream, '\t'.join( [tokens[3], '0'] + tokens[:3] + tokens[4:] ) for line in sam_stream: if line[0] == '@' or not line.strip(): continue tokens = line.strip().split('\t') print >>temp_stream, '\t'.join([tokens[0], '1'] + tokens[1:]) import subprocess sorted_combined_file = os.path.join(temp_dir_path, 'combined.sorted.temp') subprocess.check_call(' '.join(['sort -T %s -k1,1 -k2,2n' % temp_dir_path, combined_file, '>', sorted_combined_file]), bufsize=-1, shell=True) basewise_relevant, read_relevant = 0, 0 # Initialize counters for computing accuracy metrics basewise_retrieved, basewise_intersection = 0, 0 read_retrieved, read_intersection = 0, 0 with open(sorted_combined_file) as sorted_combined_stream: for (name,), xpartition in xstream(sorted_combined_stream, 1): '''Dict mapping read names to alignments (chrom, 1-based start, 1-based end)''' true_maps = [] saved = [] for tokens in xpartition: saved.append(tokens) if tokens[0] == '0': if len(tokens) < 12: continue chrom = tokens[1] chrom_start = int(tokens[2]) chrom_end = int(tokens[3]) block_sizes = tokens[10].split(',') block_starts = tokens[11].split(',') # Handle trailing commas try: int(block_sizes[-1]) except ValueError: block_sizes = block_sizes[:-1] try: int(block_starts[-1]) except ValueError: block_starts = block_starts[:-1] block_count = len(block_sizes) assert block_count == len(block_starts) exons = [(chrom, chrom_start + int(block_starts[i]), chrom_start + int(block_starts[i]) + int(block_sizes[i])) for i in xrange(block_count)] true_maps.append(exons) basewise_relevant += sum([int(block_size) for block_size in block_sizes]) read_relevant += 1 elif tokens[0] == '1': flag = int(tokens[1]) if flag & 256 or flag & 4: '''Secondary alignment or unmapped and thus not retrieved; ignore''' continue cigar, pos, seq = tokens[5], int(tokens[3]), tokens[9] (dummy_md, mapped, unmapped, clip_count, read_length) \ = dummy_md_and_mapped_offsets( cigar, clip_threshold=clip_threshold ) if unmapped: # Too much clipping continue basewise_retrieved += read_length - clip_count read_retrieved += 1 if not true_maps: assert ignore_spliced_reads continue # Try both /1 and /2; choose the best basewise result intersected_base_count = 0 for true_map in true_maps: if tokens[2] != true_map[0][0]: '''chr is wrong, but this is still counted as a retrieval above''' continue base_counter, base_truths = 0, set() '''Each tuple in base_truths is (index of base in read, mapped location)''' for block in true_map: base_truths.update([(base_counter + i, j + 1) for i, j in enumerate( xrange( block[1], block[2] ))]) base_counter += block[2] - block[1] base_predictions = set() if unmapped: # Too much clipping continue _, _, _, exons, _ = indels_junctions_exons_mismatches( cigar, dummy_md, pos, seq, drop_deletions=True ) mapped_index = 0 for exon in exons: base_predictions.update( [(mapped[mapped_index + i], j) for i, j in enumerate( xrange( exon[0], exon[1] ))]) mapped_index += exon[1] - exon[0] intersected_base_count = max(intersected_base_count, len( base_predictions.intersection(base_truths) )) basewise_intersection += intersected_base_count if intersected_base_count >= read_length * base_threshold: read_intersection += 1 elif dump_incorrect: # Incorrect alignment; write to stderr print >>sys.stderr, '\t'.join( ['.'.join(line) for line in saved] ) else: raise RuntimeError( 'Invalid intermediate line.' ) return (basewise_retrieved, basewise_relevant, basewise_intersection, read_retrieved, read_relevant, read_intersection)
def go(input_stream=sys.stdin, output_stream=sys.stdout, fudge=5, stranded=False, verbose=False, max_refs=300, report_multiplier=1.2): """ Emits junction combinations associated with reads. Soft-clipped Bowtie 2 alignments of read sequences to the transcript fragment index are used infer which cojunctions could possibly be overlapped by reads. Then maximal cliques of the graph described in the maximal_cliques() function are enumerated to obtain which junction combinations could possibly be overlapped by reads. input_stream: where to retrieve Bowtie 2 output output_stream: where to emit exon and junction tuples; typically, this is sys.stdout. fudge: by how many bases to extend left and right extend sizes to accommodate potential indels stranded: True iff input reads are strand-specific; this affects whether an output partition has a terminal '+' or '-' indicating the sense strand. Further, if stranded is True, an alignment is returned only if its strand agrees with the junction's strand. verbose: True if alignments should occasionally be written to stderr. max_refs: maximum number of reference sequences to enumerate per read; if more are present, prioritize those sequences that overlap the fewest junctions report_multiplier: if verbose is True, the line number of an alignment written to stderr increases exponentially with base report_multiplier. """ output_line_count, next_report_line, i = 0, 0, 0 for (qname, ), xpartition in xstream(input_stream, 1): '''While labeled multireadlet, this list may end up simply a unireadlet.''' multiread = [] for tokens in xpartition: flag = int(tokens[0]) if verbose and next_report_line == i: print >>sys.stderr, \ 'SAM output record %d: rdname="%s", flag=%d' % (i, qname, flag) next_report_line = int( (next_report_line + 1) * report_multiplier + 1) - 1 i += 1 multiread.append((qname, ) + tokens) if flag & 4: continue cojunctions, all_junctions = defaultdict(set), {} for alignment in multiread_with_junctions(multiread, stranded): cigar = alignment[5] md = [field for field in alignment if field[:5] == 'MD:Z:'][0][5:] pos = int(alignment[3]) seq = alignment[9] reversed_complement_seq = seq[::-1].translate( _reversed_complement_translation_table) if seq < reversed_complement_seq: seq_to_print = seq else: seq_to_print = reversed_complement_seq seq_size = len(seq) rname = alignment[2] sense = [field for field in alignment if field[:5] == 'XS:A:'][0][5:] if (rname, sense) not in all_junctions: all_junctions[(rname, sense)] = defaultdict(list) _, _, junctions, _, _ = indels_junctions_exons_mismatches( cigar, md, pos, seq, junctions_only=True) cojunctions[(rname, sense)].add( tuple([(junction[0], junction[1]) for junction in junctions])) for junction in junctions: if (junction[0], junction[1]) \ not in all_junctions[(rname, sense)]: all_junctions[(rname, sense)][(junction[0], junction[1])] \ = [junction[2], junction[3]] else: all_junctions[(rname, sense)][( junction[0], junction[1])][0] = max( all_junctions[(rname, sense)][(junction[0], junction[1])][0], junction[2]) all_junctions[(rname, sense)][( junction[0], junction[1])][1] = max( all_junctions[(rname, sense)][(junction[0], junction[1])][1], junction[3]) for rname, sense in all_junctions: to_write = set() for cojunction in selected_cojunctions( paths_from_cojunctions(list(cojunctions[(rname, sense)]), span=(seq_size + fudge)), max_refs=max_refs, seq=seq, rname=rname, sense=sense): left_extend_size = all_junctions[(rname, sense)][cojunction[0]][0] right_extend_size = all_junctions[(rname, sense)][cojunction[-1]][1] to_write.add( ('{rname}{sense}\t{starts}' '\t{ends}\t{left_size}' '\t{right_size}\t{seq}').format( rname=rname, sense=sense, starts=','.join( [str(junction[0]) for junction in cojunction]), ends=','.join( [str(junction[1]) for junction in cojunction]), left_size=(left_extend_size + fudge), right_size=(right_extend_size + fudge), seq=seq_to_print)) counter.add('paths_out', len(to_write)) for line_to_write in to_write: print line_to_write output_line_count += 1 output_stream.flush() print >> sys.stderr, ( 'cojunction_enum_delegate.py reports %d output lines.' % output_line_count)
def go(true_bed_stream, sam_stream=sys.stdin, generous=False, base_threshold=0.5, clip_threshold=1.0, dump_incorrect=False, temp_dir=None, ignore_spliced_reads=False): """ Finds relevant and retrieved instance counts. true_bed_stream: file handle for BED output of Flux simulation sam_stream: where to read in aligner's mappings generous: True iff aligner cuts off /1 or /2 of a given read base_threshold: proportion of a read's bases that must align correctly for a read to be considered a correct mapping clip_threshold: proportion of a read's bases that must be clipped for a read to be considered unmapped dump_incorrect: write incorrect (read) alignments to stderr ignore_spliced_reads: ignores all spliced reads """ from tempdel import remove_temporary_directories import tempfile import atexit if temp_dir is None: temp_dir_path = tempfile.mkdtemp() else: try: temp_dir_path = tempfile.mkdtemp(dir=temp_dir) except: temp_dir_path = tempfile.mkdtemp() #print >>sys.stderr, temp_dir_path atexit.register(remove_temporary_directories, [temp_dir_path]) # Store everything in one file, then sort it on read name combined_file = os.path.join(temp_dir_path, 'combined.temp') with open(combined_file, 'w') as temp_stream: if ignore_spliced_reads: if generous: for line in true_bed_stream: tokens = line.strip().split('\t') if ',' in tokens[-1]: continue # skip intron line print >> temp_stream, '\t'.join([tokens[3][:-2], '0'] + tokens[:3] + tokens[4:]) else: for line in true_bed_stream: tokens = line.strip().split('\t') if ',' in tokens[-1]: continue # skip intron line print >> temp_stream, '\t'.join([tokens[3], '0'] + tokens[:3] + tokens[4:]) for line in sam_stream: if line[0] == '@' or not line.strip(): continue tokens = line.strip().split('\t') if 'N' in tokens[5]: continue # skip intron line print >> temp_stream, '\t'.join([tokens[0], '1'] + tokens[1:]) else: if generous: for line in true_bed_stream: tokens = line.strip().split('\t') print >> temp_stream, '\t'.join([tokens[3][:-2], '0'] + tokens[:3] + tokens[4:]) else: for line in true_bed_stream: tokens = line.strip().split('\t') print >> temp_stream, '\t'.join([tokens[3], '0'] + tokens[:3] + tokens[4:]) for line in sam_stream: if line[0] == '@' or not line.strip(): continue tokens = line.strip().split('\t') print >> temp_stream, '\t'.join([tokens[0], '1'] + tokens[1:]) import subprocess sorted_combined_file = os.path.join(temp_dir_path, 'combined.sorted.temp') subprocess.check_call(' '.join([ 'sort -T %s -k1,1 -k2,2n' % temp_dir_path, combined_file, '>', sorted_combined_file ]), bufsize=-1, shell=True) basewise_relevant, read_relevant = 0, 0 # Initialize counters for computing accuracy metrics basewise_retrieved, basewise_intersection = 0, 0 read_retrieved, read_intersection = 0, 0 with open(sorted_combined_file) as sorted_combined_stream: for (name, ), xpartition in xstream(sorted_combined_stream, 1): '''Dict mapping read names to alignments (chrom, 1-based start, 1-based end)''' true_maps = [] saved = [] for tokens in xpartition: saved.append(tokens) if tokens[0] == '0': if len(tokens) < 12: continue chrom = tokens[1] chrom_start = int(tokens[2]) chrom_end = int(tokens[3]) block_sizes = tokens[10].split(',') block_starts = tokens[11].split(',') # Handle trailing commas try: int(block_sizes[-1]) except ValueError: block_sizes = block_sizes[:-1] try: int(block_starts[-1]) except ValueError: block_starts = block_starts[:-1] block_count = len(block_sizes) assert block_count == len(block_starts) exons = [(chrom, chrom_start + int(block_starts[i]), chrom_start + int(block_starts[i]) + int(block_sizes[i])) for i in xrange(block_count)] true_maps.append(exons) basewise_relevant += sum( [int(block_size) for block_size in block_sizes]) read_relevant += 1 elif tokens[0] == '1': flag = int(tokens[1]) if flag & 256 or flag & 4: '''Secondary alignment or unmapped and thus not retrieved; ignore''' continue cigar, pos, seq = tokens[5], int(tokens[3]), tokens[9] (dummy_md, mapped, unmapped, clip_count, read_length) \ = dummy_md_and_mapped_offsets( cigar, clip_threshold=clip_threshold ) if unmapped: # Too much clipping continue basewise_retrieved += read_length - clip_count read_retrieved += 1 if not true_maps: assert ignore_spliced_reads continue # Try both /1 and /2; choose the best basewise result intersected_base_count = 0 for true_map in true_maps: if tokens[2] != true_map[0][0]: '''chr is wrong, but this is still counted as a retrieval above''' continue base_counter, base_truths = 0, set() '''Each tuple in base_truths is (index of base in read, mapped location)''' for block in true_map: base_truths.update([(base_counter + i, j + 1) for i, j in enumerate( xrange(block[1], block[2])) ]) base_counter += block[2] - block[1] base_predictions = set() if unmapped: # Too much clipping continue _, _, _, exons, _ = indels_junctions_exons_mismatches( cigar, dummy_md, pos, seq, drop_deletions=True) mapped_index = 0 for exon in exons: base_predictions.update([ (mapped[mapped_index + i], j) for i, j in enumerate(xrange(exon[0], exon[1])) ]) mapped_index += exon[1] - exon[0] intersected_base_count = max( intersected_base_count, len(base_predictions.intersection(base_truths))) basewise_intersection += intersected_base_count if intersected_base_count >= read_length * base_threshold: read_intersection += 1 elif dump_incorrect: # Incorrect alignment; write to stderr print >> sys.stderr, '\t'.join( ['.'.join(line) for line in saved]) else: raise RuntimeError('Invalid intermediate line.') return (basewise_retrieved, basewise_relevant, basewise_intersection, read_retrieved, read_relevant, read_intersection)
def go(input_stream=sys.stdin, output_stream=sys.stdout, fudge=5, stranded=False, verbose=False, max_refs=300, report_multiplier=1.2): """ Emits junction combinations associated with reads. Soft-clipped Bowtie 2 alignments of read sequences to the transcript fragment index are used infer which cojunctions could possibly be overlapped by reads. Then maximal cliques of the graph described in the maximal_cliques() function are enumerated to obtain which junction combinations could possibly be overlapped by reads. input_stream: where to retrieve Bowtie 2 output output_stream: where to emit exon and junction tuples; typically, this is sys.stdout. fudge: by how many bases to extend left and right extend sizes to accommodate potential indels stranded: True iff input reads are strand-specific; this affects whether an output partition has a terminal '+' or '-' indicating the sense strand. Further, if stranded is True, an alignment is returned only if its strand agrees with the junction's strand. verbose: True if alignments should occasionally be written to stderr. max_refs: maximum number of reference sequences to enumerate per read; if more are present, prioritize those sequences that overlap the fewest junctions report_multiplier: if verbose is True, the line number of an alignment written to stderr increases exponentially with base report_multiplier. """ output_line_count, next_report_line, i = 0, 0, 0 for (qname,), xpartition in xstream(input_stream, 1): '''While labeled multireadlet, this list may end up simply a unireadlet.''' multiread = [] for tokens in xpartition: flag = int(tokens[0]) if verbose and next_report_line == i: print >>sys.stderr, \ 'SAM output record %d: rdname="%s", flag=%d' % (i, qname, flag) next_report_line = int((next_report_line + 1) * report_multiplier + 1) - 1 i += 1 multiread.append((qname,) + tokens) if flag & 4: continue corrected_multiread = multiread_with_junctions(multiread, stranded) cojunctions, all_junctions = defaultdict(set), {} for alignment in multiread_with_junctions(multiread, stranded): cigar = alignment[5] md = [field for field in alignment if field[:5] == 'MD:Z:'][0][5:] pos = int(alignment[3]) seq = alignment[9] reversed_complement_seq = seq[::-1].translate( _reversed_complement_translation_table ) if seq < reversed_complement_seq: seq_to_print = seq else: seq_to_print = reversed_complement_seq seq_size = len(seq) rname = alignment[2] sense = [field for field in alignment if field[:5] == 'XS:A:'][0][5:] if (rname, sense) not in all_junctions: all_junctions[(rname, sense)] = defaultdict(list) _, _, junctions, _, _ = indels_junctions_exons_mismatches( cigar, md, pos, seq, junctions_only=True ) cojunctions[(rname, sense)].add( tuple([(junction[0], junction[1]) for junction in junctions]) ) for junction in junctions: if (junction[0], junction[1]) \ not in all_junctions[(rname, sense)]: all_junctions[(rname, sense)][(junction[0], junction[1])] \ = [junction[2], junction[3]] else: all_junctions[(rname, sense)][ (junction[0], junction[1]) ][0] = max(all_junctions[(rname, sense)][ (junction[0], junction[1]) ][0], junction[2]) all_junctions[(rname, sense)][ (junction[0], junction[1]) ][1] = max(all_junctions[(rname, sense)][ (junction[0], junction[1]) ][1], junction[3]) for rname, sense in all_junctions: to_write = set() for cojunction in selected_cojunctions(paths_from_cojunctions( list(cojunctions[(rname, sense)]), span=(seq_size + fudge) ), max_refs=max_refs, seq=seq, rname=rname, sense=sense): left_extend_size = all_junctions[(rname, sense)][ cojunction[0] ][0] right_extend_size = all_junctions[(rname, sense)][ cojunction[-1] ][1] to_write.add(('{rname}{sense}\t{starts}' '\t{ends}\t{left_size}' '\t{right_size}\t{seq}').format( rname=rname, sense=sense, starts=','.join( [str(junction[0]) for junction in cojunction] ), ends=','.join( [str(junction[1]) for junction in cojunction] ), left_size=(left_extend_size + fudge), right_size=(right_extend_size + fudge), seq=seq_to_print )) for line_to_write in to_write: print line_to_write output_line_count += 1 output_stream.flush() print >>sys.stderr, ('cojunction_enum_delegate.py reports %d output lines.' % output_line_count)