input_line_count += len(initial_multiread) multiread = [ alignment for alignment in initial_multiread if not (int(alignment[1]) & 4) ] flag = int(initial_multiread[0][1]) if not multiread: counter.add('unmapped') # Write only the SAM output if the read was unmapped output_line_count += alignment_printer.print_unmapped_read( qname, initial_multiread[0][9], initial_multiread[0][10]) else: '''Correct positions to match original reference's, correct CIGARs, eliminate duplicates, and decide primary alignment.''' try: corrected_multiread = multiread_with_junctions( multiread, stranded=args.stranded) except: print >> sys.stderr, ('Error encountered interpreting ' 'multiread %s' % (multiread, )) raise if not corrected_multiread: '''This is effectively an unmapped read; write corresponding SAM output.''' if flag & 16: seq_to_write = initial_multiread[0][9][::-1].translate( reversed_complement_translation_table) qual_to_write = initial_multiread[0][10][::-1] else: seq_to_write = initial_multiread[0][9] qual_to_write = initial_multiread[0][10] output_line_count \
def go(input_stream=sys.stdin, output_stream=sys.stdout, fudge=5, stranded=False, verbose=False, max_refs=300, report_multiplier=1.2): """ Emits junction combinations associated with reads. Soft-clipped Bowtie 2 alignments of read sequences to the transcript fragment index are used infer which cojunctions could possibly be overlapped by reads. Then maximal cliques of the graph described in the maximal_cliques() function are enumerated to obtain which junction combinations could possibly be overlapped by reads. input_stream: where to retrieve Bowtie 2 output output_stream: where to emit exon and junction tuples; typically, this is sys.stdout. fudge: by how many bases to extend left and right extend sizes to accommodate potential indels stranded: True iff input reads are strand-specific; this affects whether an output partition has a terminal '+' or '-' indicating the sense strand. Further, if stranded is True, an alignment is returned only if its strand agrees with the junction's strand. verbose: True if alignments should occasionally be written to stderr. max_refs: maximum number of reference sequences to enumerate per read; if more are present, prioritize those sequences that overlap the fewest junctions report_multiplier: if verbose is True, the line number of an alignment written to stderr increases exponentially with base report_multiplier. """ output_line_count, next_report_line, i = 0, 0, 0 for (qname,), xpartition in xstream(input_stream, 1): '''While labeled multireadlet, this list may end up simply a unireadlet.''' multiread = [] for tokens in xpartition: flag = int(tokens[0]) if verbose and next_report_line == i: print >>sys.stderr, \ 'SAM output record %d: rdname="%s", flag=%d' % (i, qname, flag) next_report_line = int((next_report_line + 1) * report_multiplier + 1) - 1 i += 1 multiread.append((qname,) + tokens) if flag & 4: continue corrected_multiread = multiread_with_junctions(multiread, stranded) cojunctions, all_junctions = defaultdict(set), {} for alignment in multiread_with_junctions(multiread, stranded): cigar = alignment[5] md = [field for field in alignment if field[:5] == 'MD:Z:'][0][5:] pos = int(alignment[3]) seq = alignment[9] reversed_complement_seq = seq[::-1].translate( _reversed_complement_translation_table ) if seq < reversed_complement_seq: seq_to_print = seq else: seq_to_print = reversed_complement_seq seq_size = len(seq) rname = alignment[2] sense = [field for field in alignment if field[:5] == 'XS:A:'][0][5:] if (rname, sense) not in all_junctions: all_junctions[(rname, sense)] = defaultdict(list) _, _, junctions, _, _ = indels_junctions_exons_mismatches( cigar, md, pos, seq, junctions_only=True ) cojunctions[(rname, sense)].add( tuple([(junction[0], junction[1]) for junction in junctions]) ) for junction in junctions: if (junction[0], junction[1]) \ not in all_junctions[(rname, sense)]: all_junctions[(rname, sense)][(junction[0], junction[1])] \ = [junction[2], junction[3]] else: all_junctions[(rname, sense)][ (junction[0], junction[1]) ][0] = max(all_junctions[(rname, sense)][ (junction[0], junction[1]) ][0], junction[2]) all_junctions[(rname, sense)][ (junction[0], junction[1]) ][1] = max(all_junctions[(rname, sense)][ (junction[0], junction[1]) ][1], junction[3]) for rname, sense in all_junctions: to_write = set() for cojunction in selected_cojunctions(paths_from_cojunctions( list(cojunctions[(rname, sense)]), span=(seq_size + fudge) ), max_refs=max_refs, seq=seq, rname=rname, sense=sense): left_extend_size = all_junctions[(rname, sense)][ cojunction[0] ][0] right_extend_size = all_junctions[(rname, sense)][ cojunction[-1] ][1] to_write.add(('{rname}{sense}\t{starts}' '\t{ends}\t{left_size}' '\t{right_size}\t{seq}').format( rname=rname, sense=sense, starts=','.join( [str(junction[0]) for junction in cojunction] ), ends=','.join( [str(junction[1]) for junction in cojunction] ), left_size=(left_extend_size + fudge), right_size=(right_extend_size + fudge), seq=seq_to_print )) for line_to_write in to_write: print line_to_write output_line_count += 1 output_stream.flush() print >>sys.stderr, ('cojunction_enum_delegate.py reports %d output lines.' % output_line_count)
def go(input_stream=sys.stdin, output_stream=sys.stdout, fudge=5, stranded=False, verbose=False, max_refs=300, report_multiplier=1.2): """ Emits junction combinations associated with reads. Soft-clipped Bowtie 2 alignments of read sequences to the transcript fragment index are used infer which cojunctions could possibly be overlapped by reads. Then maximal cliques of the graph described in the maximal_cliques() function are enumerated to obtain which junction combinations could possibly be overlapped by reads. input_stream: where to retrieve Bowtie 2 output output_stream: where to emit exon and junction tuples; typically, this is sys.stdout. fudge: by how many bases to extend left and right extend sizes to accommodate potential indels stranded: True iff input reads are strand-specific; this affects whether an output partition has a terminal '+' or '-' indicating the sense strand. Further, if stranded is True, an alignment is returned only if its strand agrees with the junction's strand. verbose: True if alignments should occasionally be written to stderr. max_refs: maximum number of reference sequences to enumerate per read; if more are present, prioritize those sequences that overlap the fewest junctions report_multiplier: if verbose is True, the line number of an alignment written to stderr increases exponentially with base report_multiplier. """ output_line_count, next_report_line, i = 0, 0, 0 for (qname, ), xpartition in xstream(input_stream, 1): '''While labeled multireadlet, this list may end up simply a unireadlet.''' multiread = [] for tokens in xpartition: flag = int(tokens[0]) if verbose and next_report_line == i: print >>sys.stderr, \ 'SAM output record %d: rdname="%s", flag=%d' % (i, qname, flag) next_report_line = int( (next_report_line + 1) * report_multiplier + 1) - 1 i += 1 multiread.append((qname, ) + tokens) if flag & 4: continue cojunctions, all_junctions = defaultdict(set), {} for alignment in multiread_with_junctions(multiread, stranded): cigar = alignment[5] md = [field for field in alignment if field[:5] == 'MD:Z:'][0][5:] pos = int(alignment[3]) seq = alignment[9] reversed_complement_seq = seq[::-1].translate( _reversed_complement_translation_table) if seq < reversed_complement_seq: seq_to_print = seq else: seq_to_print = reversed_complement_seq seq_size = len(seq) rname = alignment[2] sense = [field for field in alignment if field[:5] == 'XS:A:'][0][5:] if (rname, sense) not in all_junctions: all_junctions[(rname, sense)] = defaultdict(list) _, _, junctions, _, _ = indels_junctions_exons_mismatches( cigar, md, pos, seq, junctions_only=True) cojunctions[(rname, sense)].add( tuple([(junction[0], junction[1]) for junction in junctions])) for junction in junctions: if (junction[0], junction[1]) \ not in all_junctions[(rname, sense)]: all_junctions[(rname, sense)][(junction[0], junction[1])] \ = [junction[2], junction[3]] else: all_junctions[(rname, sense)][( junction[0], junction[1])][0] = max( all_junctions[(rname, sense)][(junction[0], junction[1])][0], junction[2]) all_junctions[(rname, sense)][( junction[0], junction[1])][1] = max( all_junctions[(rname, sense)][(junction[0], junction[1])][1], junction[3]) for rname, sense in all_junctions: to_write = set() for cojunction in selected_cojunctions( paths_from_cojunctions(list(cojunctions[(rname, sense)]), span=(seq_size + fudge)), max_refs=max_refs, seq=seq, rname=rname, sense=sense): left_extend_size = all_junctions[(rname, sense)][cojunction[0]][0] right_extend_size = all_junctions[(rname, sense)][cojunction[-1]][1] to_write.add( ('{rname}{sense}\t{starts}' '\t{ends}\t{left_size}' '\t{right_size}\t{seq}').format( rname=rname, sense=sense, starts=','.join( [str(junction[0]) for junction in cojunction]), ends=','.join( [str(junction[1]) for junction in cojunction]), left_size=(left_extend_size + fudge), right_size=(right_extend_size + fudge), seq=seq_to_print)) counter.add('paths_out', len(to_write)) for line_to_write in to_write: print line_to_write output_line_count += 1 output_stream.flush() print >> sys.stderr, ( 'cojunction_enum_delegate.py reports %d output lines.' % output_line_count)
if not (int(alignment[1]) & 4)] flag = int(initial_multiread[0][1]) if not multiread: counter.add('unmapped') # Write only the SAM output if the read was unmapped output_line_count += alignment_printer.print_unmapped_read( qname, initial_multiread[0][9], initial_multiread[0][10] ) else: '''Correct positions to match original reference's, correct CIGARs, eliminate duplicates, and decide primary alignment.''' try: corrected_multiread = multiread_with_junctions( multiread, stranded=args.stranded ) except: print >>sys.stderr, ('Error encountered interpreting ' 'multiread %s' % (multiread,)) raise if not corrected_multiread: '''This is effectively an unmapped read; write corresponding SAM output.''' if flag & 16: seq_to_write = initial_multiread[0][9][::-1].translate( reversed_complement_translation_table ) qual_to_write = initial_multiread[0][10][::-1] else: seq_to_write = initial_multiread[0][9]