Beispiel #1
0
 input_line_count += len(initial_multiread)
 multiread = [
     alignment for alignment in initial_multiread
     if not (int(alignment[1]) & 4)
 ]
 flag = int(initial_multiread[0][1])
 if not multiread:
     counter.add('unmapped')
     # Write only the SAM output if the read was unmapped
     output_line_count += alignment_printer.print_unmapped_read(
         qname, initial_multiread[0][9], initial_multiread[0][10])
 else:
     '''Correct positions to match original reference's, correct
     CIGARs, eliminate duplicates, and decide primary alignment.'''
     try:
         corrected_multiread = multiread_with_junctions(
             multiread, stranded=args.stranded)
     except:
         print >> sys.stderr, ('Error encountered interpreting '
                               'multiread %s' % (multiread, ))
         raise
     if not corrected_multiread:
         '''This is effectively an unmapped read; write
         corresponding SAM output.'''
         if flag & 16:
             seq_to_write = initial_multiread[0][9][::-1].translate(
                 reversed_complement_translation_table)
             qual_to_write = initial_multiread[0][10][::-1]
         else:
             seq_to_write = initial_multiread[0][9]
             qual_to_write = initial_multiread[0][10]
         output_line_count \
Beispiel #2
0
def go(input_stream=sys.stdin, output_stream=sys.stdout, fudge=5,
        stranded=False, verbose=False, max_refs=300, report_multiplier=1.2):
    """ Emits junction combinations associated with reads.

        Soft-clipped Bowtie 2 alignments of read sequences to the transcript
        fragment index are used infer which cojunctions could possibly be
        overlapped by reads. Then maximal cliques of the graph described in
        the maximal_cliques() function are enumerated to obtain which
        junction combinations could possibly be overlapped by reads.

        input_stream: where to retrieve Bowtie 2 output
        output_stream: where to emit exon and junction tuples; typically, this
            is sys.stdout.
        fudge: by how many bases to extend left and right extend sizes
            to accommodate potential indels
        stranded: True iff input reads are strand-specific; this affects
            whether an output partition has a terminal '+' or '-' indicating
            the sense strand. Further, if stranded is True, an alignment is
            returned only if its strand agrees with the junction's strand.
        verbose: True if alignments should occasionally be written to stderr.
        max_refs: maximum number of reference sequences to enumerate per read;
            if more are present, prioritize those sequences that overlap
            the fewest junctions
        report_multiplier: if verbose is True, the line number of an
            alignment written to stderr increases exponentially with base
            report_multiplier.
    """
    output_line_count, next_report_line, i = 0, 0, 0
    for (qname,), xpartition in xstream(input_stream, 1):
        '''While labeled multireadlet, this list may end up simply a
        unireadlet.'''
        multiread = []
        for tokens in xpartition:
            flag = int(tokens[0])
            if verbose and next_report_line == i:
                print >>sys.stderr, \
                    'SAM output record %d: rdname="%s", flag=%d' % (i,
                                                                    qname,
                                                                    flag)
                next_report_line = int((next_report_line + 1)
                                        * report_multiplier + 1) - 1
            i += 1
            multiread.append((qname,) + tokens)
        if flag & 4: continue
        corrected_multiread = multiread_with_junctions(multiread,
                                                        stranded)
        cojunctions, all_junctions = defaultdict(set), {}
        for alignment in multiread_with_junctions(multiread, stranded):
            cigar = alignment[5]
            md = [field for field in alignment
                    if field[:5] == 'MD:Z:'][0][5:]
            pos = int(alignment[3])
            seq = alignment[9]
            reversed_complement_seq = seq[::-1].translate(
                    _reversed_complement_translation_table
                )
            if seq < reversed_complement_seq:
                seq_to_print = seq
            else:
                seq_to_print = reversed_complement_seq
            seq_size = len(seq)
            rname = alignment[2]
            sense = [field for field in alignment
                        if field[:5] == 'XS:A:'][0][5:]
            if (rname, sense) not in all_junctions:
                all_junctions[(rname, sense)] = defaultdict(list)
            _, _, junctions, _, _ = indels_junctions_exons_mismatches(
                                                cigar, md, pos, seq,
                                                junctions_only=True
                                            )
            cojunctions[(rname, sense)].add(
                    tuple([(junction[0], junction[1])
                                for junction in junctions])
                )
            for junction in junctions:
                if (junction[0], junction[1]) \
                    not in all_junctions[(rname, sense)]:
                    all_junctions[(rname, sense)][(junction[0], junction[1])] \
                        = [junction[2], junction[3]]
                else:
                    all_junctions[(rname, sense)][
                            (junction[0], junction[1])
                        ][0] = max(all_junctions[(rname, sense)][
                                (junction[0], junction[1])
                            ][0], junction[2])
                    all_junctions[(rname, sense)][
                            (junction[0], junction[1])
                        ][1] = max(all_junctions[(rname, sense)][
                                (junction[0], junction[1])
                            ][1], junction[3])
        for rname, sense in all_junctions:
            to_write = set()
            for cojunction in selected_cojunctions(paths_from_cojunctions(
                    list(cojunctions[(rname, sense)]), span=(seq_size + fudge)
                ), max_refs=max_refs, seq=seq, rname=rname, sense=sense):
                left_extend_size = all_junctions[(rname, sense)][
                                        cojunction[0]
                                    ][0]
                right_extend_size = all_junctions[(rname, sense)][
                                        cojunction[-1]
                                    ][1]
                to_write.add(('{rname}{sense}\t{starts}'
                       '\t{ends}\t{left_size}'
                       '\t{right_size}\t{seq}').format(
                            rname=rname,
                            sense=sense,
                            starts=','.join(
                                    [str(junction[0])
                                        for junction in cojunction]
                                ),
                            ends=','.join(
                                    [str(junction[1])
                                        for junction in cojunction]
                                ),
                            left_size=(left_extend_size
                                        + fudge),
                            right_size=(right_extend_size
                                        + fudge),
                            seq=seq_to_print
                       ))
            for line_to_write in to_write:
                print line_to_write
                output_line_count += 1
    output_stream.flush()
    print >>sys.stderr, ('cojunction_enum_delegate.py reports %d output lines.'
                            % output_line_count)
Beispiel #3
0
def go(input_stream=sys.stdin,
       output_stream=sys.stdout,
       fudge=5,
       stranded=False,
       verbose=False,
       max_refs=300,
       report_multiplier=1.2):
    """ Emits junction combinations associated with reads.

        Soft-clipped Bowtie 2 alignments of read sequences to the transcript
        fragment index are used infer which cojunctions could possibly be
        overlapped by reads. Then maximal cliques of the graph described in
        the maximal_cliques() function are enumerated to obtain which
        junction combinations could possibly be overlapped by reads.

        input_stream: where to retrieve Bowtie 2 output
        output_stream: where to emit exon and junction tuples; typically, this
            is sys.stdout.
        fudge: by how many bases to extend left and right extend sizes
            to accommodate potential indels
        stranded: True iff input reads are strand-specific; this affects
            whether an output partition has a terminal '+' or '-' indicating
            the sense strand. Further, if stranded is True, an alignment is
            returned only if its strand agrees with the junction's strand.
        verbose: True if alignments should occasionally be written to stderr.
        max_refs: maximum number of reference sequences to enumerate per read;
            if more are present, prioritize those sequences that overlap
            the fewest junctions
        report_multiplier: if verbose is True, the line number of an
            alignment written to stderr increases exponentially with base
            report_multiplier.
    """
    output_line_count, next_report_line, i = 0, 0, 0
    for (qname, ), xpartition in xstream(input_stream, 1):
        '''While labeled multireadlet, this list may end up simply a
        unireadlet.'''
        multiread = []
        for tokens in xpartition:
            flag = int(tokens[0])
            if verbose and next_report_line == i:
                print >>sys.stderr, \
                    'SAM output record %d: rdname="%s", flag=%d' % (i,
                                                                    qname,
                                                                    flag)
                next_report_line = int(
                    (next_report_line + 1) * report_multiplier + 1) - 1
            i += 1
            multiread.append((qname, ) + tokens)
        if flag & 4: continue
        cojunctions, all_junctions = defaultdict(set), {}
        for alignment in multiread_with_junctions(multiread, stranded):
            cigar = alignment[5]
            md = [field for field in alignment if field[:5] == 'MD:Z:'][0][5:]
            pos = int(alignment[3])
            seq = alignment[9]
            reversed_complement_seq = seq[::-1].translate(
                _reversed_complement_translation_table)
            if seq < reversed_complement_seq:
                seq_to_print = seq
            else:
                seq_to_print = reversed_complement_seq
            seq_size = len(seq)
            rname = alignment[2]
            sense = [field for field in alignment
                     if field[:5] == 'XS:A:'][0][5:]
            if (rname, sense) not in all_junctions:
                all_junctions[(rname, sense)] = defaultdict(list)
            _, _, junctions, _, _ = indels_junctions_exons_mismatches(
                cigar, md, pos, seq, junctions_only=True)
            cojunctions[(rname, sense)].add(
                tuple([(junction[0], junction[1]) for junction in junctions]))
            for junction in junctions:
                if (junction[0], junction[1]) \
                    not in all_junctions[(rname, sense)]:
                    all_junctions[(rname, sense)][(junction[0], junction[1])] \
                        = [junction[2], junction[3]]
                else:
                    all_junctions[(rname, sense)][(
                        junction[0], junction[1])][0] = max(
                            all_junctions[(rname, sense)][(junction[0],
                                                           junction[1])][0],
                            junction[2])
                    all_junctions[(rname, sense)][(
                        junction[0], junction[1])][1] = max(
                            all_junctions[(rname, sense)][(junction[0],
                                                           junction[1])][1],
                            junction[3])
        for rname, sense in all_junctions:
            to_write = set()
            for cojunction in selected_cojunctions(
                    paths_from_cojunctions(list(cojunctions[(rname, sense)]),
                                           span=(seq_size + fudge)),
                    max_refs=max_refs,
                    seq=seq,
                    rname=rname,
                    sense=sense):
                left_extend_size = all_junctions[(rname,
                                                  sense)][cojunction[0]][0]
                right_extend_size = all_junctions[(rname,
                                                   sense)][cojunction[-1]][1]
                to_write.add(
                    ('{rname}{sense}\t{starts}'
                     '\t{ends}\t{left_size}'
                     '\t{right_size}\t{seq}').format(
                         rname=rname,
                         sense=sense,
                         starts=','.join(
                             [str(junction[0]) for junction in cojunction]),
                         ends=','.join(
                             [str(junction[1]) for junction in cojunction]),
                         left_size=(left_extend_size + fudge),
                         right_size=(right_extend_size + fudge),
                         seq=seq_to_print))
            counter.add('paths_out', len(to_write))
            for line_to_write in to_write:
                print line_to_write
                output_line_count += 1
    output_stream.flush()
    print >> sys.stderr, (
        'cojunction_enum_delegate.py reports %d output lines.' %
        output_line_count)
Beispiel #4
0
                 if not (int(alignment[1]) & 4)]
 flag = int(initial_multiread[0][1])
 if not multiread:
     counter.add('unmapped')
     # Write only the SAM output if the read was unmapped
     output_line_count += alignment_printer.print_unmapped_read(
                                             qname,
                                             initial_multiread[0][9],
                                             initial_multiread[0][10]
                                         )
 else:
     '''Correct positions to match original reference's, correct
     CIGARs, eliminate duplicates, and decide primary alignment.'''
     try:
         corrected_multiread = multiread_with_junctions(
                                     multiread,
                                     stranded=args.stranded
                                 )
     except:
         print >>sys.stderr, ('Error encountered interpreting '
                              'multiread %s' % (multiread,))
         raise
     if not corrected_multiread:
         '''This is effectively an unmapped read; write
         corresponding SAM output.'''
         if flag & 16:
             seq_to_write = initial_multiread[0][9][::-1].translate(
                             reversed_complement_translation_table
                         )
             qual_to_write = initial_multiread[0][10][::-1]
         else:
             seq_to_write = initial_multiread[0][9]