Example #1
0
def go(output_stream=sys.stdout, input_stream=sys.stdin,
        verbose=False, report_multiplier=1.2,
        alignment_count_to_report=1, tie_margin=0):
    """ Processes Bowtie 2 alignments, emitting filtered SAM output.

        Only max(# tied alignments, alignment_count_to_report) alignments
        are printed. This way, the compare_alignments step always has enough
        information to fill the XS field.

        output_stream: where to emit exon and junction tuples; typically, this
            is sys.stdout.
        input_stream: where to find input to process
        verbose: True if alignments should occasionally be written to stderr.
        report_multiplier: if verbose is True, the line number of an
            alignment written to stderr increases exponentially with base
            report_multiplier.
        alignment_count_to_report: argument of Bowtie 2's -k field
        tie_margin: allowed score difference per 100 bases among ties in 
             max alignment score.
    """
    output_line_count, next_report_line = 0, 0
    threshold_alignment_count = max(2, alignment_count_to_report)
    for (qname,), xpartition in xstream(input_stream, 1):
        max_score, alignments_output, current_tie_margin = None, 0, None
        for rest_of_line in xpartition:
            # Note Bowtie 2 outputs alignments in order of descending score
            try:
                score = int([field[5:] for field in rest_of_line
                                if field[:5] == 'AS:i:'][0])
            except IndexError:
                # Unmapped read; flag should be 4. Print only essentials.
                assert int(rest_of_line[0]) == 4
                print >>output_stream, ('%s\t4\t\x1c\t\x1c\t\x1c\t\x1c'
                                         '\t\x1c\t\x1c\t\x1c\t%s\t%s') % (
                                                qname,
                                                rest_of_line[8],
                                                rest_of_line[9]
                                            )
                output_line_count += 1
            else:
                if current_tie_margin is None:
                    current_tie_margin = round(
                            tie_margin * float(len(rest_of_line[8])) / 100
                        )
                if score + current_tie_margin >= max_score:
                    max_score = max(max_score, score)
                elif alignments_output >= threshold_alignment_count:
                    break
                print >>output_stream, '\t'.join((qname,) + rest_of_line)
                alignments_output += 1
                output_line_count += 1
                if verbose and next_report_line == output_line_count:
                    print >>sys.stderr, \
                        'SAM output record %d: rdname="%s", flag=%d' \
                        % (output_line_count, qname, int(rest_of_line[0]))
                    next_report_line = max(int(next_report_line
                        * report_multiplier), next_report_line + 1)
    output_stream.flush()
    print >>sys.stderr, ('realign_reads_delegate.py reports %d output lines.'
                            % output_line_count)
def go(output_stream=sys.stdout, input_stream=sys.stdin,
        verbose=False, report_multiplier=1.2,
        alignment_count_to_report=1, tie_margin=0):
    """ Processes Bowtie 2 alignments, emitting filtered SAM output.

        Only max(# tied alignments, alignment_count_to_report) alignments
        are printed. This way, the compare_alignments step always has enough
        information to fill the XS field.

        output_stream: where to emit exon and intron tuples; typically, this is
            sys.stdout.
        input_stream: where to find input to process
        verbose: True if alignments should occasionally be written to stderr.
        report_multiplier: if verbose is True, the line number of an
            alignment written to stderr increases exponentially with base
            report_multiplier.
        alignment_count_to_report: argument of Bowtie 2's -k field
        tie_margin: allowed score difference per 100 bases among ties in 
             max alignment score.
    """
    output_line_count, next_report_line = 0, 0
    threshold_alignment_count = max(2, alignment_count_to_report)
    for (qname,), xpartition in xstream(input_stream, 1):
        max_score, alignments_output, current_tie_margin = None, 0, None
        for rest_of_line in xpartition:
            # Note Bowtie 2 outputs alignments in order of descending score
            try:
                score = int([field[5:] for field in rest_of_line
                                if field[:5] == 'AS:i:'][0])
            except IndexError:
                # Unmapped read; flag should be 4. Print only essentials.
                assert int(rest_of_line[0]) == 4
                print >>output_stream, ('%s\t4\t\x1c\t\x1c\t\x1c\t\x1c'
                                         '\t\x1c\t\x1c\t\x1c\t%s\t%s') % (
                                                qname,
                                                rest_of_line[8],
                                                rest_of_line[9]
                                            )
                output_line_count += 1
            else:
                if current_tie_margin is None:
                    current_tie_margin = round(
                            tie_margin * float(len(rest_of_line[8])) / 100
                        )
                if score + current_tie_margin >= max_score:
                    max_score = max(max_score, score)
                elif alignments_output >= threshold_alignment_count:
                    break
                print >>output_stream, '\t'.join((qname,) + rest_of_line)
                alignments_output += 1
                output_line_count += 1
                if verbose and next_report_line == output_line_count:
                    print >>sys.stderr, \
                        'SAM output record %d: rdname="%s", flag=%d' \
                        % (output_line_count, qname, int(rest_of_line[0]))
                    next_report_line = max(int(next_report_line
                        * report_multiplier), next_report_line + 1)
    output_stream.flush()
    print >>sys.stderr, ('realign_reads_delegate.py reports %d output lines.'
                            % output_line_count)
Example #3
0
def go(input_stream=sys.stdin, output_stream=sys.stdout, bowtie2_exe='bowtie2',
    bowtie_index_base='genome', bowtie2_index_base='genome2', 
    manifest_file='manifest', bowtie2_args=None, bin_size=10000, verbose=False,
    exon_differentials=True, exon_intervals=False, report_multiplier=1.2,
    min_exon_size=8, search_filter=1, min_readlet_size=15, max_readlet_size=25,
    readlet_interval=12, capping_multiplier=1.5, drop_deletions=False,
    gzip_level=3, scratch=None, index_count=1, output_bam_by_chr=False,
    tie_margin=0, no_realign=False, no_polyA=False):
    """ Runs Rail-RNA-align_reads.

        A single pass of Bowtie is run to find end-to-end alignments. Unmapped
        reads are saved for readletizing to determine junctions in sucessive
        reduce steps as well as for realignment in a later map step.

        Input (read from stdin)
        ----------------------------
        Tab-delimited input tuple columns in a mix of any of the following
        three formats:
        Format 1 (single-end, 3-column):
          1. Nucleotide sequence or its reversed complement, whichever is first
            in alphabetical order
          2. 1 if sequence was reverse-complemented else 0
          3. Name
          4. Quality sequence or its reverse, whichever corresponds to field 1

        Format 2 (paired, 2 lines, 3 columns each)
        (so this is the same as single-end)
          1. Nucleotide sequence for mate 1 or its reversed complement,
            whichever is first in alphabetical order
          2. 1 if sequence was reverse-complemented else 0
          3. Name for mate 1
          4. Quality sequence for mate 1 or its reverse, whichever corresponds
            to field 1
            
            (new line)

          1. Nucleotide sequence for mate 2 or its reversed complement,
            whichever is first in alphabetical order
          2. 1 if sequence was reverse complemented else 0
          3. Name for mate 2
          4. Quality sequence for mate 2 or its reverse, whichever corresponds
            to field 1

        Input is partitioned and sorted by field 1, the read sequence.

        Hadoop output (written to stdout)
        ----------------------------
        A given RNAME sequence is partitioned into intervals ("bins") of some 
        user-specified length (see partition.py).

        Exonic chunks (aka ECs; three formats, any or all of which may be
        emitted):

        Format 1 (exon_ival); tab-delimited output tuple columns:
        1. Reference name (RNAME in SAM format) + ';' + bin number
        2. Sample index
        3. EC start (inclusive) on forward strand
        4. EC end (exclusive) on forward strand

        Format 2 (exon_diff); tab-delimited output tuple columns:
        1. Reference name (RNAME in SAM format) + ';' + bin number
        2. max(EC start, bin start) (inclusive) on forward strand IFF diff is
            positive and EC end (exclusive) on forward strand IFF diff is
            negative
        3. Sample index
        4. '1' if alignment from which diff originates is "unique" according to
            --tie-margin criterion; else '0'
        5. +1 or -1 * count, the number of instances of a read sequence for
            which to print exonic chunks

        Note that only unique alignments are currently output as ivals and/or
        diffs.

        Format 3 (sam); tab-delimited output tuple columns:
        Standard SAM output except fields are in different order, and the first
        field corresponds to sample label. (Fields are reordered to facilitate
        partitioning by sample name/RNAME and sorting by POS.) Each line
        corresponds to a spliced alignment. The order of the fields is as
        follows.
        1. Sample index if outputting BAMs by sample OR
                sample-rname index if outputting BAMs by chr
        2. (Number string representing RNAME; see BowtieIndexReference
            class in bowtie_index for conversion information) OR
            '0' if outputting BAMs by chr
        3. POS
        4. QNAME
        5. FLAG
        6. MAPQ
        7. CIGAR
        8. RNEXT
        9. PNEXT
        10. TLEN
        11. SEQ
        12. QUAL
        ... + optional fields

        Insertions/deletions (indel_bed)

        tab-delimited output tuple columns:
        1. 'I' or 'D' insertion or deletion line
        2. Number string representing RNAME
        3. Start position (Last base before insertion or 
            first base of deletion)
        4. End position (Last base before insertion or last base of deletion 
                            (exclusive))
        5. Inserted sequence for insertions or deleted sequence for deletions
        6. Sample index
        ----Next fields are for junctions only; they are '\x1c' for indels----
        7. '\x1c'
        8. '\x1c'
        --------------------------------------------------------------------
        9. Number of instances of insertion or deletion in sample; this is
            always +1 * count before bed_pre combiner/reducer

        Read whose primary alignment is not end-to-end

        Tab-delimited output tuple columns (unmapped):
        1. Transcriptome Bowtie 2 index group number
        2. SEQ
        3. 1 if SEQ is reverse-complemented, else 0
        4. QNAME
        5. QUAL

        Tab-delimited output tuple columns (readletized):
        1. Readlet sequence or its reversed complement, whichever is first in
            alphabetical order
        2. read sequence ID + ('-' if readlet
            sequence is reverse-complemented; else '+') + '\x1e' + displacement
            of readlet's 5' end from read's 5' end + '\x1e' + displacement of
            readlet's 3' end from read's 3' end (+, for EXACTLY one readlet of
            a read sequence, '\x1e' + read sequence + '\x1e' +
            (an '\x1f'-separated list A of unique sample labels with read
            sequences that match the original read sequence) + '\x1e' +
            (an '\x1f'-separated list  of unique sample labels B with read
            sequences that match the reversed complement of the original read
            sequence)) + '\x1e' + (an '\x1f'-separated list of the number of
            instances of the read sequence for each respective sample in list
            A) + '\x1e' + (an '\x1f'-separated list of the number of instances
            of the read sequence's reversed complement for each respective
            sample in list B). Here, a read sequence ID takes the form X:Y,
            where X is the "mapred_task_partition" environment variable -- a
            unique index for a task within a job -- and Y is the index of the
            read sequence relative to the beginning of the input stream.

        Tab-delimited tuple columns (postponed_sam):
        Standard 11+ -column raw SAM output

        Single column (unique):
        1. A unique read sequence

        Two columns, exactly one line (dummy); ensures creation of junction
            index:
        1. character "-"
        2. the word "dummy"

        ALL OUTPUT COORDINATES ARE 1-INDEXED.

        input_stream: where to find input reads.
        output_stream: where to emit exonic chunks and junctions.
        bowtie2_exe: filename of Bowtie2 executable; include path if not in
            $PATH.
        bowtie_index_base: the basename of the Bowtie1 index files associated
            with the reference.
        bowtie2_index_base: the basename of the Bowtie2 index files associated
            with the reference.
        manifest_file: filename of manifest
        bowtie2_args: string containing precisely extra command-line arguments
            to pass to first-pass Bowtie2.
        bin_size: genome is partitioned in units of bin_size for later load
            balancing.
        verbose: True iff more informative messages should be written to
            stderr.
        exon_differentials: True iff EC differentials are to be emitted.
        exon_intervals: True iff EC intervals are to be emitted.
        report_multiplier: if verbose is True, the line number of an alignment
            or read written to stderr increases exponentially with base
            report_multiplier.
        min_exon_size: minimum exon size searched for in junction_search.py
            later in pipeline; used to determine how large a soft clip on one
            side of a read is necessary to pass it on to junction search
            pipeline
        search_filter: how large a soft clip on one side of a read is necessary
            to pass it on to junction search pipeline
        min_readlet_size: "capping" readlets (that is, readlets that terminate
            at a given end of the read) are never smaller than this value
        max_readlet_size: size of every noncapping readlet
        readlet_interval: number of bases separating successive readlets along
            the read
        capping_multiplier: successive capping readlets on a given end of a
            read are increased in size exponentially with base
            capping_multiplier
        drop_deletions: True iff deletions should be dropped from coverage
            vector
        gzip_level: compression level to use for temporary files
        scratch: scratch directory for storing temporary files or None if 
            securely created temporary directory
        index_count: number of transcriptome Bowtie 2 indexes to which to
            assign unmapped reads for later realignment
        output_bam_by_chr: True iff final output BAMs will be by chromosome
        tie_margin: allowed score difference per 100 bases among ties in
            max score. For example, 150 and 144 are tied alignment scores
            for a 100-bp read when --tie-margin is 6.
        no_realign: True iff job flow does not need more than readlets: this
            usually means only a transcript index is being constructed
        no_polyA: kill noncapping readlets that are all As and write as
            unmapped all reads with polyA prefixes whose suffixes are <
            min_exon_size

        No return value.
    """
    global _input_line_count
    reference_index = bowtie_index.BowtieIndexReference(bowtie_index_base)
    manifest_object = manifest.LabelsAndIndices(manifest_file)
    alignment_printer = AlignmentPrinter(
            manifest_object,
            reference_index,
            bin_size=bin_size,
            output_stream=output_stream,
            exon_ivals=exon_intervals,
            exon_diffs=exon_differentials,
            drop_deletions=drop_deletions,
            output_bam_by_chr=output_bam_by_chr,
            tie_margin=tie_margin
        )
    # Get task partition to pass to align_reads_delegate.py
    try:
        task_partition = os.environ['mapred_task_partition']
    except KeyError:
        # Hadoop 2.x?
        try:
            task_partition = os.environ['mapreduce_task_partition']
        except KeyError:
            # A unit test is probably being run
            task_partition = '0'
    temp_dir = make_temp_dir(scratch)
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir])
    align_file = os.path.join(temp_dir, 'first_pass_reads.temp.gz')
    other_reads_file = os.path.join(temp_dir, 'other_reads.temp.gz')
    second_pass_file = os.path.join(temp_dir, 'second_pass_reads.temp.gz')
    k_value, _, _ = bowtie.parsed_bowtie_args(bowtie2_args)
    nothing_doing = True
    # Required length of prefix after poly(A) is trimmed
    remaining_seq_size = max(min_exon_size - 1, 1)
    with xopen(True, align_file, 'w', gzip_level) as align_stream, \
        xopen(True, other_reads_file, 'w', gzip_level) as other_stream:
        for seq_number, ((seq,), xpartition) in enumerate(
                                                        xstream(sys.stdin, 1)
                                                    ):
            seq_length = len(seq)
            if no_polyA and (
                    all(seq[i] == 'A' 
                         for i in xrange(seq_length - remaining_seq_size))
                    or all(seq[i] == 'T' 
                         for i in xrange(remaining_seq_size, seq_length))
                    or all(seq[i] == 'A' 
                         for i in xrange(remaining_seq_size, seq_length))
                    or all(seq[i] == 'T' 
                         for i in xrange(seq_length - remaining_seq_size))
                ):
                if not no_realign:
                    '''If a sequence is too short without its poly(A) tail,
                    make all reads with that sequence unmapped. Technically,
                    this also kills poly(A)s at 5' ends, but we probably
                    couldn't align those sequences anyway.'''
                    reversed_complement_seq = seq[::-1].translate(
                                        _reversed_complement_translation_table
                                    )
                    for is_reversed, name, qual in xpartition:
                        if is_reversed == '0':
                            alignment_printer.print_unmapped_read(
                                                    name,
                                                    seq,
                                                    qual
                                                )
                        else:
                            alignment_printer.print_unmapped_read(
                                                    name,
                                                    reversed_complement_seq,
                                                    qual[::-1]
                                                )
                continue
            nothing_doing = False
            '''Select highest-quality read with alphabetically last qname
            for first-pass alignment.'''
            best_name, best_mean_qual, best_qual_index, i = None, None, 0, 0
            others_to_print = dlist()
            for is_reversed, name, qual in xpartition:
                _input_line_count += 1
                others_to_print.append(
                        '\t'.join([
                            str(seq_number), is_reversed, name, qual
                        ])
                    )
                mean_qual = (
                        float(sum([ord(score) for score in qual])) / len(qual)
                    )
                if (mean_qual > best_mean_qual
                        or mean_qual == best_mean_qual and name > best_name):
                    best_qual_index = i
                    best_mean_qual = mean_qual
                    best_name = name
                    to_align = '\t'.join([
                                        '%s\x1d%s' % (is_reversed, name),
                                        seq, qual
                                    ])
                i += 1
            assert i >= 1
            if i == 1:
                print >>other_stream, str(seq_number)
            else:
                for j, other_to_print in enumerate(others_to_print):
                    if j != best_qual_index:
                        print >>other_stream, other_to_print
            print >>align_stream, to_align
    # Print dummy line
    print 'dummy\t-\tdummy'
    sys.stdout.flush() # this is REALLY important b/c called script will stdout
    if nothing_doing:
        # No input
        sys.exit(0)
    input_command = 'gzip -cd %s' % align_file
    bowtie_command = ' '.join([bowtie2_exe,
        bowtie2_args if bowtie2_args is not None else '',
        ' --sam-no-qname-trunc --local -t --no-hd --mm -x',
        bowtie2_index_base, '--12 -'])
    delegate_command = ''.join(
                [sys.executable, ' ', os.path.realpath(__file__)[:-3],
                    ('_delegate.py --task-partition {task_partition} '
                     '--other-reads {other_reads} --second-pass-reads '
                     '{second_pass_reads} --min-readlet-size '
                     '{min_readlet_size} {drop_deletions} '
                     '--max-readlet-size {max_readlet_size} '
                     '--readlet-interval {readlet_interval} '
                     '--capping-multiplier {capping_multiplier:1.12f} '
                     '{verbose} --report-multiplier {report_multiplier:1.12f} '
                     '--k-value {k_value} '
                     '--bowtie-idx {bowtie_index_base} '
                     '--partition-length {bin_size} '
                     '--manifest {manifest_file} '
                     '{exon_differentials} {exon_intervals} '
                     '--gzip-level {gzip_level} '
                     '--search-filter {search_filter} '
                     '--index-count {index_count} '
                     '--tie-margin {tie_margin} '
                     '{no_realign} '
                     '{no_polyA} '
                     '{output_bam_by_chr}').format(
                        task_partition=task_partition,
                        other_reads=other_reads_file,
                        second_pass_reads=second_pass_file,
                        min_readlet_size=min_readlet_size,
                        drop_deletions=('--drop-deletions' if drop_deletions
                                            else ''),
                        max_readlet_size=max_readlet_size,
                        readlet_interval=readlet_interval,
                        capping_multiplier=capping_multiplier,
                        verbose=('--verbose' if verbose else ''),
                        report_multiplier=report_multiplier,
                        k_value=k_value,
                        bowtie_index_base=bowtie_index_base,
                        bin_size=bin_size,
                        manifest_file=manifest_file,
                        exon_differentials=('--exon-differentials'
                                            if exon_differentials else ''),
                        exon_intervals=('--exon-intervals'
                                        if exon_intervals else ''),
                        gzip_level=gzip_level,
                        search_filter=search_filter,
                        index_count=index_count,
                        tie_margin=tie_margin,
                        no_realign=('--no-realign' if no_realign else ''),
                        no_polyA=('--no-polyA' if no_polyA else ''),
                        output_bam_by_chr=('--output-bam-by-chr'
                                            if output_bam_by_chr
                                            else '')
                     )]
            )
    full_command = ' | '.join([input_command, 
                                bowtie_command, delegate_command])
    print >>sys.stderr, \
        'Starting first-pass Bowtie 2 with command: ' + full_command
    bowtie_process = subprocess.Popen(' '.join(
                    ['set -exo pipefail;', full_command]
                ),
            bufsize=-1, stdout=sys.stdout, stderr=sys.stderr, shell=True,
            executable='/bin/bash')
    return_code = bowtie_process.wait()
    if return_code:
        raise RuntimeError('Error occurred while reading first-pass Bowtie 2 '
                           'output; exitlevel was %d.' % return_code)
    os.remove(align_file)
    os.remove(other_reads_file)
    if not no_realign:
        input_command = 'gzip -cd %s' % second_pass_file
        bowtie_command = ' '.join([bowtie2_exe,
            bowtie2_args if bowtie2_args is not None else '',
            ' --sam-no-qname-trunc --local -t --no-hd --mm -x',
            bowtie2_index_base, '--12 -'])
        delegate_command = ''.join(
                    [sys.executable, ' ', os.path.realpath(__file__)[:-3],
                        ('_delegate.py --task-partition {task_partition} '
                         '--min-readlet-size {min_readlet_size} '
                         '{drop_deletions} '
                         '--max-readlet-size {max_readlet_size} '
                         '--readlet-interval {readlet_interval} '
                         '--capping-multiplier {capping_multiplier:012f} '
                         '{verbose} '
                         '--report-multiplier {report_multiplier:012f} '
                         '--k-value {k_value} '
                         '--bowtie-idx {bowtie_index_base} '
                         '--partition-length {bin_size} '
                         '--manifest {manifest_file} '
                         '{exon_differentials} {exon_intervals} '
                         '--gzip-level {gzip_level} '
                         '--search-filter {search_filter} ' 
                         '--index-count {index_count} '
                         '--tie-margin {tie_margin} '
                         '{output_bam_by_chr}').format(
                            task_partition=task_partition,
                            min_readlet_size=min_readlet_size,
                            drop_deletions=('--drop-deletions'
                                                if drop_deletions else ''),
                            readlet_interval=readlet_interval,
                            max_readlet_size=max_readlet_size,
                            capping_multiplier=capping_multiplier,
                            verbose=('--verbose' if verbose else ''),
                            report_multiplier=report_multiplier,
                            k_value=k_value,
                            bowtie_index_base=bowtie_index_base,
                            bin_size=bin_size,
                            manifest_file=manifest_file,
                            exon_differentials=('--exon-differentials'
                                                if exon_differentials else ''),
                            exon_intervals=('--exon-intervals'
                                            if exon_intervals else ''),
                            gzip_level=gzip_level,
                            search_filter=search_filter,
                            index_count=index_count,
                            tie_margin=tie_margin,
                            output_bam_by_chr=('--output-bam-by-chr'
                                                if output_bam_by_chr
                                                else '')
                         )]
                )
        full_command = ' | '.join([input_command, 
                                    bowtie_command, delegate_command])
        print >>sys.stderr, \
            'Starting second-pass Bowtie 2 with command: ' + full_command
        bowtie_process = subprocess.Popen(' '.join(
                        ['set -exo pipefail;', full_command]
                    ),
                bufsize=-1, stdout=sys.stdout, stderr=sys.stderr, shell=True,
                executable='/bin/bash')
        return_code = bowtie_process.wait()
        if return_code:
            raise RuntimeError('Error occurred while reading second-pass '
                               'Bowtie 2 output; exitlevel was %d.'
                                % return_code)
    sys.stdout.flush()
Example #4
0
def go(input_stream=sys.stdin,
       output_stream=sys.stdout,
       fudge=5,
       stranded=False,
       verbose=False,
       report_multiplier=1.2):
    """ Emits intron combinations associated with reads.

        Soft-clipped Bowtie 2 alignments of read sequences to the transcript
        fragment index are used infer which cointrons could possibly be
        overlapped by reads. Then maximal cliques of the graph described in
        the maximal_cliques() function are enumerated to obtain which
        intron combinations could possibly be overlapped by reads.

        input_stream: where to retrieve Bowtie 2 output
        output_stream: where to emit exon and intron tuples; typically, this is
            sys.stdout.
        verbose: True if alignments should occasionally be written to stderr.
        stranded: True iff input reads are strand-specific; this affects
            whether an output partition has a terminal '+' or '-' indicating
            the sense strand. Further, if stranded is True, an alignment is
            returned only if its strand agrees with the intron's strand.
        fudge: by how many bases to extend left and right extend sizes
            to accommodate potential indels
        report_multiplier: if verbose is True, the line number of an
            alignment written to stderr increases exponentially with base
            report_multiplier.
    """
    output_line_count, next_report_line, i = 0, 0, 0
    for (qname, ), xpartition in xstream(input_stream, 1):
        '''While labeled multireadlet, this list may end up simply a
        unireadlet.'''
        multiread = []
        for tokens in xpartition:
            flag = int(tokens[0])
            if verbose and next_report_line == i:
                print >>sys.stderr, \
                    'SAM output record %d: rdname="%s", flag=%d' % (i,
                                                                    qname,
                                                                    flag)
                next_report_line = int(
                    (next_report_line + 1) * report_multiplier + 1) - 1
            i += 1
            multiread.append((qname, ) + tokens)
        if flag & 4: continue
        corrected_multiread = multiread_with_introns(multiread, stranded)
        all_introns = {}
        for alignment in multiread_with_introns(multiread, stranded):
            cigar = alignment[5]
            md = [field for field in alignment if field[:5] == 'MD:Z:'][0][5:]
            pos = int(alignment[3])
            seq = alignment[9]
            reversed_complement_seq = seq[::-1].translate(
                _reversed_complement_translation_table)
            if seq < reversed_complement_seq:
                seq_to_print = seq
            else:
                seq_to_print = reversed_complement_seq
            seq_size = len(seq)
            rname = alignment[2]
            sense = [field for field in alignment
                     if field[:5] == 'XS:A:'][0][5:]
            if (rname, sense) not in all_introns:
                all_introns[(rname, sense)] = defaultdict(list)
            _, _, introns, _ = indels_introns_and_exons(cigar, md, pos, seq)
            for intron in introns:
                if (intron[0], intron[1]) \
                    not in all_introns[(rname, sense)]:
                    all_introns[(rname, sense)][(intron[0], intron[1])] \
                        = [intron[2], intron[3]]
                else:
                    all_introns[(rname,
                                 sense)][(intron[0], intron[1])][0] = max(
                                     all_introns[(rname,
                                                  sense)][(intron[0],
                                                           intron[1])][0],
                                     intron[2])
                    all_introns[(rname,
                                 sense)][(intron[0], intron[1])][1] = max(
                                     all_introns[(rname,
                                                  sense)][(intron[0],
                                                           intron[1])][1],
                                     intron[3])
        for rname, sense in all_introns:
            to_write = set()
            # Grab maximal cliques
            for clique in \
                maximal_cliques(all_introns[(rname, sense)].keys()):
                for cointrons in separated_introns(clique,
                                                   separation=(seq_size +
                                                               fudge)):
                    cointrons.sort()
                    left_extend_size = all_introns[(rname, sense)][(
                        cointrons[0][0], cointrons[0][1])][0]
                    right_extend_size = all_introns[(rname, sense)][(
                        cointrons[-1][0], cointrons[-1][1])][1]
                    to_write.add(
                        ('{rname}{sense}\t{starts}'
                         '\t{ends}\t{left_size}'
                         '\t{right_size}\t{seq}').format(
                             rname=rname,
                             sense=sense,
                             starts=','.join(
                                 [str(intron[0]) for intron in cointrons]),
                             ends=','.join(
                                 [str(intron[1]) for intron in cointrons]),
                             left_size=(left_extend_size + fudge),
                             right_size=(right_extend_size + fudge),
                             seq=seq_to_print))
            for line_to_write in to_write:
                print line_to_write
                output_line_count += 1
    output_stream.flush()
    print >> sys.stderr, (
        'cointron_enum_delegate.py reports %d output lines.' %
        output_line_count)
Example #5
0
output_url = Url(args.out) if args.out is not None \
    else Url(os.getcwd())
input_line_count = 0
if output_url.is_local:
    # Set up destination directory
    try:
        os.makedirs(output_url.to_url())
    except:
        pass
else:
    mover = filemover.FileMover(args=args)
    # Set up temporary destination
    import tempfile
    temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch))
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path])
for (line_type, sample_label), xpartition in xstream(sys.stdin, 2):
    assert line_type in 'NID'
    sample_label = manifest_object.index_to_label[sample_label]
    type_string = ('insertions' if line_type == 'I' else
                   ('deletions' if line_type == 'D' else 'junctions'))
    output_filename = (
        (args.bed_basename + '.' if args.bed_basename != '' else '') +
        type_string + '.' + sample_label + '.bed')
    if output_url.is_local:
        output_path = os.path.join(args.out, output_filename)
    else:
        output_path = os.path.join(temp_dir_path, output_filename)
    with open(output_path, 'w') as output_stream:
        print >>output_stream, 'track name="%s_%s" description="' \
                                   'Rail-RNA v%s %s for sample %s"' \
                                                      % (sample_label,
                instance=args.read_instance,
            )
        write_read_introns_from_sam_stream(
            sys.stdin, combined_stream, retrieved_intron_counts, instance=args.read_instance
        )
    import subprocess

    sorted_combined_file = os.path.join(temp_dir_path, "combined.sorted.temp")
    subprocess.check_call(
        " ".join(["sort -T %s -k1,1" % temp_dir_path, combined_file, ">", sorted_combined_file]), bufsize=-1, shell=True
    )
    relevant = 0
    retrieved = 0
    relevant_and_retrieved = 0
    with open(sorted_combined_file) as sorted_combined_stream:
        for (name,), xpartition in xstream(sorted_combined_stream, 1):
            relevant_and_retrieved_instances = list(xpartition)
            ts = [
                instance[:-1]
                for instance in relevant_and_retrieved_instances
                if instance[-1] == "t"
                and (
                    args.coverage_threshold is None
                    or any([intron_counts[intron] <= args.coverage_threshold for intron in instance[:-1]])
                )
            ]
            rs = [
                instance[:-1]
                for instance in relevant_and_retrieved_instances
                if instance[-1] == "r"
                and (
Example #7
0
output_url = Url(args.out) if args.out is not None \
    else Url(os.getcwd())
input_line_count = 0
if output_url.is_local:
    # Set up destination directory
    try: os.makedirs(output_url.to_url())
    except: pass
else:
    mover = filemover.FileMover(args=args)
    # Set up temporary destination
    import tempfile
    temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch))
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path])

input_line_count = 0
for (line_type,), xpartition in xstream(sys.stdin, 1):
    type_string = ('insertions' if line_type == '0' else
                    ('deletions' if line_type == '1' else
                      ('junctions' if line_type == '2' else
                        'normalization')))
    output_filename = ((args.tsv_basename + '.' 
                          if args.tsv_basename != '' else '')
                          + type_string + '.tsv.gz')
    if output_url.is_local:
        output_path = os.path.join(args.out, output_filename)
    else:
        output_path = os.path.join(temp_dir_path, output_filename)
    with xopen(True, output_path, 'w', args.gzip_level) as output_stream:
        if line_type != '3':
            '''Print all labels in the order in which they appear in the
            manifest file.'''
Example #8
0
def go(
    input_stream=sys.stdin,
    output_stream=sys.stdout,
    bowtie_exe="bowtie",
    bowtie_index_base="genome",
    bowtie_args="",
    gzip_level=3,
    verbose=False,
    report_multiplier=1.2,
    scratch=None,
):
    """ Runs Rail-RNA-align_readlets.

        Aligns input readlet sequences and writes a single output line per
        readlet belonging to a distinct read sequence.

        Input (read from stdin)
        ----------------------------
        Tab-delimited input tuple columns:
        1. Readlet sequence or its reversed complement, whichever is first in
            alphabetical order
        2. read sequence ID + ('-' if readlet
            sequence is reverse-complemented; else '+') + '\x1e' + displacement
            of readlet's 5' end from read's 5' end + '\x1e' + displacement of
            readlet's 3' end from read's 3' end (+, for EXACTLY one readlet of
            a read sequence, '\x1e' + read sequence + '\x1e' +
            (an '\x1f'-separated list A of unique sample labels with read
            sequences that match the original read sequence) + '\x1e' +
            (an '\x1f'-separated list  of unique sample labels B with read
            sequences that match the reversed complement of the original read
            sequence)) + '\x1e' + (an '\x1f'-separated list of the number of
            instances of the read sequence for each respective sample in list
            A) + '\x1e' + (an '\x1f'-separated list of the number of instances
            of the read sequence's reversed complement for each respective
            sample in list B). Here, a read sequence ID takes the form X:Y,
            where X is the "mapred_task_partition" environment variable -- a
            unique index for a task within a job -- and Y is the index of the
            read sequence relative to the beginning of the input stream.

        Input is partitioned by field 1, the readlet sequence or its reversed
        complement.

        Hadoop output (written to stdout)
        ----------------------------
        Tab-delimited output tuple columns, where each line corresponds to a
        readlet from a distinct read rather than a unique readlet sequence:
        1. Read sequence ID
        2. Displacement of readlet's 5' end from read's 5' end + '\x1e' +
            displacement of readlet's 3' end from read's 3' end (+, for EXACTLY
            one readlet of a read sequence, '\x1e' + read sequence + '\x1e' +
            number of instances of read sequence + '\x1e' + number of instances
            of read sequence's reversed complement + '\x1e' (+, for EXACTLY one
            readlet of a read sequence, '\x1e' + read sequence + '\x1e' +
            (an '\x1f'-separated list A of unique sample labels with read
            sequences that match the original read sequence) + '\x1e' +
            (an '\x1f'-separated list  of unique sample labels B with read
            sequences that match the reversed complement of the original read
            sequence))] + '\x1e' + (an '\x1f'-separated list of the number of
            instances of the read sequence for each respective
            sample in list A) + '\x1e' + (an '\x1f'-separated list of the
            number of instances of the read sequence's reversed complement for
            each respective sample in list B)
        3. '\x1f'-separated list of alignment RNAMEs or '\x1c' if no alignments
        4. '\x1f'-separated list of alignment FLAGs or '\x1c' if no alignments
        5. '\x1f-separated list of alignment POSes or '\x1c' if no alignments

        ALL OUTPUT COORDINATES ARE 1-INDEXED.

        input_stream: where to find input reads.
        output_stream: where to emit exonic chunks and introns.
        bowtie_exe: filename of Bowtie executable; include path if not in
            $PATH.
        bowtie_index_base: the basename of the Bowtie index files associated
            with the reference.
        bowtie_args: string containing precisely extra command-line arguments
            to pass to first-pass Bowtie, e.g., "--tryhard --best"; or None.
        gzip_level: level of gzip compression to use for qname file
        verbose: True iff more informative messages should be written to
            stderr.
        report_multiplier: if verbose is True, the line number of an alignment
            written to stderr increases exponentially with base
            report_multiplier.
        scratch: scratch directory for storing temporary files or None if 
            securely created temporary directory

        No return value.
    """
    global _input_line_count
    # For storing long qnames
    temp_dir = make_temp_dir(scratch)
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir])
    qnames_file = os.path.join(temp_dir, "qnames.temp.gz")
    readlet_file = os.path.join(temp_dir, "readlets.temp.gz")
    with xopen(True, qnames_file, "w", gzip_level) as qname_stream:
        with xopen(True, readlet_file, "w", gzip_level) as readlet_stream:
            for (seq_count, ((seq,), xpartition)) in enumerate(xstream(input_stream, 1)):
                print >> readlet_stream, "\t".join([str(seq_count), seq, "I" * len(seq)])
                print >> qname_stream, next(iter(xpartition))[0]
                for (qname,) in xpartition:
                    _input_line_count += 1
                    print >> qname_stream, qname
                # Separate qnames with single + character
                print >> qname_stream, "+"
    input_command = "gzip -cd %s" % readlet_file
    bowtie_command = " ".join([bowtie_exe, bowtie_args, "-S -t --sam-nohead --mm", bowtie_index_base, "--12 -"])
    delegate_command = "".join(
        [
            sys.executable,
            " ",
            os.path.realpath(__file__)[:-3],
            "_delegate.py --report-multiplier %08f --qnames-file %s %s"
            % (report_multiplier, qnames_file, "--verbose" if verbose else ""),
        ]
    )
    full_command = " | ".join([input_command, bowtie_command, delegate_command])
    print >>sys.stderr, "Starting Bowtie with command: " + full_command
    bowtie_process = subprocess.Popen(
        " ".join(["set -exo pipefail;", full_command]),
        bufsize=-1,
        stdout=sys.stdout,
        stderr=sys.stderr,
        shell=True,
        executable="/bin/bash",
    )
    return_code = bowtie_process.wait()
    if return_code:
        raise RuntimeError("Error occurred while reading Bowtie output; " "exitlevel was %d." % return_code)
Example #9
0
            unique_mapped_read_counts[sample_index]) = [
                                        int(token) for token
                                        in tokens[-2].split(',')
                                    ]
try:
    mean_weight = 1. / len([_ for _ in mapped_read_counts.values() if _])
except ZeroDivisionError:
    mean_weight = 0.0
try:
    unique_mean_weight = 1. / len(
                        [_ for _ in unique_mapped_read_counts.values() if _]
                    )
except ZeroDivisionError:
    unique_mean_weight = 0.0

for (partition_id,), xpartition in xstream(sys.stdin, 1):
    bin_count += 1
    bin_start_time, bin_diff_count = time.time(), 0
    rname = partition_id.rpartition(';')[0]
    rname_index = reference_index.l_rname_to_string[rname]
    coverages, unique_coverages = defaultdict(int), defaultdict(int)
    for (pos, sample_indexes_and_diffs) in itertools.groupby(
                                            xpartition, lambda val: val[0]
                                        ):
        input_line_count += 1
        pos = int(pos)
        for sample_index, diffs in itertools.groupby(
                                sample_indexes_and_diffs, lambda val: val[1]
                            ):
            for _, _, uniqueness, diff in diffs:
                coverages[sample_index] += int(diff)
Example #10
0
import bowtie_index
from dooplicity.tools import xstream

# Print file's docstring if -h is invoked
parser = argparse.ArgumentParser(
    description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
bowtie.add_args(parser)
args = parser.parse_args()
input_line_count, output_line_count = 0, 0

start_time = time.time()

reference_index = bowtie_index.BowtieIndexReference(
    os.path.expandvars(args.bowtie_idx))
for (_, rname_string, intron_pos, intron_end_pos, sense,
     sample_index), xpartition in xstream(sys.stdin, 6):
    coverage = 0
    for value in xpartition:
        input_line_count += 1
        try:
            # Assume intron line
            _, _, instance_count = value
        except ValueError:
            # Alignment line
            print('\t'.join((value[2], value[3],
                             reference_index.string_to_rname[rname_string],
                             str(int(value[1]))) + value[4:]) +
                  ('\tXC:i:%d' % coverage))
            output_line_count += 1
        else:
            coverage += int(instance_count)
Example #11
0
from dooplicity.tools import xstream

parser = argparse.ArgumentParser(description=__doc__, 
            formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument(\
    '--verbose', action='store_const', const=True, default=False,
    help='Print out extra debugging statements')
bowtie.add_args(parser)
args = parser.parse_args()

start_time = time.time()
input_line_count = 0
reference_index = bowtie_index.BowtieIndexReference(
                        os.path.expandvars(args.bowtie_idx)
                    )
for key, xpartition in xstream(sys.stdin, 3, skip_duplicates=True):
    '''For computing maximum left and right extend sizes for every key --
    that is, every intron combo (fields 1-3 of input).'''
    left_extend_size, right_extend_size = None, None
    left_size, right_size = None, None
    for value in xpartition:
        assert len(value) == 4
        input_line_count += 1
        left_extend_size = max(left_extend_size, int(value[-4]))
        right_extend_size = max(right_extend_size, int(value[-3]))
        try:
            left_size = max(left_size, int(value[-2]))
        except ValueError:
            left_size = 'NA'
        try:
            right_size = max(right_size, int(value[-1]))
Example #12
0
        sample_index = manifest_object.label_to_index[tokens[0]]
        (mapped_read_counts[sample_index],
         unique_mapped_read_counts[sample_index]) = [
             int(token) for token in tokens[-2].split(',')
         ]
try:
    mean_weight = 1. / len([_ for _ in mapped_read_counts.values() if _])
except ZeroDivisionError:
    mean_weight = 0.0
try:
    unique_mean_weight = 1. / len(
        [_ for _ in unique_mapped_read_counts.values() if _])
except ZeroDivisionError:
    unique_mean_weight = 0.0

for (partition_id, ), xpartition in xstream(sys.stdin, 1):
    bin_count += 1
    bin_start_time, bin_diff_count = time.time(), 0
    rname = partition_id.rpartition(';')[0]
    rname_index = reference_index.l_rname_to_string[rname]
    coverages, unique_coverages = defaultdict(int), defaultdict(int)
    for (pos, sample_indexes_and_diffs) in itertools.groupby(
            xpartition, lambda val: val[0]):
        input_line_count += 1
        pos = int(pos)
        for sample_index, diffs in itertools.groupby(sample_indexes_and_diffs,
                                                     lambda val: val[1]):
            for _, _, uniqueness, diff in diffs:
                coverages[sample_index] += int(diff)
                if uniqueness == '1':
                    unique_coverages[sample_index] += int(diff)
Example #13
0
parser.add_argument(\
    '--verbose', action='store_const', const=True, default=False,
    help='Print out extra debugging statements')
bowtie.add_args(parser)
group_reads.add_args(parser)
args = parser.parse_args()

start_time = time.time()
input_line_count = 0
counter = Counter('cojunction_fasta')
register_cleanup(counter.flush)
reference_index = bowtie_index.BowtieIndexReference(
    os.path.expandvars(args.bowtie_idx))
group_reads_object = group_reads.IndexGroup(args.index_count)
for (rname, poses, end_poses), xpartition in xstream(sys.stdin,
                                                     3,
                                                     skip_duplicates=True):
    counter.add('partitions')
    reverse_strand_string = rname[-1]
    rname = rname[:-1]
    read_seqs = dlist()
    poses = [int(pos) for pos in poses.split(',')]
    end_poses = [int(end_pos) for end_pos in end_poses.split(',')]
    max_left_extend_size, max_right_extend_size = None, None
    for left_extend_size, right_extend_size, read_seq in xpartition:
        counter.add('inputs')
        input_line_count += 1
        max_left_extend_size = max(max_left_extend_size, int(left_extend_size))
        max_right_extend_size \
            = max(max_right_extend_size, int(right_extend_size))
        read_seqs.append(read_seq)
Example #14
0
# To convert sample-rname index to sample index-rname index tuple
sample_and_rname_indexes = SampleAndRnameIndexes(
                                                    manifest_object,
                                                    args.output_by_chromosome
                                                )

import time
start_time = time.time()
from alignment_handlers import AlignmentPrinter
alignment_printer = AlignmentPrinter(manifest_object, reference_index,
                                        tie_margin=args.tie_margin)
input_line_count = 0
if args.suppress_bam:
    # Just grab stats
    if args.output_by_chromosome:
        for (index, _), xpartition in xstream(sys.stdin, 2):
            sample_index, rname_index = (
                    sample_and_rname_indexes.sample_and_rname_indexes(index)
                )
            unique_count, total_count = 0, 0
            for record in xpartition:
                if not (int(record[2]) & 256):
                    total_count += 1
                    try:
                        # seq is at position 8
                        if alignment_printer.unique(record, seq_index=8):
                            unique_count += 1
                    except IndexError:
                        # Unmapped read; it's unique
                        unique_count += 1
                input_line_count += 1
Example #15
0
def go(input_stream=sys.stdin,
       output_stream=sys.stdout,
       bowtie_exe='bowtie',
       bowtie_index_base='genome',
       bowtie_args='',
       gzip_level=3,
       verbose=False,
       report_multiplier=1.2,
       scratch=None):
    """ Runs Rail-RNA-align_readlets.

        Aligns input readlet sequences and writes a single output line per
        readlet belonging to a distinct read sequence.

        Input (read from stdin)
        ----------------------------
        Tab-delimited input tuple columns:
        1. Readlet sequence or its reversed complement, whichever is first in
            alphabetical order
        2. read sequence ID + ('-' if readlet
            sequence is reverse-complemented; else '+') + '\x1e' + displacement
            of readlet's 5' end from read's 5' end + '\x1e' + displacement of
            readlet's 3' end from read's 3' end (+, for EXACTLY one readlet of
            a read sequence, '\x1e' + read sequence + '\x1e' +
            (an '\x1f'-separated list A of unique sample labels with read
            sequences that match the original read sequence) + '\x1e' +
            (an '\x1f'-separated list  of unique sample labels B with read
            sequences that match the reversed complement of the original read
            sequence)) + '\x1e' + (an '\x1f'-separated list of the number of
            instances of the read sequence for each respective sample in list
            A) + '\x1e' + (an '\x1f'-separated list of the number of instances
            of the read sequence's reversed complement for each respective
            sample in list B). Here, a read sequence ID takes the form X:Y,
            where X is the "mapred_task_partition" environment variable -- a
            unique index for a task within a job -- and Y is the index of the
            read sequence relative to the beginning of the input stream.

        Input is partitioned by field 1, the readlet sequence or its reversed
        complement.

        Hadoop output (written to stdout)
        ----------------------------
        Tab-delimited output tuple columns, where each line corresponds to a
        readlet from a distinct read rather than a unique readlet sequence:
        1. Read sequence ID
        2. Displacement of readlet's 5' end from read's 5' end + '\x1e' +
            displacement of readlet's 3' end from read's 3' end (+, for EXACTLY
            one readlet of a read sequence, '\x1e' + read sequence + '\x1e' +
            number of instances of read sequence + '\x1e' + number of instances
            of read sequence's reversed complement + '\x1e' (+, for EXACTLY one
            readlet of a read sequence, '\x1e' + read sequence + '\x1e' +
            (an '\x1f'-separated list A of unique sample labels with read
            sequences that match the original read sequence) + '\x1e' +
            (an '\x1f'-separated list  of unique sample labels B with read
            sequences that match the reversed complement of the original read
            sequence))] + '\x1e' + (an '\x1f'-separated list of the number of
            instances of the read sequence for each respective
            sample in list A) + '\x1e' + (an '\x1f'-separated list of the
            number of instances of the read sequence's reversed complement for
            each respective sample in list B)
        3. '\x1f'-separated list of alignment RNAMEs or '\x1c' if no alignments
        4. '\x1f'-separated list of alignment FLAGs or '\x1c' if no alignments
        5. '\x1f-separated list of alignment POSes or '\x1c' if no alignments

        ALL OUTPUT COORDINATES ARE 1-INDEXED.

        input_stream: where to find input reads.
        output_stream: where to emit exonic chunks and introns.
        bowtie_exe: filename of Bowtie executable; include path if not in
            $PATH.
        bowtie_index_base: the basename of the Bowtie index files associated
            with the reference.
        bowtie_args: string containing precisely extra command-line arguments
            to pass to first-pass Bowtie, e.g., "--tryhard --best"; or None.
        gzip_level: level of gzip compression to use for qname file
        verbose: True iff more informative messages should be written to
            stderr.
        report_multiplier: if verbose is True, the line number of an alignment
            written to stderr increases exponentially with base
            report_multiplier.
        scratch: scratch directory for storing temporary files or None if 
            securely created temporary directory

        No return value.
    """
    global _input_line_count
    # For storing long qnames
    temp_dir = make_temp_dir(scratch)
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir])
    qnames_file = os.path.join(temp_dir, 'qnames.temp.gz')
    readlet_file = os.path.join(temp_dir, 'readlets.temp.gz')
    with xopen(True, qnames_file, 'w', gzip_level) as qname_stream:
        with xopen(True, readlet_file, 'w', gzip_level) as readlet_stream:
            for (seq_count, ((seq,), xpartition)) \
                in enumerate(xstream(input_stream, 1)):
                print >>readlet_stream, \
                    '\t'.join([str(seq_count), seq, 'I'*len(seq)])
                print >> qname_stream, next(iter(xpartition))[0]
                for (qname, ) in xpartition:
                    _input_line_count += 1
                    print >> qname_stream, qname
                # Separate qnames with single + character
                print >> qname_stream, '+'
    input_command = 'gzip -cd %s' % readlet_file
    bowtie_command = ' '.join([
        bowtie_exe, bowtie_args, '-S -t --sam-nohead --mm', bowtie_index_base,
        '--12 -'
    ])
    delegate_command = ''.join([
        sys.executable, ' ',
        os.path.realpath(__file__)[:-3],
        '_delegate.py --report-multiplier %08f --qnames-file %s %s' %
        (report_multiplier, qnames_file, '--verbose' if verbose else '')
    ])
    full_command = ' | '.join(
        [input_command, bowtie_command, delegate_command])
    print >> sys.stderr, 'Starting Bowtie with command: ' + full_command
    bowtie_process = subprocess.Popen(' '.join(
        ['set -exo pipefail;', full_command]),
                                      bufsize=-1,
                                      stdout=sys.stdout,
                                      stderr=sys.stderr,
                                      shell=True,
                                      executable='/bin/bash')
    return_code = bowtie_process.wait()
    if return_code:
        raise RuntimeError('Error occurred while reading Bowtie output; '
                           'exitlevel was %d.' % return_code)
Example #16
0
def go(manifest_object, input_stream=sys.stdin, output_stream=sys.stdout,
        sample_fraction=0.05, coverage_threshold=5, verbose=False):
    """ Runs Rail-RNA-bed_pre

        Writes indels and junctions for outputting BEDs by sample and
        TSVs across samples.

        Input (read from stdin)
        ----------------------------
        Tab-delimited input tuple columns:
        1. 'I', 'D', or 'N' for insertion, deletion, or junction line
        2. Number string representing RNAME
        3. Start position (Last base before insertion, first base of deletion,
                            or first base of intron)
        4. End position (Last base before insertion, last base of deletion
                            (exclusive), or last base of intron (exclusive))
        5. '+' or '-' indicating which strand is the sense strand for
            junctions, inserted sequence for insertions, or deleted sequence
            for deletions
        6. Sample index
        ----Next fields are for junctions only; they are '\x1c' for indels----
        7. Number of nucleotides between 5' end of intron and 5' end of read
            from which it was inferred, ASSUMING THE SENSE STRAND IS THE
            FORWARD STRAND. That is, if the sense strand is the reverse strand,
            this is the distance between the 3' end of the read and the 3' end
            of the intron.
        8. Number of nucleotides between 3' end of intron and 3' end of read
            from which it was inferred, ASSUMING THE SENSE STRAND IS THE
            FORWARD STRAND.
        --------------------------------------------------------------------
        9. Number of instances of junction, insertion, or deletion in sample;
            this is always +1 before bed_pre combiner/reducer

        Input is partitioned by fields 1-5 and sorted by field 6.

        Hadoop output (written to stdout)
        ----------------------------
        Tab-delimited output tuple columns (bed):
        1. 'I', 'D', or 'N' for insertion, deletion, or junction line
        2. Sample index
        3. Number string representing RNAME (+ '+ or -' if junction; same as
            field 6)
        4. Start position (Last base before insertion, first base of deletion,
                            or first base of intron)
        5. End position (Last base before insertion, last base of deletion
                            (exclusive), or last base of intron (exclusive))
        6. '+' or '-' indicating which strand is the sense strand for
            junctions, inserted sequence for insertions, or deleted sequence
            for deletions
        ----Next fields are for junctions only; they are '\x1c' for indels----
        7. MAX number of nucleotides between 5' end of intron and 5' end of
            read from which it was inferred, ASSUMING THE SENSE STRAND IS THE
            FORWARD STRAND. That is, if the sense strand is the reverse strand,
            this is the distance between the 3' end of the read and the 3' end
            of the intron.
        8. MAX number of nucleotides between 3' end of intron and 3' end of
            read from which it was inferred, ASSUMING THE SENSE STRAND IS THE
            FORWARD STRAND.
        9. MAXIMIN (number of nucleotides between 5' end of intron and 5' end
                    of read, number of nucleotides between 3' end of intron and
                    3' end of read);
           min is between the args above; max is across reads.

        Tab-delimited output tuple columns (collect)
        1. '0' if insertion, '1' if deletion, or '2' if junction line
        2. Number string representing RNAME (+ '+ or -' if junction; same as
                                                field 6)
        3. Start position (Last base before insertion, first base of deletion,
                            or first base of intron)
        4. End position (Last base before insertion, last base of deletion
                            (exclusive), or last base of intron (exclusive))
        5. '+' or '-' indicating which strand is the sense strand for
            junctions, inserted sequence for insertions, or deleted sequence
            for deletions
        6. Coverage of feature for sample with index N
        ...
        N + 6. Coverage of feature in sample with index N
        --------------------------------------------------------------------
        10. SUMMED number of instances of junction, insertion, or deletion in
            sample

        OUTPUT COORDINATES ARE 1-INDEXED.

        input_stream: where to find input indels/junctions
        output_stream: where to write output
        manifest_object: object of class LabelsAndIndices that maps indices
            to labels and back; used to count number of samples.
        sample_fraction: fraction of samples in which an indel must appear
            to pass filter if coverage_threshold criterion is not satisfied
        coverage_threshold: number of reads that must overlap indel in at
            least one sample to pass filter of sample_fraction criterion is not
            satisfied
        verbose: output extra debugging statements

        Return value: tuple (input line count, output line count)
    """
    input_line_count, output_line_count = 0, 0

    '''Compute minimum number of samples in which indel should appear to be
    output if coverage threshold not met.'''
    total_sample_count = len(manifest_object.label_to_index)
    min_sample_count = int(round(
                total_sample_count * sample_fraction
            ))

    for (line_type, rname, pos, end_pos, strand_or_seq), xpartition in xstream(
                input_stream, 5
            ):
        collect_specs = [rname, pos, end_pos if line_type != 'N'
                                             else str(int(end_pos) - 1),
                                     strand_or_seq]
        coverages = []
        i = 0
        if line_type == 'N':
            for sample_index, data in itertools.groupby(
                                                    xpartition, 
                                                    key=lambda val: val[0]
                                                ):
                sample_index = int(sample_index)
                while i != sample_index:
                    # Write 0 coverage for sample indexes reporting 0 junctions
                    coverages.append(0)
                    i += 1
                coverage_sum = 0
                max_left_displacement, max_right_displacement = None, None
                maximin_displacement = None
                for _, left_displacement, right_displacement, coverage in data:
                    input_line_count += 1
                    left_displacement = int(left_displacement)
                    right_displacement = int(right_displacement)
                    max_left_displacement = max(left_displacement,
                                                max_left_displacement)
                    max_right_displacement = max(right_displacement,
                                                 max_right_displacement)
                    maximin_displacement = max(
                            min(left_displacement, right_displacement),
                            maximin_displacement
                        )
                    coverage_sum += int(coverage)
                assert max_left_displacement is not None
                assert max_right_displacement is not None
                assert maximin_displacement is not None
                print >>output_stream, \
                    'bed\tN\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d' % (
                        sample_index, rname, pos, end_pos, strand_or_seq,
                        max_left_displacement, max_right_displacement,
                        maximin_displacement, coverage_sum
                    )
                coverages.append(coverage_sum)
                i += 1
                output_line_count += 1
            output_stream.write('collect\t2\t')
            print >>output_stream, '\t'.join(
                        collect_specs
                        + [str(coverage_value) for coverage_value in coverages]
                        + ['0']*(total_sample_count - len(coverages))
                    )
            output_line_count += 1
        else:
            assert line_type in 'ID'
            sample_count = 0
            for sample_index, data in itertools.groupby(
                                                    xpartition, 
                                                    key=lambda val: val[0]
                                                ):
                sample_index = int(sample_index)
                while i != sample_index:
                    # Write 0 coverage for sample indexes reporting 0 indels
                    coverages.append(0)
                    i += 1
                coverage_sum = 0
                for _, _, _, coverage in data:
                    input_line_count += 1
                    coverage_sum += int(coverage)
                print >>output_stream, \
                    'bed\t%s\t%s\t%s\t%s\t%s\t%s\t\x1c\t\x1c\t\x1c\t%d' % (
                        line_type, sample_index, rname, pos, end_pos,
                        strand_or_seq, coverage_sum
                    )
                coverages.append(coverage_sum)
                sample_count += 1
                i += 1
                output_line_count += 1
            max_coverage = max(coverages)
            if (sample_count >= min_sample_count
                or (max_coverage >= coverage_threshold
                    and coverage_threshold != -1)):
                if line_type == 'I':
                    output_stream.write('collect\t0\t')
                else:
                    output_stream.write('collect\t1\t')
                print >>output_stream, \
                    '\t'.join(
                        collect_specs 
                        + [str(coverage_value) for coverage_value in coverages]
                        + ['0']*(total_sample_count - len(coverages))
                    )
                output_line_count += 1
            elif verbose:
                print >>sys.stderr, (
                        'Indel (%s, %s, %s, %s) filtered out; it appeared in '
                        '%d sample(s), and its coverage in any one sample did '
                        'not exceed %d.'
                    ) % (rname, strand_or_seq, pos, end_pos, sample_count,
                            max_coverage)
    return input_line_count, output_line_count
Example #17
0
        print >> sizes_stream, '%s %d' % (rname,
                                          reference_index.rname_lengths[rname])

input_line_count, output_line_count = 0, 0
output_url = Url(args.out)
if output_url.is_local:
    # Set up destination directory
    try:
        os.makedirs(output_url.to_url())
    except:
        pass
mover = filemover.FileMover(args=args)
track_line = ('track type=bedGraph name="{name}" '
              'description="{description}" visibility=full '
              'color=227,29,118 altColor=0,179,220 priority=400')
for (sample_index, ), xpartition in xstream(sys.stdin, 1):
    try:
        sample_label = manifest_object.index_to_label[sample_index]
    except KeyError:
        # It's a mean or median
        if 'mean' in sample_index or 'median' in sample_index:
            sample_label = sample_index
        else:
            raise RuntimeError('Sample label index "%s" was not recorded.' %
                               sample_label)
    '''Dictionary for which each key is a coverage (i.e., number of ECs
    covering a given base). Its corresponding value is the number of bases with
    that coverage.'''
    coverage_histogram, unique_coverage_histogram = (defaultdict(int),
                                                     defaultdict(int))
    with open(bed_filename, 'w') as bed_stream, \
def go(qname_stream, output_stream=sys.stdout, input_stream=sys.stdin,
        verbose=False, report_multiplier=1.2):
    """ Emits readlet alignments.

        qname_stream contains long QNAMEs in the order in which readlets passed
        to Bowtie appeared. These names would have been truncated. Each QNAME
        takes the form

            '\x1d'-separated list of [read sequence ID + ('-' if readlet
            sequence is reverse-complemented; else '+') + '\x1e' + displacement
            of readlet's 5' end from read's 5' end + '\x1e' + displacement of
            readlet's 3' end from read's 3' end (+, for EXACTLY one readlet of
            a read sequence, '\x1e' + read sequence + '\x1e' +
            (an '\x1f'-separated list A of unique sample labels with read
            sequences that match the original read sequence) + '\x1e' +
            (an '\x1f'-separated list  of unique sample labels B with read
            sequences that match the reversed complement of the original read
            sequence)) + '\x1e' + (an '\x1f'-separated list of the number of
            instances of the read sequence for each respective sample in list
            A) + '\x1e' + (an '\x1f'-separated list of the number of instances
            of the read sequence's reversed complement for each respective
            sample in list B)]

        A line is written per readlet per associated read sequence. So if a
        given readlet can be found on 3 reads, 3 lines are written, each
        containing the readlet's alignments.

        qname_stream: where to retrieve extended qnames
        input_stream: where to retrieve Bowtie output
        output_stream: where to emit exon and intron tuples; typically, this is
            sys.stdout.
        verbose: True if alignments should occasionally be written to stderr.
        report_multiplier: if verbose is True, the line number of an
            alignment written to stderr increases exponentially with base
            report_multiplier.
    """
    output_line_count, next_report_line, i = 0, 0, 0
    for (qname,), xpartition in xstream(input_stream, 1):
        '''While labeled multireadlet, this list may end up simply a
        unireadlet.'''
        multireadlet = []
        for tokens in xpartition:
            (flag, rname, pos, mapq, cigar,
                rnext, pnext, tlen, seq, qual) = tokens[:10]
            flag = int(flag)
            multireadlet.append((rname, flag, pos))
            if verbose and next_report_line == i:
                print >>sys.stderr, \
                    'SAM output record %d: rdname="%s", flag=%d' % (i,
                                                                    qname,
                                                                    flag)
                next_report_line = int((next_report_line + 1)
                                        * report_multiplier + 1) - 1
            i += 1
        '''If the next qname doesn't match the last qname or there are no
        more lines, all of a multireadlet's alignments have been
        collected.'''
        if not flag & 4:
            '''Last readlet has at least one alignment; print all
            alignments for each read from which readlet sequence is
            derived.'''
            rnames, flags, poses = zip(*multireadlet)
            reverse_flags = [a_flag ^ 16 for a_flag in flags]
            flags = '\x1f'.join([str(a_flag) for a_flag in flags])
            reverse_flags = '\x1f'.join(
                                    [str(a_flag) for a_flag
                                        in reverse_flags]
                                )
            rnames = '\x1f'.join(rnames)
            poses = '\x1f'.join(poses)
            read = qname_stream.readline().strip()
            while read != '+':
                read_id, _, read_rest = read.partition('\x1e')
                if read_id[-1] == '-':
                    current_flags = reverse_flags
                else:
                    current_flags = flags
                print >>output_stream, '%s\t%s\t%s\t%s\t%s' % \
                    (read_id[:-1], read_rest, rnames,
                        current_flags, poses)
                output_line_count += 1
                read = qname_stream.readline().strip()
        else:
            '''Readlet had no reported alignments; print ONLY when readlet
            contains general info about read.'''
            read = qname_stream.readline().strip()
            while read != '+':
                read_id, _, read_rest = read.partition('\x1e')
                if len(read_rest.split('\x1e')) > 2:
                    print >>output_stream, \
                        '%s\t%s\t\x1c\t\x1c\t\x1c' % (read_id[:-1],
                                                        read_rest)
                output_line_count += 1
                read = qname_stream.readline().strip()
    output_stream.flush()
    print >>sys.stderr, ('align_readlets_delegate.py reports %d output lines.'
                            % output_line_count)
Example #19
0
def go(input_stream=sys.stdin, output_stream=sys.stdout, fudge=5,
        stranded=False, verbose=False, max_refs=300, report_multiplier=1.2):
    """ Emits junction combinations associated with reads.

        Soft-clipped Bowtie 2 alignments of read sequences to the transcript
        fragment index are used infer which cojunctions could possibly be
        overlapped by reads. Then maximal cliques of the graph described in
        the maximal_cliques() function are enumerated to obtain which
        junction combinations could possibly be overlapped by reads.

        input_stream: where to retrieve Bowtie 2 output
        output_stream: where to emit exon and junction tuples; typically, this
            is sys.stdout.
        fudge: by how many bases to extend left and right extend sizes
            to accommodate potential indels
        stranded: True iff input reads are strand-specific; this affects
            whether an output partition has a terminal '+' or '-' indicating
            the sense strand. Further, if stranded is True, an alignment is
            returned only if its strand agrees with the junction's strand.
        verbose: True if alignments should occasionally be written to stderr.
        max_refs: maximum number of reference sequences to enumerate per read;
            if more are present, prioritize those sequences that overlap
            the fewest junctions
        report_multiplier: if verbose is True, the line number of an
            alignment written to stderr increases exponentially with base
            report_multiplier.
    """
    output_line_count, next_report_line, i = 0, 0, 0
    for (qname,), xpartition in xstream(input_stream, 1):
        '''While labeled multireadlet, this list may end up simply a
        unireadlet.'''
        multiread = []
        for tokens in xpartition:
            flag = int(tokens[0])
            if verbose and next_report_line == i:
                print >>sys.stderr, \
                    'SAM output record %d: rdname="%s", flag=%d' % (i,
                                                                    qname,
                                                                    flag)
                next_report_line = int((next_report_line + 1)
                                        * report_multiplier + 1) - 1
            i += 1
            multiread.append((qname,) + tokens)
        if flag & 4: continue
        corrected_multiread = multiread_with_junctions(multiread,
                                                        stranded)
        cojunctions, all_junctions = defaultdict(set), {}
        for alignment in multiread_with_junctions(multiread, stranded):
            cigar = alignment[5]
            md = [field for field in alignment
                    if field[:5] == 'MD:Z:'][0][5:]
            pos = int(alignment[3])
            seq = alignment[9]
            reversed_complement_seq = seq[::-1].translate(
                    _reversed_complement_translation_table
                )
            if seq < reversed_complement_seq:
                seq_to_print = seq
            else:
                seq_to_print = reversed_complement_seq
            seq_size = len(seq)
            rname = alignment[2]
            sense = [field for field in alignment
                        if field[:5] == 'XS:A:'][0][5:]
            if (rname, sense) not in all_junctions:
                all_junctions[(rname, sense)] = defaultdict(list)
            _, _, junctions, _, _ = indels_junctions_exons_mismatches(
                                                cigar, md, pos, seq,
                                                junctions_only=True
                                            )
            cojunctions[(rname, sense)].add(
                    tuple([(junction[0], junction[1])
                                for junction in junctions])
                )
            for junction in junctions:
                if (junction[0], junction[1]) \
                    not in all_junctions[(rname, sense)]:
                    all_junctions[(rname, sense)][(junction[0], junction[1])] \
                        = [junction[2], junction[3]]
                else:
                    all_junctions[(rname, sense)][
                            (junction[0], junction[1])
                        ][0] = max(all_junctions[(rname, sense)][
                                (junction[0], junction[1])
                            ][0], junction[2])
                    all_junctions[(rname, sense)][
                            (junction[0], junction[1])
                        ][1] = max(all_junctions[(rname, sense)][
                                (junction[0], junction[1])
                            ][1], junction[3])
        for rname, sense in all_junctions:
            to_write = set()
            for cojunction in selected_cojunctions(paths_from_cojunctions(
                    list(cojunctions[(rname, sense)]), span=(seq_size + fudge)
                ), max_refs=max_refs, seq=seq, rname=rname, sense=sense):
                left_extend_size = all_junctions[(rname, sense)][
                                        cojunction[0]
                                    ][0]
                right_extend_size = all_junctions[(rname, sense)][
                                        cojunction[-1]
                                    ][1]
                to_write.add(('{rname}{sense}\t{starts}'
                       '\t{ends}\t{left_size}'
                       '\t{right_size}\t{seq}').format(
                            rname=rname,
                            sense=sense,
                            starts=','.join(
                                    [str(junction[0])
                                        for junction in cojunction]
                                ),
                            ends=','.join(
                                    [str(junction[1])
                                        for junction in cojunction]
                                ),
                            left_size=(left_extend_size
                                        + fudge),
                            right_size=(right_extend_size
                                        + fudge),
                            seq=seq_to_print
                       ))
            for line_to_write in to_write:
                print line_to_write
                output_line_count += 1
    output_stream.flush()
    print >>sys.stderr, ('cojunction_enum_delegate.py reports %d output lines.'
                            % output_line_count)
Example #20
0
site.addsitedir(base_path)

import bowtie
import bowtie_index
from dooplicity.tools import xstream

# Print file's docstring if -h is invoked
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
bowtie.add_args(parser)
args = parser.parse_args()
input_line_count, output_line_count = 0, 0

start_time = time.time()

reference_index = bowtie_index.BowtieIndexReference(os.path.expandvars(args.bowtie_idx))
for (_, rname_string, intron_pos, intron_end_pos, sense, sample_index), xpartition in xstream(sys.stdin, 6):
    coverage = 0
    for value in xpartition:
        input_line_count += 1
        try:
            # Assume intron line
            _, _, instance_count = value
        except ValueError:
            # Alignment line
            print(
                "\t".join(
                    (value[2], value[3], reference_index.string_to_rname[rname_string], str(int(value[1]))) + value[4:]
                )
                + ("\tXC:i:%d" % coverage)
            )
            output_line_count += 1
Example #21
0
else:
    output_path = os.path.join(temp_dir_path, output_filename)

input_line_count = 0
# Get RNAMEs in order of descending length
sorted_rnames = [reference_index.string_to_rname['%012d' % i]
                    for i in xrange(
                                len(reference_index.string_to_rname)
                            )]
sample_indexes_seen = set()
with xopen(True, output_path, 'w', args.gzip_level) as output_stream:
    print >>output_stream, '\t'.join(
                    [''] + sorted_rnames
                    + ['total mapped reads', 'total reads']
                )
    for (_, sample_index), xpartition in xstream(sys.stdin, 2):
        counter.add('partitions')
        sample_label = manifest_object.index_to_label[sample_index]
        total_counts, unique_counts = defaultdict(int), defaultdict(int)
        for rname_index, total_count, unique_count in xpartition:
            counter.add('inputs')
            rname = reference_index.string_to_rname[rname_index]
            total_counts[rname] = int(total_count)
            unique_counts[rname] = int(unique_count)
        total_reads = sum(total_counts.values())
        total_mapped_reads = total_reads - total_counts['*']
        total_uniques = sum(unique_counts.values())
        total_mapped_uniques = total_uniques - unique_counts['*']
        counter.add('total_reads', total_reads)
        counter.add('total_mapped_reads', total_mapped_reads)
        counter.add('total_uniques', total_uniques)
Example #22
0
def input_files_from_input_stream(input_stream,
                                  output_stream,
                                  temp_dir_path=None,
                                  verbose=False,
                                  gzip_level=3):
    """ Generates FASTA reference to index and file with reads.

        Each line of the read file is in the following format:

        read number <TAB> SEQ <TAB> QUAL

        input_stream: where to find Hadoop input
        output_stream: where to write unmapped reads
        temp_dir_path: where to store files
        verbose: output extra debugging messages
        gzip_level: gzip compression level (0-9)

        Yield value: tuple (path to FASTA reference file, path to read file)
    """
    global _input_line_count
    if temp_dir_path is None: temp_dir_path = tempfile.mkdtemp()
    prefasta_filename = os.path.join(temp_dir_path, 'temp.prefa')
    deduped_fasta_filename = os.path.join(temp_dir_path, 'temp.deduped.prefa')
    final_fasta_filename = os.path.join(temp_dir_path, 'temp.fa')
    reads_filename = os.path.join(temp_dir_path, 'reads.temp.gz')
    for (group_counter, ((index_group, ),
                         xpartition)) in enumerate(xstream(input_stream, 1)):
        counter.add('partitions')
        if verbose:
            print >> sys.stderr, (
                'Group %d: Writing prefasta and input reads...' %
                group_counter)
        with open(prefasta_filename, 'w') as fasta_stream:
            with xopen(True, reads_filename, 'w') as read_stream:
                for read_seq, values in itertools.groupby(
                        xpartition, key=lambda val: val[0]):
                    fasta_printed = False
                    counter.add('inputs')
                    for value in values:
                        _input_line_count += 1
                        if value[1][0] == '0':
                            # Print FASTA line
                            print >> fasta_stream, '\t'.join(
                                [value[1][1:-2], value[2]])
                            fasta_printed = True
                        elif fasta_printed:
                            '''Add to temporary seq stream only if an
                            associated FASTA line was found.'''
                            if value[1] == '1':
                                print >> read_stream, '\t'.join(
                                    [value[2], read_seq, value[3]])
                            else:
                                print >> read_stream, '\t'.join([
                                    value[2], read_seq[::-1].translate(
                                        _reversed_complement_translation_table
                                    ), value[3][::-1]
                                ])
                        else:
                            # Print unmapped read
                            if value[1] == '1':
                                seq_to_write = read_seq
                                qual_to_write = value[3]
                            else:
                                seq_to_write = read_seq[::-1].translate(
                                    _reversed_complement_translation_table)
                                qual_to_write = value[3][::-1]
                            '''Write only essentials; handle "formal" writing
                            in next step.'''
                            output_stream.write(
                                '%s\t4\t\x1c\t\x1c\t\x1c\t\x1c'
                                '\t\x1c\t\x1c\t\x1c\t%s\t%s\n' %
                                (value[2], seq_to_write, qual_to_write))
        if verbose:
            print >> sys.stderr, (
                'Group %d: Done! Sorting and deduplicating prefasta...' %
                group_counter)
        # Sort prefasta and eliminate duplicate lines
        dedup_process_return = subprocess.call(
            r'''sort %s | uniq >%s''' %
            (prefasta_filename, deduped_fasta_filename),
            shell=True,
            executable='/bin/bash')
        if dedup_process_return != 0:
            raise RuntimeError(
                'Problem encountered deduplicating FASTA reference')
        if verbose:
            print >> sys.stderr, ('Group %d Done! Writing final FASTA.' %
                                  group_counter)
        with open(final_fasta_filename, 'w') as final_fasta_stream:
            with open(deduped_fasta_filename) as fasta_stream:
                for line in fasta_stream:
                    rname, seq = line.strip().split('\t')
                    print >> final_fasta_stream, rname
                    final_fasta_stream.write('\n'.join(
                        [seq[i:i + 80] for i in xrange(0, len(seq), 80)]))
                    final_fasta_stream.write('\n')
        os.remove(deduped_fasta_filename)
        os.remove(prefasta_filename)
        output_stream.flush()
        yield final_fasta_filename, reads_filename
Example #23
0
if output_url.is_local:
    # Set up destination directory
    try: os.makedirs(output_url.to_url())
    except: pass
else:
    mover = filemover.FileMover(args=args)
    # Set up temporary destination
    import tempfile
    temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch))
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path])

input_line_count = 0
counter = Counter('tsv')
register_cleanup(counter.flush)

for (line_type,), xpartition in xstream(sys.stdin, 1):
    type_string = ('insertions' if line_type == '0' else
                    ('deletions' if line_type == '1' else
                      ('junctions' if line_type == '2' else
                         ('coverages' + line_type[1:]
                            if line_type.startswith('3') else
                                'normalization'))))
    counter.add(type_string + '_partitions')
    output_filename = ((args.tsv_basename + '.'
                          if args.tsv_basename != '' else '')
                          + type_string + '.tsv.gz')
    if output_url.is_local:
        output_path = os.path.join(args.out, output_filename)
    else:
        output_path = os.path.join(temp_dir_path, output_filename)
    with xopen(True, output_path, 'w', args.gzip_level) as output_stream:
Example #24
0
with open(sizes_filename, 'w') as sizes_stream:
    for rname in reference_index.rname_lengths:
        print >>sizes_stream, '%s %d' % (rname, 
            reference_index.rname_lengths[rname])

input_line_count, output_line_count = 0, 0
output_url = Url(args.out)
if output_url.is_local:
    # Set up destination directory
    try: os.makedirs(output_url.to_url())
    except: pass
mover = filemover.FileMover(args=args)
track_line = ('track type=bedGraph name="{name}" '
         'description="{description}" visibility=full '
         'color=227,29,118 altColor=0,179,220 priority=400')
for (sample_index,), xpartition in xstream(sys.stdin, 1):
    counter.add('partitions')
    real_sample = True
    try:
        sample_label = manifest_object.index_to_label[sample_index]
    except KeyError:
        # It's a nonref track, a mean, or a median
        real_sample = False
        if search('\.[ATCGN]', sample_index):
            try:
                sample_label = (
                        manifest_object.index_to_label[sample_index[:-2]]
                        + sample_index[-2:]
                    )
            except KeyError:
                raise RuntimeError('Sample label index "%s" was not recorded.'
Example #25
0
def go(manifest_object, input_stream=sys.stdin, output_stream=sys.stdout,
        sample_fraction=0.05, coverage_threshold=5, collect_junctions=False,
        verbose=False):
    """ Runs Rail-RNA-junction_filter.

        Filters out every junction from input_stream that is not either:
          (1) in round(sample_fraction * (total number of samples)) samples OR
          (2) found in at least coverage_threshold reads in at least one
            sample.

        Input (read from stdin)
        ----------------------------
        Tab-delimited columns:
        1. Reference name (RNAME in SAM format) +
            '+' or '-' indicating which strand is the sense strand
        2. Intron start position (inclusive)
        3. Intron end position (exclusive)
        4. '\x1f'-separated list of sample indexes in which junction was found
        5. '\x1f'-separated list of numbers of reads in which junction was
            found in respective sample specified by field 4

        Input is partitioned by fields 1-3.

        Hadoop output (written to stdout)
        ----------------------------
        Tab-delimited tuple columns (filter):
        1. Reference name (RNAME in SAM format) +
            '+' or '-' indicating which strand is the sense strand
        2. Sample index
        3. Intron start position (inclusive)
        4. Intron end position (exclusive)

        If --collect-junctions is True:
        Tab-delimited tuple columns (collect):
        1. Reference name (RNAME in SAM format) +
            '+' or '-' indicating which strand is the sense strand
        2. Intron start position (inclusive)
        3. Intron end position (exclusive)
        4. '\x1f'-separated list of sample indexes in which junction was found
        5. '\x1f'-separated list of numbers of reads in which junction was
            found in respective sample specified by field 4

        ALL OUTPUT COORDINATES ARE 1-INDEXED.

        input_stream: where to find input junctions
        output_stream: where to write output
        manifest_object: object of class LabelsAndIndices that maps indices
            to labels and back; used to count number of samples.
        sample_fraction: fraction of samples in which a junction must appear
            to pass filter if coverage_threshold criterion is not satisfied
        coverage_threshold: number of reads that must overlap junction in at
            least one sample to pass filter of sample_fraction criterion is not
            satisfied
        collect_junctions: collects and outputs junctions across samples;
            ignores sample_fraction and coverage_threshold
        verbose: output extra debugging statements

        Return value: tuple (input line count, output line count)
    """
    input_line_count, output_line_count = 0, 0
    min_sample_count = int(round(
            len(manifest_object.label_to_index) * sample_fraction
        ))
    for (rname_and_strand, pos, end_pos), xpartition in xstream(
                                                                input_stream, 3
                                                            ):
        counter.add('partitions')
        sample_indexes = defaultdict(int)
        for current_sample_indexes, current_sample_counts in xpartition:
            input_line_count += 1
            counter.add('inputs')
            current_sample_counts = current_sample_counts.split('\x1f')
            for i, sample_index in enumerate(
                                        current_sample_indexes.split('\x1f')
                                    ):
                sample_indexes[sample_index] += int(current_sample_counts[i])
        pos, end_pos = int(pos), int(end_pos)
        if collect_junctions:
            samples_to_dump = sorted(sample_indexes.items(),
                                        key=lambda sample: int(sample[0]))
            counter.add('collect_junction_lines')
            print >>output_stream, 'collect\t%s\t%012d\t%012d\t%s\t%s' % (
                    rname_and_strand, pos, end_pos,
                    ','.join([sample[0] for sample in samples_to_dump]),
                    ','.join([str(sample[1]) for sample in samples_to_dump])
                )
            output_line_count += 1
        sample_count = len(sample_indexes)
        max_coverage = max(sample_indexes.values())
        if end_pos > pos and (sample_count >= min_sample_count
            or (max_coverage >= coverage_threshold
                and coverage_threshold != -1)):
            for sample_index in sample_indexes:
                counter.add('junctions_passing_filter')
                print >>output_stream, 'filter\t%s\t%s\t%012d\t%012d' % (
                        rname_and_strand, sample_index,
                        pos, end_pos
                    )
                output_line_count += 1
        else:
            counter.add('junctions_failing_filter')
            if verbose:
                print >>sys.stderr, (
                        'Junction (%s, %d, %d) filtered out; it appeared in %d '
                        'sample(s), and its coverage in any one sample did '
                        'not exceed %d.'
                    ) % (rname_and_strand, pos, end_pos,
                            sample_count, max_coverage)
    return input_line_count, output_line_count
Example #26
0
alignment_count_to_report, seed, non_deterministic \
    = bowtie.parsed_bowtie_args(bowtie_args)

alignment_printer = AlignmentPrinter(manifest_object,
                                     reference_index,
                                     output_stream=sys.stdout,
                                     bin_size=args.partition_length,
                                     exon_ivals=args.exon_intervals,
                                     exon_diffs=args.exon_differentials,
                                     drop_deletions=args.drop_deletions,
                                     output_bam_by_chr=args.output_bam_by_chr,
                                     tie_margin=args.tie_margin)
input_line_count, output_line_count = 0, 0
start_time = time.time()

for (qname, ), xpartition in xstream(sys.stdin, 1):
    alignments = [(qname, ) + alignment for alignment in xpartition]
    input_line_count += len(alignments)
    intron_counts = [alignment[5].count('N') for alignment in alignments]
    min_intron_count = min(intron_counts)
    if not min_intron_count:
        '''There is at least one alignment that overlaps no introns; report 
        an alignment with the highest score at random. Separate into alignments
        that overlap the fewest introns and alignments that don't.'''
        clipped_alignments = [
            alignments[i] for i in xrange(len(intron_counts))
            if intron_counts[i] == 0
        ]
        alignments_and_scores = [(alignment, [
            int(tokens[5:]) for tokens in alignment if tokens[:5] == 'AS:i:'
        ][0]) for alignment in clipped_alignments]
Example #27
0
def edges_from_input_stream(input_stream,
                            readlet_size=20,
                            min_overlap_exon_size=1):
    """ Generates edges of directed acyclic graph (DAG) of introns.

        A DAG is constructed for each strand. Each node of the DAG represents
        a unique intron and is labeled by the tuple (intron_start, intron_end),
        where intron_start is the (1-based) coordinate of the first base of the
        intron, and intron_end is the coordinate of the first base after the
        intron. An edge occurs between two introns A and B iff they do not
        overlap, and no intron C occurs between A and B such that A, B, and C 
        do not overlap. The intron with larger coordinates is the child of
        the intron with smaller coordinates. Weight each edge by the number of
        exonic bases between the introns it connects.

        The DAG has sources and sinks. Pad the DAG with new sources and sinks
        as follows. Extend an edge to each original source
        (source_start, source_end) from a new source labeled by the tuple
        (None, max(1, source_start - readlet_size + 1)). Extend an edge
        from each original sink (sink_start, sink_end) to a new sink labeled by
        the tuple (sink_end + source_start + readlet_size - 1, None).
        So each original source is assigned exactly one new source, and each
        original sink is assigned exactly one new sink.

        The paths through this DAG span all possible combinations of
        nonoverlapping introns on the strand. Finding all subpaths (sequences
        of introns), each of whose weights is <= readlet_size, redundantly
        enumerates all possible combinations of introns a readlet
        can overlap. Unfortunately, obtaining such combinations in
        "hot spots," where there are many alternative splicings and short
        exons, can become computationally intractable for large readlet sizes.
        To (optionally) control these combinatorial blow-ups, impose an
        effective minimum exon size min_overlap_exon_size by redefining overlap
        between two introns: two introns overlap if there are fewer than
        min_overlap_exon_size exonic bases between them.

        The algorithm for generating the edges of the DAG operates on an input
        stream whose lines are composed of the following tab-separated fields:

        1. Reference name (RNAME in SAM format) +
            '+' or '-' indicating which strand is the sense strand
        2. Sample index
        3. Intron start position (inclusive)
        4. Intron end position (exclusive)

        The input is partitioned by strand/sample index (fields 1-2) and sorted
        by the remaining fields. INPUT COORDINATES ARE ASSUMED TO BE 1-INDEXED.

        Introns are sorted by start position. To begin, the first set of
        mutually overlapping introns are connected to their new sources. Two
        data structures encode the graph as it is constructed [1]: the set
        unlinked_nodes, containing introns that do not yet have any children,
        and linked_nodes, a dictionary that, where possible, maps each intron A
        to its corresponding successive nonoverlapping intron B with the
        smallest end position read so far. With each new intron N read, the
        nodes in unlinked_nodes and linked_nodes are checked for whether N is
        their child, and these edges are yielded. (Note that by construction,
        nodes are streamed in topological order.) Nodes from unlinked_nodes may
        thus be promoted to linked_nodes. Each value node in linked_nodes is
        also replaced with N if N has a smaller end position. Then every node A
        in linked_nodes is checked for a path A -> B -> N. If such a path
        exists, an edge can NEVER connect A with any successive introns, and A
        is removed from linked_nodes. The algorithm continues until the end of
        the strand is reached, when the edges connecting remaining
        unlinked_nodes and their new sinks are yielded.

        [1] Technically, there are three data structures. unlinked_nodes and
        linked_nodes contain only indices of introns, and "introns" is
        a dictionary that maps indices to tuples (intron_start, intron_end).

        input_stream: where to find sorted introns of the form specified above.
        fudge: by how much a readlet_size should be extended.
        min_exon_size: minimum number of exonic bases between two introns
            for them to be considered nonoverlapping.

        Yield value: An edge tuple (strand,
                                    sample_index,
                                    (intron A start, intron A end),
                                    (intron B start, intron B end)) or None
                     at the beginning of a new partition.
    """
    global _input_line_count
    for key, xpartition in xstream(input_stream, 2, skip_duplicates=True):
        unlinked_nodes = set()
        for q, value in enumerate(xpartition):
            assert len(value) == 2
            _input_line_count += 1
            if not q:
                # Denote start of new partition
                yield None
                # Handle first intron from partition
                intron_start, intron_end = int(value[0]), int(value[1])
                # Create fake source before first intron
                fake_source = (None, max(intron_start - (readlet_size - 1), 1))
                introns = {0: fake_source, 1: (intron_start, intron_end)}
                linked_nodes = {0: 1}
                unlinked_nodes = set([1])
                index = 2
                # Yield first edge for strand (before first intron)
                yield key + (fake_source, (intron_start, intron_end))
                first_intron = False
            else:
                # Handle next introns from partition
                intron_start, intron_end = int(value[0]), int(value[1])
                introns[index] = (intron_start, intron_end)
                nodes_to_trash = []
                for node in unlinked_nodes:
                    if intron_start >= introns[node][1] + \
                        min_overlap_exon_size:
                        nodes_to_trash.append(node)
                for node in nodes_to_trash:
                    linked_nodes[node] = index
                    unlinked_nodes.remove(node)
                unlinked_nodes.add(index)
                nodes_to_trash = []
                for node in linked_nodes:
                    intermediate_node = linked_nodes[node]
                    if intermediate_node in linked_nodes:
                        nodes_to_trash.append(node)
                    else:
                        yield key + (introns[node], (intron_start, intron_end))
                        if introns[intermediate_node][1] > intron_end:
                            linked_nodes[node] = index
                for node in nodes_to_trash:
                    del linked_nodes[node]
                    del introns[node]
                index += 1
        # Yield final edges for strand
        for node in unlinked_nodes:
            current_intron = introns[node]
            yield key + (current_intron,
                         (current_intron[1] + readlet_size - 1, None))
Example #28
0
def input_files_from_input_stream(input_stream,
                                    output_stream,
                                    temp_dir_path=None,
                                    verbose=False,
                                    gzip_level=3):
    """ Generates FASTA reference to index and file with reads.

        Each line of the read file is in the following format:

        read number <TAB> SEQ <TAB> QUAL

        input_stream: where to find Hadoop input
        output_stream: where to write unmapped reads
        temp_dir_path: where to store files
        verbose: output extra debugging messages
        gzip_level: gzip compression level (0-9)

        Yield value: tuple (path to FASTA reference file, path to read file)
    """
    global _input_line_count
    if temp_dir_path is None: temp_dir_path = tempfile.mkdtemp()
    prefasta_filename = os.path.join(temp_dir_path, 'temp.prefa')
    deduped_fasta_filename = os.path.join(temp_dir_path, 'temp.deduped.prefa')
    final_fasta_filename = os.path.join(temp_dir_path, 'temp.fa')
    reads_filename = os.path.join(temp_dir_path, 'reads.temp.gz')
    for (counter, ((index_group,), xpartition)) in enumerate(
                                                    xstream(input_stream, 1)
                                                ):
        if verbose:
            print >>sys.stderr, (
                        'Group %d: Writing prefasta and input reads...'
                        % counter
                    )
        with open(prefasta_filename, 'w') as fasta_stream:
            with xopen(True, reads_filename, 'w') as read_stream:
                for read_seq, values in itertools.groupby(xpartition, 
                                                    key=lambda val: val[0]):
                    fasta_printed = False
                    for value in values:
                        _input_line_count += 1
                        if value[1][0] == '0':
                            # Print FASTA line
                            print >>fasta_stream, '\t'.join([value[1][1:-2],
                                                                value[2]])
                            fasta_printed = True
                        elif fasta_printed:
                            '''Add to temporary seq stream only if an
                            associated FASTA line was found.'''
                            if value[1] == '1':
                                print >>read_stream, '\t'.join([value[2],
                                                                    read_seq,
                                                                    value[3]])
                            else:
                                print >>read_stream, '\t'.join([
                                            value[2],
                                            read_seq[::-1].translate(
                                        _reversed_complement_translation_table
                                    ),
                                            value[3][::-1]])
                        else:
                            # Print unmapped read
                            if value[1] == '1':
                                seq_to_write = read_seq
                                qual_to_write = value[3]
                            else:
                                seq_to_write = read_seq[::-1].translate(
                                        _reversed_complement_translation_table
                                    )
                                qual_to_write = value[3][::-1]
                            '''Write only essentials; handle "formal" writing
                            in next step.'''
                            output_stream.write(
                                        '%s\t4\t\x1c\t\x1c\t\x1c\t\x1c'
                                        '\t\x1c\t\x1c\t\x1c\t%s\t%s\n' % (
                                                                value[2],
                                                                seq_to_write,
                                                                qual_to_write
                                                            )
                                    )
        if verbose:
            print >>sys.stderr, (
                        'Group %d: Done! Sorting and deduplicating prefasta...'
                        % counter
                    )
        # Sort prefasta and eliminate duplicate lines
        dedup_process_return = subprocess.call(
                r'''sort %s | uniq >%s'''
                % (prefasta_filename, deduped_fasta_filename), shell=True,
                executable='/bin/bash'
            )
        if dedup_process_return != 0:
            raise RuntimeError(
                    'Problem encountered deduplicating FASTA reference'
                )
        if verbose:
            print >>sys.stderr, (
                    'Group %d Done! Writing final FASTA.' % counter
                )
        with open(final_fasta_filename, 'w') as final_fasta_stream:
            with open(deduped_fasta_filename) as fasta_stream:
                for line in fasta_stream:
                    rname, seq = line.strip().split('\t')
                    print >>final_fasta_stream, rname
                    final_fasta_stream.write(
                        '\n'.join([seq[i:i+80] for i 
                                    in xrange(0, len(seq), 80)])
                    )
                    final_fasta_stream.write('\n')
        os.remove(deduped_fasta_filename)
        os.remove(prefasta_filename)
        yield final_fasta_filename, reads_filename
Example #29
0
alignment_printer = AlignmentPrinter(
                                manifest_object,
                                reference_index,
                                output_stream=sys.stdout,
                                bin_size=args.partition_length,
                                exon_ivals=args.exon_intervals,
                                exon_diffs=args.exon_differentials,
                                drop_deletions=args.drop_deletions,
                                output_bam_by_chr=args.output_bam_by_chr,
                                tie_margin=args.tie_margin
                            )
input_line_count, output_line_count = 0, 0
start_time = time.time()

for (qname,), xpartition in xstream(sys.stdin, 1):
    alignments = [(qname,) + alignment for alignment in xpartition]
    input_line_count += len(alignments)
    junction_counts = [alignment[5].count('N') for alignment in alignments]
    min_junction_count = min(junction_counts)
    if not min_junction_count:
        '''There is at least one alignment that overlaps no junctions; report 
        an alignment with the highest score at random. Separate into alignments
        that overlap the fewest junctions and alignments that don't.'''
        clipped_alignments = [alignments[i] for i in xrange(
                                                        len(junction_counts)
                                            ) if junction_counts[i] == 0]
        alignments_and_scores = [(alignment, [int(tokens[5:])
                                                for tokens in alignment
                                                if tokens[:5] == 'AS:i:'][0])
                                    for alignment in clipped_alignments]
Example #30
0
# To convert sample-rname index to sample index-rname index tuple
sample_and_rname_indexes = SampleAndRnameIndexes(
                                                    manifest_object,
                                                    args.output_by_chromosome
                                                )

import time
start_time = time.time()
from alignment_handlers import AlignmentPrinter
alignment_printer = AlignmentPrinter(manifest_object, reference_index,
                                        tie_margin=args.tie_margin)
input_line_count = 0
if args.suppress_bam:
    # Just grab stats
    if args.output_by_chromosome:
        for (index, _), xpartition in xstream(sys.stdin, 2):
            sample_index, rname_index = (
                    sample_and_rname_indexes.sample_and_rname_indexes(index)
                )
            unique_count, total_count = 0, 0
            for record in xpartition:
                if not (int(record[2]) & 256):
                    total_count += 1
                    try:
                        # seq is at position 8
                        if alignment_printer.unique(record, seq_index=8):
                            unique_count += 1
                    except IndexError:
                        # Unmapped read; it's unique
                        unique_count += 1
                input_line_count += 1
Example #31
0
def go(true_bed_stream,
       sam_stream=sys.stdin,
       generous=False,
       base_threshold=0.5,
       clip_threshold=1.0,
       dump_incorrect=False,
       temp_dir=None,
       ignore_spliced_reads=False):
    """ Finds relevant and retrieved instance counts.

        true_bed_stream: file handle for BED output of Flux simulation
        sam_stream: where to read in aligner's mappings
        generous: True iff aligner cuts off /1 or /2 of a given read
        base_threshold: proportion of a read's bases that must align
            correctly for a read to be considered a correct mapping
        clip_threshold: proportion of a read's bases that must be clipped
            for a read to be considered unmapped
        dump_incorrect: write incorrect (read) alignments to stderr
        ignore_spliced_reads: ignores all spliced reads
    """
    from tempdel import remove_temporary_directories
    import tempfile
    import atexit
    if temp_dir is None:
        temp_dir_path = tempfile.mkdtemp()
    else:
        try:
            temp_dir_path = tempfile.mkdtemp(dir=temp_dir)
        except:
            temp_dir_path = tempfile.mkdtemp()
    #print >>sys.stderr, temp_dir_path
    atexit.register(remove_temporary_directories, [temp_dir_path])
    # Store everything in one file, then sort it on read name
    combined_file = os.path.join(temp_dir_path, 'combined.temp')
    with open(combined_file, 'w') as temp_stream:
        if ignore_spliced_reads:
            if generous:
                for line in true_bed_stream:
                    tokens = line.strip().split('\t')
                    if ',' in tokens[-1]: continue  # skip intron line
                    print >> temp_stream, '\t'.join([tokens[3][:-2], '0'] +
                                                    tokens[:3] + tokens[4:])
            else:
                for line in true_bed_stream:
                    tokens = line.strip().split('\t')
                    if ',' in tokens[-1]: continue  # skip intron line
                    print >> temp_stream, '\t'.join([tokens[3], '0'] +
                                                    tokens[:3] + tokens[4:])
            for line in sam_stream:
                if line[0] == '@' or not line.strip(): continue
                tokens = line.strip().split('\t')
                if 'N' in tokens[5]: continue  # skip intron line
                print >> temp_stream, '\t'.join([tokens[0], '1'] + tokens[1:])
        else:
            if generous:
                for line in true_bed_stream:
                    tokens = line.strip().split('\t')
                    print >> temp_stream, '\t'.join([tokens[3][:-2], '0'] +
                                                    tokens[:3] + tokens[4:])
            else:
                for line in true_bed_stream:
                    tokens = line.strip().split('\t')
                    print >> temp_stream, '\t'.join([tokens[3], '0'] +
                                                    tokens[:3] + tokens[4:])
            for line in sam_stream:
                if line[0] == '@' or not line.strip(): continue
                tokens = line.strip().split('\t')
                print >> temp_stream, '\t'.join([tokens[0], '1'] + tokens[1:])
    import subprocess
    sorted_combined_file = os.path.join(temp_dir_path, 'combined.sorted.temp')
    subprocess.check_call(' '.join([
        'sort -T %s -k1,1 -k2,2n' % temp_dir_path, combined_file, '>',
        sorted_combined_file
    ]),
                          bufsize=-1,
                          shell=True)
    basewise_relevant, read_relevant = 0, 0
    # Initialize counters for computing accuracy metrics
    basewise_retrieved, basewise_intersection = 0, 0
    read_retrieved, read_intersection = 0, 0
    with open(sorted_combined_file) as sorted_combined_stream:
        for (name, ), xpartition in xstream(sorted_combined_stream, 1):
            '''Dict mapping read names to alignments
            (chrom, 1-based start, 1-based end)'''
            true_maps = []
            saved = []
            for tokens in xpartition:
                saved.append(tokens)
                if tokens[0] == '0':
                    if len(tokens) < 12:
                        continue
                    chrom = tokens[1]
                    chrom_start = int(tokens[2])
                    chrom_end = int(tokens[3])
                    block_sizes = tokens[10].split(',')
                    block_starts = tokens[11].split(',')
                    # Handle trailing commas
                    try:
                        int(block_sizes[-1])
                    except ValueError:
                        block_sizes = block_sizes[:-1]
                    try:
                        int(block_starts[-1])
                    except ValueError:
                        block_starts = block_starts[:-1]
                    block_count = len(block_sizes)
                    assert block_count == len(block_starts)
                    exons = [(chrom, chrom_start + int(block_starts[i]),
                              chrom_start + int(block_starts[i]) +
                              int(block_sizes[i]))
                             for i in xrange(block_count)]
                    true_maps.append(exons)
                    basewise_relevant += sum(
                        [int(block_size) for block_size in block_sizes])
                    read_relevant += 1
                elif tokens[0] == '1':
                    flag = int(tokens[1])
                    if flag & 256 or flag & 4:
                        '''Secondary alignment or unmapped and thus not
                        retrieved; ignore'''
                        continue
                    cigar, pos, seq = tokens[5], int(tokens[3]), tokens[9]
                    (dummy_md, mapped,
                        unmapped, clip_count, read_length) \
                        = dummy_md_and_mapped_offsets(
                                            cigar,
                                            clip_threshold=clip_threshold
                                        )
                    if unmapped:
                        # Too much clipping
                        continue
                    basewise_retrieved += read_length - clip_count
                    read_retrieved += 1
                    if not true_maps:
                        assert ignore_spliced_reads
                        continue
                    # Try both /1 and /2; choose the best basewise result
                    intersected_base_count = 0
                    for true_map in true_maps:
                        if tokens[2] != true_map[0][0]:
                            '''chr is wrong, but this is still counted as a
                            retrieval above'''
                            continue
                        base_counter, base_truths = 0, set()
                        '''Each tuple in base_truths is
                        (index of base in read, mapped location)'''
                        for block in true_map:
                            base_truths.update([(base_counter + i, j + 1)
                                                for i, j in enumerate(
                                                    xrange(block[1], block[2]))
                                                ])
                            base_counter += block[2] - block[1]
                        base_predictions = set()
                        if unmapped:
                            # Too much clipping
                            continue
                        _, _, _, exons, _ = indels_junctions_exons_mismatches(
                            cigar, dummy_md, pos, seq, drop_deletions=True)
                        mapped_index = 0
                        for exon in exons:
                            base_predictions.update([
                                (mapped[mapped_index + i], j)
                                for i, j in enumerate(xrange(exon[0], exon[1]))
                            ])
                            mapped_index += exon[1] - exon[0]
                        intersected_base_count = max(
                            intersected_base_count,
                            len(base_predictions.intersection(base_truths)))
                    basewise_intersection += intersected_base_count
                    if intersected_base_count >= read_length * base_threshold:
                        read_intersection += 1
                    elif dump_incorrect:
                        # Incorrect alignment; write to stderr
                        print >> sys.stderr, '\t'.join(
                            ['.'.join(line) for line in saved])
                else:
                    raise RuntimeError('Invalid intermediate line.')
    return (basewise_retrieved, basewise_relevant, basewise_intersection,
            read_retrieved, read_relevant, read_intersection)
Example #32
0
    else Url(os.getcwd())
input_line_count = 0
counter = Counter('bed')
register_cleanup(counter.flush)

if output_url.is_local:
    # Set up destination directory
    try: os.makedirs(output_url.to_url())
    except: pass
else:
    mover = filemover.FileMover(args=args)
    # Set up temporary destination
    import tempfile
    temp_dir_path = make_temp_dir(tempdel.silentexpandvars(args.scratch))
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir_path])
for (line_type, sample_label), xpartition in xstream(sys.stdin, 2):
    assert line_type in 'NID'
    sample_label = manifest_object.index_to_label[sample_label]
    type_string = ('insertions' if line_type == 'I' else
                    ('deletions' if line_type == 'D' else 'junctions'))
    output_filename = ((args.bed_basename + '.' 
                          if args.bed_basename != '' else '')
                          + type_string + '.' + sample_label + '.bed')
    if output_url.is_local:
        output_path = os.path.join(args.out, output_filename)
    else:
        output_path = os.path.join(temp_dir_path, output_filename)
    with open(output_path, 'w') as output_stream:
        print >>output_stream, 'track name="%s_%s" description="' \
                                   'Rail-RNA v%s %s for sample %s"' \
                                                      % (sample_label,
Example #33
0
if output_url.is_local:
    output_path = os.path.join(args.out, output_filename)
else:
    output_path = os.path.join(temp_dir_path, output_filename)

input_line_count = 0
# Get RNAMEs in order of descending length
sorted_rnames = [
    reference_index.string_to_rname['%012d' % i]
    for i in xrange(len(reference_index.string_to_rname))
]
sample_indexes_seen = set()
with xopen(True, output_path, 'w', args.gzip_level) as output_stream:
    print >> output_stream, '\t'.join([''] + sorted_rnames +
                                      ['total mapped reads', 'total reads'])
    for (_, sample_index), xpartition in xstream(sys.stdin, 2):
        sample_label = manifest_object.index_to_label[sample_index]
        total_counts, unique_counts = defaultdict(int), defaultdict(int)
        for rname_index, total_count, unique_count in xpartition:
            rname = reference_index.string_to_rname[rname_index]
            total_counts[rname] = int(total_count)
            unique_counts[rname] = int(unique_count)
        total_reads = sum(total_counts.values())
        total_mapped_reads = total_reads - total_counts['*']
        total_uniques = sum(unique_counts.values())
        total_mapped_uniques = total_uniques - unique_counts['*']
        print >> output_stream, '\t'.join([sample_label] + [
            '%d,%d' % (total_counts[rname], unique_counts[rname])
            for rname in sorted_rnames
        ] + [
            '%d,%d' % (total_mapped_reads, total_mapped_uniques),
Example #34
0
def go(true_bed_stream, sam_stream=sys.stdin, generous=False,
        base_threshold=0.5, clip_threshold=1.0, dump_incorrect=False,
        temp_dir=None, ignore_spliced_reads=False):
    """ Finds relevant and retrieved instance counts.

        true_bed_stream: file handle for BED output of Flux simulation
        sam_stream: where to read in aligner's mappings
        generous: True iff aligner cuts off /1 or /2 of a given read
        base_threshold: proportion of a read's bases that must align
            correctly for a read to be considered a correct mapping
        clip_threshold: proportion of a read's bases that must be clipped
            for a read to be considered unmapped
        dump_incorrect: write incorrect (read) alignments to stderr
        ignore_spliced_reads: ignores all spliced reads
    """
    from tempdel import remove_temporary_directories
    import tempfile
    import atexit
    if temp_dir is None:
        temp_dir_path = tempfile.mkdtemp()
    else:
        try:
            temp_dir_path = tempfile.mkdtemp(dir=temp_dir)
        except:
            temp_dir_path = tempfile.mkdtemp()
    #print >>sys.stderr, temp_dir_path
    atexit.register(remove_temporary_directories, [temp_dir_path])
    # Store everything in one file, then sort it on read name
    combined_file = os.path.join(temp_dir_path, 'combined.temp')
    with open(combined_file, 'w') as temp_stream:
        if ignore_spliced_reads:
            if generous:
                for line in true_bed_stream:
                    tokens = line.strip().split('\t')
                    if ',' in tokens[-1]: continue # skip intron line
                    print >>temp_stream, '\t'.join([tokens[3][:-2], '0']
                                                    + tokens[:3] + tokens[4:])
            else:
                for line in true_bed_stream:
                    tokens = line.strip().split('\t')
                    if ',' in tokens[-1]: continue # skip intron line
                    print >>temp_stream, '\t'.join(
                                    [tokens[3], '0'] + tokens[:3] + tokens[4:]
                                )
            for line in sam_stream:
                if line[0] == '@' or not line.strip(): continue
                tokens = line.strip().split('\t')
                if 'N' in tokens[5]: continue # skip intron line
                print >>temp_stream, '\t'.join([tokens[0], '1'] + tokens[1:])
        else:
            if generous:
                for line in true_bed_stream:
                    tokens = line.strip().split('\t')
                    print >>temp_stream, '\t'.join([tokens[3][:-2], '0']
                                                    + tokens[:3] + tokens[4:])
            else:
                for line in true_bed_stream:
                    tokens = line.strip().split('\t')
                    print >>temp_stream, '\t'.join(
                                    [tokens[3], '0'] + tokens[:3] + tokens[4:]
                                )
            for line in sam_stream:
                if line[0] == '@' or not line.strip(): continue
                tokens = line.strip().split('\t')
                print >>temp_stream, '\t'.join([tokens[0], '1'] + tokens[1:])
    import subprocess
    sorted_combined_file = os.path.join(temp_dir_path, 'combined.sorted.temp')
    subprocess.check_call(' '.join(['sort -T %s -k1,1 -k2,2n'
                                        % temp_dir_path, combined_file, 
                                        '>', sorted_combined_file]),
                            bufsize=-1, shell=True)
    basewise_relevant, read_relevant = 0, 0
    # Initialize counters for computing accuracy metrics
    basewise_retrieved, basewise_intersection = 0, 0
    read_retrieved, read_intersection = 0, 0
    with open(sorted_combined_file) as sorted_combined_stream:
        for (name,), xpartition in xstream(sorted_combined_stream, 1):
            '''Dict mapping read names to alignments
            (chrom, 1-based start, 1-based end)'''
            true_maps = []
            saved = []
            for tokens in xpartition:
                saved.append(tokens)
                if tokens[0] == '0':
                    if len(tokens) < 12:
                        continue
                    chrom = tokens[1]
                    chrom_start = int(tokens[2])
                    chrom_end = int(tokens[3])
                    block_sizes = tokens[10].split(',')
                    block_starts = tokens[11].split(',')
                    # Handle trailing commas
                    try:
                        int(block_sizes[-1])
                    except ValueError:
                        block_sizes = block_sizes[:-1]
                    try:
                        int(block_starts[-1])
                    except ValueError:
                        block_starts = block_starts[:-1]
                    block_count = len(block_sizes)
                    assert block_count == len(block_starts)
                    exons = [(chrom,
                                chrom_start + int(block_starts[i]),
                                chrom_start + int(block_starts[i])
                                + int(block_sizes[i]))
                                for i in xrange(block_count)]
                    true_maps.append(exons)
                    basewise_relevant += sum([int(block_size) for block_size
                                                in block_sizes])
                    read_relevant += 1
                elif tokens[0] == '1':
                    flag = int(tokens[1])
                    if flag & 256 or flag & 4:
                        '''Secondary alignment or unmapped and thus not
                        retrieved; ignore'''
                        continue
                    cigar, pos, seq = tokens[5], int(tokens[3]), tokens[9]
                    (dummy_md, mapped,
                        unmapped, clip_count, read_length) \
                        = dummy_md_and_mapped_offsets(
                                            cigar,
                                            clip_threshold=clip_threshold
                                        )
                    if unmapped:
                        # Too much clipping
                        continue
                    basewise_retrieved += read_length - clip_count
                    read_retrieved += 1
                    if not true_maps:
                        assert ignore_spliced_reads
                        continue
                    # Try both /1 and /2; choose the best basewise result
                    intersected_base_count = 0
                    for true_map in true_maps:
                        if tokens[2] != true_map[0][0]:
                            '''chr is wrong, but this is still counted as a
                            retrieval above'''
                            continue
                        base_counter, base_truths = 0, set()
                        '''Each tuple in base_truths is
                        (index of base in read, mapped location)'''
                        for block in true_map:
                            base_truths.update([(base_counter + i, j + 1)
                                                    for i, j in enumerate(
                                                        xrange(
                                                            block[1], block[2]
                                                        ))])
                            base_counter += block[2] - block[1]
                        base_predictions = set()
                        if unmapped:
                            # Too much clipping
                            continue
                        _, _, _, exons = indels_introns_and_exons(
                                                        cigar,
                                                        dummy_md, pos, seq,
                                                        drop_deletions=True
                                                    )
                        mapped_index = 0
                        for exon in exons:
                            base_predictions.update(
                                        [(mapped[mapped_index + i], j)
                                                  for i, j in enumerate(
                                                    xrange(
                                                        exon[0], exon[1]
                                                    ))])
                            mapped_index += exon[1] - exon[0]
                        intersected_base_count = max(intersected_base_count,
                                len(
                                    base_predictions.intersection(base_truths)
                                ))
                    basewise_intersection += intersected_base_count
                    if intersected_base_count >= read_length * base_threshold:
                        read_intersection += 1
                    elif dump_incorrect:
                        # Incorrect alignment; write to stderr
                        print >>sys.stderr, '\t'.join(
                                ['.'.join(line) for line in saved]
                            )
                else:
                    raise RuntimeError(
                                'Invalid intermediate line.'
                            )
    return (basewise_retrieved, basewise_relevant, basewise_intersection,
            read_retrieved, read_relevant, read_intersection)
Example #35
0
def go(manifest_object,
       input_stream=sys.stdin,
       output_stream=sys.stdout,
       sample_fraction=0.05,
       coverage_threshold=5,
       verbose=False):
    """ Runs Rail-RNA-bed_pre

        Writes indels and junctions for outputting BEDs by sample and
        TSVs across samples.

        Input (read from stdin)
        ----------------------------
        Tab-delimited input tuple columns:
        1. 'I', 'D', or 'N' for insertion, deletion, or junction line
        2. Number string representing RNAME
        3. Start position (Last base before insertion, first base of deletion,
                            or first base of intron)
        4. End position (Last base before insertion, last base of deletion
                            (exclusive), or last base of intron (exclusive))
        5. '+' or '-' indicating which strand is the sense strand for
            junctions, inserted sequence for insertions, or deleted sequence
            for deletions
        6. Sample index
        ----Next fields are for junctions only; they are '\x1c' for indels----
        7. Number of nucleotides between 5' end of intron and 5' end of read
            from which it was inferred, ASSUMING THE SENSE STRAND IS THE
            FORWARD STRAND. That is, if the sense strand is the reverse strand,
            this is the distance between the 3' end of the read and the 3' end
            of the intron.
        8. Number of nucleotides between 3' end of intron and 3' end of read
            from which it was inferred, ASSUMING THE SENSE STRAND IS THE
            FORWARD STRAND.
        --------------------------------------------------------------------
        9. Number of instances of junction, insertion, or deletion in sample;
            this is always +1 before bed_pre combiner/reducer

        Input is partitioned by fields 1-5 and sorted by field 6.

        Hadoop output (written to stdout)
        ----------------------------
        Tab-delimited output tuple columns (bed):
        1. 'I', 'D', or 'N' for insertion, deletion, or junction line
        2. Sample index
        3. Number string representing RNAME (+ '+ or -' if junction; same as
            field 6)
        4. Start position (Last base before insertion, first base of deletion,
                            or first base of intron)
        5. End position (Last base before insertion, last base of deletion
                            (exclusive), or last base of intron (exclusive))
        6. '+' or '-' indicating which strand is the sense strand for
            junctions, inserted sequence for insertions, or deleted sequence
            for deletions
        ----Next fields are for junctions only; they are '\x1c' for indels----
        7. MAX number of nucleotides between 5' end of intron and 5' end of
            read from which it was inferred, ASSUMING THE SENSE STRAND IS THE
            FORWARD STRAND. That is, if the sense strand is the reverse strand,
            this is the distance between the 3' end of the read and the 3' end
            of the intron.
        8. MAX number of nucleotides between 3' end of intron and 3' end of
            read from which it was inferred, ASSUMING THE SENSE STRAND IS THE
            FORWARD STRAND.
        9. MAXIMIN (number of nucleotides between 5' end of intron and 5' end
                    of read, number of nucleotides between 3' end of intron and
                    3' end of read);
           min is between the args above; max is across reads.

        Tab-delimited output tuple columns (collect)
        1. '0' if insertion, '1' if deletion, or '2' if junction line
        2. Number string representing RNAME (+ '+ or -' if junction; same as
                                                field 6)
        3. Start position (Last base before insertion, first base of deletion,
                            or first base of intron)
        4. End position (Last base before insertion, last base of deletion
                            (exclusive), or last base of intron (exclusive))
        5. '+' or '-' indicating which strand is the sense strand for
            junctions, inserted sequence for insertions, or deleted sequence
            for deletions
        6. Coverage of feature for sample with index N
        ...
        N + 6. Coverage of feature in sample with index N
        --------------------------------------------------------------------
        10. SUMMED number of instances of junction, insertion, or deletion in
            sample

        OUTPUT COORDINATES ARE 1-INDEXED.

        input_stream: where to find input indels/junctions
        output_stream: where to write output
        manifest_object: object of class LabelsAndIndices that maps indices
            to labels and back; used to count number of samples.
        sample_fraction: fraction of samples in which an indel must appear
            to pass filter if coverage_threshold criterion is not satisfied
        coverage_threshold: number of reads that must overlap indel in at
            least one sample to pass filter of sample_fraction criterion is not
            satisfied
        verbose: output extra debugging statements

        Return value: tuple (input line count, output line count)
    """
    input_line_count, output_line_count = 0, 0
    '''Compute minimum number of samples in which indel should appear to be
    output if coverage threshold not met.'''
    total_sample_count = len(manifest_object.label_to_index)
    min_sample_count = int(round(total_sample_count * sample_fraction))

    for (line_type, rname, pos, end_pos,
         strand_or_seq), xpartition in xstream(input_stream, 5):
        collect_specs = [
            rname, pos, end_pos if line_type != 'N' else str(int(end_pos) - 1),
            strand_or_seq
        ]
        sample_indexes, coverages = [], []
        if line_type == 'N':
            counter.add('junction_line')
            for sample_index, data in itertools.groupby(
                    xpartition, key=lambda val: val[0]):
                coverage_sum = 0
                max_left_displacement, max_right_displacement = None, None
                maximin_displacement = None
                for _, left_displacement, right_displacement, coverage in data:
                    input_line_count += 1
                    left_displacement = int(left_displacement)
                    right_displacement = int(right_displacement)
                    max_left_displacement = max(left_displacement,
                                                max_left_displacement)
                    max_right_displacement = max(right_displacement,
                                                 max_right_displacement)
                    maximin_displacement = max(
                        min(left_displacement, right_displacement),
                        maximin_displacement)
                    coverage_sum += int(coverage)
                assert max_left_displacement is not None
                assert max_right_displacement is not None
                assert maximin_displacement is not None
                counter.add('bed_line')
                print >>output_stream, \
                    'bed\tN\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d' % (
                        sample_index, rname, pos, end_pos, strand_or_seq,
                        max_left_displacement, max_right_displacement,
                        maximin_displacement, coverage_sum
                    )
                sample_indexes.append(sample_index)
                coverages.append(coverage_sum)
                output_line_count += 1
            counter.add('collect_line')
            output_stream.write('collect\t2\t')
            print >> output_stream, '\t'.join(
                collect_specs +
                [','.join(sample_indexes), ','.join(map(str, coverages))])
            output_line_count += 1
        else:
            counter.add('insertion_line' if line_type ==
                        'I' else 'deletion_line')
            assert line_type in 'ID'
            sample_count = 0
            for sample_index, data in itertools.groupby(
                    xpartition, key=lambda val: val[0]):
                coverage_sum = 0
                for _, _, _, coverage in data:
                    input_line_count += 1
                    coverage_sum += int(coverage)
                counter.add('bed_line')
                print >>output_stream, \
                    'bed\t%s\t%s\t%s\t%s\t%s\t%s\t\x1c\t\x1c\t\x1c\t%d' % (
                        line_type, sample_index, rname, pos, end_pos,
                        strand_or_seq, coverage_sum
                    )
                sample_indexes.append(sample_index)
                coverages.append(coverage_sum)
                sample_count += 1
                output_line_count += 1
            max_coverage = max(coverages)
            if (sample_count >= min_sample_count
                    or (max_coverage >= coverage_threshold
                        and coverage_threshold != -1)):
                counter.add('collect_line')
                if line_type == 'I':
                    output_stream.write('collect\t0\t')
                else:
                    output_stream.write('collect\t1\t')
                print >>output_stream, \
                    '\t'.join(
                        collect_specs
                        + [','.join(sample_indexes),
                           ','.join(map(str, coverages))]
                    )
                output_line_count += 1
            else:
                counter.add('indel_filtered_out')
                if verbose:
                    print >> sys.stderr, (
                        'Indel (%s, %s, %s, %s) filtered out; it appeared in '
                        '%d sample(s), and its coverage in any one sample did '
                        'not exceed %d.') % (rname, strand_or_seq, pos,
                                             end_pos, sample_count,
                                             max_coverage)
    counter.flush()
    return input_line_count, output_line_count
Example #36
0
from dooplicity.tools import xstream

# Print file's docstring if -h is invoked
parser = argparse.ArgumentParser(description=__doc__, 
            formatter_class=argparse.RawDescriptionHelpFormatter)
bowtie.add_args(parser)
args = parser.parse_args()
input_line_count, output_line_count = 0, 0

start_time = time.time()

reference_index = bowtie_index.BowtieIndexReference(
                        os.path.expandvars(args.bowtie_idx)
                    )
for (_, rname_string, intron_pos, intron_end_pos,
        sense, sample_index), xpartition in xstream(sys.stdin, 6):
    coverage = 0
    for value in xpartition:
        input_line_count += 1
        try:
            # Assume junction line
            _, _, instance_count = value
        except ValueError:
            # Alignment line
            print ('\t'.join((value[2], value[3],
                reference_index.string_to_rname[rname_string],
                str(int(value[1])))
                + value[4:]) + ('\tXC:i:%d' % coverage))
            output_line_count += 1
        else:
            coverage += int(instance_count)
Example #37
0
def go(input_stream=sys.stdin,
       output_stream=sys.stdout,
       bowtie2_exe='bowtie2',
       bowtie_index_base='genome',
       bowtie2_index_base='genome2',
       manifest_file='manifest',
       bowtie2_args=None,
       bin_size=10000,
       verbose=False,
       exon_differentials=True,
       exon_intervals=False,
       report_multiplier=1.2,
       min_exon_size=8,
       search_filter=1,
       min_readlet_size=15,
       max_readlet_size=25,
       readlet_interval=12,
       capping_multiplier=1.5,
       drop_deletions=False,
       gzip_level=3,
       scratch=None,
       index_count=1,
       output_bam_by_chr=False,
       tie_margin=0,
       no_realign=False,
       no_polyA=False):
    """ Runs Rail-RNA-align_reads.

        A single pass of Bowtie is run to find end-to-end alignments. Unmapped
        reads are saved for readletizing to determine introns in sucessive
        reduce steps as well as for realignment in a later map step.

        Input (read from stdin)
        ----------------------------
        Tab-delimited input tuple columns in a mix of any of the following
        three formats:
        Format 1 (single-end, 3-column):
          1. Nucleotide sequence or its reversed complement, whichever is first
            in alphabetical order
          2. 1 if sequence was reverse-complemented else 0
          3. Name
          4. Quality sequence or its reverse, whichever corresponds to field 1

        Format 2 (paired, 2 lines, 3 columns each)
        (so this is the same as single-end)
          1. Nucleotide sequence for mate 1 or its reversed complement,
            whichever is first in alphabetical order
          2. 1 if sequence was reverse-complemented else 0
          3. Name for mate 1
          4. Quality sequence for mate 1 or its reverse, whichever corresponds
            to field 1
            
            (new line)

          1. Nucleotide sequence for mate 2 or its reversed complement,
            whichever is first in alphabetical order
          2. 1 if sequence was reverse complemented else 0
          3. Name for mate 2
          4. Quality sequence for mate 2 or its reverse, whichever corresponds
            to field 1

        Input is partitioned and sorted by field 1, the read sequence.

        Hadoop output (written to stdout)
        ----------------------------
        A given RNAME sequence is partitioned into intervals ("bins") of some 
        user-specified length (see partition.py).

        Exonic chunks (aka ECs; three formats, any or all of which may be
        emitted):

        Format 1 (exon_ival); tab-delimited output tuple columns:
        1. Reference name (RNAME in SAM format) + ';' + bin number
        2. Sample index
        3. EC start (inclusive) on forward strand
        4. EC end (exclusive) on forward strand

        Format 2 (exon_diff); tab-delimited output tuple columns:
        1. Reference name (RNAME in SAM format) + ';' + bin number
        2. max(EC start, bin start) (inclusive) on forward strand IFF diff is
            positive and EC end (exclusive) on forward strand IFF diff is
            negative
        3. Sample index
        4. '1' if alignment from which diff originates is "unique" according to
            --tie-margin criterion; else '0'
        5. +1 or -1 * count, the number of instances of a read sequence for
            which to print exonic chunks

        Note that only unique alignments are currently output as ivals and/or
        diffs.

        Format 3 (sam); tab-delimited output tuple columns:
        Standard SAM output except fields are in different order, and the first
        field corresponds to sample label. (Fields are reordered to facilitate
        partitioning by sample name/RNAME and sorting by POS.) Each line
        corresponds to a spliced alignment. The order of the fields is as
        follows.
        1. Sample index if outputting BAMs by sample OR
                sample-rname index if outputting BAMs by chr
        2. (Number string representing RNAME; see BowtieIndexReference
            class in bowtie_index for conversion information) OR
            '0' if outputting BAMs by chr
        3. POS
        4. QNAME
        5. FLAG
        6. MAPQ
        7. CIGAR
        8. RNEXT
        9. PNEXT
        10. TLEN
        11. SEQ
        12. QUAL
        ... + optional fields

        Insertions/deletions (indel_bed)

        tab-delimited output tuple columns:
        1. 'I' or 'D' insertion or deletion line
        2. Number string representing RNAME
        3. Start position (Last base before insertion or 
            first base of deletion)
        4. End position (Last base before insertion or last base of deletion 
                            (exclusive))
        5. Inserted sequence for insertions or deleted sequence for deletions
        6. Sample index
        ----Next fields are for introns only; they are '\x1c' for indels----
        7. '\x1c'
        8. '\x1c'
        --------------------------------------------------------------------
        9. Number of instances of insertion or deletion in sample; this is
            always +1 * count before bed_pre combiner/reducer

        Read whose primary alignment is not end-to-end

        Tab-delimited output tuple columns (unmapped):
        1. Transcriptome Bowtie 2 index group number
        2. SEQ
        3. 1 if SEQ is reverse-complemented, else 0
        4. QNAME
        5. QUAL

        Tab-delimited output tuple columns (readletized):
        1. Readlet sequence or its reversed complement, whichever is first in
            alphabetical order
        2. read sequence ID + ('-' if readlet
            sequence is reverse-complemented; else '+') + '\x1e' + displacement
            of readlet's 5' end from read's 5' end + '\x1e' + displacement of
            readlet's 3' end from read's 3' end (+, for EXACTLY one readlet of
            a read sequence, '\x1e' + read sequence + '\x1e' +
            (an '\x1f'-separated list A of unique sample labels with read
            sequences that match the original read sequence) + '\x1e' +
            (an '\x1f'-separated list  of unique sample labels B with read
            sequences that match the reversed complement of the original read
            sequence)) + '\x1e' + (an '\x1f'-separated list of the number of
            instances of the read sequence for each respective sample in list
            A) + '\x1e' + (an '\x1f'-separated list of the number of instances
            of the read sequence's reversed complement for each respective
            sample in list B). Here, a read sequence ID takes the form X:Y,
            where X is the "mapred_task_partition" environment variable -- a
            unique index for a task within a job -- and Y is the index of the
            read sequence relative to the beginning of the input stream.

        Tab-delimited tuple columns (postponed_sam):
        Standard 11+ -column raw SAM output

        Single column (unique):
        1. A unique read sequence

        Two columns, exactly one line (dummy); ensures creation of intron
            index:
        1. character "-"
        2. the word "dummy"

        ALL OUTPUT COORDINATES ARE 1-INDEXED.

        input_stream: where to find input reads.
        output_stream: where to emit exonic chunks and introns.
        bowtie2_exe: filename of Bowtie2 executable; include path if not in
            $PATH.
        bowtie_index_base: the basename of the Bowtie1 index files associated
            with the reference.
        bowtie2_index_base: the basename of the Bowtie2 index files associated
            with the reference.
        manifest_file: filename of manifest
        bowtie2_args: string containing precisely extra command-line arguments
            to pass to first-pass Bowtie2.
        bin_size: genome is partitioned in units of bin_size for later load
            balancing.
        verbose: True iff more informative messages should be written to
            stderr.
        exon_differentials: True iff EC differentials are to be emitted.
        exon_intervals: True iff EC intervals are to be emitted.
        report_multiplier: if verbose is True, the line number of an alignment
            or read written to stderr increases exponentially with base
            report_multiplier.
        min_exon_size: minimum exon size searched for in intron_search.py later
            in pipeline; used to determine how large a soft clip on one side of
            a read is necessary to pass it on to intron search pipeline
        search_filter: how large a soft clip on one side of a read is necessary
            to pass it on to intron search pipeline
        min_readlet_size: "capping" readlets (that is, readlets that terminate
            at a given end of the read) are never smaller than this value
        max_readlet_size: size of every noncapping readlet
        readlet_interval: number of bases separating successive readlets along
            the read
        capping_multiplier: successive capping readlets on a given end of a
            read are increased in size exponentially with base
            capping_multiplier
        drop_deletions: True iff deletions should be dropped from coverage
            vector
        gzip_level: compression level to use for temporary files
        scratch: scratch directory for storing temporary files or None if 
            securely created temporary directory
        index_count: number of transcriptome Bowtie 2 indexes to which to
            assign unmapped reads for later realignment
        output_bam_by_chr: True iff final output BAMs will be by chromosome
        tie_margin: allowed score difference per 100 bases among ties in
            max score. For example, 150 and 144 are tied alignment scores
            for a 100-bp read when --tie-margin is 6.
        no_realign: True iff job flow does not need more than readlets: this
            usually means only a transcript index is being constructed
        no_polyA: kill noncapping readlets that are all As and write as
            unmapped all reads with polyA prefixes whose suffixes are <
            min_exon_size

        No return value.
    """
    global _input_line_count
    # Required length of prefix after poly(A) is trimmed
    remaining_seq_size = max(min_exon_size - 1, 1)
    polyA_set = frozenset(['A' * i
                           for i in xrange(1, remaining_seq_size + 1)] +
                          ['T' * i
                           for i in xrange(1, remaining_seq_size + 1)] + [''])
    reference_index = bowtie_index.BowtieIndexReference(bowtie_index_base)
    manifest_object = manifest.LabelsAndIndices(manifest_file)
    alignment_printer = AlignmentPrinter(manifest_object,
                                         reference_index,
                                         bin_size=bin_size,
                                         output_stream=output_stream,
                                         exon_ivals=exon_intervals,
                                         exon_diffs=exon_differentials,
                                         drop_deletions=drop_deletions,
                                         output_bam_by_chr=output_bam_by_chr,
                                         tie_margin=tie_margin)
    # Get task partition to pass to align_reads_delegate.py
    try:
        task_partition = os.environ['mapred_task_partition']
    except KeyError:
        # Hadoop 2.x?
        try:
            task_partition = os.environ['mapreduce_task_partition']
        except KeyError:
            # A unit test is probably being run
            task_partition = '0'
    temp_dir = make_temp_dir(scratch)
    register_cleanup(tempdel.remove_temporary_directories, [temp_dir])
    align_file = os.path.join(temp_dir, 'first_pass_reads.temp.gz')
    other_reads_file = os.path.join(temp_dir, 'other_reads.temp.gz')
    second_pass_file = os.path.join(temp_dir, 'second_pass_reads.temp.gz')
    k_value, _, _ = bowtie.parsed_bowtie_args(bowtie2_args)
    nothing_doing = True
    with xopen(True, align_file, 'w', gzip_level) as align_stream, \
        xopen(True, other_reads_file, 'w', gzip_level) as other_stream:
        for seq_number, ((seq, ),
                         xpartition) in enumerate(xstream(sys.stdin, 1)):
            if no_polyA and (seq[:-remaining_seq_size] in polyA_set
                             or seq[remaining_seq_size:] in polyA_set):
                if not no_realign:
                    '''If a sequence is too short without its poly(A) tail,
                    make all reads with that sequence unmapped. Technically,
                    this also kills poly(A)s at 5' ends, but we probably
                    couldn't align those sequences anyway.'''
                    reversed_complement_seq = seq[::-1].translate(
                        _reversed_complement_translation_table)
                    for is_reversed, name, qual in xpartition:
                        if is_reversed == '0':
                            alignment_printer.print_unmapped_read(
                                name, seq, qual)
                        else:
                            alignment_printer.print_unmapped_read(
                                name, reversed_complement_seq, qual[::-1])
                continue
            nothing_doing = False
            '''Select highest-quality read with alphabetically last qname
            for first-pass alignment.'''
            best_name, best_mean_qual, best_qual_index, i = None, None, 0, 0
            others_to_print = dlist()
            for is_reversed, name, qual in xpartition:
                _input_line_count += 1
                others_to_print.append('\t'.join(
                    [str(seq_number), is_reversed, name, qual]))
                mean_qual = (float(sum([ord(score)
                                        for score in qual])) / len(qual))
                if (mean_qual > best_mean_qual
                        or mean_qual == best_mean_qual and name > best_name):
                    best_qual_index = i
                    best_mean_qual = mean_qual
                    best_name = name
                    to_align = '\t'.join(
                        ['%s\x1d%s' % (is_reversed, name), seq, qual])
                i += 1
            assert i >= 1
            if i == 1:
                print >> other_stream, str(seq_number)
            else:
                for j, other_to_print in enumerate(others_to_print):
                    if j != best_qual_index:
                        print >> other_stream, other_to_print
            print >> align_stream, to_align
    # Print dummy line
    print 'dummy\t-\tdummy'
    sys.stdout.flush(
    )  # this is REALLY important b/c called script will stdout
    if nothing_doing:
        # No input
        sys.exit(0)
    input_command = 'gzip -cd %s' % align_file
    bowtie_command = ' '.join([
        bowtie2_exe, bowtie2_args if bowtie2_args is not None else '',
        ' --sam-no-qname-trunc --local -t --no-hd --mm -x', bowtie2_index_base,
        '--12 -'
    ])
    delegate_command = ''.join([
        sys.executable, ' ',
        os.path.realpath(__file__)[:-3],
        ('_delegate.py --task-partition {task_partition} '
         '--other-reads {other_reads} --second-pass-reads '
         '{second_pass_reads} --min-readlet-size '
         '{min_readlet_size} {drop_deletions} '
         '--max-readlet-size {max_readlet_size} '
         '--readlet-interval {readlet_interval} '
         '--capping-multiplier {capping_multiplier:1.12f} '
         '{verbose} --report-multiplier {report_multiplier:1.12f} '
         '--k-value {k_value} '
         '--bowtie-idx {bowtie_index_base} '
         '--partition-length {bin_size} '
         '--manifest {manifest_file} '
         '{exon_differentials} {exon_intervals} '
         '--gzip-level {gzip_level} '
         '--search-filter {search_filter} '
         '--index-count {index_count} '
         '--tie-margin {tie_margin} '
         '{no_realign} '
         '{no_polyA} '
         '{output_bam_by_chr}').format(
             task_partition=task_partition,
             other_reads=other_reads_file,
             second_pass_reads=second_pass_file,
             min_readlet_size=min_readlet_size,
             drop_deletions=('--drop-deletions' if drop_deletions else ''),
             max_readlet_size=max_readlet_size,
             readlet_interval=readlet_interval,
             capping_multiplier=capping_multiplier,
             verbose=('--verbose' if verbose else ''),
             report_multiplier=report_multiplier,
             k_value=k_value,
             bowtie_index_base=bowtie_index_base,
             bin_size=bin_size,
             manifest_file=manifest_file,
             exon_differentials=('--exon-differentials'
                                 if exon_differentials else ''),
             exon_intervals=('--exon-intervals' if exon_intervals else ''),
             gzip_level=gzip_level,
             search_filter=search_filter,
             index_count=index_count,
             tie_margin=tie_margin,
             no_realign=('--no-realign' if no_realign else ''),
             no_polyA=('--no-polyA' if no_polyA else ''),
             output_bam_by_chr=('--output-bam-by-chr'
                                if output_bam_by_chr else ''))
    ])
    full_command = ' | '.join(
        [input_command, bowtie_command, delegate_command])
    print >>sys.stderr, \
        'Starting first-pass Bowtie 2 with command: ' + full_command
    bowtie_process = subprocess.Popen(' '.join(
        ['set -exo pipefail;', full_command]),
                                      bufsize=-1,
                                      stdout=sys.stdout,
                                      stderr=sys.stderr,
                                      shell=True,
                                      executable='/bin/bash')
    return_code = bowtie_process.wait()
    if return_code:
        raise RuntimeError('Error occurred while reading first-pass Bowtie 2 '
                           'output; exitlevel was %d.' % return_code)
    os.remove(align_file)
    os.remove(other_reads_file)
    if not no_realign:
        input_command = 'gzip -cd %s' % second_pass_file
        bowtie_command = ' '.join([
            bowtie2_exe, bowtie2_args if bowtie2_args is not None else '',
            ' --sam-no-qname-trunc --local -t --no-hd --mm -x',
            bowtie2_index_base, '--12 -'
        ])
        delegate_command = ''.join([
            sys.executable, ' ',
            os.path.realpath(__file__)[:-3],
            ('_delegate.py --task-partition {task_partition} '
             '--min-readlet-size {min_readlet_size} '
             '{drop_deletions} '
             '--max-readlet-size {max_readlet_size} '
             '--readlet-interval {readlet_interval} '
             '--capping-multiplier {capping_multiplier:012f} '
             '{verbose} '
             '--report-multiplier {report_multiplier:012f} '
             '--k-value {k_value} '
             '--bowtie-idx {bowtie_index_base} '
             '--partition-length {bin_size} '
             '--manifest {manifest_file} '
             '{exon_differentials} {exon_intervals} '
             '--gzip-level {gzip_level} '
             '--search-filter {search_filter} '
             '--index-count {index_count} '
             '--tie-margin {tie_margin} '
             '{output_bam_by_chr}').format(
                 task_partition=task_partition,
                 min_readlet_size=min_readlet_size,
                 drop_deletions=('--drop-deletions' if drop_deletions else ''),
                 readlet_interval=readlet_interval,
                 max_readlet_size=max_readlet_size,
                 capping_multiplier=capping_multiplier,
                 verbose=('--verbose' if verbose else ''),
                 report_multiplier=report_multiplier,
                 k_value=k_value,
                 bowtie_index_base=bowtie_index_base,
                 bin_size=bin_size,
                 manifest_file=manifest_file,
                 exon_differentials=('--exon-differentials'
                                     if exon_differentials else ''),
                 exon_intervals=('--exon-intervals' if exon_intervals else ''),
                 gzip_level=gzip_level,
                 search_filter=search_filter,
                 index_count=index_count,
                 tie_margin=tie_margin,
                 output_bam_by_chr=('--output-bam-by-chr'
                                    if output_bam_by_chr else ''))
        ])
        full_command = ' | '.join(
            [input_command, bowtie_command, delegate_command])
        print >>sys.stderr, \
            'Starting second-pass Bowtie 2 with command: ' + full_command
        bowtie_process = subprocess.Popen(' '.join(
            ['set -exo pipefail;', full_command]),
                                          bufsize=-1,
                                          stdout=sys.stdout,
                                          stderr=sys.stderr,
                                          shell=True,
                                          executable='/bin/bash')
        return_code = bowtie_process.wait()
        if return_code:
            raise RuntimeError('Error occurred while reading second-pass '
                               'Bowtie 2 output; exitlevel was %d.' %
                               return_code)
    sys.stdout.flush()
Example #38
0
def edges_from_input_stream(input_stream, readlet_size=20,
    min_overlap_exon_size=1):
    """ Generates edges of directed acyclic graph (DAG) of introns.

        A DAG is constructed for each strand. Each node of the DAG represents
        a unique intron and is labeled by the tuple (intron_start, intron_end),
        where intron_start is the (1-based) coordinate of the first base of the
        intron, and intron_end is the coordinate of the first base after the
        intron. An edge occurs between two introns A and B iff they do not
        overlap, and no intron C occurs between A and B such that A, B, and C 
        do not overlap. The intron with larger coordinates is the child of
        the intron with smaller coordinates. Weight each edge by the number of
        exonic bases between the introns it connects.

        The DAG has sources and sinks. Pad the DAG with new sources and sinks
        as follows. Extend an edge to each original source
        (source_start, source_end) from a new source labeled by the tuple
        (None, max(1, source_start - readlet_size + 1)). Extend an edge
        from each original sink (sink_start, sink_end) to a new sink labeled by
        the tuple (sink_end + source_start + readlet_size - 1, None).
        So each original source is assigned exactly one new source, and each
        original sink is assigned exactly one new sink.

        The paths through this DAG span all possible combinations of
        nonoverlapping introns on the strand. Finding all subpaths (sequences
        of introns), each of whose weights is <= readlet_size, redundantly
        enumerates all possible combinations of introns a readlet
        can overlap. Unfortunately, obtaining such combinations in
        "hot spots," where there are many alternative splicings and short
        exons, can become computationally intractable for large readlet sizes.
        To (optionally) control these combinatorial blow-ups, impose an
        effective minimum exon size min_overlap_exon_size by redefining overlap
        between two introns: two introns overlap if there are fewer than
        min_overlap_exon_size exonic bases between them.

        The algorithm for generating the edges of the DAG operates on an input
        stream whose lines are composed of the following tab-separated fields:

        1. Reference name (RNAME in SAM format) +
            '+' or '-' indicating which strand is the sense strand
        2. Sample index
        3. Intron start position (inclusive)
        4. Intron end position (exclusive)

        The input is partitioned by strand/sample index (fields 1-2) and sorted
        by the remaining fields. INPUT COORDINATES ARE ASSUMED TO BE 1-INDEXED.

        Introns are sorted by start position. To begin, the first set of
        mutually overlapping introns are connected to their new sources. Two
        data structures encode the graph as it is constructed [1]: the set
        unlinked_nodes, containing introns that do not yet have any children,
        and linked_nodes, a dictionary that, where possible, maps each intron A
        to its corresponding successive nonoverlapping intron B with the
        smallest end position read so far. With each new intron N read, the
        nodes in unlinked_nodes and linked_nodes are checked for whether N is
        their child, and these edges are yielded. (Note that by construction,
        nodes are streamed in topological order.) Nodes from unlinked_nodes may
        thus be promoted to linked_nodes. Each value node in linked_nodes is
        also replaced with N if N has a smaller end position. Then every node A
        in linked_nodes is checked for a path A -> B -> N. If such a path
        exists, an edge can NEVER connect A with any successive introns, and A
        is removed from linked_nodes. The algorithm continues until the end of
        the strand is reached, when the edges connecting remaining
        unlinked_nodes and their new sinks are yielded.

        [1] Technically, there are three data structures. unlinked_nodes and
        linked_nodes contain only indices of introns, and "introns" is
        a dictionary that maps indices to tuples (intron_start, intron_end).

        input_stream: where to find sorted introns of the form specified above.
        fudge: by how much a readlet_size should be extended.
        min_exon_size: minimum number of exonic bases between two introns
            for them to be considered nonoverlapping.

        Yield value: An edge tuple (strand,
                                    sample_index,
                                    (intron A start, intron A end),
                                    (intron B start, intron B end)) or None
                     at the beginning of a new partition.
    """
    global _input_line_count
    for key, xpartition in xstream(input_stream, 2, skip_duplicates=True):
        unlinked_nodes = set()
        for q, value in enumerate(xpartition):
            assert len(value) == 2
            _input_line_count += 1
            if not q:
                # Denote start of new partition
                yield None
                # Handle first intron from partition
                intron_start, intron_end = int(value[0]), int(value[1])
                # Create fake source before first intron
                fake_source = (
                        None,
                        max(intron_start - (readlet_size - 1), 1)
                    )
                introns = {
                        0 : fake_source,
                        1 : (intron_start, intron_end)
                    }
                linked_nodes = { 0 : 1 }
                unlinked_nodes = set([1])
                index = 2
                # Yield first edge for strand (before first intron)
                yield key + (fake_source,
                                (intron_start, intron_end))
                first_intron = False
            else:
                # Handle next introns from partition
                intron_start, intron_end = int(value[0]), int(value[1])
                introns[index] = (intron_start, intron_end)
                nodes_to_trash = []
                for node in unlinked_nodes:
                    if intron_start >= introns[node][1] + \
                        min_overlap_exon_size:
                        nodes_to_trash.append(node)
                for node in nodes_to_trash:
                    linked_nodes[node] = index
                    unlinked_nodes.remove(node)
                unlinked_nodes.add(index)
                nodes_to_trash = []
                for node in linked_nodes:
                    intermediate_node = linked_nodes[node]
                    if intermediate_node in linked_nodes:
                        nodes_to_trash.append(node)
                    else:
                        yield key + (introns[node],
                                            (intron_start, intron_end))
                        if introns[intermediate_node][1] > intron_end:
                            linked_nodes[node] = index
                for node in nodes_to_trash:
                    del linked_nodes[node]
                    del introns[node]
                index += 1
        # Yield final edges for strand
        for node in unlinked_nodes:
            current_intron = introns[node]
            yield key + (current_intron, (current_intron[1]
                                            + readlet_size - 1, None))
Example #39
0
parser = argparse.ArgumentParser(description=__doc__, 
            formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument(\
    '--verbose', action='store_const', const=True, default=False,
    help='Print out extra debugging statements')
bowtie.add_args(parser)
group_reads.add_args(parser)
args = parser.parse_args()

start_time = time.time()
input_line_count = 0
reference_index = bowtie_index.BowtieIndexReference(
                            os.path.expandvars(args.bowtie_idx)
                        )
group_reads_object = group_reads.IndexGroup(args.index_count)
for (rname, poses, end_poses), xpartition in xstream(sys.stdin, 3,
                                                        skip_duplicates=True):
    reverse_strand_string = rname[-1]
    rname = rname[:-1]
    read_seqs = dlist()
    poses = [int(pos) for pos in poses.split(',')]
    end_poses = [int(end_pos) for end_pos in end_poses.split(',')]
    max_left_extend_size, max_right_extend_size = None, None
    for left_extend_size, right_extend_size, read_seq in xpartition:
        input_line_count += 1
        max_left_extend_size = max(max_left_extend_size, int(left_extend_size))
        max_right_extend_size \
            = max(max_right_extend_size, int(right_extend_size))
        read_seqs.append(read_seq)
    intron_combo = zip(poses, end_poses)
    assert max_left_extend_size is not None
    assert max_right_extend_size is not None
Example #40
0
def go(
    manifest_object,
    input_stream=sys.stdin,
    output_stream=sys.stdout,
    sample_fraction=0.05,
    coverage_threshold=5,
    collect_junctions=False,
    verbose=False,
):
    """ Runs Rail-RNA-junction_filter.

        Filters out every junction from input_stream that is not either:
          (1) in round(sample_fraction * (total number of samples)) samples OR
          (2) found in at least coverage_threshold reads in at least one
            sample.

        Input (read from stdin)
        ----------------------------
        Tab-delimited columns:
        1. Reference name (RNAME in SAM format) +
            '+' or '-' indicating which strand is the sense strand
        2. Intron start position (inclusive)
        3. Intron end position (exclusive)
        4. '\x1f'-separated list of sample indexes in which junction was found
        5. '\x1f'-separated list of numbers of reads in which junction was
            found in respective sample specified by field 4

        Input is partitioned by fields 1-3.

        Hadoop output (written to stdout)
        ----------------------------
        Tab-delimited tuple columns (filter):
        1. Reference name (RNAME in SAM format) +
            '+' or '-' indicating which strand is the sense strand
        2. Sample index
        3. Intron start position (inclusive)
        4. Intron end position (exclusive)

        If --collect-junctions is True:
        Tab-delimited tuple columns (collect):
        1. Reference name (RNAME in SAM format) +
            '+' or '-' indicating which strand is the sense strand
        2. Intron start position (inclusive)
        3. Intron end position (exclusive)
        4. '\x1f'-separated list of sample indexes in which junction was found
        5. '\x1f'-separated list of numbers of reads in which junction was
            found in respective sample specified by field 4

        ALL OUTPUT COORDINATES ARE 1-INDEXED.

        input_stream: where to find input junctions
        output_stream: where to write output
        manifest_object: object of class LabelsAndIndices that maps indices
            to labels and back; used to count number of samples.
        sample_fraction: fraction of samples in which a junction must appear
            to pass filter if coverage_threshold criterion is not satisfied
        coverage_threshold: number of reads that must overlap junction in at
            least one sample to pass filter of sample_fraction criterion is not
            satisfied
        collect_junctions: collects and outputs junctions across samples;
            ignores sample_fraction and coverage_threshold
        verbose: output extra debugging statements

        Return value: tuple (input line count, output line count)
    """
    input_line_count, output_line_count = 0, 0
    min_sample_count = int(round(len(manifest_object.label_to_index) * sample_fraction))
    for (rname_and_strand, pos, end_pos), xpartition in xstream(input_stream, 3):
        sample_indexes = defaultdict(int)
        for current_sample_indexes, current_sample_counts in xpartition:
            input_line_count += 1
            current_sample_counts = current_sample_counts.split("\x1f")
            for i, sample_index in enumerate(current_sample_indexes.split("\x1f")):
                sample_indexes[sample_index] += int(current_sample_counts[i])
        pos, end_pos = int(pos), int(end_pos)
        if collect_junctions:
            samples_to_dump = sorted(sample_indexes.items(), key=lambda sample: int(sample[0]))
            print >> output_stream, "collect\t%s\t%012d\t%012d\t%s\t%s" % (
                rname_and_strand,
                pos,
                end_pos,
                ",".join([sample[0] for sample in samples_to_dump]),
                ",".join([str(sample[1]) for sample in samples_to_dump]),
            )
            output_line_count += 1
        sample_count = len(sample_indexes)
        max_coverage = max(sample_indexes.values())
        if end_pos > pos and (
            sample_count >= min_sample_count or (max_coverage >= coverage_threshold and coverage_threshold != -1)
        ):
            for sample_index in sample_indexes:
                print >> output_stream, "filter\t%s\t%s\t%012d\t%012d" % (rname_and_strand, sample_index, pos, end_pos)
                output_line_count += 1
        elif verbose:
            print >>sys.stderr, (
                "Junction (%s, %d, %d) filtered out; it appeared in %d "
                "sample(s), and its coverage in any one sample did "
                "not exceed %d."
            ) % (rname_and_strand, pos, end_pos, sample_count, max_coverage)
    return input_line_count, output_line_count
Example #41
0
                                        combined_stream,
                                        retrieved_intron_counts,
                                        instance=args.read_instance)
 import subprocess
 sorted_combined_file = os.path.join(temp_dir_path, 'combined.sorted.temp')
 subprocess.check_call(' '.join([
     'sort -T %s -k1,1' % temp_dir_path, combined_file, '>',
     sorted_combined_file
 ]),
                       bufsize=-1,
                       shell=True)
 relevant = 0
 retrieved = 0
 relevant_and_retrieved = 0
 with open(sorted_combined_file) as sorted_combined_stream:
     for (name, ), xpartition in xstream(sorted_combined_stream, 1):
         relevant_and_retrieved_instances = list(xpartition)
         ts = [
             instance[:-1] for instance in relevant_and_retrieved_instances
             if instance[-1] == 't' and (
                 args.coverage_threshold is None or any([
                     intron_counts[intron] <= args.coverage_threshold
                     for intron in instance[:-1]
                 ]))
         ]
         rs = [
             instance[:-1] for instance in relevant_and_retrieved_instances
             if instance[-1] == 'r' and (
                 args.coverage_threshold is None or any([
                     intron_counts[intron] <= args.coverage_threshold
                     for intron in instance[:-1]