Beispiel #1
0
def write_read_introns_from_sam_stream(sam_stream,
                                       output_stream,
                                       retrieved_intron_counts,
                                       instance=False):
    """ Writes output that maps QNAMES to exon-exon junctions overlapped.

        sam_stream: where to find retrieved alignments in SAM form
        output_stream: where to write output. Each line takes the form:
            <read name><TAB>RNAME<TAB><sorted list of intron starts and ends>
            <TAB>['r' for 'retrieved']
        retrieved_intron_counts: defaultdict(int) that counts number of
            retrieved alignments overlapping exon-exon junction

        No return value.
    """
    for line in sam_stream:
        if line[0] == '@': continue
        try:
            tokens = line.strip().split('\t')
            flag = int(tokens[1])
            if flag & 4:
                continue
            name = tokens[0]
            rname = tokens[2]
            cigar = tokens[5]
            pos = int(tokens[3])
            seq = tokens[9]
            flag = int(tokens[1])
            if 'N' not in cigar or flag & 256:
                continue
            #md = [token[5:] for token in tokens if token[:5] == 'MD:Z:'][0]
            _, _, introns, _ = indels_introns_and_exons(
                cigar, dummy_md_index(cigar), pos, seq)
            introns = [intron[:2] for intron in introns]
            introns = [
                rname + ';'.join([''] + [str(bound) for bound in intron])
                for intron in sorted(list(introns))
            ]
            if instance:
                for intron in introns:
                    retrieved_intron_counts[intron] += 1
                print >> output_stream, '%s\t%s\tr' % (name,
                                                       '\t'.join(introns))
            else:
                for intron in introns:
                    retrieved_intron_counts[intron] += 1
                    print >> output_stream, '%s;%s\t%s\tr' % (name, intron,
                                                              intron)
        except IndexError:
            print >> sys.stderr, ('Error found on line: ' + line)
            raise
Beispiel #2
0
def go(true_bed_stream, sam_stream=sys.stdin, generous=False,
        base_threshold=0.5, clip_threshold=1.0, dump_incorrect=False,
        temp_dir=None, ignore_spliced_reads=False):
    """ Finds relevant and retrieved instance counts.

        true_bed_stream: file handle for BED output of Flux simulation
        sam_stream: where to read in aligner's mappings
        generous: True iff aligner cuts off /1 or /2 of a given read
        base_threshold: proportion of a read's bases that must align
            correctly for a read to be considered a correct mapping
        clip_threshold: proportion of a read's bases that must be clipped
            for a read to be considered unmapped
        dump_incorrect: write incorrect (read) alignments to stderr
        ignore_spliced_reads: ignores all spliced reads
    """
    from tempdel import remove_temporary_directories
    import tempfile
    import atexit
    if temp_dir is None:
        temp_dir_path = tempfile.mkdtemp()
    else:
        try:
            temp_dir_path = tempfile.mkdtemp(dir=temp_dir)
        except:
            temp_dir_path = tempfile.mkdtemp()
    #print >>sys.stderr, temp_dir_path
    atexit.register(remove_temporary_directories, [temp_dir_path])
    # Store everything in one file, then sort it on read name
    combined_file = os.path.join(temp_dir_path, 'combined.temp')
    with open(combined_file, 'w') as temp_stream:
        if ignore_spliced_reads:
            if generous:
                for line in true_bed_stream:
                    tokens = line.strip().split('\t')
                    if ',' in tokens[-1]: continue # skip intron line
                    print >>temp_stream, '\t'.join([tokens[3][:-2], '0']
                                                    + tokens[:3] + tokens[4:])
            else:
                for line in true_bed_stream:
                    tokens = line.strip().split('\t')
                    if ',' in tokens[-1]: continue # skip intron line
                    print >>temp_stream, '\t'.join(
                                    [tokens[3], '0'] + tokens[:3] + tokens[4:]
                                )
            for line in sam_stream:
                if line[0] == '@' or not line.strip(): continue
                tokens = line.strip().split('\t')
                if 'N' in tokens[5]: continue # skip intron line
                print >>temp_stream, '\t'.join([tokens[0], '1'] + tokens[1:])
        else:
            if generous:
                for line in true_bed_stream:
                    tokens = line.strip().split('\t')
                    print >>temp_stream, '\t'.join([tokens[3][:-2], '0']
                                                    + tokens[:3] + tokens[4:])
            else:
                for line in true_bed_stream:
                    tokens = line.strip().split('\t')
                    print >>temp_stream, '\t'.join(
                                    [tokens[3], '0'] + tokens[:3] + tokens[4:]
                                )
            for line in sam_stream:
                if line[0] == '@' or not line.strip(): continue
                tokens = line.strip().split('\t')
                print >>temp_stream, '\t'.join([tokens[0], '1'] + tokens[1:])
    import subprocess
    sorted_combined_file = os.path.join(temp_dir_path, 'combined.sorted.temp')
    subprocess.check_call(' '.join(['sort -T %s -k1,1 -k2,2n'
                                        % temp_dir_path, combined_file, 
                                        '>', sorted_combined_file]),
                            bufsize=-1, shell=True)
    basewise_relevant, read_relevant = 0, 0
    # Initialize counters for computing accuracy metrics
    basewise_retrieved, basewise_intersection = 0, 0
    read_retrieved, read_intersection = 0, 0
    with open(sorted_combined_file) as sorted_combined_stream:
        for (name,), xpartition in xstream(sorted_combined_stream, 1):
            '''Dict mapping read names to alignments
            (chrom, 1-based start, 1-based end)'''
            true_maps = []
            saved = []
            for tokens in xpartition:
                saved.append(tokens)
                if tokens[0] == '0':
                    if len(tokens) < 12:
                        continue
                    chrom = tokens[1]
                    chrom_start = int(tokens[2])
                    chrom_end = int(tokens[3])
                    block_sizes = tokens[10].split(',')
                    block_starts = tokens[11].split(',')
                    # Handle trailing commas
                    try:
                        int(block_sizes[-1])
                    except ValueError:
                        block_sizes = block_sizes[:-1]
                    try:
                        int(block_starts[-1])
                    except ValueError:
                        block_starts = block_starts[:-1]
                    block_count = len(block_sizes)
                    assert block_count == len(block_starts)
                    exons = [(chrom,
                                chrom_start + int(block_starts[i]),
                                chrom_start + int(block_starts[i])
                                + int(block_sizes[i]))
                                for i in xrange(block_count)]
                    true_maps.append(exons)
                    basewise_relevant += sum([int(block_size) for block_size
                                                in block_sizes])
                    read_relevant += 1
                elif tokens[0] == '1':
                    flag = int(tokens[1])
                    if flag & 256 or flag & 4:
                        '''Secondary alignment or unmapped and thus not
                        retrieved; ignore'''
                        continue
                    cigar, pos, seq = tokens[5], int(tokens[3]), tokens[9]
                    (dummy_md, mapped,
                        unmapped, clip_count, read_length) \
                        = dummy_md_and_mapped_offsets(
                                            cigar,
                                            clip_threshold=clip_threshold
                                        )
                    if unmapped:
                        # Too much clipping
                        continue
                    basewise_retrieved += read_length - clip_count
                    read_retrieved += 1
                    if not true_maps:
                        assert ignore_spliced_reads
                        continue
                    # Try both /1 and /2; choose the best basewise result
                    intersected_base_count = 0
                    for true_map in true_maps:
                        if tokens[2] != true_map[0][0]:
                            '''chr is wrong, but this is still counted as a
                            retrieval above'''
                            continue
                        base_counter, base_truths = 0, set()
                        '''Each tuple in base_truths is
                        (index of base in read, mapped location)'''
                        for block in true_map:
                            base_truths.update([(base_counter + i, j + 1)
                                                    for i, j in enumerate(
                                                        xrange(
                                                            block[1], block[2]
                                                        ))])
                            base_counter += block[2] - block[1]
                        base_predictions = set()
                        if unmapped:
                            # Too much clipping
                            continue
                        _, _, _, exons = indels_introns_and_exons(
                                                        cigar,
                                                        dummy_md, pos, seq,
                                                        drop_deletions=True
                                                    )
                        mapped_index = 0
                        for exon in exons:
                            base_predictions.update(
                                        [(mapped[mapped_index + i], j)
                                                  for i, j in enumerate(
                                                    xrange(
                                                        exon[0], exon[1]
                                                    ))])
                            mapped_index += exon[1] - exon[0]
                        intersected_base_count = max(intersected_base_count,
                                len(
                                    base_predictions.intersection(base_truths)
                                ))
                    basewise_intersection += intersected_base_count
                    if intersected_base_count >= read_length * base_threshold:
                        read_intersection += 1
                    elif dump_incorrect:
                        # Incorrect alignment; write to stderr
                        print >>sys.stderr, '\t'.join(
                                ['.'.join(line) for line in saved]
                            )
                else:
                    raise RuntimeError(
                                'Invalid intermediate line.'
                            )
    return (basewise_retrieved, basewise_relevant, basewise_intersection,
            read_retrieved, read_relevant, read_intersection)
Beispiel #3
0
def go(input_stream=sys.stdin,
       output_stream=sys.stdout,
       fudge=5,
       stranded=False,
       verbose=False,
       report_multiplier=1.2):
    """ Emits intron combinations associated with reads.

        Soft-clipped Bowtie 2 alignments of read sequences to the transcript
        fragment index are used infer which cointrons could possibly be
        overlapped by reads. Then maximal cliques of the graph described in
        the maximal_cliques() function are enumerated to obtain which
        intron combinations could possibly be overlapped by reads.

        input_stream: where to retrieve Bowtie 2 output
        output_stream: where to emit exon and intron tuples; typically, this is
            sys.stdout.
        verbose: True if alignments should occasionally be written to stderr.
        stranded: True iff input reads are strand-specific; this affects
            whether an output partition has a terminal '+' or '-' indicating
            the sense strand. Further, if stranded is True, an alignment is
            returned only if its strand agrees with the intron's strand.
        fudge: by how many bases to extend left and right extend sizes
            to accommodate potential indels
        report_multiplier: if verbose is True, the line number of an
            alignment written to stderr increases exponentially with base
            report_multiplier.
    """
    output_line_count, next_report_line, i = 0, 0, 0
    for (qname, ), xpartition in xstream(input_stream, 1):
        '''While labeled multireadlet, this list may end up simply a
        unireadlet.'''
        multiread = []
        for tokens in xpartition:
            flag = int(tokens[0])
            if verbose and next_report_line == i:
                print >>sys.stderr, \
                    'SAM output record %d: rdname="%s", flag=%d' % (i,
                                                                    qname,
                                                                    flag)
                next_report_line = int(
                    (next_report_line + 1) * report_multiplier + 1) - 1
            i += 1
            multiread.append((qname, ) + tokens)
        if flag & 4: continue
        corrected_multiread = multiread_with_introns(multiread, stranded)
        all_introns = {}
        for alignment in multiread_with_introns(multiread, stranded):
            cigar = alignment[5]
            md = [field for field in alignment if field[:5] == 'MD:Z:'][0][5:]
            pos = int(alignment[3])
            seq = alignment[9]
            reversed_complement_seq = seq[::-1].translate(
                _reversed_complement_translation_table)
            if seq < reversed_complement_seq:
                seq_to_print = seq
            else:
                seq_to_print = reversed_complement_seq
            seq_size = len(seq)
            rname = alignment[2]
            sense = [field for field in alignment
                     if field[:5] == 'XS:A:'][0][5:]
            if (rname, sense) not in all_introns:
                all_introns[(rname, sense)] = defaultdict(list)
            _, _, introns, _ = indels_introns_and_exons(cigar, md, pos, seq)
            for intron in introns:
                if (intron[0], intron[1]) \
                    not in all_introns[(rname, sense)]:
                    all_introns[(rname, sense)][(intron[0], intron[1])] \
                        = [intron[2], intron[3]]
                else:
                    all_introns[(rname,
                                 sense)][(intron[0], intron[1])][0] = max(
                                     all_introns[(rname,
                                                  sense)][(intron[0],
                                                           intron[1])][0],
                                     intron[2])
                    all_introns[(rname,
                                 sense)][(intron[0], intron[1])][1] = max(
                                     all_introns[(rname,
                                                  sense)][(intron[0],
                                                           intron[1])][1],
                                     intron[3])
        for rname, sense in all_introns:
            to_write = set()
            # Grab maximal cliques
            for clique in \
                maximal_cliques(all_introns[(rname, sense)].keys()):
                for cointrons in separated_introns(clique,
                                                   separation=(seq_size +
                                                               fudge)):
                    cointrons.sort()
                    left_extend_size = all_introns[(rname, sense)][(
                        cointrons[0][0], cointrons[0][1])][0]
                    right_extend_size = all_introns[(rname, sense)][(
                        cointrons[-1][0], cointrons[-1][1])][1]
                    to_write.add(
                        ('{rname}{sense}\t{starts}'
                         '\t{ends}\t{left_size}'
                         '\t{right_size}\t{seq}').format(
                             rname=rname,
                             sense=sense,
                             starts=','.join(
                                 [str(intron[0]) for intron in cointrons]),
                             ends=','.join(
                                 [str(intron[1]) for intron in cointrons]),
                             left_size=(left_extend_size + fudge),
                             right_size=(right_extend_size + fudge),
                             seq=seq_to_print))
            for line_to_write in to_write:
                print line_to_write
                output_line_count += 1
    output_stream.flush()
    print >> sys.stderr, (
        'cointron_enum_delegate.py reports %d output lines.' %
        output_line_count)
Beispiel #4
0
def go(true_bed_stream,
       sam_stream=sys.stdin,
       generous=False,
       base_threshold=0.5,
       clip_threshold=1.0,
       dump_incorrect=False,
       temp_dir=None,
       ignore_spliced_reads=False):
    """ Finds relevant and retrieved instance counts.

        true_bed_stream: file handle for BED output of Flux simulation
        sam_stream: where to read in aligner's mappings
        generous: True iff aligner cuts off /1 or /2 of a given read
        base_threshold: proportion of a read's bases that must align
            correctly for a read to be considered a correct mapping
        clip_threshold: proportion of a read's bases that must be clipped
            for a read to be considered unmapped
        dump_incorrect: write incorrect (read) alignments to stderr
        ignore_spliced_reads: ignores all spliced reads
    """
    from tempdel import remove_temporary_directories
    import tempfile
    import atexit
    if temp_dir is None:
        temp_dir_path = tempfile.mkdtemp()
    else:
        try:
            temp_dir_path = tempfile.mkdtemp(dir=temp_dir)
        except:
            temp_dir_path = tempfile.mkdtemp()
    #print >>sys.stderr, temp_dir_path
    atexit.register(remove_temporary_directories, [temp_dir_path])
    # Store everything in one file, then sort it on read name
    combined_file = os.path.join(temp_dir_path, 'combined.temp')
    with open(combined_file, 'w') as temp_stream:
        if ignore_spliced_reads:
            if generous:
                for line in true_bed_stream:
                    tokens = line.strip().split('\t')
                    if ',' in tokens[-1]: continue  # skip intron line
                    print >> temp_stream, '\t'.join([tokens[3][:-2], '0'] +
                                                    tokens[:3] + tokens[4:])
            else:
                for line in true_bed_stream:
                    tokens = line.strip().split('\t')
                    if ',' in tokens[-1]: continue  # skip intron line
                    print >> temp_stream, '\t'.join([tokens[3], '0'] +
                                                    tokens[:3] + tokens[4:])
            for line in sam_stream:
                if line[0] == '@' or not line.strip(): continue
                tokens = line.strip().split('\t')
                if 'N' in tokens[5]: continue  # skip intron line
                print >> temp_stream, '\t'.join([tokens[0], '1'] + tokens[1:])
        else:
            if generous:
                for line in true_bed_stream:
                    tokens = line.strip().split('\t')
                    print >> temp_stream, '\t'.join([tokens[3][:-2], '0'] +
                                                    tokens[:3] + tokens[4:])
            else:
                for line in true_bed_stream:
                    tokens = line.strip().split('\t')
                    print >> temp_stream, '\t'.join([tokens[3], '0'] +
                                                    tokens[:3] + tokens[4:])
            for line in sam_stream:
                if line[0] == '@' or not line.strip(): continue
                tokens = line.strip().split('\t')
                print >> temp_stream, '\t'.join([tokens[0], '1'] + tokens[1:])
    import subprocess
    sorted_combined_file = os.path.join(temp_dir_path, 'combined.sorted.temp')
    subprocess.check_call(' '.join([
        'sort -T %s -k1,1 -k2,2n' % temp_dir_path, combined_file, '>',
        sorted_combined_file
    ]),
                          bufsize=-1,
                          shell=True)
    basewise_relevant, read_relevant = 0, 0
    # Initialize counters for computing accuracy metrics
    basewise_retrieved, basewise_intersection = 0, 0
    read_retrieved, read_intersection = 0, 0
    with open(sorted_combined_file) as sorted_combined_stream:
        for (name, ), xpartition in xstream(sorted_combined_stream, 1):
            '''Dict mapping read names to alignments
            (chrom, 1-based start, 1-based end)'''
            true_maps = []
            saved = []
            for tokens in xpartition:
                saved.append(tokens)
                if tokens[0] == '0':
                    if len(tokens) < 12:
                        continue
                    chrom = tokens[1]
                    chrom_start = int(tokens[2])
                    chrom_end = int(tokens[3])
                    block_sizes = tokens[10].split(',')
                    block_starts = tokens[11].split(',')
                    # Handle trailing commas
                    try:
                        int(block_sizes[-1])
                    except ValueError:
                        block_sizes = block_sizes[:-1]
                    try:
                        int(block_starts[-1])
                    except ValueError:
                        block_starts = block_starts[:-1]
                    block_count = len(block_sizes)
                    assert block_count == len(block_starts)
                    exons = [(chrom, chrom_start + int(block_starts[i]),
                              chrom_start + int(block_starts[i]) +
                              int(block_sizes[i]))
                             for i in xrange(block_count)]
                    true_maps.append(exons)
                    basewise_relevant += sum(
                        [int(block_size) for block_size in block_sizes])
                    read_relevant += 1
                elif tokens[0] == '1':
                    flag = int(tokens[1])
                    if flag & 256 or flag & 4:
                        '''Secondary alignment or unmapped and thus not
                        retrieved; ignore'''
                        continue
                    cigar, pos, seq = tokens[5], int(tokens[3]), tokens[9]
                    (dummy_md, mapped,
                        unmapped, clip_count, read_length) \
                        = dummy_md_and_mapped_offsets(
                                            cigar,
                                            clip_threshold=clip_threshold
                                        )
                    if unmapped:
                        # Too much clipping
                        continue
                    basewise_retrieved += read_length - clip_count
                    read_retrieved += 1
                    if not true_maps:
                        assert ignore_spliced_reads
                        continue
                    # Try both /1 and /2; choose the best basewise result
                    intersected_base_count = 0
                    for true_map in true_maps:
                        if tokens[2] != true_map[0][0]:
                            '''chr is wrong, but this is still counted as a
                            retrieval above'''
                            continue
                        base_counter, base_truths = 0, set()
                        '''Each tuple in base_truths is
                        (index of base in read, mapped location)'''
                        for block in true_map:
                            base_truths.update([(base_counter + i, j + 1)
                                                for i, j in enumerate(
                                                    xrange(block[1], block[2]))
                                                ])
                            base_counter += block[2] - block[1]
                        base_predictions = set()
                        if unmapped:
                            # Too much clipping
                            continue
                        _, _, _, exons = indels_introns_and_exons(
                            cigar, dummy_md, pos, seq, drop_deletions=True)
                        mapped_index = 0
                        for exon in exons:
                            base_predictions.update([
                                (mapped[mapped_index + i], j)
                                for i, j in enumerate(xrange(exon[0], exon[1]))
                            ])
                            mapped_index += exon[1] - exon[0]
                        intersected_base_count = max(
                            intersected_base_count,
                            len(base_predictions.intersection(base_truths)))
                    basewise_intersection += intersected_base_count
                    if intersected_base_count >= read_length * base_threshold:
                        read_intersection += 1
                    elif dump_incorrect:
                        # Incorrect alignment; write to stderr
                        print >> sys.stderr, '\t'.join(
                            ['.'.join(line) for line in saved])
                else:
                    raise RuntimeError('Invalid intermediate line.')
    return (basewise_retrieved, basewise_relevant, basewise_intersection,
            read_retrieved, read_relevant, read_intersection)