def write_read_introns_from_sam_stream(sam_stream, output_stream,
                                        retrieved_intron_counts,
                                        instance=False):
    """ Writes output that maps QNAMES to exon-exon junctions overlapped.

        sam_stream: where to find retrieved alignments in SAM form
        output_stream: where to write output. Each line takes the form:
            <read name><TAB>RNAME<TAB><sorted list of intron starts and ends>
            <TAB>['r' for 'retrieved']
        retrieved_intron_counts: defaultdict(int) that counts number of
            retrieved alignments overlapping exon-exon junction

        No return value.
    """
    for line in sam_stream:
        if line[0] == '@': continue
        try:
            tokens = line.strip().split('\t')
            flag = int(tokens[1])
            if flag & 4:
                continue
            name = tokens[0]
            rname = tokens[2]
            cigar = tokens[5]
            pos = int(tokens[3])
            seq = tokens[9]
            flag = int(tokens[1])
            if 'N' not in cigar or flag & 256:
                continue
            #md = [token[5:] for token in tokens if token[:5] == 'MD:Z:'][0]
            _, _, introns, _ = indels_junctions_and_exons(cigar,
                                        dummy_md_index(cigar), pos, seq)
            introns = [intron[:2] for intron in introns]
            introns = [rname 
                          + ';'.join([''] + [str(bound) for bound in intron])
                          for intron in sorted(list(introns))]
            if instance:
                for intron in introns:
                    retrieved_intron_counts[intron] += 1
                print >>output_stream, '%s\t%s\tr' % (name, '\t'.join(introns))
            else:
                for intron in introns:
                    retrieved_intron_counts[intron] += 1
                    print >>output_stream, '%s;%s\t%s\tr' % (name,
                                                             intron,
                                                             intron)
        except IndexError:
            print >>sys.stderr, ('Error found on line: ' + line)
            raise
Beispiel #2
0
def go(true_bed_stream, sam_stream=sys.stdin, generous=False,
        base_threshold=0.5, clip_threshold=1.0, dump_incorrect=False,
        temp_dir=None, ignore_spliced_reads=False):
    """ Finds relevant and retrieved instance counts.

        true_bed_stream: file handle for BED output of Flux simulation
        sam_stream: where to read in aligner's mappings
        generous: True iff aligner cuts off /1 or /2 of a given read
        base_threshold: proportion of a read's bases that must align
            correctly for a read to be considered a correct mapping
        clip_threshold: proportion of a read's bases that must be clipped
            for a read to be considered unmapped
        dump_incorrect: write incorrect (read) alignments to stderr
        ignore_spliced_reads: ignores all spliced reads
    """
    from tempdel import remove_temporary_directories
    import tempfile
    import atexit
    if temp_dir is None:
        temp_dir_path = tempfile.mkdtemp()
    else:
        try:
            temp_dir_path = tempfile.mkdtemp(dir=temp_dir)
        except:
            temp_dir_path = tempfile.mkdtemp()
    #print >>sys.stderr, temp_dir_path
    atexit.register(remove_temporary_directories, [temp_dir_path])
    # Store everything in one file, then sort it on read name
    combined_file = os.path.join(temp_dir_path, 'combined.temp')
    with open(combined_file, 'w') as temp_stream:
        if ignore_spliced_reads:
            if generous:
                for line in true_bed_stream:
                    tokens = line.strip().split('\t')
                    if ',' in tokens[-1]: continue # skip intron line
                    print >>temp_stream, '\t'.join([tokens[3][:-2], '0']
                                                    + tokens[:3] + tokens[4:])
            else:
                for line in true_bed_stream:
                    tokens = line.strip().split('\t')
                    if ',' in tokens[-1]: continue # skip intron line
                    print >>temp_stream, '\t'.join(
                                    [tokens[3], '0'] + tokens[:3] + tokens[4:]
                                )
            for line in sam_stream:
                if line[0] == '@' or not line.strip(): continue
                tokens = line.strip().split('\t')
                if 'N' in tokens[5]: continue # skip intron line
                print >>temp_stream, '\t'.join([tokens[0], '1'] + tokens[1:])
        else:
            if generous:
                for line in true_bed_stream:
                    tokens = line.strip().split('\t')
                    print >>temp_stream, '\t'.join([tokens[3][:-2], '0']
                                                    + tokens[:3] + tokens[4:])
            else:
                for line in true_bed_stream:
                    tokens = line.strip().split('\t')
                    print >>temp_stream, '\t'.join(
                                    [tokens[3], '0'] + tokens[:3] + tokens[4:]
                                )
            for line in sam_stream:
                if line[0] == '@' or not line.strip(): continue
                tokens = line.strip().split('\t')
                print >>temp_stream, '\t'.join([tokens[0], '1'] + tokens[1:])
    import subprocess
    sorted_combined_file = os.path.join(temp_dir_path, 'combined.sorted.temp')
    subprocess.check_call(' '.join(['sort -T %s -k1,1 -k2,2n'
                                        % temp_dir_path, combined_file, 
                                        '>', sorted_combined_file]),
                            bufsize=-1, shell=True)
    basewise_relevant, read_relevant = 0, 0
    # Initialize counters for computing accuracy metrics
    basewise_retrieved, basewise_intersection = 0, 0
    read_retrieved, read_intersection = 0, 0
    with open(sorted_combined_file) as sorted_combined_stream:
        for (name,), xpartition in xstream(sorted_combined_stream, 1):
            '''Dict mapping read names to alignments
            (chrom, 1-based start, 1-based end)'''
            true_maps = []
            saved = []
            for tokens in xpartition:
                saved.append(tokens)
                if tokens[0] == '0':
                    if len(tokens) < 12:
                        continue
                    chrom = tokens[1]
                    chrom_start = int(tokens[2])
                    chrom_end = int(tokens[3])
                    block_sizes = tokens[10].split(',')
                    block_starts = tokens[11].split(',')
                    # Handle trailing commas
                    try:
                        int(block_sizes[-1])
                    except ValueError:
                        block_sizes = block_sizes[:-1]
                    try:
                        int(block_starts[-1])
                    except ValueError:
                        block_starts = block_starts[:-1]
                    block_count = len(block_sizes)
                    assert block_count == len(block_starts)
                    exons = [(chrom,
                                chrom_start + int(block_starts[i]),
                                chrom_start + int(block_starts[i])
                                + int(block_sizes[i]))
                                for i in xrange(block_count)]
                    true_maps.append(exons)
                    basewise_relevant += sum([int(block_size) for block_size
                                                in block_sizes])
                    read_relevant += 1
                elif tokens[0] == '1':
                    flag = int(tokens[1])
                    if flag & 256 or flag & 4:
                        '''Secondary alignment or unmapped and thus not
                        retrieved; ignore'''
                        continue
                    cigar, pos, seq = tokens[5], int(tokens[3]), tokens[9]
                    (dummy_md, mapped,
                        unmapped, clip_count, read_length) \
                        = dummy_md_and_mapped_offsets(
                                            cigar,
                                            clip_threshold=clip_threshold
                                        )
                    if unmapped:
                        # Too much clipping
                        continue
                    basewise_retrieved += read_length - clip_count
                    read_retrieved += 1
                    if not true_maps:
                        assert ignore_spliced_reads
                        continue
                    # Try both /1 and /2; choose the best basewise result
                    intersected_base_count = 0
                    for true_map in true_maps:
                        if tokens[2] != true_map[0][0]:
                            '''chr is wrong, but this is still counted as a
                            retrieval above'''
                            continue
                        base_counter, base_truths = 0, set()
                        '''Each tuple in base_truths is
                        (index of base in read, mapped location)'''
                        for block in true_map:
                            base_truths.update([(base_counter + i, j + 1)
                                                    for i, j in enumerate(
                                                        xrange(
                                                            block[1], block[2]
                                                        ))])
                            base_counter += block[2] - block[1]
                        base_predictions = set()
                        if unmapped:
                            # Too much clipping
                            continue
                        _, _, _, exons = indels_junctions_and_exons(
                                                        cigar,
                                                        dummy_md, pos, seq,
                                                        drop_deletions=True
                                                    )
                        mapped_index = 0
                        for exon in exons:
                            base_predictions.update(
                                        [(mapped[mapped_index + i], j)
                                                  for i, j in enumerate(
                                                    xrange(
                                                        exon[0], exon[1]
                                                    ))])
                            mapped_index += exon[1] - exon[0]
                        intersected_base_count = max(intersected_base_count,
                                len(
                                    base_predictions.intersection(base_truths)
                                ))
                    basewise_intersection += intersected_base_count
                    if intersected_base_count >= read_length * base_threshold:
                        read_intersection += 1
                    elif dump_incorrect:
                        # Incorrect alignment; write to stderr
                        print >>sys.stderr, '\t'.join(
                                ['.'.join(line) for line in saved]
                            )
                else:
                    raise RuntimeError(
                                'Invalid intermediate line.'
                            )
    return (basewise_retrieved, basewise_relevant, basewise_intersection,
            read_retrieved, read_relevant, read_intersection)