def write_read_introns_from_sam_stream(sam_stream, output_stream, retrieved_intron_counts, instance=False): """ Writes output that maps QNAMES to exon-exon junctions overlapped. sam_stream: where to find retrieved alignments in SAM form output_stream: where to write output. Each line takes the form: <read name><TAB>RNAME<TAB><sorted list of intron starts and ends> <TAB>['r' for 'retrieved'] retrieved_intron_counts: defaultdict(int) that counts number of retrieved alignments overlapping exon-exon junction No return value. """ for line in sam_stream: if line[0] == '@': continue try: tokens = line.strip().split('\t') flag = int(tokens[1]) if flag & 4: continue name = tokens[0] rname = tokens[2] cigar = tokens[5] pos = int(tokens[3]) seq = tokens[9] flag = int(tokens[1]) if 'N' not in cigar or flag & 256: continue #md = [token[5:] for token in tokens if token[:5] == 'MD:Z:'][0] _, _, introns, _ = indels_junctions_and_exons(cigar, dummy_md_index(cigar), pos, seq) introns = [intron[:2] for intron in introns] introns = [rname + ';'.join([''] + [str(bound) for bound in intron]) for intron in sorted(list(introns))] if instance: for intron in introns: retrieved_intron_counts[intron] += 1 print >>output_stream, '%s\t%s\tr' % (name, '\t'.join(introns)) else: for intron in introns: retrieved_intron_counts[intron] += 1 print >>output_stream, '%s;%s\t%s\tr' % (name, intron, intron) except IndexError: print >>sys.stderr, ('Error found on line: ' + line) raise
def go(true_bed_stream, sam_stream=sys.stdin, generous=False, base_threshold=0.5, clip_threshold=1.0, dump_incorrect=False, temp_dir=None, ignore_spliced_reads=False): """ Finds relevant and retrieved instance counts. true_bed_stream: file handle for BED output of Flux simulation sam_stream: where to read in aligner's mappings generous: True iff aligner cuts off /1 or /2 of a given read base_threshold: proportion of a read's bases that must align correctly for a read to be considered a correct mapping clip_threshold: proportion of a read's bases that must be clipped for a read to be considered unmapped dump_incorrect: write incorrect (read) alignments to stderr ignore_spliced_reads: ignores all spliced reads """ from tempdel import remove_temporary_directories import tempfile import atexit if temp_dir is None: temp_dir_path = tempfile.mkdtemp() else: try: temp_dir_path = tempfile.mkdtemp(dir=temp_dir) except: temp_dir_path = tempfile.mkdtemp() #print >>sys.stderr, temp_dir_path atexit.register(remove_temporary_directories, [temp_dir_path]) # Store everything in one file, then sort it on read name combined_file = os.path.join(temp_dir_path, 'combined.temp') with open(combined_file, 'w') as temp_stream: if ignore_spliced_reads: if generous: for line in true_bed_stream: tokens = line.strip().split('\t') if ',' in tokens[-1]: continue # skip intron line print >>temp_stream, '\t'.join([tokens[3][:-2], '0'] + tokens[:3] + tokens[4:]) else: for line in true_bed_stream: tokens = line.strip().split('\t') if ',' in tokens[-1]: continue # skip intron line print >>temp_stream, '\t'.join( [tokens[3], '0'] + tokens[:3] + tokens[4:] ) for line in sam_stream: if line[0] == '@' or not line.strip(): continue tokens = line.strip().split('\t') if 'N' in tokens[5]: continue # skip intron line print >>temp_stream, '\t'.join([tokens[0], '1'] + tokens[1:]) else: if generous: for line in true_bed_stream: tokens = line.strip().split('\t') print >>temp_stream, '\t'.join([tokens[3][:-2], '0'] + tokens[:3] + tokens[4:]) else: for line in true_bed_stream: tokens = line.strip().split('\t') print >>temp_stream, '\t'.join( [tokens[3], '0'] + tokens[:3] + tokens[4:] ) for line in sam_stream: if line[0] == '@' or not line.strip(): continue tokens = line.strip().split('\t') print >>temp_stream, '\t'.join([tokens[0], '1'] + tokens[1:]) import subprocess sorted_combined_file = os.path.join(temp_dir_path, 'combined.sorted.temp') subprocess.check_call(' '.join(['sort -T %s -k1,1 -k2,2n' % temp_dir_path, combined_file, '>', sorted_combined_file]), bufsize=-1, shell=True) basewise_relevant, read_relevant = 0, 0 # Initialize counters for computing accuracy metrics basewise_retrieved, basewise_intersection = 0, 0 read_retrieved, read_intersection = 0, 0 with open(sorted_combined_file) as sorted_combined_stream: for (name,), xpartition in xstream(sorted_combined_stream, 1): '''Dict mapping read names to alignments (chrom, 1-based start, 1-based end)''' true_maps = [] saved = [] for tokens in xpartition: saved.append(tokens) if tokens[0] == '0': if len(tokens) < 12: continue chrom = tokens[1] chrom_start = int(tokens[2]) chrom_end = int(tokens[3]) block_sizes = tokens[10].split(',') block_starts = tokens[11].split(',') # Handle trailing commas try: int(block_sizes[-1]) except ValueError: block_sizes = block_sizes[:-1] try: int(block_starts[-1]) except ValueError: block_starts = block_starts[:-1] block_count = len(block_sizes) assert block_count == len(block_starts) exons = [(chrom, chrom_start + int(block_starts[i]), chrom_start + int(block_starts[i]) + int(block_sizes[i])) for i in xrange(block_count)] true_maps.append(exons) basewise_relevant += sum([int(block_size) for block_size in block_sizes]) read_relevant += 1 elif tokens[0] == '1': flag = int(tokens[1]) if flag & 256 or flag & 4: '''Secondary alignment or unmapped and thus not retrieved; ignore''' continue cigar, pos, seq = tokens[5], int(tokens[3]), tokens[9] (dummy_md, mapped, unmapped, clip_count, read_length) \ = dummy_md_and_mapped_offsets( cigar, clip_threshold=clip_threshold ) if unmapped: # Too much clipping continue basewise_retrieved += read_length - clip_count read_retrieved += 1 if not true_maps: assert ignore_spliced_reads continue # Try both /1 and /2; choose the best basewise result intersected_base_count = 0 for true_map in true_maps: if tokens[2] != true_map[0][0]: '''chr is wrong, but this is still counted as a retrieval above''' continue base_counter, base_truths = 0, set() '''Each tuple in base_truths is (index of base in read, mapped location)''' for block in true_map: base_truths.update([(base_counter + i, j + 1) for i, j in enumerate( xrange( block[1], block[2] ))]) base_counter += block[2] - block[1] base_predictions = set() if unmapped: # Too much clipping continue _, _, _, exons = indels_junctions_and_exons( cigar, dummy_md, pos, seq, drop_deletions=True ) mapped_index = 0 for exon in exons: base_predictions.update( [(mapped[mapped_index + i], j) for i, j in enumerate( xrange( exon[0], exon[1] ))]) mapped_index += exon[1] - exon[0] intersected_base_count = max(intersected_base_count, len( base_predictions.intersection(base_truths) )) basewise_intersection += intersected_base_count if intersected_base_count >= read_length * base_threshold: read_intersection += 1 elif dump_incorrect: # Incorrect alignment; write to stderr print >>sys.stderr, '\t'.join( ['.'.join(line) for line in saved] ) else: raise RuntimeError( 'Invalid intermediate line.' ) return (basewise_retrieved, basewise_relevant, basewise_intersection, read_retrieved, read_relevant, read_intersection)