def read_sam(sam_path, mode='', min_quality=0): view_options = '' flag_on = 0x0 flag_off = 0x900 # Ignore secondary and supplementary alignments if 'a' in mode: flag_off |= 0x4 # Aligned if 'A' in mode: flag_on |= 0x1 flag_off |= 0xc # Both mates aligned if 'C' in mode: flag_on |= 0x3 flag_off |= 0xc # Concordant read pairs if 'u' in mode: flag_on |= 0x4 # Unaligned if '1' in mode: flag_on |= 0x40 # First mates if '2' in mode: flag_on |= 0x80 # Second mates if '+' in mode: flag_off |= 0x10 # Plus strand only if '-' in mode: flag_on |= 0x10 # Minus strand only if not 'D' in mode: flag_off |= 0x400 # Flagged duplicates view_options += '-f 0x%x -F 0x%x ' % (flag_on, flag_off) if min_quality > 0: view_options += '-q%d ' % min_quality out = shell_stdout('samtools view %s %s' % (view_options, sam_path)) for line in out: yield line.split('\t')
def read_sam(sam_path, mode="", min_quality=0): view_options = "" flag_on = 0x0 flag_off = 0x900 # Ignore secondary and supplementary alignments if "a" in mode: flag_off |= 0x4 # Aligned if "A" in mode: flag_on |= 0x1 flag_off |= 0xC # Both mates aligned if "C" in mode: flag_on |= 0x3 flag_off |= 0xC # Concordant read pairs if "u" in mode: flag_on |= 0x4 # Unaligned if "1" in mode: flag_on |= 0x40 # First mates if "2" in mode: flag_on |= 0x80 # Second mates if "+" in mode: flag_off |= 0x10 # Plus strand only if "-" in mode: flag_on |= 0x10 # Minus strand only if not "D" in mode: flag_off |= 0x400 # Flagged duplicates view_options += "-f 0x%x -F 0x%x " % (flag_on, flag_off) if min_quality > 0: view_options += "-q%d " % min_quality out = shell_stdout("samtools view %s %s" % (view_options, sam_path)) for line in out: yield line.decode("utf8").split("\t")
def coverage_cds(bam_path, gtf_path): chr_sizes = ref_sequence_sizes(bam_path) info('Constructing a map of coding regions...') coding = {} for chr, size in chr_sizes.iteritems(): coding[chr] = [False] * size for line in zopen(gtf_path): if line.startswith('#'): continue cols = line.split('\t') if cols[2] != 'CDS': continue if len(cols[0]) > 5: continue # Ignore chromosomes other than chrXX if not cols[0] in coding: continue coding[cols[0]][int(cols[3])-1:int(cols[4])] = True info('Calculating a coverage histogram...') coverage_hist = [0] * 200 chr = '' pos = 0 for line in shell_stdout('bedtools genomecov -d -split -ibam %s' % bam_path): cols = line.split('\t') if cols[0] != chr: chr = cols[0] cds = coding[chr] pos = int(cols[1])-2 info('%s...' % chr) pos += 1 if cds[pos]: coverage_hist[min(int(cols[2]), len(coverage_hist)-1)] += 1 print('Coverage histogram:') print('===================') for cov in range(0, len(coverage_hist)): print('%d: %d' % (cov, coverage_hist[cov]))
def ref_sequence_sizes(sam_path): out = shell_stdout("samtools view -H %s" % sam_path) chr_sizes = {} for line in out: m = re.match("@SQ\tSN:(\w+)\tLN:(\d+)", line.decode("utf8")) if m: chr_sizes[m.group(1)] = int(m.group(2)) return chr_sizes
def ref_sequence_sizes(sam_path): out = shell_stdout('samtools view -H %s' % sam_path) chr_sizes = {} for line in out: m = re.match('@SQ\tSN:(\w+)\tLN:(\d+)', line) if m: chr_sizes[m.group(1)] = int(m.group(2)) return chr_sizes
def sam_reads_raw(bam_path, out_prefix): out_1 = zopen('%s_1.reads.gz' % out_prefix, 'w') out_2 = zopen('%s_2.reads.gz' % out_prefix, 'w') out = zopen('%s.reads.gz' % out_prefix, 'w') reads_1 = {} reads_2 = {} # The "samtools bam2fq" command does not output supplementary or # secondary alignments. Each read only has one primary alignment. options = '-n' if has_mate_suffixes(bam_path) else '' bam2fq = shell_stdout('samtools bam2fq %s %s' % (options, bam_path)) for line in bam2fq: if line[0] != '@': error('Invalid bam2fq output.') line = line[:-1] if line.endswith('/1'): segname = line[1:-2] mate = reads_2.pop(segname, None) if mate: out_1.write(next(bam2fq)) out_2.write('%s\n' % mate) else: reads_1[segname] = next(bam2fq)[:-1] elif line.endswith('/2'): segname = line[1:-2] mate = reads_1.pop(segname, None) if mate: out_1.write('%s\n' % mate) out_2.write(next(bam2fq)) else: reads_2[segname] = next(bam2fq)[:-1] else: out.write('%s\n' % next(bam2fq)[:-1]) # Skip per-base qualities. They can start with '@'. next(bam2fq) next(bam2fq) info('Found %d orphan first mates.' % len(reads_1)) for read_id in reads_1.keys()[:5]: info('- Example: %s' % read_id) info('Found %d orphan second mates.' % len(reads_2)) for read_id in reads_2.keys()[:5]: info('- Example: %s' % read_id) if len(reads_1) > 0: for read in reads_1.itervalues(): out.write('%s\n' % read) if len(reads_2) > 0: for read in reads_2.itervalues(): out.write('%s\n' % read) out_1.close() out_2.close() out.close()
def sam_statistics(bam_paths): samples = [re.sub('\.bam$', '', s) for s in bam_paths] print( 'SAMPLE\tTOTAL READS\tALIGNED READS\tALIGNED READS WITH ALIGNED MATE\tALIGNED READS WITH CONCORDANT MATE\tMITOCHONDRIAL\tDUPLICATES' ) for bam_path in bam_paths: total = -1 aligned = -1 aligned_with_aligned_mate = -1 aligned_with_concordant_mate = -1 mitochondrial = -1 duplicates = -1 for line in shell_stdout('samtools flagstat %s' % bam_path): m = re.search(r'(\d+) \+ (\d+) in total', line) if m: total = int(m.group(1)) + int(m.group(2)) m = re.search(r'(\d+) \+ (\d+) duplicates', line) if m: duplicates = int(m.group(1)) + int(m.group(2)) m = re.search(r'(\d+) \+ (\d+) mapped', line) if m: aligned = int(m.group(1)) + int(m.group(2)) m = re.search(r'(\d+) \+ (\d+) properly paired', line) if m: aligned_with_concordant_mate = \ int(m.group(1)) + int(m.group(2)) m = re.search(r'(\d+) \+ (\d+) with itself and mate mapped', line) if m: aligned_with_aligned_mate = int(m.group(1)) + int(m.group(2)) # Count the number of reads aligned to mitochondrial DNA for line in shell_stdout('samtools view -c %s chrM' % bam_path): mitochondrial = int(line) break print( '%s\t%d\t%d (%.1f%%)\t%d (%.1f%%)\t%d (%.1f%%)\t%d (%.1f%%)\t%d' % (re.sub('\.bam$', '', bam_path), total, aligned, float(aligned) / total * 100, aligned_with_aligned_mate, float(aligned_with_aligned_mate) / total * 100, aligned_with_concordant_mate, float(aligned_with_concordant_mate) / total * 100, mitochondrial, float(mitochondrial) / total * 100, duplicates))
def sam_merge_counts(bed_path, count_paths): samples = [p.replace('.tsv', '') for p in count_paths] bed_file = open(bed_path) cols = next(bed_file).rstrip('\n').split('\t') header = ['CHROMOSOME', 'START', 'END'] if len(cols) >= 4: header.append('FEATURE') while len(header) < len(cols): header.append('') header += samples print('\t'.join(header)) for line in shell_stdout('paste %s %s' % (bed_path, ' '.join(count_paths))): sys.stdout.write(line)
def simple_pileup(bam_paths, genome_path, kgenomes_path, min_mapq=10, min_alt_alleles=3, region=None): helper_dir = os.path.dirname(os.path.realpath(__file__)) + '/compiled' options = [] if region: options.append('%s %s' % ('-l' if region.endswith('.bed') else '-r', region)) # samtools mpileup will automatically ignore alignments flagged as # duplicates cmd = 'samtools mpileup -d 100000 -A -x -R -sB %s -q0 -l %s -f %s %s | %s/spileup %d %d' % (' '.join(options), kgenomes_path, genome_path, ' '.join(bam_paths), helper_dir, min_alt_alleles, min_mapq) #info('Pre-filtering mutations with the following command:\n%s' % cmd) return shell_stdout(cmd)
def lftp_mirror(rule, dry_run=False): cmds = open('.lftp_script', 'w') cmds.write('open -u %s,%s sftp://%s\n' % ( rule.username, rule.password, rule.dst_host)) cmds.write('mirror -P3 -Rae %s %s %s\n' % ( '--dry-run' if dry_run else '-v', rule.src_dir, rule.dst_dir)) cmds.close() if dry_run: userpass = rule.username + ':' + rule.password + '@' host = rule.dst_host out = shell_stdout('lftp -f .lftp_script') for line in out: if line.startswith('chmod'): continue if line.startswith('mkdir'): continue m = re.match('get -O sftp://(.+) file:/.+/(.+)', line) if m: dst = m.group(1) if dst.startswith(userpass): dst = dst[len(userpass):] if dst.startswith(host): dst = dst[len(host):] print('ADD %s/%s' % (dst, m.group(2))) continue m = re.match('get -e -O sftp://(.+) file:/.+/(.+)', line) if m: dst = m.group(1) if dst.startswith(userpass): dst = dst[len(userpass):] if dst.startswith(host): dst = dst[len(host):] print('UPDATE %s/%s' % (dst, m.group(2))) continue m = re.match('rm .*sftp://(.+)', line) if m: dst = m.group(1) if dst.startswith(userpass): dst = dst[len(userpass):] if dst.startswith(host): dst = dst[len(host):] print('DELETE %s' % dst) continue sys.stdout.write(line) else: shell('lftp -f .lftp_script') os.remove('.lftp_script')
def simple_pileup(bam_paths, genome_path, min_mapq=10, min_alt_alleles=3, region=None): helper_dir = os.path.dirname(os.path.realpath(__file__)) + '/compiled' options = [] if region: options.append('%s %s' % ('-l' if region.endswith('.bed') else '-r', region)) # samtools mpileup will automatically ignore alignments flagged as # duplicates cmd = 'samtools mpileup -d 1000000 -A -x -R -sB %s -q0 -f %s %s | %s/spileup %d %d' % ( ' '.join(options), genome_path, ' '.join(bam_paths), helper_dir, min_alt_alleles, min_mapq) #info('Pre-filtering mutations with the following command:\n%s' % cmd) return shell_stdout(cmd)
def sam_unaligned_reads(bam_path): # The "samtools bam2fq" command does not output supplementary or # secondary alignments. Each read has max 1 primary alignment. options = '-n' if has_mate_suffixes(bam_path) else '' if has_base_qualities(bam_path): shell('samtools view -u -f 0x4 -F 0x900 %s | samtools bam2fq %s -' % (bam_path, options)) else: bam2fq = shell_stdout( 'samtools view -u -f 0x4 -F 0x900 %s | samtools bam2fq %s -' % (bam_path, options)) for line in bam2fq: if line[0] != '@': error('Invalid bam2fq output.') sys.stdout.write('>') sys.stdout.write(line[1:]) sys.stdout.write(next(bam2fq)) # Skip per-base qualities. They can start with '@'. next(bam2fq) next(bam2fq)
def detect_discordant_pairs(sam_path, out_prefix, max_frag_len, min_mapq, orientation): out = zopen(out_prefix + '.discordant_pairs.tsv.gz', 'w') N = 0 sort_tmp_dir = os.path.dirname(out_prefix) if not sort_tmp_dir: sort_tmp_dir = './' # Go through all the first mates and look for discordant pairs. info('Searching for discordant read pairs...') prev = [''] for line in shell_stdout( 'sam discordant pairs --min-mapq=%d %s %d | sort -k1,1 -T %s' % (min_mapq, sam_path, max_frag_len, sort_tmp_dir)): al = line.split('\t') if len(al) < 9: continue # Discard spliced and clipped reads. # FIXME: Add support for spliced RNA-seq reads. if 'N' in al[5] or 'S' in al[5]: continue if al[0].endswith('/1') or al[0].endswith('/2'): al[0] = al[0][:-2] # Remove /1 or /2 suffix if al[0] != prev[0]: prev = al continue flags = int(al[1]) chr = al[2] mchr = prev[2] strand = '-' if flags & 0x10 else '+' mstrand = '-' if flags & 0x20 else '+' pos = int(al[3]) mpos = int(prev[3]) rlen = len(al[9]) mrlen = len(prev[9]) if not chr.startswith('chr'): chr = 'chr' + chr if not mchr.startswith('chr'): mchr = 'chr' + mchr if chr == 'chrM' or mchr == 'chrM': continue # Discard mitochondrial if orientation == 'fr': # Reorient pairs so that the first mate is always upstream. if chr > mchr or (chr == mchr and pos > mpos): chr, mchr = mchr, chr pos, mpos = mpos, pos rlen, mrlen = mrlen, rlen strand, mstrand = mstrand, strand # Convert to forward-forward orientation (flip second mate). mstrand = '-' if mstrand == '+' else '+' elif orientation == 'rf': # Reorient pairs so that the first mate is always upstream. if chr > mchr or (chr == mchr and pos > mpos): chr, mchr = mchr, chr pos, mpos = mpos, pos rlen, mrlen = mrlen, rlen strand, mstrand = mstrand, strand # Convert to forward-forward orientation (flip first mate). strand = '-' if strand == '+' else '+' elif orientation == 'ff': # Reorient pairs so that the first mate is always upstream. # If mates are swapped, both mates must be reversed. if chr > mchr or (chr == mchr and pos > mpos): chr, mchr = mchr, chr pos, mpos = mpos, pos rlen, mrlen = mrlen, rlen strand, mstrand = '+' if mstrand == '-' else '-', \ '+' if strand == '-' else '-' else: error('Unsupported read orientation detected.') # Make positions represent read starts. if strand == '-': pos += rlen - 1 if mstrand == '-': mpos += mrlen - 1 # Each discordant mate pair is represented as a 7-tuple # (chr_1, strand_1, pos_1, chr_2, strand_2, pos_2, None). # The None at the end signifies that this is a mate pair. # Positions are 1-based and represent read starts. out.write('%s\t%s\t%d\t%s\t%s\t%d\t-\n' % (chr, strand, pos, mchr, mstrand, mpos)) N += 1 out.close() info('Found %d discordant mate pairs.' % N)
def detect_specific(bam_path, donors_path, acceptors_path, genome_path, out_prefix, all_reads): read_len = sam.read_length(bam_path) info('Using read length %d bp...' % read_len) flank_len = read_len - 10 chromosomes = read_fasta(genome_path) donor_exons = regions_from_bed(donors_path) donors = [] for ex in donor_exons: chr = ex[0] if ex[0].startswith('chr') else 'chr' + ex[0] chr_seq = chromosomes[chr] if ex[1] == '+': donors.append((chr, '+', ex[3], chr_seq[ex[3] - flank_len:ex[3]])) elif ex[1] == '-': donors.append( (chr, '-', ex[2], revcomplement(chr_seq[ex[2] - 1:ex[2] - 1 + flank_len]))) acceptor_exons = regions_from_bed(acceptors_path) acceptors = [] for ex in acceptor_exons: chr = ex[0] if ex[0].startswith('chr') else 'chr' + ex[0] chr_seq = chromosomes[chr] if ex[1] == '+': acceptors.append( (chr, '+', ex[2], chr_seq[ex[2] - 1:ex[2] - 1 + flank_len])) elif ex[1] == '-': acceptors.append((chr, '-', ex[3], revcomplement(chr_seq[ex[3] - flank_len:ex[3]]))) del chromosomes # Release 3 GB of memory gc.collect() # Remove duplicate acceptors and donors. acceptors = list(set(acceptors)) donors = list(set(donors)) # Calculate junction sequences junctions = {} for left in donors: for right in acceptors: name = '%s:%s:%d_%s:%s:%d' % (left[:3] + right[:3]) junctions[name] = Object(sequence=left[3] + right[3], reads=[]) info('Generated %d junctions.' % len(junctions)) # Build Bowtie index info('Constructing junction FASTA file...') index_fasta_path = out_prefix + '_ref.fa' index = open(index_fasta_path, 'w') for name, junction in junctions.iteritems(): index.write('>%s\n%s\n' % (name, junction.sequence)) index.close() info('Constructing Bowtie index...') shell('bowtie-build -q %s %s_index' % (index_fasta_path, out_prefix)) # Align reads against junctions and tally junction read counts. if all_reads: info('Aligning all reads against index...') reads_command = 'sam reads %s' % bam_path else: info('Aligning unaligned reads against index...') reads_command = 'sam unaligned reads %s' % bam_path for line in shell_stdout('bowtie -f -v1 -B1 %s_index <(%s)' % (out_prefix, reads_command)): cols = line.rstrip().split('\t') junctions[cols[2]].reads.append(cols[4]) shell('rm %s_index.* %s_ref.fa' % (out_prefix, out_prefix)) out_file = open(out_prefix + '.tsv', 'w') out_file.write('5\' breakpoint\t3\' breakpoint\tNum reads\tSequences\n') for name, j in junctions.iteritems(): if not j.reads: continue flanks = name.split('_') out_file.write('%s\t%s\t%d\t' % (flanks[0], flanks[1], len(j.reads))) #out_file.write(';'.join(j.reads)) out_file.write('\n') out_file.close()
def detect_discordant_reads(sam_path, genome_path, out_prefix, anchor_len): out = zopen(out_prefix + '.discordant_reads.tsv.gz', 'w') N = 0 info('Splitting unaligned reads into %d bp anchors and aligning against ' 'the genome...' % anchor_len) # IMPORTANT: Only one thread can be used, otherwise alignment order is not # guaranteed and the loop below will fail. anchor_alignments = shell_stdout( 'samtools fasta -f 0x4 %s | fasta split interleaved - %d | ' 'bowtie -f -p1 -v0 -m1 -B1 --suppress 5,6,7,8 %s -' % (sam_path, anchor_len, genome_path)) chromosomes = read_flat_seq(genome_path) for chr in list(chromosomes.keys()): if not chr.startswith('chr'): chromosomes['chr' + chr] = chromosomes.pop(chr) prev = [''] for line in anchor_alignments: al = line.split('\t') if al[0][-2] == '/': al[0] = al[0][:-2] if al[0] != prev[0]: prev = al continue chr = prev[2] mchr = al[2] strand = prev[1] mstrand = al[1] pos = int(prev[3]) mpos = int(al[3]) seq = prev[0][prev[0].find('_') + 1:] full_len = len(seq) if not chr.startswith('chr'): chr = 'chr' + chr if not mchr.startswith('chr'): mchr = 'chr' + mchr # Ignore anchor pairs where the anchors are too close. if chr == mchr and abs(pos - mpos) < full_len - anchor_len + 10: continue # Ignore rearrangements involving mitochondrial DNA. if 'M' in chr or 'M' in mchr: continue # Reorient the pairs so the first anchor is always upstream. # If mates are swapped, both mates must be reverse-complemented. if chr > mchr or (chr == mchr and pos > mpos): chr, mchr = mchr, chr pos, mpos = mpos, pos strand, mstrand = '+' if mstrand == '-' else '-', \ '+' if strand == '-' else '-' seq = revcomplement(seq) # Extract the flanking sequences from the chromosome sequences. # The range calculations are a bit complex. It's easier to understand # them if you first add one to all indices to convert to 1-based # genomic coordinates ("pos" and "mpos" are 1-based). if strand == '+': left_grch = chromosomes[chr][pos - 1:pos + full_len - 1] else: left_grch = revcomplement( chromosomes[chr][pos + anchor_len - full_len - 1:pos + anchor_len - 1]) if mstrand == '+': right_grch = chromosomes[mchr][mpos + anchor_len - full_len - 1:mpos + anchor_len - 1] else: right_grch = revcomplement(chromosomes[mchr][mpos - 1:mpos + full_len - 1]) # If the read is at the very edge of a chromosome, ignore it. if len(left_grch) < full_len or len(right_grch) < full_len: continue # Make sure that reference sequences are in uppercase left_grch = left_grch.upper() right_grch = right_grch.upper() #print('-------------------') #print([chr, strand, pos, mchr, mstrand, mpos]) #print(seq) #print(left_grch) #print(right_grch) # Check that the read sequence is not too homologous on either side # of the breakpoint. left_match = float( sum([ seq[i] == left_grch[i] for i in range(full_len - anchor_len, full_len) ])) / anchor_len right_match = float( sum([seq[i] == right_grch[i] for i in range(anchor_len)])) / anchor_len max_homology = 0.7 if left_match >= max_homology or right_match >= max_homology: continue # Identify the breakpoint location that minimizes the number of # nucleotide mismatches between the read and the breakpoint flanks. potential_breakpoints = range(anchor_len, full_len - anchor_len + 1) mismatches = [0] * len(potential_breakpoints) for k, br in enumerate(potential_breakpoints): grch_chimera = left_grch[:br] + right_grch[br:] mismatches[k] = sum( [seq[i] != grch_chimera[i] for i in range(full_len)]) # The best breakpoint placement cannot have more than N mismatches. least_mismatches = min(mismatches) #if least_mismatches > 2: continue # "br" represent the number of nucleotides in the read # before the breakpoint, counting from the 5' end of the read. # If there is microhomology, we pick the first breakpoint. br = potential_breakpoints[mismatches.index(least_mismatches)] # Now that we know the exact fusion breakpoint, we mark mismatches # with a lower case nucleotide and augment the read # sequence with a | symbol to denote the junction. grch_chimera = left_grch[:br] + right_grch[br:] seq = ''.join([ nuc if grch_chimera[k] == nuc else nuc.lower() for k, nuc in enumerate(seq) ]) seq = seq[:br] + '|' + seq[br:] # Make positions represent read starts. if strand == '-': pos += anchor_len - 1 if mstrand == '-': mpos += anchor_len - 1 # Each discordant anchor pair is represented as a 7-tuple # (chr_1, strand_1, pos_1, chr_2, strand_2, pos_2, sequence). # Positions are 1-based and represent read starts. out.write('%s\t%s\t%d\t%s\t%s\t%d\t%s\n' % (chr, strand, pos, mchr, mstrand, mpos, seq)) N += 1 info('Found %d discordant anchor pairs.' % N) out.close()
info('Downloading %s...' % filename) shell('gtdownload -v -d %s -c ~/tools/genetorrent*/cghub_2016.key' % sample.analysis_data_uri) if __name__ == '__main__': args = docopt.docopt(__doc__) predicates = [ 'disease_abbr=' + args['<cancer>'].upper(), 'library_strategy=' + args['<library_type>'] ] if args['--genome']: predicates.append('refassem_short_name=' + args['--genome']) output = shell_stdout('cgquery "%s"' % '&'.join(predicates)) samples = cghub_parse(output) # Filter the samples if the user has provided a whitelist if args['--filename']: rx = args['--filename'] samples = [s for s in samples if re.search(rx, s.files[0])] # Filter the samples if the user has provided a filename whitelist if args['--filename-in']: whitelist = [line.strip() for line in open(args['--filename-in'])] samples = [s for s in samples if s.files[0] in whitelist] # Filter the samples if the user has provided a filename blacklist if args['--filename-not-in']: blacklist = [line.strip() for line in open(args['--filename-not-in'])]