def get_median_read_coverage(output_path, num_threads, overwrite_files): """ Given the read alignments, use samtools stats to return an approximate median coverage value. """ log("Calculating global read coverage") if os.path.isfile(output_path + "c_reads_against_query.s.bam.stats"): if not overwrite_files: log("retaining pre-existing file: " + output_path + "c_reads_against_query.s.bam.stats") else: log("overwriting pre-existing file: " + output_path + "c_reads_against_query.s.bam.stats") st = pysam.stats("-@", str(num_threads), output_path + "c_reads_against_query.s.bam") with open(output_path + "c_reads_against_query.s.bam.stats", "w") as f: f.write(st) else: st = pysam.stats("-@", str(num_threads), output_path + "c_reads_against_query.s.bam") with open(output_path + "c_reads_against_query.s.bam.stats", "w") as f: f.write(st) # Get the coverage histogram (for 1 to 1k) covs = [] with open(output_path + "c_reads_against_query.s.bam.stats") as f: for line in f: if line.startswith("COV"): covs.append(int(line.split("\t")[3])) # Get the median from the histogram covs = np.asarray(covs, dtype=np.int32) # Remove the last value, which is a catch-all for coverages > 1k covs = covs[:-1] mid = sum(covs) // 2 cs = 0 for i in range(len(covs)): cs += covs[i] if cs >= mid: return i raise ValueError("Unable to calculate read coverage. Check SAM/BAM files and stats file.")
def insert_size(this, bamFile, threads=1): result_str = pysam.stats("-@", str(threads), bamFile) insert_size = float( re.search('SN\s+insert size average:\s+(\S+)', result_str).group(1)) insert_std = float( re.search('SN\s+insert size standard deviation:\s+(\S+)', result_str).group(1)) return (insert_size, insert_std)
def samtools_stats(x): """Extract insert size of PE BAM file samtools stats {in.bam} | grep 'insert size average' output of command: samtools stats Summary Numbers grep ^SN | cut -f 2- | grep 'insert size average' """ stat = pysam.stats('-@', '8', x) d = {} for line in stat.split('\n'): # check if not line.startswith('SN'): continue # stat sn, group, num = line.strip().split('\t')[0:3] group = group.strip(':') d[group] = num return d