def get_physical_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, log_path, err_path, cov_fpath, chr_len_fpath): if not os.path.exists(bedtools_fpath('bamToBed')): logger.info(' Failed calculating physical coverage...') return None raw_cov_fpath = cov_fpath + '_raw' if not is_non_empty_file(raw_cov_fpath): logger.info(' Calculating physical coverage...') ## keep properly mapped, unique, and non-duplicate read pairs only bam_filtered_fpath = os.path.join(output_dirpath, ref_name + '.filtered.bam') qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'proper_pair and not supplementary and not duplicate', bam_fpath], stdout=open(bam_filtered_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) ## sort by read names bam_filtered_sorted_fpath = os.path.join(output_dirpath, ref_name + '.filtered.sorted.bam') qutils.call_subprocess([sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-o', bam_filtered_sorted_fpath, '-n', bam_filtered_fpath], stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) bedpe_fpath = os.path.join(output_dirpath, ref_name + '.bedpe') qutils.call_subprocess([bedtools_fpath('bamToBed'), '-i', bam_filtered_sorted_fpath, '-bedpe'], stdout=open(bedpe_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) raw_bed_fpath = os.path.join(output_dirpath, ref_name + '.bed') with open(bedpe_fpath, 'r') as bedpe: with open(raw_bed_fpath, 'w') as bed_file: for line in bedpe: fs = line.split() bed_file.write('\t'.join([fs[0], fs[1], fs[5] + '\n'])) sorted_bed_fpath = os.path.join(output_dirpath, ref_name + '.sorted.bed') qutils.call_subprocess([bedtools_fpath('bedtools'), 'sort', '-i', raw_bed_fpath], stdout=open(sorted_bed_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([bedtools_fpath('bedtools'), 'genomecov', '-bga', '-i', sorted_bed_fpath, '-g', chr_len_fpath], stdout=open(raw_cov_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) return raw_cov_fpath
def get_physical_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, log_path, err_path, cov_fpath): if not os.path.exists(bedtools_fpath('bamToBed')): logger.info(' Failed calculating physical coverage...') return None if not is_non_empty_file(cov_fpath): logger.info(' Calculating physical coverage...') ## keep properly mapped, unique, and non-duplicate read pairs only bam_filtered_fpath = os.path.join(output_dirpath, ref_name + '.filtered.bam') qutils.call_subprocess([ sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'proper_pair and not supplementary and not duplicate', bam_fpath ], stdout=open(bam_filtered_fpath, 'w'), stderr=open(err_path, 'a')) ## sort by read names bam_filtered_sorted_fpath = os.path.join( output_dirpath, ref_name + '.filtered.sorted.bam') qutils.call_subprocess([ sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-n', '-o', bam_filtered_sorted_fpath, bam_filtered_fpath ], stdout=open(log_path, 'a'), stderr=open(err_path, 'a')) bedpe_fpath = os.path.join(output_dirpath, ref_name + '.bedpe') qutils.call_subprocess([ bedtools_fpath('bamToBed'), '-i', bam_filtered_sorted_fpath, '-bedpe' ], stdout=open(bedpe_fpath, 'w'), stderr=open(err_path, 'a')) raw_bed_fpath = os.path.join(output_dirpath, ref_name + '.bed') with open(bedpe_fpath, 'r') as bedpe: with open(raw_bed_fpath, 'w') as bed_file: for line in bedpe: fs = line.split() bed_file.write('\t'.join([fs[0], fs[1], fs[5] + '\n'])) sorted_bed_fpath = os.path.join(output_dirpath, ref_name + '.sorted.bed') qutils.call_subprocess( [bedtools_fpath('bedtools'), 'sort', '-i', raw_bed_fpath], stdout=open(sorted_bed_fpath, 'w'), stderr=open(err_path, 'a')) chr_len_fpath = get_chr_len_fpath(ref_fpath) raw_cov_fpath = cov_fpath + '_raw' qutils.call_subprocess([ bedtools_fpath('bedtools'), 'genomecov', '-bga', '-i', sorted_bed_fpath, '-g', chr_len_fpath ], stdout=open(raw_cov_fpath, 'w'), stderr=open(err_path, 'a')) proceed_cov_file(raw_cov_fpath, cov_fpath) return cov_fpath
def get_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_path, cov_fpath, physical_cov_fpath): raw_cov_fpath = cov_fpath + '_raw' if not is_non_empty_file(cov_fpath): logger.info(' Calculating reads coverage...') if not is_non_empty_file(raw_cov_fpath): if not is_non_empty_file(bam_sorted_fpath): qutils.call_subprocess([ sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-o', bam_sorted_fpath, bam_fpath ], stdout=open(log_path, 'a'), stderr=open(err_path, 'a')) chr_len_fpath = get_chr_len_fpath(ref_fpath) qutils.call_subprocess([ bedtools_fpath('bedtools'), 'genomecov', '-bga', '-ibam', bam_sorted_fpath, '-g', chr_len_fpath ], stdout=open(raw_cov_fpath, 'w'), stderr=open(err_path, 'a')) qutils.assert_file_exists(raw_cov_fpath, 'coverage file') proceed_cov_file(raw_cov_fpath, cov_fpath) if not is_non_empty_file(physical_cov_fpath): physical_cov_fpath = get_physical_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, log_path, err_path, physical_cov_fpath) return cov_fpath, physical_cov_fpath
def get_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_path, cov_fpath, physical_cov_fpath, correct_chr_names): raw_cov_fpath = cov_fpath + '_raw' chr_len_fpath = get_chr_len_fpath(ref_fpath, correct_chr_names) if not is_non_empty_file(cov_fpath): logger.info(' Calculating reads coverage...') if not is_non_empty_file(raw_cov_fpath): if not is_non_empty_file(bam_sorted_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-o', bam_sorted_fpath, bam_fpath], stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([bedtools_fpath('bedtools'), 'genomecov', '-bga', '-ibam', bam_sorted_fpath, '-g', chr_len_fpath], stdout=open(raw_cov_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.assert_file_exists(raw_cov_fpath, 'coverage file') proceed_cov_file(raw_cov_fpath, cov_fpath, correct_chr_names) if not is_non_empty_file(physical_cov_fpath): raw_cov_fpath = get_physical_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, log_path, err_path, physical_cov_fpath, chr_len_fpath) proceed_cov_file(raw_cov_fpath, physical_cov_fpath, correct_chr_names) return cov_fpath, physical_cov_fpath