def get_physical_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, log_path, err_path, cov_fpath, chr_len_fpath): if not os.path.exists(bedtools_fpath('bamToBed')): logger.info(' Failed calculating physical coverage...') return None raw_cov_fpath = cov_fpath + '_raw' if not is_non_empty_file(raw_cov_fpath): logger.info(' Calculating physical coverage...') ## keep properly mapped, unique, and non-duplicate read pairs only bam_filtered_fpath = os.path.join(output_dirpath, ref_name + '.filtered.bam') qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'proper_pair and not supplementary and not duplicate', bam_fpath], stdout=open(bam_filtered_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) ## sort by read names bam_filtered_sorted_fpath = os.path.join(output_dirpath, ref_name + '.filtered.sorted.bam') qutils.call_subprocess([sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-o', bam_filtered_sorted_fpath, '-n', bam_filtered_fpath], stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) bedpe_fpath = os.path.join(output_dirpath, ref_name + '.bedpe') qutils.call_subprocess([bedtools_fpath('bamToBed'), '-i', bam_filtered_sorted_fpath, '-bedpe'], stdout=open(bedpe_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) raw_bed_fpath = os.path.join(output_dirpath, ref_name + '.bed') with open(bedpe_fpath, 'r') as bedpe: with open(raw_bed_fpath, 'w') as bed_file: for line in bedpe: fs = line.split() bed_file.write('\t'.join([fs[0], fs[1], fs[5] + '\n'])) sorted_bed_fpath = os.path.join(output_dirpath, ref_name + '.sorted.bed') qutils.call_subprocess([bedtools_fpath('bedtools'), 'sort', '-i', raw_bed_fpath], stdout=open(sorted_bed_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([bedtools_fpath('bedtools'), 'genomecov', '-bga', '-i', sorted_bed_fpath, '-g', chr_len_fpath], stdout=open(raw_cov_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) return raw_cov_fpath
def get_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_path, cov_fpath, physical_cov_fpath, correct_chr_names): raw_cov_fpath = cov_fpath + '_raw' chr_len_fpath = get_chr_len_fpath(ref_fpath, correct_chr_names) if not is_non_empty_file(cov_fpath): logger.info(' Calculating reads coverage...') if not is_non_empty_file(raw_cov_fpath): if not is_non_empty_file(bam_sorted_fpath): qutils.call_subprocess([ sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-o', bam_sorted_fpath, bam_fpath ], stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([ bedtools_fpath('bedtools'), 'genomecov', '-bga', '-ibam', bam_sorted_fpath, '-g', chr_len_fpath ], stdout=open(raw_cov_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.assert_file_exists(raw_cov_fpath, 'coverage file') proceed_cov_file(raw_cov_fpath, cov_fpath, correct_chr_names) if not is_non_empty_file(physical_cov_fpath): raw_cov_fpath = get_physical_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, log_path, err_path, physical_cov_fpath, chr_len_fpath) proceed_cov_file(raw_cov_fpath, physical_cov_fpath, correct_chr_names) return cov_fpath, physical_cov_fpath
def merge_bed(repeats_fpath, uncovered_fpath, insert_size, output_dirpath, err_path): combined_bed_fpath = join(output_dirpath, 'skipped_regions.bed') with open(combined_bed_fpath, 'w') as out: if exists(repeats_fpath): with open(repeats_fpath) as in_f: for line in in_f: l = line.split('\t') repeat_len = int(l[2]) - int(l[1]) if repeat_len >= insert_size: out.write(line) if exists(uncovered_fpath): with open(uncovered_fpath) as in_f: for line in in_f: out.write(line) sorted_bed_fpath = add_suffix(combined_bed_fpath, 'sorted') qutils.call_subprocess(['sort', '-k1,1', '-k2,2n', combined_bed_fpath], stdout=open(sorted_bed_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) merged_bed_fpath = add_suffix(combined_bed_fpath, 'merged') qutils.call_subprocess( [bedtools_fpath('bedtools'), 'merge', '-i', sorted_bed_fpath], stdout=open(merged_bed_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) return merged_bed_fpath
def get_physical_coverage(output_dirpath, ref_name, bam_fpath, log_path, err_fpath, cov_fpath, chr_len_fpath): if not isfile(bedtools_fpath('bamToBed')): logger.info(' Failed calculating physical coverage...') return None raw_cov_fpath = add_suffix(cov_fpath, 'raw') if not is_non_empty_file(raw_cov_fpath): logger.info(' Calculating physical coverage...') ## keep properly mapped, unique, and non-duplicate read pairs only bam_filtered_fpath = join(output_dirpath, ref_name + '.filtered.bam') sambamba_view(bam_fpath, bam_filtered_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='proper_pair and not supplementary and not duplicate') ## sort by read names bam_filtered_sorted_fpath = join(output_dirpath, ref_name + '.filtered.sorted.bam') sort_bam(bam_filtered_fpath, bam_filtered_sorted_fpath, err_fpath, logger, sort_rule='-n') bed_fpath = bam_to_bed(output_dirpath, ref_name, bam_filtered_sorted_fpath, err_fpath, logger, bedpe=True) calculate_genome_cov(bed_fpath, raw_cov_fpath, chr_len_fpath, err_fpath, logger) return raw_cov_fpath
def get_unique_covered_regions(ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size, uncovered_fpath, use_long_reads=False): red_genome_dir = os.path.join(tmp_dir, 'tmp_red') if isdir(red_genome_dir): shutil.rmtree(red_genome_dir) os.makedirs(red_genome_dir) ref_name = qutils.name_from_fpath(ref_fpath) ref_symlink = os.path.join(red_genome_dir, ref_name + '.fa') ## Red recognizes only *.fa files if os.path.islink(ref_symlink): os.remove(ref_symlink) os.symlink(ref_fpath, ref_symlink) logger.info(' ' + 'Running repeat masking tool...') repeats_fpath = os.path.join(tmp_dir, ref_name + '.rpt') if is_non_empty_file(repeats_fpath): return_code = 0 logger.info(' ' + 'Using existing file ' + repeats_fpath + '...') else: return_code = qutils.call_subprocess([binary_fpath, '-gnm', red_genome_dir, '-rpt', tmp_dir, '-frm', '2', '-min', '5'], stdout=open(log_fpath, 'w'), stderr=open(log_fpath, 'w'), indent=' ') if return_code == 0 and repeats_fpath and exists(repeats_fpath): long_repeats_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.long.rpt') with open(long_repeats_fpath, 'w') as out: with open(repeats_fpath) as in_f: for line in in_f: l = line.split('\t') repeat_len = int(l[2]) - int(l[1]) if repeat_len >= insert_size: out.write(line[1:]) repeats_fasta_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.fasta') coords_fpath = os.path.join(tmp_dir, qutils.name_from_fpath(ref_fpath) + '.rpt.coords.txt') if not is_non_empty_file(coords_fpath): fasta_index_fpath = ref_fpath + '.fai' if exists(fasta_index_fpath): os.remove(fasta_index_fpath) qutils.call_subprocess([bedtools_fpath('bedtools'), 'getfasta', '-fi', ref_fpath, '-bed', long_repeats_fpath, '-fo', repeats_fasta_fpath], stderr=open(log_fpath, 'w'), indent=' ') cmdline = [minimap_fpath(), '-c', '-x', 'asm10', '-N', '50', '--mask-level', '1', '--no-long-join', '-r', '100', '-t', str(qconfig.max_threads), '-z', '200', ref_fpath, repeats_fasta_fpath] qutils.call_subprocess(cmdline, stdout=open(coords_fpath, 'w'), stderr=open(log_fpath, 'a')) filtered_repeats_fpath, repeats_regions = check_repeats_instances(coords_fpath, long_repeats_fpath, use_long_reads) unique_covered_regions = remove_repeat_regions(ref_fpath, filtered_repeats_fpath, uncovered_fpath) return unique_covered_regions, repeats_regions return None, None