def align_reference(ref_fpath, output_dir, using_reads='all', calculate_coverage=False): required_files = [] ref_name = qutils.name_from_fpath(ref_fpath) cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov') uncovered_fpath = add_suffix(cov_fpath, 'uncovered') if using_reads != 'all': cov_fpath = add_suffix(cov_fpath, using_reads) uncovered_fpath = add_suffix(uncovered_fpath, using_reads) insert_size_fpath = join(output_dir, ref_name + '.is.txt') if not is_non_empty_file(uncovered_fpath): required_files.append(uncovered_fpath) if not is_non_empty_file(insert_size_fpath) and (using_reads == 'all' or using_reads == 'pe'): required_files.append(insert_size_fpath) temp_output_dir = join(output_dir, 'temp_output') if not isdir(temp_output_dir): os.makedirs(temp_output_dir) log_path = join(output_dir, 'reads_stats.log') err_fpath = join(output_dir, 'reads_stats.err') correct_chr_names, sam_fpath, bam_fpath = align_single_file(ref_fpath, output_dir, temp_output_dir, log_path, err_fpath, qconfig.max_threads, sam_fpath=qconfig.reference_sam, bam_fpath=qconfig.reference_bam, required_files=required_files, is_reference=True, alignment_only=True, using_reads=using_reads) if not qconfig.optimal_assembly_insert_size or qconfig.optimal_assembly_insert_size == 'auto': if using_reads == 'pe' and sam_fpath: insert_size, std_dev = calculate_insert_size(sam_fpath, output_dir, ref_name) if not insert_size: logger.info(' Failed calculating insert size.') else: qconfig.optimal_assembly_insert_size = insert_size elif using_reads == 'all' and is_non_empty_file(insert_size_fpath): try: insert_size = int(open(insert_size_fpath).readline()) if insert_size: qconfig.optimal_assembly_insert_size = insert_size except: pass if not required_files: return sam_fpath, bam_fpath, uncovered_fpath if not sam_fpath: logger.info(' Failed detecting uncovered regions.') return None, None if calculate_coverage: bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped')) bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted')) if is_non_empty_file(bam_sorted_fpath): logger.info(' Using existing sorted BAM-file: ' + bam_sorted_fpath) else: sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped') sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger) if not is_non_empty_file(uncovered_fpath) and calculate_coverage: get_coverage(temp_output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath, correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False) return sam_fpath, bam_fpath, uncovered_fpath
def align_reference(ref_fpath, output_dir, using_reads='all'): required_files = [] ref_name = qutils.name_from_fpath(ref_fpath) cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov') uncovered_fpath = add_suffix(cov_fpath, 'uncovered') if using_reads != 'all': cov_fpath = add_suffix(cov_fpath, using_reads) uncovered_fpath = add_suffix(uncovered_fpath, using_reads) insert_size_fpath = join(output_dir, ref_name + '.is.txt') if not is_non_empty_file(uncovered_fpath): required_files.append(uncovered_fpath) if not is_non_empty_file(insert_size_fpath) and (using_reads == 'all' or using_reads == 'paired_end'): required_files.append(insert_size_fpath) temp_output_dir = join(output_dir, 'temp_output') if not isdir(temp_output_dir): os.makedirs(temp_output_dir) log_path = join(output_dir, 'reads_stats.log') err_fpath = join(output_dir, 'reads_stats.err') correct_chr_names, sam_fpath, bam_fpath = align_single_file(ref_fpath, output_dir, temp_output_dir, log_path, err_fpath, qconfig.max_threads, sam_fpath=qconfig.reference_sam, bam_fpath=qconfig.reference_bam, required_files=required_files, is_reference=True, alignment_only=True, using_reads=using_reads) qconfig.reference_sam = sam_fpath qconfig.reference_bam = bam_fpath if not qconfig.ideal_assembly_insert_size or qconfig.ideal_assembly_insert_size == 'auto': if using_reads == 'paired_end' and sam_fpath: insert_size = calculate_insert_size(sam_fpath, output_dir, ref_name) if not insert_size: logger.info(' Failed calculating insert size.') else: qconfig.ideal_assembly_insert_size = insert_size if not required_files: return bam_fpath, uncovered_fpath if not sam_fpath: logger.info(' Failed detecting uncovered regions.') return None, None bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped')) bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted')) if is_non_empty_file(bam_sorted_fpath): logger.info(' Using existing sorted BAM-file: ' + bam_sorted_fpath) else: sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped') sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger) if not is_non_empty_file(uncovered_fpath): get_coverage(temp_output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath, correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False) return bam_fpath, uncovered_fpath
def merge_sam_files(tmp_sam_fpaths, sam_fpath, bam_fpath, max_threads, err_fpath): tmp_bam_fpaths = [] for tmp_sam_fpath in tmp_sam_fpaths: if is_non_empty_file(tmp_sam_fpath): tmp_bam_fpath = tmp_sam_fpath.replace('.sam', '.bam') tmp_bam_sorted_fpath = add_suffix(tmp_bam_fpath, 'sorted') if not is_non_empty_file(tmp_bam_sorted_fpath): sort_bam(tmp_bam_fpath, tmp_bam_sorted_fpath, err_fpath, logger) tmp_bam_fpaths.append(tmp_bam_sorted_fpath) qutils.call_subprocess([sambamba_fpath('sambamba'), 'merge', '-t', str(max_threads), bam_fpath] + tmp_bam_fpaths, stderr=open(err_fpath, 'a'), logger=logger) sambamba_view(bam_fpath, sam_fpath, max_threads, err_fpath, logger) return sam_fpath
def connect_with_matepairs(bam_fpath, output_dirpath, err_fpath): bam_filtered_fpath = add_suffix(bam_fpath, 'filtered') qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'proper_pair and not supplementary and not duplicate', bam_fpath], stdout=open(bam_filtered_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) ## sort by read names bam_filtered_sorted_fpath = add_suffix(bam_filtered_fpath, 'sorted') sort_bam(bam_filtered_fpath, bam_filtered_sorted_fpath, err_fpath, logger, sort_rule='-n') bed_fpath = bam_to_bed(output_dirpath, 'matepairs', bam_filtered_sorted_fpath, err_fpath, logger, bedpe=True, only_intervals=True) matepair_regions = defaultdict(list) with open(bed_fpath) as bed: for l in bed: fs = l.split() matepair_regions[fs[0]].append((int(fs[1]), int(fs[2]))) return matepair_regions
def get_physical_coverage(output_dirpath, ref_name, bam_fpath, log_path, err_fpath, cov_fpath, chr_len_fpath): if not isfile(bedtools_fpath('bamToBed')): logger.info(' Failed calculating physical coverage...') return None raw_cov_fpath = add_suffix(cov_fpath, 'raw') if not is_non_empty_file(raw_cov_fpath): logger.info(' Calculating physical coverage...') ## keep properly mapped, unique, and non-duplicate read pairs only bam_filtered_fpath = join(output_dirpath, ref_name + '.filtered.bam') sambamba_view(bam_fpath, bam_filtered_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='proper_pair and not supplementary and not duplicate') ## sort by read names bam_filtered_sorted_fpath = join(output_dirpath, ref_name + '.filtered.sorted.bam') sort_bam(bam_filtered_fpath, bam_filtered_sorted_fpath, err_fpath, logger, sort_rule='-n') bed_fpath = bam_to_bed(output_dirpath, ref_name, bam_filtered_sorted_fpath, err_fpath, logger, bedpe=True) calculate_genome_cov(bed_fpath, raw_cov_fpath, chr_len_fpath, err_fpath, logger) return raw_cov_fpath
def process_one_ref(cur_ref_fpath, output_dirpath, err_fpath, max_threads, bam_fpath=None, bed_fpath=None): ref_name = qutils.name_from_fpath(cur_ref_fpath) if not bam_fpath: sam_fpath = join(output_dirpath, ref_name + '.sam') bam_fpath = join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = join(output_dirpath, ref_name + '.sorted.bam') else: sam_fpath = bam_fpath.replace('.bam', '.sam') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') bed_fpath = bed_fpath or join(output_dirpath, ref_name + '.bed') if is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) return bed_fpath if not isfile(bam_sorted_fpath): sambamba_view(sam_fpath, bam_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped and proper_pair') sort_bam(bam_fpath, bam_sorted_fpath, err_fpath, logger, threads=max_threads) if not is_non_empty_file(bam_sorted_fpath + '.bai'): qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', bam_sorted_fpath], stderr=open(err_fpath, 'a'), logger=logger) create_fai_file(cur_ref_fpath) vcf_output_dirpath = join(output_dirpath, ref_name + '_gridss') vcf_fpath = join(vcf_output_dirpath, ref_name + '.vcf') if not is_non_empty_file(vcf_fpath): if isdir(vcf_output_dirpath): shutil.rmtree(vcf_output_dirpath, ignore_errors=True) os.makedirs(vcf_output_dirpath) max_mem = get_gridss_memory() env = os.environ.copy() env["PATH"] += os.pathsep + bwa_dirpath bwa_index(cur_ref_fpath, err_fpath, logger) qutils.call_subprocess(['java', '-ea', '-Xmx' + str(max_mem) + 'g', '-Dsamjdk.create_index=true', '-Dsamjdk.use_async_io_read_samtools=true', '-Dsamjdk.use_async_io_write_samtools=true', '-Dsamjdk.use_async_io_write_tribble=true', '-cp', get_gridss_fpath(), 'gridss.CallVariants', 'I=' + bam_sorted_fpath, 'O=' + vcf_fpath, 'ASSEMBLY=' + join(vcf_output_dirpath, ref_name + '.gridss.bam'), 'REFERENCE_SEQUENCE=' + cur_ref_fpath, 'WORKER_THREADS=' + str(max_threads), 'WORKING_DIR=' + vcf_output_dirpath], stderr=open(err_fpath, 'a'), logger=logger, env=env) if is_non_empty_file(vcf_fpath): raw_bed_fpath = add_suffix(bed_fpath, 'raw') filtered_bed_fpath = add_suffix(bed_fpath, 'filtered') qutils.call_subprocess(['java', '-cp', get_gridss_fpath(), 'au.edu.wehi.idsv.VcfBreakendToBedpe', 'I=' + vcf_fpath, 'O=' + raw_bed_fpath, 'OF=' + filtered_bed_fpath, 'R=' + cur_ref_fpath, 'INCLUDE_HEADER=TRUE'], stderr=open(err_fpath, 'a'), logger=logger) reformat_bedpe(raw_bed_fpath, bed_fpath) return bed_fpath
def process_one_ref(cur_ref_fpath, output_dirpath, err_fpath, max_threads, bam_fpath=None, bed_fpath=None): ref_name = qutils.name_from_fpath(cur_ref_fpath) if not bam_fpath: sam_fpath = join(output_dirpath, ref_name + '.sam') bam_fpath = join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = join(output_dirpath, ref_name + '.sorted.bam') else: sam_fpath = bam_fpath.replace('.bam', '.sam') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') bed_fpath = bed_fpath or join(output_dirpath, ref_name + '.bed') if is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) return bed_fpath if not isfile(bam_sorted_fpath): sambamba_view(sam_fpath, bam_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped') sort_bam(bam_fpath, bam_sorted_fpath, err_fpath, logger, threads=max_threads) if not is_non_empty_file(bam_sorted_fpath + '.bai'): qutils.call_subprocess([sambamba_fpath('sambamba'), 'index', bam_sorted_fpath], stderr=open(err_fpath, 'a'), logger=logger) create_fai_file(cur_ref_fpath) vcf_output_dirpath = join(output_dirpath, ref_name + '_gridss') vcf_fpath = join(vcf_output_dirpath, ref_name + '.vcf') if not is_non_empty_file(vcf_fpath): if isdir(vcf_output_dirpath): shutil.rmtree(vcf_output_dirpath, ignore_errors=True) os.makedirs(vcf_output_dirpath) max_mem = get_gridss_memory() env = os.environ.copy() env["PATH"] += os.pathsep + bwa_dirpath bwa_index(cur_ref_fpath, err_fpath, logger) qutils.call_subprocess(['java', '-ea', '-Xmx' + str(max_mem) + 'g', '-Dsamjdk.create_index=true', '-Dsamjdk.use_async_io_read_samtools=true', '-Dsamjdk.use_async_io_write_samtools=true', '-Dsamjdk.use_async_io_write_tribble=true', '-cp', get_gridss_fpath(), 'gridss.CallVariants', 'I=' + bam_sorted_fpath, 'O=' + vcf_fpath, 'ASSEMBLY=' + join(vcf_output_dirpath, ref_name + '.gridss.bam'), 'REFERENCE_SEQUENCE=' + cur_ref_fpath, 'WORKER_THREADS=' + str(max_threads), 'WORKING_DIR=' + vcf_output_dirpath], stderr=open(err_fpath, 'a'), logger=logger, env=env) if is_non_empty_file(vcf_fpath): raw_bed_fpath = add_suffix(bed_fpath, 'raw') filtered_bed_fpath = add_suffix(bed_fpath, 'filtered') qutils.call_subprocess(['java', '-cp', get_gridss_fpath(), 'au.edu.wehi.idsv.VcfBreakendToBedpe', 'I=' + vcf_fpath, 'O=' + raw_bed_fpath, 'OF=' + filtered_bed_fpath, 'R=' + cur_ref_fpath, 'INCLUDE_HEADER=TRUE'], stderr=open(err_fpath, 'a'), logger=logger) reformat_bedpe(raw_bed_fpath, bed_fpath) return bed_fpath
def merge_sam_files(tmp_sam_fpaths, sam_fpath, bam_fpath, output_dir, max_threads, err_fpath): merged_bam_fpath = add_suffix(bam_fpath, 'merged') tmp_bam_fpaths = [] for tmp_sam_fpath in tmp_sam_fpaths: if is_non_empty_file(tmp_sam_fpath): tmp_bam_fpath = tmp_sam_fpath.replace('.sam', '.bam') tmp_bam_sorted_fpath = add_suffix(tmp_bam_fpath, 'sorted') if not is_non_empty_file(tmp_bam_sorted_fpath): sambamba_view(tmp_sam_fpath, tmp_bam_fpath, max_threads, err_fpath, logger, filter_rule=None) sort_bam(tmp_bam_fpath, tmp_bam_sorted_fpath, err_fpath, logger) tmp_bam_fpaths.append(tmp_bam_sorted_fpath) qutils.call_subprocess([sambamba_fpath('sambamba'), 'merge', '-t', str(max_threads), merged_bam_fpath] + tmp_bam_fpaths, stderr=open(err_fpath, 'a'), logger=logger) qutils.call_subprocess([sambamba_fpath('sambamba'), 'markdup', '-r', '-t', str(max_threads), '--tmpdir', output_dir, merged_bam_fpath, bam_fpath], stderr=open(err_fpath, 'a'), logger=logger) sambamba_view(bam_fpath, sam_fpath, max_threads, err_fpath, logger) return merged_bam_fpath
def get_joiners(ref_name, sam_fpath, bam_fpath, output_dirpath, err_fpath, using_reads): bam_filtered_fpath = add_suffix(bam_fpath, 'filtered') if not is_non_empty_file(bam_filtered_fpath): filter_rule = 'not unmapped and not supplementary and not secondary_alignment' sambamba_view(bam_fpath, bam_filtered_fpath, qconfig.max_threads, err_fpath, logger, filter_rule=filter_rule) bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') if not is_non_empty_file(bam_sorted_fpath): sort_bam(bam_filtered_fpath, bam_sorted_fpath, err_fpath, logger, sort_rule='-n') bed_fpath = bam_to_bed(output_dirpath, using_reads, bam_sorted_fpath, err_fpath, logger, bedpe=using_reads == 'mp') intervals = defaultdict(list) if using_reads == 'mp': insert_size, std_dev = calculate_insert_size(sam_fpath, output_dirpath, ref_name, reads_suffix='mp') min_is = insert_size - std_dev max_is = insert_size + std_dev with open(bed_fpath) as bed: for l in bed: fs = l.split() if using_reads == 'mp' and insert_size: interval_len = int(fs[2]) - int(fs[1]) if min_is <= abs(interval_len) <= max_is: intervals[fs[0]].append((int(fs[1]), int(fs[2]))) else: intervals[fs[0]].append((int(fs[1]), int(fs[2]))) return intervals
def get_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath, correct_chr_names, cov_fpath, physical_cov_fpath=None, uncovered_fpath=None, create_cov_files=True): raw_cov_fpath = cov_fpath + '_raw' chr_len_fpath = get_chr_len_fpath(ref_fpath, correct_chr_names) if not is_non_empty_file(cov_fpath): logger.info(' Calculating reads coverage...') if not is_non_empty_file(raw_cov_fpath): if not is_non_empty_file(bam_sorted_fpath): sort_bam(bam_fpath, bam_sorted_fpath, log_path, err_fpath, logger) calculate_genome_cov(bam_sorted_fpath, raw_cov_fpath, chr_len_fpath, err_fpath, logger) qutils.assert_file_exists(raw_cov_fpath, 'coverage file') if uncovered_fpath: print_uncovered_regions(raw_cov_fpath, uncovered_fpath, correct_chr_names) if create_cov_files: proceed_cov_file(raw_cov_fpath, cov_fpath, correct_chr_names) if not is_non_empty_file(physical_cov_fpath) and create_cov_files: raw_cov_fpath = get_physical_coverage(output_dirpath, ref_name, bam_fpath, log_path, err_fpath, physical_cov_fpath, chr_len_fpath) proceed_cov_file(raw_cov_fpath, physical_cov_fpath, correct_chr_names) return cov_fpath, physical_cov_fpath
def align_ideal_assembly(ref_fpath, assembly_fpath, output_dir, log_fpath, err_fpath): sam_fpath = join(output_dir, basename(assembly_fpath) + '.sam') bam_fpath = sam_fpath.replace('.sam', '.bam') bam_mapped_fpath = add_suffix(bam_fpath, 'mapped') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') if not is_non_empty_file(bam_fpath): bwa_index(ref_fpath, err_fpath, logger) qutils.call_subprocess([bwa_fpath('bwa'), 'mem', '-t', str(qconfig.max_threads), ref_fpath, assembly_fpath], stdout=open(sam_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-S', sam_fpath], stdout=open(bam_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) if not is_non_empty_file(bam_sorted_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'not unmapped', bam_fpath], stdout=open(bam_mapped_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger) cov_fpath = join(output_dir, basename(assembly_fpath) + '.cov') uncovered_fpath = add_suffix(cov_fpath, 'uncovered') ref_name = qutils.name_from_fpath(ref_fpath) correct_chr_names = get_correct_names_for_chroms(output_dir, ref_fpath, sam_fpath, err_fpath, assembly_fpath, logger) get_coverage(output_dir, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_fpath, err_fpath, correct_chr_names, cov_fpath, uncovered_fpath=uncovered_fpath, create_cov_files=False) return uncovered_fpath
def run_processing_reads(contigs_fpaths, main_ref_fpath, meta_ref_fpaths, ref_labels, temp_output_dir, output_dir, log_path, err_fpath): required_files = [] bed_fpath, cov_fpath, physical_cov_fpath = None, None, None if main_ref_fpath: ref_name = qutils.name_from_fpath(main_ref_fpath) bed_fpath = qconfig.bed or join(output_dir, ref_name + '.bed') cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov') physical_cov_fpath = qconfig.phys_cov_fpath or join(output_dir, ref_name + '.physical.cov') required_files = [bed_fpath, cov_fpath, physical_cov_fpath] if qconfig.no_sv: logger.info(' Will not search Structural Variations (--fast or --no-sv is specified)') bed_fpath = None elif is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) elif not qconfig.forward_reads and not qconfig.interlaced_reads: if not qconfig.reference_sam and not qconfig.reference_bam: logger.info(' Will not search Structural Variations (needs paired-end reads)') bed_fpath = None qconfig.no_sv = True if qconfig.create_icarus_html: if is_non_empty_file(cov_fpath): is_correct_file = check_cov_file(cov_fpath) if is_correct_file: logger.info(' Using existing reads coverage file: ' + cov_fpath) if is_non_empty_file(physical_cov_fpath): logger.info(' Using existing physical coverage file: ' + physical_cov_fpath) else: logger.info(' Will not calculate coverage (--fast or --no-html, or --no-icarus, or --space-efficient is specified)') cov_fpath = None physical_cov_fpath = None if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and \ (not qconfig.create_icarus_html or (is_non_empty_file(cov_fpath) and is_non_empty_file(physical_cov_fpath))): required_files = [] n_jobs = min(qconfig.max_threads, len(contigs_fpaths) + 1) max_threads_per_job = max(1, qconfig.max_threads // n_jobs) sam_fpaths = qconfig.sam_fpaths or [None] * len(contigs_fpaths) bam_fpaths = qconfig.bam_fpaths or [None] * len(contigs_fpaths) parallel_align_args = [(contigs_fpath, output_dir, temp_output_dir, log_path, err_fpath, max_threads_per_job, sam_fpaths[index], bam_fpaths[index], index) for index, contigs_fpath in enumerate(contigs_fpaths)] if main_ref_fpath: parallel_align_args.append((main_ref_fpath, output_dir, temp_output_dir, log_path, err_fpath, max_threads_per_job, qconfig.reference_sam, qconfig.reference_bam, None, required_files, True)) correct_chr_names, sam_fpaths, bam_fpaths = run_parallel(align_single_file, parallel_align_args, n_jobs) qconfig.sam_fpaths = sam_fpaths[:len(contigs_fpaths)] qconfig.bam_fpaths = bam_fpaths[:len(contigs_fpaths)] add_statistics_to_report(output_dir, contigs_fpaths, main_ref_fpath) save_reads(output_dir) if not main_ref_fpath: return None, None, None correct_chr_names = correct_chr_names[-1] sam_fpath, bam_fpath = sam_fpaths[-1], bam_fpaths[-1] qconfig.reference_sam = sam_fpath qconfig.reference_bam = bam_fpath if not required_files: return bed_fpath, cov_fpath, physical_cov_fpath if not all([sam_fpath, bam_fpath]): logger.info(' Failed searching structural variations.') return None, None, None sam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(sam_fpath, 'sorted')) bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped')) bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted')) if is_non_empty_file(sam_sorted_fpath): logger.info(' Using existing sorted SAM-file: ' + sam_sorted_fpath) else: if not is_non_empty_file(bam_sorted_fpath): sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped') sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger) sambamba_view(bam_sorted_fpath, sam_sorted_fpath, qconfig.max_threads, err_fpath, logger) if qconfig.create_icarus_html and (not is_non_empty_file(cov_fpath) or not is_non_empty_file(physical_cov_fpath)): cov_fpath, physical_cov_fpath = get_coverage(temp_output_dir, main_ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath, correct_chr_names, cov_fpath, physical_cov_fpath) if not is_non_empty_file(bed_fpath) and not qconfig.no_sv: if meta_ref_fpaths: logger.info(' Splitting SAM-file by references...') headers = [] seq_lengths = {} with open(sam_fpath) as sam_file: for line in sam_file: if not line.startswith('@'): break if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line: seq_name = line.split('\tSN:')[1].split('\t')[0] seq_length = int(line.split('\tLN:')[1].split('\t')[0]) seq_lengths[seq_name] = seq_length headers.append(line.strip()) need_ref_splitting = False ref_files = {} if meta_ref_fpaths: global ref_sam_fpaths for cur_ref_fpath in meta_ref_fpaths: cur_ref_name = qutils.name_from_fpath(cur_ref_fpath) ref_sam_fpath = join(temp_output_dir, cur_ref_name + '.sam') ref_sam_fpaths[cur_ref_fpath] = ref_sam_fpath if is_non_empty_file(ref_sam_fpath): logger.info(' Using existing split SAM-file for %s: %s' % (cur_ref_name, ref_sam_fpath)) ref_files[cur_ref_name] = None else: ref_sam_file = open(ref_sam_fpath, 'w') if not headers[0].startswith('@SQ'): ref_sam_file.write(headers[0] + '\n') for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h): seq_name = h.split('\tSN:')[1].split('\t')[0] if seq_name in ref_labels and ref_labels[seq_name] == cur_ref_name: ref_sam_file.write(h + '\n') ref_sam_file.write(headers[-1] + '\n') ref_files[cur_ref_name] = ref_sam_file need_ref_splitting = True trivial_deletions_fpath = \ search_trivial_deletions(temp_output_dir, sam_sorted_fpath, ref_files, ref_labels, seq_lengths, need_ref_splitting) if get_gridss_fpath() and isfile(get_gridss_fpath()): try: gridss_sv_fpath = search_sv_with_gridss(main_ref_fpath, bam_mapped_fpath, meta_ref_fpaths, temp_output_dir, err_fpath) qutils.cat_files([gridss_sv_fpath, trivial_deletions_fpath], bed_fpath) except: pass if isfile(trivial_deletions_fpath) and not is_non_empty_file(bed_fpath): shutil.copy(trivial_deletions_fpath, bed_fpath) if not qconfig.no_sv: if is_non_empty_file(bed_fpath): logger.main_info(' Structural variations are in ' + bed_fpath) else: if isfile(bed_fpath): logger.main_info(' No structural variations were found.') else: logger.main_info(' Failed searching structural variations.') bed_fpath = None if is_non_empty_file(cov_fpath): logger.main_info(' Coverage distribution along the reference genome is in ' + cov_fpath) else: if not qconfig.create_icarus_html: logger.main_info(' Failed to calculate coverage distribution') cov_fpath = None return bed_fpath, cov_fpath, physical_cov_fpath