def search_sv_with_manta(main_ref_fpath, meta_ref_fpaths, output_dirpath, err_path): logger.info(' Searching structural variations with Manta...') final_bed_fpath = os.path.join(output_dirpath, qconfig.manta_sv_fname) if os.path.exists(final_bed_fpath): logger.info(' Using existing file: ' + final_bed_fpath) return final_bed_fpath if meta_ref_fpaths: if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed n_jobs = min(len(meta_ref_fpaths), qconfig.max_threads) if not qconfig.memory_efficient: bed_fpaths = Parallel(n_jobs=n_jobs)( delayed(process_one_ref)(cur_ref_fpath, output_dirpath, err_path) for cur_ref_fpath in meta_ref_fpaths) else: bed_fpaths = [ process_one_ref(cur_ref_fpath, output_dirpath, err_path) for cur_ref_fpath in meta_ref_fpaths ] bed_fpaths = [f for f in bed_fpaths if f is not None] if bed_fpaths: qutils.cat_files(bed_fpaths, final_bed_fpath) else: process_one_ref(main_ref_fpath, output_dirpath, err_path, bed_fpath=final_bed_fpath) logger.info(' Saving to: ' + final_bed_fpath) return final_bed_fpath
def search_sv_with_gridss(main_ref_fpath, bam_fpath, meta_ref_fpaths, output_dirpath, err_fpath): logger.info(' Searching structural variations with GRIDSS...') final_bed_fpath = join(output_dirpath, qutils.name_from_fpath(main_ref_fpath) + '_' + qconfig.sv_bed_fname) if isfile(final_bed_fpath): logger.info(' Using existing file: ' + final_bed_fpath) return final_bed_fpath if not get_path_to_program('java') or not check_java_version(1.8): logger.warning('Java 1.8 or later is required to run GRIDSS. Please install it and rerun QUAST.') return None if not get_path_to_program('Rscript'): logger.warning('R is required to run GRIDSS. Please install it and rerun QUAST.') return None if meta_ref_fpaths: n_jobs = min(len(meta_ref_fpaths), qconfig.max_threads) threads_per_job = max(1, qconfig.max_threads // n_jobs) parallel_args = [(cur_ref_fpath, output_dirpath, err_fpath, threads_per_job) for cur_ref_fpath in meta_ref_fpaths] bed_fpaths = run_parallel(process_one_ref, parallel_args, n_jobs, filter_results=True) if bed_fpaths: qutils.cat_files(bed_fpaths, final_bed_fpath) else: process_one_ref(main_ref_fpath, output_dirpath, err_fpath, qconfig.max_threads, bam_fpath=bam_fpath, bed_fpath=final_bed_fpath) logger.info(' Saving to: ' + final_bed_fpath) return final_bed_fpath
def search_sv_with_gridss(main_ref_fpath, bam_fpath, meta_ref_fpaths, output_dirpath, err_fpath): logger.info(' Searching structural variations with GRIDSS...') final_bed_fpath = join(output_dirpath, qutils.name_from_fpath(main_ref_fpath) + '_' + qconfig.sv_bed_fname) if isfile(final_bed_fpath): logger.info(' Using existing file: ' + final_bed_fpath) return final_bed_fpath if not get_path_to_program('java') or not check_java_version(1.8): logger.warning('Java 1.8 or later is required to run GRIDSS. Please install it and rerun QUAST.') return None if not get_path_to_program('Rscript'): logger.warning('R is required to run GRIDSS. Please install it and rerun QUAST.') return None if meta_ref_fpaths: n_jobs = min(len(meta_ref_fpaths), qconfig.max_threads) threads_per_job = max(1, qconfig.max_threads // n_jobs) parallel_args = [(cur_ref_fpath, output_dirpath, err_fpath, threads_per_job) for cur_ref_fpath in meta_ref_fpaths] bed_fpaths = run_parallel(process_one_ref, parallel_args, n_jobs, filter_results=True) if bed_fpaths: qutils.cat_files(bed_fpaths, final_bed_fpath) else: process_one_ref(main_ref_fpath, output_dirpath, err_fpath, qconfig.max_threads, bam_fpath=bam_fpath, bed_fpath=final_bed_fpath) logger.info(' Saving to: ' + final_bed_fpath) return final_bed_fpath
def search_sv_with_manta(main_ref_fpath, meta_ref_fpaths, output_dirpath, err_path): logger.info(' Searching structural variations with Manta...') final_bed_fpath = os.path.join(output_dirpath, qconfig.manta_sv_fname) if os.path.exists(final_bed_fpath): logger.info(' Using existing file: ' + final_bed_fpath) return final_bed_fpath if meta_ref_fpaths: if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed n_jobs = min(len(meta_ref_fpaths), qconfig.max_threads) bed_fpaths = Parallel(n_jobs=n_jobs)(delayed(process_one_ref)(cur_ref_fpath, output_dirpath, err_path) for cur_ref_fpath in meta_ref_fpaths) bed_fpaths = [f for f in bed_fpaths if f is not None] if bed_fpaths: qutils.cat_files(bed_fpaths, final_bed_fpath) else: process_one_ref(main_ref_fpath, output_dirpath, err_path, bed_fpath=final_bed_fpath) logger.info(' Saving to: ' + final_bed_fpath) return final_bed_fpath
def run_processing_reads(contigs_fpaths, main_ref_fpath, meta_ref_fpaths, ref_labels, temp_output_dir, output_dir, log_path, err_fpath): required_files = [] bed_fpath, cov_fpath, physical_cov_fpath = None, None, None if main_ref_fpath: ref_name = qutils.name_from_fpath(main_ref_fpath) bed_fpath = qconfig.bed or join(output_dir, ref_name + '.bed') cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov') physical_cov_fpath = qconfig.phys_cov_fpath or join(output_dir, ref_name + '.physical.cov') required_files = [bed_fpath, cov_fpath, physical_cov_fpath] if qconfig.no_sv: logger.info(' Will not search Structural Variations (--fast or --no-sv is specified)') bed_fpath = None elif is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) elif not qconfig.forward_reads and not qconfig.interlaced_reads: if not qconfig.reference_sam and not qconfig.reference_bam: logger.info(' Will not search Structural Variations (needs paired-end reads)') bed_fpath = None qconfig.no_sv = True if qconfig.create_icarus_html: if is_non_empty_file(cov_fpath): is_correct_file = check_cov_file(cov_fpath) if is_correct_file: logger.info(' Using existing reads coverage file: ' + cov_fpath) if is_non_empty_file(physical_cov_fpath): logger.info(' Using existing physical coverage file: ' + physical_cov_fpath) else: logger.info(' Will not calculate coverage (--fast or --no-html, or --no-icarus, or --space-efficient is specified)') cov_fpath = None physical_cov_fpath = None if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and \ (not qconfig.create_icarus_html or (is_non_empty_file(cov_fpath) and is_non_empty_file(physical_cov_fpath))): required_files = [] n_jobs = min(qconfig.max_threads, len(contigs_fpaths) + 1) max_threads_per_job = max(1, qconfig.max_threads // n_jobs) sam_fpaths = qconfig.sam_fpaths or [None] * len(contigs_fpaths) bam_fpaths = qconfig.bam_fpaths or [None] * len(contigs_fpaths) parallel_align_args = [(contigs_fpath, output_dir, temp_output_dir, log_path, err_fpath, max_threads_per_job, sam_fpaths[index], bam_fpaths[index], index) for index, contigs_fpath in enumerate(contigs_fpaths)] if main_ref_fpath: parallel_align_args.append((main_ref_fpath, output_dir, temp_output_dir, log_path, err_fpath, max_threads_per_job, qconfig.reference_sam, qconfig.reference_bam, None, required_files, True)) correct_chr_names, sam_fpaths, bam_fpaths = run_parallel(align_single_file, parallel_align_args, n_jobs) qconfig.sam_fpaths = sam_fpaths[:len(contigs_fpaths)] qconfig.bam_fpaths = bam_fpaths[:len(contigs_fpaths)] add_statistics_to_report(output_dir, contigs_fpaths, main_ref_fpath) save_reads(output_dir) if not main_ref_fpath: return None, None, None correct_chr_names = correct_chr_names[-1] sam_fpath, bam_fpath = sam_fpaths[-1], bam_fpaths[-1] qconfig.reference_sam = sam_fpath qconfig.reference_bam = bam_fpath if not required_files: return bed_fpath, cov_fpath, physical_cov_fpath if not all([sam_fpath, bam_fpath]): logger.info(' Failed searching structural variations.') return None, None, None sam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(sam_fpath, 'sorted')) bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped')) bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted')) if is_non_empty_file(sam_sorted_fpath): logger.info(' Using existing sorted SAM-file: ' + sam_sorted_fpath) else: if not is_non_empty_file(bam_sorted_fpath): sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped') sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger) sambamba_view(bam_sorted_fpath, sam_sorted_fpath, qconfig.max_threads, err_fpath, logger) if qconfig.create_icarus_html and (not is_non_empty_file(cov_fpath) or not is_non_empty_file(physical_cov_fpath)): cov_fpath, physical_cov_fpath = get_coverage(temp_output_dir, main_ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath, correct_chr_names, cov_fpath, physical_cov_fpath) if not is_non_empty_file(bed_fpath) and not qconfig.no_sv: if meta_ref_fpaths: logger.info(' Splitting SAM-file by references...') headers = [] seq_lengths = {} with open(sam_fpath) as sam_file: for line in sam_file: if not line.startswith('@'): break if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line: seq_name = line.split('\tSN:')[1].split('\t')[0] seq_length = int(line.split('\tLN:')[1].split('\t')[0]) seq_lengths[seq_name] = seq_length headers.append(line.strip()) need_ref_splitting = False ref_files = {} if meta_ref_fpaths: global ref_sam_fpaths for cur_ref_fpath in meta_ref_fpaths: cur_ref_name = qutils.name_from_fpath(cur_ref_fpath) ref_sam_fpath = join(temp_output_dir, cur_ref_name + '.sam') ref_sam_fpaths[cur_ref_fpath] = ref_sam_fpath if is_non_empty_file(ref_sam_fpath): logger.info(' Using existing split SAM-file for %s: %s' % (cur_ref_name, ref_sam_fpath)) ref_files[cur_ref_name] = None else: ref_sam_file = open(ref_sam_fpath, 'w') if not headers[0].startswith('@SQ'): ref_sam_file.write(headers[0] + '\n') for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h): seq_name = h.split('\tSN:')[1].split('\t')[0] if seq_name in ref_labels and ref_labels[seq_name] == cur_ref_name: ref_sam_file.write(h + '\n') ref_sam_file.write(headers[-1] + '\n') ref_files[cur_ref_name] = ref_sam_file need_ref_splitting = True trivial_deletions_fpath = \ search_trivial_deletions(temp_output_dir, sam_sorted_fpath, ref_files, ref_labels, seq_lengths, need_ref_splitting) if get_gridss_fpath() and isfile(get_gridss_fpath()): try: gridss_sv_fpath = search_sv_with_gridss(main_ref_fpath, bam_mapped_fpath, meta_ref_fpaths, temp_output_dir, err_fpath) qutils.cat_files([gridss_sv_fpath, trivial_deletions_fpath], bed_fpath) except: pass if isfile(trivial_deletions_fpath) and not is_non_empty_file(bed_fpath): shutil.copy(trivial_deletions_fpath, bed_fpath) if not qconfig.no_sv: if is_non_empty_file(bed_fpath): logger.main_info(' Structural variations are in ' + bed_fpath) else: if isfile(bed_fpath): logger.main_info(' No structural variations were found.') else: logger.main_info(' Failed searching structural variations.') bed_fpath = None if is_non_empty_file(cov_fpath): logger.main_info(' Coverage distribution along the reference genome is in ' + cov_fpath) else: if not qconfig.create_icarus_html: logger.main_info(' Failed to calculate coverage distribution') cov_fpath = None return bed_fpath, cov_fpath, physical_cov_fpath
def run_processing_reads(main_ref_fpath, meta_ref_fpaths, ref_labels, reads_fpaths, output_dirpath, res_path, log_path, err_path, sam_fpath=None, bam_fpath=None, bed_fpath=None): ref_name = qutils.name_from_fpath(main_ref_fpath) if not sam_fpath and bam_fpath: sam_fpath = get_safe_fpath(output_dirpath, bam_fpath[:-4] + '.sam') else: sam_fpath = sam_fpath or os.path.join(output_dirpath, ref_name + '.sam') bam_fpath = bam_fpath or get_safe_fpath(output_dirpath, sam_fpath[:-4] + '.bam') sam_sorted_fpath = get_safe_fpath(output_dirpath, add_suffix(sam_fpath, 'sorted')) bam_sorted_fpath = get_safe_fpath(output_dirpath, add_suffix(bam_fpath, 'sorted')) bed_fpath = bed_fpath or os.path.join(res_path, ref_name + '.bed') cov_fpath = os.path.join(res_path, ref_name + '.cov') physical_cov_fpath = os.path.join(res_path, ref_name + '.physical.cov') if qconfig.no_sv: logger.info( ' Will not search Structural Variations (--fast or --no-sv is specified)' ) bed_fpath = None elif is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) if qconfig.create_icarus_html: if is_non_empty_file(cov_fpath): is_correct_file = check_cov_file(cov_fpath) if is_correct_file: logger.info(' Using existing reads coverage file: ' + cov_fpath) if is_non_empty_file(physical_cov_fpath): logger.info(' Using existing physical coverage file: ' + physical_cov_fpath) else: logger.info( ' Will not calculate coverage (--fast or --no-html, or --no-icarus, or --space-efficient is specified)' ) cov_fpath = None physical_cov_fpath = None if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and \ (not qconfig.create_icarus_html or (is_non_empty_file(cov_fpath) and is_non_empty_file(physical_cov_fpath))): return bed_fpath, cov_fpath, physical_cov_fpath logger.info(' ' + 'Pre-processing reads...') correct_chr_names = None if is_non_empty_file(sam_fpath): logger.info(' Using existing SAM-file: ' + sam_fpath) correct_chr_names = get_correct_names_for_chroms( output_dirpath, main_ref_fpath, sam_fpath, err_path, reads_fpaths) elif is_non_empty_file(bam_fpath): logger.info(' Using existing BAM-file: ' + bam_fpath) qutils.call_subprocess([ sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', bam_fpath ], stdout=open(sam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) correct_chr_names = get_correct_names_for_chroms( output_dirpath, main_ref_fpath, sam_fpath, err_path, reads_fpaths) if not correct_chr_names and reads_fpaths: logger.info(' Running BWA...') # use absolute paths because we will change workdir sam_fpath = os.path.abspath(sam_fpath) abs_reads_fpaths = [] for reads_fpath in reads_fpaths: abs_reads_fpaths.append(os.path.abspath(reads_fpath)) if len(abs_reads_fpaths) != 2: logger.error( ' You should specify files with forward and reverse reads.') logger.info(' Failed searching structural variations.') return None, None, None if not qconfig.no_check: if not paired_reads_names_are_equal(reads_fpaths, logger): logger.error( ' Read names are discordant, skipping reads analysis!') logger.info(' Failed searching structural variations.') return None, None, None prev_dir = os.getcwd() os.chdir(output_dirpath) cmd = [bwa_fpath('bwa'), 'index', '-p', ref_name, main_ref_fpath] if os.path.getsize( main_ref_fpath ) > 2 * 1024**3: # if reference size bigger than 2GB cmd += ['-a', 'bwtsw'] qutils.call_subprocess(cmd, stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) cmd = bwa_fpath('bwa') + ' mem -t ' + str( qconfig.max_threads) + ' ' + ref_name + ' ' + abs_reads_fpaths[ 0] + ' ' + abs_reads_fpaths[1] qutils.call_subprocess(shlex.split(cmd), stdout=open(sam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) logger.info(' Done.') os.chdir(prev_dir) if not os.path.exists(sam_fpath) or os.path.getsize(sam_fpath) == 0: logger.error(' Failed running BWA for the reference. See ' + log_path + ' for information.') logger.info(' Failed searching structural variations.') return None, None, None elif not correct_chr_names: logger.info(' Failed searching structural variations.') return None, None, None logger.info(' Sorting SAM-file...') if (is_non_empty_file(sam_sorted_fpath) and all_read_names_correct(sam_sorted_fpath) ) and is_non_empty_file(bam_fpath): logger.info(' Using existing sorted SAM-file: ' + sam_sorted_fpath) else: correct_sam_fpath = os.path.join(output_dirpath, ref_name + '.sam.correct') # write in output dir clean_read_names(sam_fpath, correct_sam_fpath) bam_fpath = os.path.join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') qutils.call_subprocess([ sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'not unmapped', '-S', correct_sam_fpath ], stdout=open(bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([ sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-o', bam_sorted_fpath, bam_fpath ], stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([ sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', bam_sorted_fpath ], stdout=open(sam_sorted_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) if qconfig.create_icarus_html and ( not is_non_empty_file(cov_fpath) or not is_non_empty_file(physical_cov_fpath)): cov_fpath, physical_cov_fpath = get_coverage( output_dirpath, main_ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_path, cov_fpath, physical_cov_fpath, correct_chr_names) if not is_non_empty_file(bed_fpath) and not qconfig.no_sv: if meta_ref_fpaths: logger.info(' Splitting SAM-file by references...') headers = [] seq_name_length = {} with open(sam_fpath) as sam_file: for line in sam_file: if not line.startswith('@'): break if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line: seq_name = line.split('\tSN:')[1].split('\t')[0] seq_length = int(line.split('\tLN:')[1].split('\t')[0]) seq_name_length[seq_name] = seq_length headers.append(line.strip()) need_ref_splitting = False if meta_ref_fpaths: ref_files = {} for cur_ref_fpath in meta_ref_fpaths: ref = qutils.name_from_fpath(cur_ref_fpath) new_ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam') if is_non_empty_file(new_ref_sam_fpath): logger.info( ' Using existing split SAM-file for %s: %s' % (ref, new_ref_sam_fpath)) ref_files[ref] = None else: new_ref_sam_file = open(new_ref_sam_fpath, 'w') if not headers[0].startswith('@SQ'): new_ref_sam_file.write(headers[0] + '\n') chrs = [] for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h): seq_name = h.split('\tSN:')[1].split('\t')[0] if seq_name in ref_labels and ref_labels[ seq_name] == ref: new_ref_sam_file.write(h + '\n') chrs.append(seq_name) new_ref_sam_file.write(headers[-1] + '\n') ref_files[ref] = new_ref_sam_file need_ref_splitting = True deletions = [] trivial_deletions_fpath = os.path.join(output_dirpath, qconfig.trivial_deletions_fname) logger.info( ' Looking for trivial deletions (long zero-covered fragments)...') need_trivial_deletions = True if os.path.exists(trivial_deletions_fpath): need_trivial_deletions = False logger.info(' Using existing file: ' + trivial_deletions_fpath) if need_trivial_deletions or need_ref_splitting: with open(sam_sorted_fpath) as sam_file: cur_deletion = None for line in sam_file: mapping = Mapping.parse(line) if mapping: if mapping.ref == '*': continue # common case: continue current deletion (potential) on the same reference if cur_deletion and cur_deletion.ref == mapping.ref: if cur_deletion.next_bad is None: # previous mapping was in region BEFORE 0-covered fragment # just passed 0-covered fragment if mapping.start - cur_deletion.prev_bad > QuastDeletion.MIN_GAP: cur_deletion.set_next_bad(mapping) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_good(mapping) # continue region BEFORE 0-covered fragment elif mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_prev_good(mapping) else: cur_deletion.set_prev_bad(mapping) else: # previous mapping was in region AFTER 0-covered fragment # just passed another 0-cov fragment between end of cur_deletion BAD region and this mapping if mapping.start - cur_deletion.next_bad_end > QuastDeletion.MIN_GAP: if cur_deletion.is_valid( ): # add previous fragment's deletion if needed deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_bad( position=cur_deletion.next_bad_end) # continue region AFTER 0-covered fragment (old one or new/another one -- see "if" above) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_good(mapping) else: cur_deletion.set_next_bad_end(mapping) # special case: just started or just switched to the next reference else: if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good( position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_good(mapping) if need_ref_splitting: cur_ref = ref_labels[mapping.ref] if mapping.ref_next.strip( ) == '=' or cur_ref == ref_labels[ mapping.ref_next]: if ref_files[cur_ref] is not None: ref_files[cur_ref].write(line) if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good( position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) if need_ref_splitting: for ref_handler in ref_files.values(): if ref_handler is not None: ref_handler.close() if need_trivial_deletions: logger.info(' Trivial deletions: %d found' % len(deletions)) logger.info(' Saving to: ' + trivial_deletions_fpath) with open(trivial_deletions_fpath, 'w') as f: for deletion in deletions: f.write(str(deletion) + '\n') if get_manta_fpath() and isfile(get_manta_fpath()): try: manta_sv_fpath = search_sv_with_manta(main_ref_fpath, meta_ref_fpaths, output_dirpath, err_path) qutils.cat_files([manta_sv_fpath, trivial_deletions_fpath], bed_fpath) except: pass if os.path.exists( trivial_deletions_fpath) and not is_non_empty_file(bed_fpath): shutil.copy(trivial_deletions_fpath, bed_fpath) if not qconfig.no_sv: if is_non_empty_file(bed_fpath): logger.main_info(' Structural variations are in ' + bed_fpath) else: if isfile(bed_fpath): logger.main_info(' No structural variations were found.') else: logger.main_info(' Failed searching structural variations.') bed_fpath = None if is_non_empty_file(cov_fpath): logger.main_info( ' Coverage distribution along the reference genome is in ' + cov_fpath) else: if not qconfig.create_icarus_html: logger.main_info(' Failed to calculate coverage distribution') cov_fpath = None return bed_fpath, cov_fpath, physical_cov_fpath
def run_processing_reads(main_ref_fpath, meta_ref_fpaths, ref_labels, reads_fpaths, output_dirpath, res_path, log_path, err_path, sam_fpath=None, bam_fpath=None, bed_fpath=None): ref_name = qutils.name_from_fpath(main_ref_fpath) if not sam_fpath and bam_fpath: sam_fpath = get_safe_fpath(output_dirpath, bam_fpath[:-4] + '.sam') else: sam_fpath = sam_fpath or os.path.join(output_dirpath, ref_name + '.sam') bam_fpath = bam_fpath or get_safe_fpath(output_dirpath, sam_fpath[:-4] + '.bam') sam_sorted_fpath = get_safe_fpath(output_dirpath, add_suffix(sam_fpath, 'sorted')) bam_sorted_fpath = get_safe_fpath(output_dirpath, add_suffix(bam_fpath, 'sorted')) bed_fpath = bed_fpath or os.path.join(res_path, ref_name + '.bed') cov_fpath = os.path.join(res_path, ref_name + '.cov') physical_cov_fpath = os.path.join(res_path, ref_name + '.physical.cov') if qconfig.no_sv: logger.info(' Will not search Structural Variations (--fast or --no-sv is specified)') bed_fpath = None elif is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) if qconfig.create_icarus_html: if is_non_empty_file(cov_fpath): is_correct_file = check_cov_file(cov_fpath) if is_correct_file: logger.info(' Using existing reads coverage file: ' + cov_fpath) if is_non_empty_file(physical_cov_fpath): logger.info(' Using existing physical coverage file: ' + physical_cov_fpath) else: logger.info(' Will not calculate coverage (--no-icarus or --space-efficient is specified)') cov_fpath = None physical_cov_fpath = None if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and \ (qconfig.space_efficient or (is_non_empty_file(cov_fpath) and is_non_empty_file(physical_cov_fpath))): return bed_fpath, cov_fpath, physical_cov_fpath logger.info(' ' + 'Pre-processing reads...') logger.info(' ' + 'Logging to %s...' % err_path) correct_chr_names = None if is_non_empty_file(sam_fpath): logger.info(' Using existing SAM-file: ' + sam_fpath) correct_chr_names = get_correct_names_for_chroms(output_dirpath, main_ref_fpath, sam_fpath, err_path, reads_fpaths) elif is_non_empty_file(bam_fpath): logger.info(' Using existing BAM-file: ' + bam_fpath) qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', bam_fpath], stdout=open(sam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) correct_chr_names = get_correct_names_for_chroms(output_dirpath, main_ref_fpath, sam_fpath, err_path, reads_fpaths) if not correct_chr_names and reads_fpaths: logger.info(' Running BWA...') # use absolute paths because we will change workdir sam_fpath = os.path.abspath(sam_fpath) abs_reads_fpaths = [] for reads_fpath in reads_fpaths: abs_reads_fpaths.append(os.path.abspath(reads_fpath)) if len(abs_reads_fpaths) != 2: logger.error(' You should specify files with forward and reverse reads.') logger.info(' Failed searching structural variations.') return None, None, None if not qconfig.no_check: if not paired_reads_names_are_equal(reads_fpaths, logger): logger.error(' Read names are discordant, skipping reads analysis!') logger.info(' Failed searching structural variations.') return None, None, None prev_dir = os.getcwd() os.chdir(output_dirpath) cmd = [bwa_fpath('bwa'), 'index', '-p', ref_name, main_ref_fpath] if os.path.getsize(main_ref_fpath) > 2 * 1024 ** 3: # if reference size bigger than 2GB cmd += ['-a', 'bwtsw'] qutils.call_subprocess(cmd, stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) cmd = bwa_fpath('bwa') + ' mem -t ' + str(qconfig.max_threads) + ' ' + ref_name + ' ' + abs_reads_fpaths[0] + ' ' + abs_reads_fpaths[1] qutils.call_subprocess(shlex.split(cmd), stdout=open(sam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) logger.info(' Done.') os.chdir(prev_dir) if not os.path.exists(sam_fpath) or os.path.getsize(sam_fpath) == 0: logger.error(' Failed running BWA for the reference. See ' + log_path + ' for information.') logger.info(' Failed searching structural variations.') return None, None, None elif not correct_chr_names: logger.info(' Failed searching structural variations.') return None, None, None logger.info(' Sorting SAM-file...') if (is_non_empty_file(sam_sorted_fpath) and all_read_names_correct(sam_sorted_fpath)) and is_non_empty_file(bam_fpath): logger.info(' Using existing sorted SAM-file: ' + sam_sorted_fpath) else: correct_sam_fpath = os.path.join(output_dirpath, ref_name + '.sam.correct') # write in output dir clean_read_names(sam_fpath, correct_sam_fpath) bam_fpath = os.path.join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'not unmapped', '-S', correct_sam_fpath], stdout=open(bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-o', bam_sorted_fpath, bam_fpath], stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', bam_sorted_fpath], stdout=open(sam_sorted_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) if qconfig.create_icarus_html and (not is_non_empty_file(cov_fpath) or not is_non_empty_file(physical_cov_fpath)): cov_fpath, physical_cov_fpath = get_coverage(output_dirpath, main_ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_path, cov_fpath, physical_cov_fpath, correct_chr_names) if not is_non_empty_file(bed_fpath) and not qconfig.no_sv: if meta_ref_fpaths: logger.info(' Splitting SAM-file by references...') headers = [] seq_name_length = {} with open(sam_fpath) as sam_file: for line in sam_file: if not line.startswith('@'): break if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line: seq_name = line.split('\tSN:')[1].split('\t')[0] seq_length = int(line.split('\tLN:')[1].split('\t')[0]) seq_name_length[seq_name] = seq_length headers.append(line.strip()) need_ref_splitting = False if meta_ref_fpaths: ref_files = {} for cur_ref_fpath in meta_ref_fpaths: ref = qutils.name_from_fpath(cur_ref_fpath) new_ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam') if is_non_empty_file(new_ref_sam_fpath): logger.info(' Using existing split SAM-file for %s: %s' % (ref, new_ref_sam_fpath)) ref_files[ref] = None else: new_ref_sam_file = open(new_ref_sam_fpath, 'w') if not headers[0].startswith('@SQ'): new_ref_sam_file.write(headers[0] + '\n') chrs = [] for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h): seq_name = h.split('\tSN:')[1].split('\t')[0] if seq_name in ref_labels and ref_labels[seq_name] == ref: new_ref_sam_file.write(h + '\n') chrs.append(seq_name) new_ref_sam_file.write(headers[-1] + '\n') ref_files[ref] = new_ref_sam_file need_ref_splitting = True deletions = [] trivial_deletions_fpath = os.path.join(output_dirpath, qconfig.trivial_deletions_fname) logger.info(' Looking for trivial deletions (long zero-covered fragments)...') need_trivial_deletions = True if os.path.exists(trivial_deletions_fpath): need_trivial_deletions = False logger.info(' Using existing file: ' + trivial_deletions_fpath) if need_trivial_deletions or need_ref_splitting: with open(sam_sorted_fpath) as sam_file: cur_deletion = None for line in sam_file: mapping = Mapping.parse(line) if mapping: if mapping.ref == '*': continue # common case: continue current deletion (potential) on the same reference if cur_deletion and cur_deletion.ref == mapping.ref: if cur_deletion.next_bad is None: # previous mapping was in region BEFORE 0-covered fragment # just passed 0-covered fragment if mapping.start - cur_deletion.prev_bad > QuastDeletion.MIN_GAP: cur_deletion.set_next_bad(mapping) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion(mapping.ref).set_prev_good(mapping) # continue region BEFORE 0-covered fragment elif mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_prev_good(mapping) else: cur_deletion.set_prev_bad(mapping) else: # previous mapping was in region AFTER 0-covered fragment # just passed another 0-cov fragment between end of cur_deletion BAD region and this mapping if mapping.start - cur_deletion.next_bad_end > QuastDeletion.MIN_GAP: if cur_deletion.is_valid(): # add previous fragment's deletion if needed deletions.append(cur_deletion) cur_deletion = QuastDeletion(mapping.ref).set_prev_bad(position=cur_deletion.next_bad_end) # continue region AFTER 0-covered fragment (old one or new/another one -- see "if" above) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion(mapping.ref).set_prev_good(mapping) else: cur_deletion.set_next_bad_end(mapping) # special case: just started or just switched to the next reference else: if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good(position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion(mapping.ref).set_prev_good(mapping) if need_ref_splitting: cur_ref = ref_labels[mapping.ref] if mapping.ref_next.strip() == '=' or cur_ref == ref_labels[mapping.ref_next]: if ref_files[cur_ref] is not None: ref_files[cur_ref].write(line) if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good(position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) if need_ref_splitting: for ref_handler in ref_files.values(): if ref_handler is not None: ref_handler.close() if need_trivial_deletions: logger.info(' Trivial deletions: %d found' % len(deletions)) logger.info(' Saving to: ' + trivial_deletions_fpath) with open(trivial_deletions_fpath, 'w') as f: for deletion in deletions: f.write(str(deletion) + '\n') if isfile(config_manta_fpath): try: manta_sv_fpath = search_sv_with_manta(main_ref_fpath, meta_ref_fpaths, output_dirpath, err_path) qutils.cat_files([manta_sv_fpath, trivial_deletions_fpath], bed_fpath) except: pass if os.path.exists(trivial_deletions_fpath) and not is_non_empty_file(bed_fpath): shutil.copy(trivial_deletions_fpath, bed_fpath) if not qconfig.no_sv: if is_non_empty_file(bed_fpath): logger.main_info(' Structural variations are in ' + bed_fpath) else: if isfile(bed_fpath): logger.main_info(' No structural variations were found.') else: logger.main_info(' Failed searching structural variations.') bed_fpath = None if is_non_empty_file(cov_fpath): logger.main_info(' Coverage distribution along the reference genome is in ' + cov_fpath) else: if not qconfig.create_icarus_html: logger.main_info(' Failed to calculate coverage distribution') cov_fpath = None return bed_fpath, cov_fpath, physical_cov_fpath
def run_processing_reads(contigs_fpaths, main_ref_fpath, meta_ref_fpaths, ref_labels, temp_output_dir, output_dir, log_path, err_fpath): required_files = [] bed_fpath, cov_fpath, physical_cov_fpath = None, None, None if main_ref_fpath: ref_name = qutils.name_from_fpath(main_ref_fpath) bed_fpath = qconfig.bed or join(output_dir, ref_name + '.bed') cov_fpath = qconfig.cov_fpath or join(output_dir, ref_name + '.cov') physical_cov_fpath = qconfig.phys_cov_fpath or join(output_dir, ref_name + '.physical.cov') required_files = [bed_fpath, cov_fpath, physical_cov_fpath] if qconfig.no_sv: logger.info(' Will not search Structural Variations (--fast or --no-sv is specified)') bed_fpath = None elif is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) elif not qconfig.forward_reads and not qconfig.interlaced_reads: if not qconfig.reference_sam and not qconfig.reference_bam: logger.info(' Will not search Structural Variations (needs paired-end reads)') bed_fpath = None qconfig.no_sv = True if qconfig.create_icarus_html: if is_non_empty_file(cov_fpath): is_correct_file = check_cov_file(cov_fpath) if is_correct_file: logger.info(' Using existing reads coverage file: ' + cov_fpath) if is_non_empty_file(physical_cov_fpath): logger.info(' Using existing physical coverage file: ' + physical_cov_fpath) else: logger.info(' Will not calculate coverage (--fast or --no-html, or --no-icarus, or --space-efficient is specified)') cov_fpath = None physical_cov_fpath = None if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and \ (not qconfig.create_icarus_html or (is_non_empty_file(cov_fpath) and is_non_empty_file(physical_cov_fpath))): required_files = [] n_jobs = min(qconfig.max_threads, len(contigs_fpaths) + 1) max_threads_per_job = max(1, qconfig.max_threads // n_jobs) sam_fpaths = qconfig.sam_fpaths or [None] * len(contigs_fpaths) bam_fpaths = qconfig.bam_fpaths or [None] * len(contigs_fpaths) parallel_align_args = [(contigs_fpath, output_dir, temp_output_dir, log_path, err_fpath, max_threads_per_job, sam_fpaths[index], bam_fpaths[index], index) for index, contigs_fpath in enumerate(contigs_fpaths)] if main_ref_fpath: parallel_align_args.append((main_ref_fpath, output_dir, temp_output_dir, log_path, err_fpath, max_threads_per_job, qconfig.reference_sam, qconfig.reference_bam, None, required_files, True)) correct_chr_names, sam_fpaths, bam_fpaths = run_parallel(align_single_file, parallel_align_args, n_jobs) qconfig.sam_fpaths = sam_fpaths[:len(contigs_fpaths)] qconfig.bam_fpaths = bam_fpaths[:len(contigs_fpaths)] add_statistics_to_report(output_dir, contigs_fpaths, main_ref_fpath) save_reads(output_dir) if not main_ref_fpath: return None, None, None correct_chr_names = correct_chr_names[-1] sam_fpath, bam_fpath = sam_fpaths[-1], bam_fpaths[-1] qconfig.reference_sam = sam_fpath qconfig.reference_bam = bam_fpath if not required_files: return bed_fpath, cov_fpath, physical_cov_fpath if not all([sam_fpath, bam_fpath]): logger.info(' Failed searching structural variations.') return None, None, None sam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(sam_fpath, 'sorted')) bam_mapped_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_fpath, 'mapped')) bam_sorted_fpath = get_safe_fpath(temp_output_dir, add_suffix(bam_mapped_fpath, 'sorted')) if is_non_empty_file(sam_sorted_fpath): logger.info(' Using existing sorted SAM-file: ' + sam_sorted_fpath) else: if not is_non_empty_file(bam_sorted_fpath): sambamba_view(bam_fpath, bam_mapped_fpath, qconfig.max_threads, err_fpath, logger, filter_rule='not unmapped') sort_bam(bam_mapped_fpath, bam_sorted_fpath, err_fpath, logger) sambamba_view(bam_sorted_fpath, sam_sorted_fpath, qconfig.max_threads, err_fpath, logger) if qconfig.create_icarus_html and (not is_non_empty_file(cov_fpath) or not is_non_empty_file(physical_cov_fpath)): cov_fpath, physical_cov_fpath = get_coverage(temp_output_dir, main_ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_fpath, correct_chr_names, cov_fpath, physical_cov_fpath) if not is_non_empty_file(bed_fpath) and not qconfig.no_sv: if meta_ref_fpaths: logger.info(' Splitting SAM-file by references...') headers = [] seq_lengths = {} with open(sam_fpath) as sam_file: for line in sam_file: if not line.startswith('@'): break if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line: seq_name = line.split('\tSN:')[1].split('\t')[0] seq_length = int(line.split('\tLN:')[1].split('\t')[0]) seq_lengths[seq_name] = seq_length headers.append(line.strip()) need_ref_splitting = False ref_files = {} if meta_ref_fpaths: global ref_sam_fpaths for cur_ref_fpath in meta_ref_fpaths: cur_ref_name = qutils.name_from_fpath(cur_ref_fpath) ref_sam_fpath = join(temp_output_dir, cur_ref_name + '.sam') ref_sam_fpaths[cur_ref_fpath] = ref_sam_fpath if is_non_empty_file(ref_sam_fpath): logger.info(' Using existing split SAM-file for %s: %s' % (cur_ref_name, ref_sam_fpath)) ref_files[cur_ref_name] = None else: ref_sam_file = open(ref_sam_fpath, 'w') if not headers[0].startswith('@SQ'): ref_sam_file.write(headers[0] + '\n') for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h): seq_name = h.split('\tSN:')[1].split('\t')[0] if seq_name in ref_labels and ref_labels[seq_name] == cur_ref_name: ref_sam_file.write(h + '\n') ref_sam_file.write(headers[-1] + '\n') ref_files[cur_ref_name] = ref_sam_file need_ref_splitting = True trivial_deletions_fpath = \ search_trivial_deletions(temp_output_dir, sam_sorted_fpath, ref_files, ref_labels, seq_lengths, need_ref_splitting) if get_gridss_fpath() and isfile(get_gridss_fpath()): try: gridss_sv_fpath = search_sv_with_gridss(main_ref_fpath, bam_mapped_fpath, meta_ref_fpaths, temp_output_dir, err_fpath) qutils.cat_files([gridss_sv_fpath, trivial_deletions_fpath], bed_fpath) except: pass if isfile(trivial_deletions_fpath) and not is_non_empty_file(bed_fpath): shutil.copy(trivial_deletions_fpath, bed_fpath) if not qconfig.no_sv: if is_non_empty_file(bed_fpath): logger.main_info(' Structural variations are in ' + bed_fpath) else: if isfile(bed_fpath): logger.main_info(' No structural variations were found.') else: logger.main_info(' Failed searching structural variations.') bed_fpath = None if is_non_empty_file(cov_fpath): logger.main_info(' Coverage distribution along the reference genome is in ' + cov_fpath) else: if not qconfig.create_icarus_html: logger.main_info(' Failed to calculate coverage distribution') cov_fpath = None return bed_fpath, cov_fpath, physical_cov_fpath