def get_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_path, cov_fpath, physical_cov_fpath): raw_cov_fpath = cov_fpath + '_raw' if not is_non_empty_file(cov_fpath): logger.info(' Calculating reads coverage...') if not is_non_empty_file(raw_cov_fpath): if not is_non_empty_file(bam_sorted_fpath): qutils.call_subprocess([ sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-o', bam_sorted_fpath, bam_fpath ], stdout=open(log_path, 'a'), stderr=open(err_path, 'a')) chr_len_fpath = get_chr_len_fpath(ref_fpath) qutils.call_subprocess([ bedtools_fpath('bedtools'), 'genomecov', '-bga', '-ibam', bam_sorted_fpath, '-g', chr_len_fpath ], stdout=open(raw_cov_fpath, 'w'), stderr=open(err_path, 'a')) qutils.assert_file_exists(raw_cov_fpath, 'coverage file') proceed_cov_file(raw_cov_fpath, cov_fpath) if not is_non_empty_file(physical_cov_fpath): physical_cov_fpath = get_physical_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, log_path, err_path, physical_cov_fpath) return cov_fpath, physical_cov_fpath
def process_one_ref(cur_ref_fpath, output_dirpath, err_path, bed_fpath=None): ref = qutils.name_from_fpath(cur_ref_fpath) ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam') ref_bam_fpath = os.path.join(output_dirpath, ref + '.bam') ref_bamsorted_fpath = os.path.join(output_dirpath, ref + '.sorted') ref_bed_fpath = bed_fpath if bed_fpath else os.path.join(output_dirpath, ref + '.bed') if os.path.getsize(ref_sam_fpath) < 1024 * 1024: # TODO: make it better (small files will cause Manta crush -- "not enough reads...") logger.info(' SAM file is too small for Manta (%d Kb), skipping..' % (os.path.getsize(ref_sam_fpath) // 1024)) return None if is_non_empty_file(ref_bed_fpath): logger.info(' Using existing Manta BED-file: ' + ref_bed_fpath) return ref_bed_fpath if not os.path.exists(ref_bamsorted_fpath + '.bam'): qutils.call_subprocess([samtools_fpath('samtools'), 'view', '-bS', ref_sam_fpath], stdout=open(ref_bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([samtools_fpath('samtools'), 'sort', ref_bam_fpath, ref_bamsorted_fpath], stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(ref_bamsorted_fpath + '.bam.bai'): qutils.call_subprocess([samtools_fpath('samtools'), 'index', ref_bamsorted_fpath + '.bam'], stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(cur_ref_fpath + '.fai'): qutils.call_subprocess([samtools_fpath('samtools'), 'faidx', cur_ref_fpath], stderr=open(err_path, 'a'), logger=logger) vcfoutput_dirpath = os.path.join(output_dirpath, ref + '_manta') found_SV_fpath = os.path.join(vcfoutput_dirpath, 'results/variants/diploidSV.vcf.gz') unpacked_SV_fpath = found_SV_fpath + '.unpacked' if not is_non_empty_file(found_SV_fpath): if os.path.exists(vcfoutput_dirpath): shutil.rmtree(vcfoutput_dirpath, ignore_errors=True) os.makedirs(vcfoutput_dirpath) qutils.call_subprocess([config_manta_fpath, '--normalBam', ref_bamsorted_fpath + '.bam', '--referenceFasta', cur_ref_fpath, '--runDir', vcfoutput_dirpath], stdout=open(err_path, 'a'), stderr=open(err_path, 'a'), logger=logger) if not os.path.exists(os.path.join(vcfoutput_dirpath, 'runWorkflow.py')): return None qutils.call_subprocess([os.path.join(vcfoutput_dirpath, 'runWorkflow.py'), '-m', 'local', '-j', str(qconfig.max_threads)], stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(unpacked_SV_fpath): cmd = 'gunzip -c %s' % found_SV_fpath qutils.call_subprocess(shlex.split(cmd), stdout=open(unpacked_SV_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) from manta import vcfToBedpe vcfToBedpe.vcfToBedpe(open(unpacked_SV_fpath), open(ref_bed_fpath, 'w')) return ref_bed_fpath
def get_physical_coverage(output_dirpath, ref_fpath, ref_name, bam_fpath, log_path, err_path, cov_fpath): if not os.path.exists(bedtools_fpath('bamToBed')): logger.info(' Failed calculating physical coverage...') return None if not is_non_empty_file(cov_fpath): logger.info(' Calculating physical coverage...') ## keep properly mapped, unique, and non-duplicate read pairs only bam_filtered_fpath = os.path.join(output_dirpath, ref_name + '.filtered.bam') qutils.call_subprocess([ sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-F', 'proper_pair and not supplementary and not duplicate', bam_fpath ], stdout=open(bam_filtered_fpath, 'w'), stderr=open(err_path, 'a')) ## sort by read names bam_filtered_sorted_fpath = os.path.join( output_dirpath, ref_name + '.filtered.sorted.bam') qutils.call_subprocess([ sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-n', '-o', bam_filtered_sorted_fpath, bam_filtered_fpath ], stdout=open(log_path, 'a'), stderr=open(err_path, 'a')) bedpe_fpath = os.path.join(output_dirpath, ref_name + '.bedpe') qutils.call_subprocess([ bedtools_fpath('bamToBed'), '-i', bam_filtered_sorted_fpath, '-bedpe' ], stdout=open(bedpe_fpath, 'w'), stderr=open(err_path, 'a')) raw_bed_fpath = os.path.join(output_dirpath, ref_name + '.bed') with open(bedpe_fpath, 'r') as bedpe: with open(raw_bed_fpath, 'w') as bed_file: for line in bedpe: fs = line.split() bed_file.write('\t'.join([fs[0], fs[1], fs[5] + '\n'])) sorted_bed_fpath = os.path.join(output_dirpath, ref_name + '.sorted.bed') qutils.call_subprocess( [bedtools_fpath('bedtools'), 'sort', '-i', raw_bed_fpath], stdout=open(sorted_bed_fpath, 'w'), stderr=open(err_path, 'a')) chr_len_fpath = get_chr_len_fpath(ref_fpath) raw_cov_fpath = cov_fpath + '_raw' qutils.call_subprocess([ bedtools_fpath('bedtools'), 'genomecov', '-bga', '-i', sorted_bed_fpath, '-g', chr_len_fpath ], stdout=open(raw_cov_fpath, 'w'), stderr=open(err_path, 'a')) proceed_cov_file(raw_cov_fpath, cov_fpath) return cov_fpath
def run_processing_reads(main_ref_fpath, meta_ref_fpaths, ref_labels, reads_fpaths, output_dirpath, res_path, log_path, err_path): ref_name = qutils.name_from_fpath(main_ref_fpath) sam_fpath = os.path.join(output_dirpath, ref_name + '.sam') bam_fpath = os.path.join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = os.path.join(output_dirpath, ref_name + '.sorted') sam_sorted_fpath = os.path.join(output_dirpath, ref_name + '.sorted.sam') bed_fpath = os.path.join(res_path, ref_name + '.bed') if is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) return bed_fpath logger.info(' ' + 'Pre-processing for searching structural variations...') logger.info(' ' + 'Logging to %s...' % err_path) if is_non_empty_file(sam_fpath): logger.info(' Using existing SAM-file: ' + sam_fpath) else: logger.info(' Running Bowtie2...') abs_reads_fpaths = [ ] # use absolute paths because we will change workdir for reads_fpath in reads_fpaths: abs_reads_fpaths.append(os.path.abspath(reads_fpath)) prev_dir = os.getcwd() os.chdir(output_dirpath) cmd = [bin_fpath('bowtie2-build'), main_ref_fpath, ref_name] qutils.call_subprocess(cmd, stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) cmd = bin_fpath('bowtie2') + ' -x ' + ref_name + ' -1 ' + abs_reads_fpaths[0] + ' -2 ' + abs_reads_fpaths[1] + ' -S ' + \ sam_fpath + ' --no-unal -a -p %s' % str(qconfig.max_threads) qutils.call_subprocess(shlex.split(cmd), stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) logger.info(' Done.') os.chdir(prev_dir) if not os.path.exists(sam_fpath) or os.path.getsize(sam_fpath) == 0: logger.error(' Failed running Bowtie2 for the reference. See ' + log_path + ' for information.') logger.info(' Failed searching structural variations.') return None logger.info(' Sorting SAM-file...') if is_non_empty_file(sam_sorted_fpath): logger.info(' Using existing sorted SAM-file: ' + sam_sorted_fpath) else: qutils.call_subprocess([ samtools_fpath('samtools'), 'view', '-@', str(qconfig.max_threads), '-bS', sam_fpath ], stdout=open(bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([ samtools_fpath('samtools'), 'sort', '-@', str(qconfig.max_threads), bam_fpath, bam_sorted_fpath ], stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([ samtools_fpath('samtools'), 'view', '-@', str(qconfig.max_threads), bam_sorted_fpath + '.bam' ], stdout=open(sam_sorted_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) if meta_ref_fpaths: logger.info(' Splitting SAM-file by references...') headers = [] seq_name_length = {} with open(sam_fpath) as sam_file: for line in sam_file: if not line.startswith('@'): break if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line: seq_name = line.split('\tSN:')[1].split('\t')[0] seq_length = int(line.split('\tLN:')[1].split('\t')[0]) seq_name_length[seq_name] = seq_length headers.append(line.strip()) need_ref_splitting = False if meta_ref_fpaths: ref_files = {} for cur_ref_fpath in meta_ref_fpaths: ref = qutils.name_from_fpath(cur_ref_fpath) new_ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam') if is_non_empty_file(new_ref_sam_fpath): logger.info(' Using existing split SAM-file for %s: %s' % (ref, new_ref_sam_fpath)) ref_files[ref] = None else: new_ref_sam_file = open(new_ref_sam_fpath, 'w') new_ref_sam_file.write(headers[0] + '\n') chrs = [] for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h): seq_name = h.split('\tSN:')[1].split('\t')[0] if seq_name in ref_labels and ref_labels[seq_name] == ref: new_ref_sam_file.write(h + '\n') chrs.append(seq_name) new_ref_sam_file.write(headers[-1] + '\n') ref_files[ref] = new_ref_sam_file need_ref_splitting = True deletions = [] trivial_deletions_fpath = os.path.join(output_dirpath, qconfig.trivial_deletions_fname) logger.info( ' Looking for trivial deletions (long zero-covered fragments)...') need_trivial_deletions = True if os.path.exists(trivial_deletions_fpath): need_trivial_deletions = False logger.info(' Using existing file: ' + trivial_deletions_fpath) if need_trivial_deletions or need_ref_splitting: with open(sam_sorted_fpath) as sam_file: cur_deletion = None for line in sam_file: mapping = Mapping.parse(line) if mapping: # common case: continue current deletion (potential) on the same reference if cur_deletion and cur_deletion.ref == mapping.ref: if cur_deletion.next_bad is None: # previous mapping was in region BEFORE 0-covered fragment # just passed 0-covered fragment if mapping.start - cur_deletion.prev_bad > QuastDeletion.MIN_GAP: cur_deletion.set_next_bad(mapping) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_good(mapping) # continue region BEFORE 0-covered fragment elif mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_prev_good(mapping) else: cur_deletion.set_prev_bad(mapping) else: # previous mapping was in region AFTER 0-covered fragment # just passed another 0-cov fragment between end of cur_deletion BAD region and this mapping if mapping.start - cur_deletion.next_bad_end > QuastDeletion.MIN_GAP: if cur_deletion.is_valid( ): # add previous fragment's deletion if needed deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_bad( position=cur_deletion.next_bad_end) # continue region AFTER 0-covered fragment (old one or new/another one -- see "if" above) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_good(mapping) else: cur_deletion.set_next_bad_end(mapping) # special case: just started or just switched to the next reference else: if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good( position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_good(mapping) if need_ref_splitting: cur_ref = ref_labels[mapping.ref] if mapping.ref_next.strip( ) == '=' or cur_ref == ref_labels[mapping.ref_next]: if ref_files[cur_ref] is not None: ref_files[cur_ref].write(line) if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good( position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) if need_ref_splitting: for ref_handler in ref_files.values(): if ref_handler is not None: ref_handler.close() if need_trivial_deletions: logger.info(' Trivial deletions: %d found' % len(deletions)) logger.info(' Saving to: ' + trivial_deletions_fpath) with open(trivial_deletions_fpath, 'w') as f: for deletion in deletions: f.write(str(deletion) + '\n') if os.path.exists(config_manta_fpath): manta_sv_fpath = search_sv_with_manta(main_ref_fpath, meta_ref_fpaths, output_dirpath, err_path) qutils.cat_files([manta_sv_fpath, trivial_deletions_fpath], bed_fpath) elif os.path.exists(trivial_deletions_fpath): shutil.copy(trivial_deletions_fpath, bed_fpath) if os.path.exists(bed_fpath): logger.main_info(' Structural variations saved to ' + bed_fpath) return bed_fpath else: logger.main_info(' Failed searching structural variations.') return None
def process_one_ref(cur_ref_fpath, output_dirpath, err_path, bed_fpath=None): ref = qutils.name_from_fpath(cur_ref_fpath) ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam') ref_bam_fpath = os.path.join(output_dirpath, ref + '.bam') ref_bamsorted_fpath = os.path.join(output_dirpath, ref + '.sorted') ref_bed_fpath = bed_fpath if bed_fpath else os.path.join( output_dirpath, ref + '.bed') if os.path.getsize( ref_sam_fpath ) < 1024 * 1024: # TODO: make it better (small files will cause Manta crush -- "not enough reads...") logger.info(' SAM file is too small for Manta (%d Kb), skipping..' % (os.path.getsize(ref_sam_fpath) // 1024)) return None if is_non_empty_file(ref_bed_fpath): logger.info(' Using existing Manta BED-file: ' + ref_bed_fpath) return ref_bed_fpath if not os.path.exists(ref_bamsorted_fpath + '.bam'): qutils.call_subprocess( [samtools_fpath('samtools'), 'view', '-bS', ref_sam_fpath], stdout=open(ref_bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([ samtools_fpath('samtools'), 'sort', ref_bam_fpath, ref_bamsorted_fpath ], stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(ref_bamsorted_fpath + '.bam.bai'): qutils.call_subprocess([ samtools_fpath('samtools'), 'index', ref_bamsorted_fpath + '.bam' ], stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(cur_ref_fpath + '.fai'): qutils.call_subprocess( [samtools_fpath('samtools'), 'faidx', cur_ref_fpath], stderr=open(err_path, 'a'), logger=logger) vcfoutput_dirpath = os.path.join(output_dirpath, ref + '_manta') found_SV_fpath = os.path.join(vcfoutput_dirpath, 'results/variants/diploidSV.vcf.gz') unpacked_SV_fpath = found_SV_fpath + '.unpacked' if not is_non_empty_file(found_SV_fpath): if os.path.exists(vcfoutput_dirpath): shutil.rmtree(vcfoutput_dirpath, ignore_errors=True) os.makedirs(vcfoutput_dirpath) qutils.call_subprocess([ config_manta_fpath, '--normalBam', ref_bamsorted_fpath + '.bam', '--referenceFasta', cur_ref_fpath, '--runDir', vcfoutput_dirpath ], stdout=open(err_path, 'a'), stderr=open(err_path, 'a'), logger=logger) if not os.path.exists(os.path.join(vcfoutput_dirpath, 'runWorkflow.py')): return None qutils.call_subprocess([ os.path.join(vcfoutput_dirpath, 'runWorkflow.py'), '-m', 'local', '-j', str(qconfig.max_threads) ], stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(unpacked_SV_fpath): cmd = 'gunzip -c %s' % found_SV_fpath qutils.call_subprocess(shlex.split(cmd), stdout=open(unpacked_SV_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) from manta import vcfToBedpe vcfToBedpe.vcfToBedpe(open(unpacked_SV_fpath), open(ref_bed_fpath, 'w')) return ref_bed_fpath
def run_processing_reads(main_ref_fpath, meta_ref_fpaths, ref_labels, reads_fpaths, output_dirpath, res_path, log_path, err_path): ref_name = qutils.name_from_fpath(main_ref_fpath) sam_fpath = os.path.join(output_dirpath, ref_name + '.sam') bam_fpath = os.path.join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = os.path.join(output_dirpath, ref_name + '.sorted') sam_sorted_fpath = os.path.join(output_dirpath, ref_name + '.sorted.sam') bed_fpath = os.path.join(res_path, ref_name + '.bed') if is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) return bed_fpath logger.info(' ' + 'Pre-processing for searching structural variations...') logger.info(' ' + 'Logging to %s...' % err_path) if is_non_empty_file(sam_fpath): logger.info(' Using existing SAM-file: ' + sam_fpath) else: logger.info(' Running Bowtie2...') abs_reads_fpaths = [] # use absolute paths because we will change workdir for reads_fpath in reads_fpaths: abs_reads_fpaths.append(os.path.abspath(reads_fpath)) prev_dir = os.getcwd() os.chdir(output_dirpath) cmd = [bin_fpath('bowtie2-build'), main_ref_fpath, ref_name] qutils.call_subprocess(cmd, stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) cmd = bin_fpath('bowtie2') + ' -x ' + ref_name + ' -1 ' + abs_reads_fpaths[0] + ' -2 ' + abs_reads_fpaths[1] + ' -S ' + \ sam_fpath + ' --no-unal -a -p %s' % str(qconfig.max_threads) qutils.call_subprocess(shlex.split(cmd), stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) logger.info(' Done.') os.chdir(prev_dir) if not os.path.exists(sam_fpath) or os.path.getsize(sam_fpath) == 0: logger.error(' Failed running Bowtie2 for the reference. See ' + log_path + ' for information.') logger.info(' Failed searching structural variations.') return None logger.info(' Sorting SAM-file...') if is_non_empty_file(sam_sorted_fpath): logger.info(' Using existing sorted SAM-file: ' + sam_sorted_fpath) else: qutils.call_subprocess([samtools_fpath('samtools'), 'view', '-@', str(qconfig.max_threads), '-bS', sam_fpath], stdout=open(bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([samtools_fpath('samtools'), 'sort', '-@', str(qconfig.max_threads), bam_fpath, bam_sorted_fpath], stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([samtools_fpath('samtools'), 'view', '-@', str(qconfig.max_threads), bam_sorted_fpath + '.bam'], stdout=open(sam_sorted_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) if meta_ref_fpaths: logger.info(' Splitting SAM-file by references...') headers = [] seq_name_length = {} with open(sam_fpath) as sam_file: for line in sam_file: if not line.startswith('@'): break if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line: seq_name = line.split('\tSN:')[1].split('\t')[0] seq_length = int(line.split('\tLN:')[1].split('\t')[0]) seq_name_length[seq_name] = seq_length headers.append(line.strip()) need_ref_splitting = False if meta_ref_fpaths: ref_files = {} for cur_ref_fpath in meta_ref_fpaths: ref = qutils.name_from_fpath(cur_ref_fpath) new_ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam') if is_non_empty_file(new_ref_sam_fpath): logger.info(' Using existing split SAM-file for %s: %s' % (ref, new_ref_sam_fpath)) ref_files[ref] = None else: new_ref_sam_file = open(new_ref_sam_fpath, 'w') new_ref_sam_file.write(headers[0] + '\n') chrs = [] for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h): seq_name = h.split('\tSN:')[1].split('\t')[0] if seq_name in ref_labels and ref_labels[seq_name] == ref: new_ref_sam_file.write(h + '\n') chrs.append(seq_name) new_ref_sam_file.write(headers[-1] + '\n') ref_files[ref] = new_ref_sam_file need_ref_splitting = True deletions = [] trivial_deletions_fpath = os.path.join(output_dirpath, qconfig.trivial_deletions_fname) logger.info(' Looking for trivial deletions (long zero-covered fragments)...') need_trivial_deletions = True if os.path.exists(trivial_deletions_fpath): need_trivial_deletions = False logger.info(' Using existing file: ' + trivial_deletions_fpath) if need_trivial_deletions or need_ref_splitting: with open(sam_sorted_fpath) as sam_file: cur_deletion = None for line in sam_file: mapping = Mapping.parse(line) if mapping: # common case: continue current deletion (potential) on the same reference if cur_deletion and cur_deletion.ref == mapping.ref: if cur_deletion.next_bad is None: # previous mapping was in region BEFORE 0-covered fragment # just passed 0-covered fragment if mapping.start - cur_deletion.prev_bad > QuastDeletion.MIN_GAP: cur_deletion.set_next_bad(mapping) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion(mapping.ref).set_prev_good(mapping) # continue region BEFORE 0-covered fragment elif mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_prev_good(mapping) else: cur_deletion.set_prev_bad(mapping) else: # previous mapping was in region AFTER 0-covered fragment # just passed another 0-cov fragment between end of cur_deletion BAD region and this mapping if mapping.start - cur_deletion.next_bad_end > QuastDeletion.MIN_GAP: if cur_deletion.is_valid(): # add previous fragment's deletion if needed deletions.append(cur_deletion) cur_deletion = QuastDeletion(mapping.ref).set_prev_bad(position=cur_deletion.next_bad_end) # continue region AFTER 0-covered fragment (old one or new/another one -- see "if" above) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion(mapping.ref).set_prev_good(mapping) else: cur_deletion.set_next_bad_end(mapping) # special case: just started or just switched to the next reference else: if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good(position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion(mapping.ref).set_prev_good(mapping) if need_ref_splitting: cur_ref = ref_labels[mapping.ref] if mapping.ref_next.strip() == '=' or cur_ref == ref_labels[mapping.ref_next]: if ref_files[cur_ref] is not None: ref_files[cur_ref].write(line) if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good(position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) if need_ref_splitting: for ref_handler in ref_files.values(): if ref_handler is not None: ref_handler.close() if need_trivial_deletions: logger.info(' Trivial deletions: %d found' % len(deletions)) logger.info(' Saving to: ' + trivial_deletions_fpath) with open(trivial_deletions_fpath, 'w') as f: for deletion in deletions: f.write(str(deletion) + '\n') if os.path.exists(config_manta_fpath): manta_sv_fpath = search_sv_with_manta(main_ref_fpath, meta_ref_fpaths, output_dirpath, err_path) qutils.cat_files([manta_sv_fpath, trivial_deletions_fpath], bed_fpath) elif os.path.exists(trivial_deletions_fpath): shutil.copy(trivial_deletions_fpath, bed_fpath) if os.path.exists(bed_fpath): logger.main_info(' Structural variations saved to ' + bed_fpath) return bed_fpath else: logger.main_info(' Failed searching structural variations.') return None
def run_processing_reads(main_ref_fpath, meta_ref_fpaths, ref_labels, reads_fpaths, output_dirpath, res_path, log_path, err_path, sam_fpath=None, bam_fpath=None, bed_fpath=None): ref_name = qutils.name_from_fpath(main_ref_fpath) if not sam_fpath and bam_fpath: sam_fpath = get_safe_fpath(output_dirpath, bam_fpath[:-4] + '.sam') else: sam_fpath = sam_fpath or os.path.join(output_dirpath, ref_name + '.sam') bam_fpath = bam_fpath or get_safe_fpath(output_dirpath, sam_fpath[:-4] + '.bam') sam_sorted_fpath = get_safe_fpath(output_dirpath, add_suffix(sam_fpath, 'sorted')) bam_sorted_fpath = get_safe_fpath(output_dirpath, add_suffix(bam_fpath, 'sorted')) bed_fpath = bed_fpath or os.path.join(res_path, ref_name + '.bed') cov_fpath = os.path.join(res_path, ref_name + '.cov') physical_cov_fpath = os.path.join(res_path, ref_name + '.physical.cov') if qconfig.no_sv: logger.info( ' Will not search Structural Variations (--fast or --no-sv is specified)' ) bed_fpath = None elif is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) if is_non_empty_file(cov_fpath): is_correct_file = check_cov_file(cov_fpath) if is_correct_file: logger.info(' Using existing reads coverage file: ' + cov_fpath) if is_non_empty_file(physical_cov_fpath): logger.info(' Using existing physical coverage file: ' + physical_cov_fpath) if (is_non_empty_file(bed_fpath) or qconfig.no_sv) and is_non_empty_file( cov_fpath) and is_non_empty_file(physical_cov_fpath): return bed_fpath, cov_fpath, physical_cov_fpath logger.info(' ' + 'Pre-processing reads...') logger.info(' ' + 'Logging to %s...' % err_path) if is_non_empty_file(sam_fpath): logger.info(' Using existing SAM-file: ' + sam_fpath) elif is_non_empty_file(bam_fpath): logger.info(' Using existing BAM-file: ' + bam_fpath) qutils.call_subprocess([ sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', bam_fpath ], stdout=open(sam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) else: logger.info(' Running BWA...') # use absolute paths because we will change workdir sam_fpath = os.path.abspath(sam_fpath) abs_reads_fpaths = [] for reads_fpath in reads_fpaths: abs_reads_fpaths.append(os.path.abspath(reads_fpath)) if len(abs_reads_fpaths) != 2: logger.error( ' You should specify files with forward and reverse reads.') logger.info(' Failed searching structural variations.') return None, None, None if not qconfig.no_check: if not paired_reads_names_are_equal(reads_fpaths, logger): logger.info( ' Read names are discordant, skipping reads analysis!') return None, None, None prev_dir = os.getcwd() os.chdir(output_dirpath) cmd = [bwa_fpath('bwa'), 'index', '-p', ref_name, main_ref_fpath] if os.path.getsize( main_ref_fpath ) > 2 * 1024**3: # if reference size bigger than 2GB cmd += ['-a', 'bwtsw'] qutils.call_subprocess(cmd, stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) cmd = bwa_fpath('bwa') + ' mem -t ' + str( qconfig.max_threads) + ' ' + ref_name + ' ' + abs_reads_fpaths[ 0] + ' ' + abs_reads_fpaths[1] qutils.call_subprocess(shlex.split(cmd), stdout=open(sam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) logger.info(' Done.') os.chdir(prev_dir) if not os.path.exists(sam_fpath) or os.path.getsize(sam_fpath) == 0: logger.error(' Failed running BWA for the reference. See ' + log_path + ' for information.') logger.info(' Failed searching structural variations.') return None, None, None logger.info(' Sorting SAM-file...') if (is_non_empty_file(sam_sorted_fpath) and all_read_names_correct(sam_sorted_fpath) ) and is_non_empty_file(bam_fpath): logger.info(' Using existing sorted SAM-file: ' + sam_sorted_fpath) else: correct_sam_fpath = os.path.join(output_dirpath, ref_name + '.sam.correct') # write in output dir clean_read_names(sam_fpath, correct_sam_fpath) bam_fpath = os.path.join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = add_suffix(bam_fpath, 'sorted') qutils.call_subprocess([ sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', '-f', 'bam', '-S', correct_sam_fpath ], stdout=open(bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([ sambamba_fpath('sambamba'), 'sort', '-t', str(qconfig.max_threads), '-o', bam_sorted_fpath, bam_fpath ], stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([ sambamba_fpath('sambamba'), 'view', '-t', str(qconfig.max_threads), '-h', bam_sorted_fpath ], stdout=open(sam_sorted_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(cov_fpath) or not is_non_empty_file( physical_cov_fpath): cov_fpath, physical_cov_fpath = get_coverage( output_dirpath, main_ref_fpath, ref_name, bam_fpath, bam_sorted_fpath, log_path, err_path, cov_fpath, physical_cov_fpath) if not is_non_empty_file(bed_fpath) and not qconfig.no_sv: if meta_ref_fpaths: logger.info(' Splitting SAM-file by references...') headers = [] seq_name_length = {} with open(sam_fpath) as sam_file: for line in sam_file: if not line.startswith('@'): break if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line: seq_name = line.split('\tSN:')[1].split('\t')[0] seq_length = int(line.split('\tLN:')[1].split('\t')[0]) seq_name_length[seq_name] = seq_length headers.append(line.strip()) need_ref_splitting = False if meta_ref_fpaths: ref_files = {} for cur_ref_fpath in meta_ref_fpaths: ref = qutils.name_from_fpath(cur_ref_fpath) new_ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam') if is_non_empty_file(new_ref_sam_fpath): logger.info( ' Using existing split SAM-file for %s: %s' % (ref, new_ref_sam_fpath)) ref_files[ref] = None else: new_ref_sam_file = open(new_ref_sam_fpath, 'w') new_ref_sam_file.write(headers[0] + '\n') chrs = [] for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h): seq_name = h.split('\tSN:')[1].split('\t')[0] if seq_name in ref_labels and ref_labels[ seq_name] == ref: new_ref_sam_file.write(h + '\n') chrs.append(seq_name) new_ref_sam_file.write(headers[-1] + '\n') ref_files[ref] = new_ref_sam_file need_ref_splitting = True deletions = [] trivial_deletions_fpath = os.path.join(output_dirpath, qconfig.trivial_deletions_fname) logger.info( ' Looking for trivial deletions (long zero-covered fragments)...') need_trivial_deletions = True if os.path.exists(trivial_deletions_fpath): need_trivial_deletions = False logger.info(' Using existing file: ' + trivial_deletions_fpath) if need_trivial_deletions or need_ref_splitting: with open(sam_sorted_fpath) as sam_file: cur_deletion = None for line in sam_file: mapping = Mapping.parse(line) if mapping: # common case: continue current deletion (potential) on the same reference if cur_deletion and cur_deletion.ref == mapping.ref: if cur_deletion.next_bad is None: # previous mapping was in region BEFORE 0-covered fragment # just passed 0-covered fragment if mapping.start - cur_deletion.prev_bad > QuastDeletion.MIN_GAP: cur_deletion.set_next_bad(mapping) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_good(mapping) # continue region BEFORE 0-covered fragment elif mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_prev_good(mapping) else: cur_deletion.set_prev_bad(mapping) else: # previous mapping was in region AFTER 0-covered fragment # just passed another 0-cov fragment between end of cur_deletion BAD region and this mapping if mapping.start - cur_deletion.next_bad_end > QuastDeletion.MIN_GAP: if cur_deletion.is_valid( ): # add previous fragment's deletion if needed deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_bad( position=cur_deletion.next_bad_end) # continue region AFTER 0-covered fragment (old one or new/another one -- see "if" above) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_good(mapping) else: cur_deletion.set_next_bad_end(mapping) # special case: just started or just switched to the next reference else: if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good( position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_good(mapping) if need_ref_splitting: cur_ref = ref_labels[mapping.ref] if mapping.ref_next.strip( ) == '=' or cur_ref == ref_labels[ mapping.ref_next]: if ref_files[cur_ref] is not None: ref_files[cur_ref].write(line) if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good( position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) if need_ref_splitting: for ref_handler in ref_files.values(): if ref_handler is not None: ref_handler.close() if need_trivial_deletions: logger.info(' Trivial deletions: %d found' % len(deletions)) logger.info(' Saving to: ' + trivial_deletions_fpath) with open(trivial_deletions_fpath, 'w') as f: for deletion in deletions: f.write(str(deletion) + '\n') if isfile(config_manta_fpath): try: manta_sv_fpath = search_sv_with_manta(main_ref_fpath, meta_ref_fpaths, output_dirpath, err_path) qutils.cat_files([manta_sv_fpath, trivial_deletions_fpath], bed_fpath) except: pass if os.path.exists( trivial_deletions_fpath) and not is_non_empty_file(bed_fpath): shutil.copy(trivial_deletions_fpath, bed_fpath) if not qconfig.no_sv: if is_non_empty_file(bed_fpath): logger.main_info(' Structural variations are in ' + bed_fpath) else: if isfile(bed_fpath): logger.main_info(' No structural variations were found.') else: logger.main_info(' Failed searching structural variations.') bed_fpath = None if is_non_empty_file(cov_fpath): logger.main_info( ' Coverage distribution along the reference genome is in ' + cov_fpath) else: logger.main_info(' Failed to calculate coverage distribution') cov_fpath = None return bed_fpath, cov_fpath, physical_cov_fpath