def parallel_blast(contigs_fpath, label, blast_res_fpath, err_fpath, blast_check_fpath, blast_threads): cmd = blast_fpath('blastn') + (' -query %s -db %s -outfmt 7 -num_threads %s' % ( contigs_fpath, db_fpath, blast_threads)) res_fpath = blast_res_fpath + '_' + label check_fpath = blast_check_fpath + '_' + label logger.info(' ' + 'processing ' + label) qutils.call_subprocess(shlex.split(cmd), stdout=open(res_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) logger.info(' ' + 'BLAST results for %s are saved to %s...' % (label, res_fpath)) with open(check_fpath, 'w') as check_file: check_file.writelines('Assembly: %s size: %d\n' % (contigs_fpath, os.path.getsize(contigs_fpath))) return
def download_blastdb(): if os.path.isfile(db_fpath + '.nsq'): logger.info() logger.info('SILVA 16S rRNA database has already been downloaded, unpacked and BLAST database created. ' 'If not, please remove %s and rerun MetaQUAST' % db_fpath + '.nsq') return 0 log_fpath = os.path.join(blastdb_dirpath, 'blastdb.log') db_gz_fpath = os.path.join(blastdb_dirpath, silva_fname + '.gz') silva_fpath = os.path.join(blastdb_dirpath, silva_fname) logger.info() if os.path.isfile(db_gz_fpath): logger.info('SILVA 16S ribosomal RNA gene database has already been downloaded.') else: logger.info('Downloading SILVA 16S ribosomal RNA gene database...') if not os.path.isdir(blastdb_dirpath): os.mkdir(blastdb_dirpath) silva_download = urllib.URLopener() silva_remote_fpath = silva_db_path + silva_fname + '.gz' try: silva_download.retrieve(silva_remote_fpath, db_gz_fpath + '.download', show_progress) except Exception: logger.error( 'Failed downloading SILVA 16S rRNA gene database (%s)! The search for reference genomes cannot be performed. ' 'Try to download it manually in %s and restart MetaQUAST.' % (silva_remote_fpath, blastdb_dirpath)) return 1 shutil.move(db_gz_fpath + '.download', db_gz_fpath) logger.info('Processing downloaded file. Logging to %s...' % log_fpath) if not os.path.isfile(silva_fpath): logger.info('Unpacking and replacing " " with "_"...') unpacked_fpath = silva_fpath + ".unpacked" cmd = "gunzip -c %s" % db_gz_fpath qutils.call_subprocess(shlex.split(cmd), stdout=open(unpacked_fpath, 'w'), stderr=open(log_fpath, 'a'), logger=logger) in_file = open(unpacked_fpath).read() out_file = open(unpacked_fpath, 'w') out_file.write(in_file.replace(' ', '_')) out_file.close() shutil.move(unpacked_fpath, silva_fpath) logger.info('Making BLAST database...') cmd = blast_fpath('makeblastdb') + (' -in %s -dbtype nucl -out %s' % (silva_fpath, db_fpath)) qutils.call_subprocess(shlex.split(cmd), stdout=open(log_fpath, 'a'), stderr=open(log_fpath, 'a'), logger=logger) if not os.path.exists(db_fpath + '.nsq') or os.path.getsize(db_fpath + '.nsq') < db_nsq_fsize: logger.error('Failed to make BLAST database ("' + blastdb_dirpath + '"). See details in log. Try to make it manually: %s' % cmd) return 1 elif not qconfig.debug: os.remove(db_gz_fpath) os.remove(silva_fpath) return 0
def gmhmm_p_everyGC(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads): tmp_dirpath = tempfile.mkdtemp(dir=tmp_dirpath) tool_exec_fpath = os.path.join(tool_dirpath, 'gmsn.pl') err_file = open(err_fpath, 'w') fasta_name = qutils.name_from_fpath(fasta_fpath) return_code = qutils.call_subprocess([ 'perl', tool_exec_fpath, '--name', fasta_name, '--clean', '--out', tmp_dirpath, fasta_fpath ], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) if return_code != 0: return genes = [] tool_exec_fpath = os.path.join(tool_dirpath, 'gmhmmp') sub_fasta_fpath = os.path.join(tmp_dirpath, fasta_name) out_fpath = sub_fasta_fpath + '.gmhmm' heu_fpath = os.path.join(tmp_dirpath, fasta_name + '_hmm_heuristic.mod') with open(err_fpath, 'a') as err_file: ok = gmhmm_p(tool_exec_fpath, fasta_fpath, heu_fpath, out_fpath, err_file, index) if ok: genes.extend(parse_gmhmm_out(out_fpath)) if not qconfig.debug: shutil.rmtree(tmp_dirpath) return genes
def gmhmm_p_everyGC(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads): tmp_dirpath = tempfile.mkdtemp(dir=tmp_dirpath) tool_exec_fpath = os.path.join(tool_dirpath, 'gmsn.pl') err_file = open(err_fpath, 'w') fasta_name = qutils.name_from_fpath(fasta_fpath) return_code = qutils.call_subprocess( ['perl', tool_exec_fpath, '--name', fasta_name, '--clean', '--out', tmp_dirpath, fasta_fpath], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) if return_code != 0: return genes = [] tool_exec_fpath = os.path.join(tool_dirpath, 'gmhmmp') sub_fasta_fpath = os.path.join(tmp_dirpath, fasta_name) out_fpath = sub_fasta_fpath + '.gmhmm' heu_fpath = os.path.join(tmp_dirpath, fasta_name + '_hmm_heuristic.mod') with open(err_fpath, 'a') as err_file: ok = gmhmm_p(tool_exec_fpath, fasta_fpath, heu_fpath, out_fpath, err_file, index) if ok: genes.extend(parse_gmhmm_out(out_fpath)) if not qconfig.debug: shutil.rmtree(tmp_dirpath) return genes
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(i) + assembly_label + '...') # run gage tool log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_name + '.stdout') log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_name + '.stderr') logger.info(' ' + qutils.index_to_str(i) + 'Logging to files ' + os.path.basename(log_out_fpath) + ' and ' + os.path.basename(log_err_fpath) + '...') log_out_f = open(log_out_fpath, 'w') log_err_f = open(log_err_fpath, 'w') return_code = qutils.call_subprocess( ['sh', gage_tool_path, reference, contigs_fpath, tmp_dir, str(qconfig.min_contig)], stdout=log_out_f, stderr=log_err_f, indent=' ' + qutils.index_to_str(i), only_if_debug=False) if return_code != 0: logger.info(' ' + qutils.index_to_str(i) + 'Failed.') else: logger.info(' ' + qutils.index_to_str(i) + 'Done.') log_out_f.close() log_err_f.close() return return_code
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir): assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) logger.info(' ' + qutils.index_to_str(i) + assembly_label + '...') # run gage tool log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_label + '.stdout') log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_label + '.stderr') logger.info(' ' + qutils.index_to_str(i) + 'Logging to files ' + os.path.basename(log_out_fpath) + ' and ' + os.path.basename(log_err_fpath) + '...') log_out_f = open(log_out_fpath, 'w') log_err_f = open(log_err_fpath, 'w') return_code = qutils.call_subprocess( ['sh', gage_tool_path, reference, contigs_fpath, tmp_dir, str(qconfig.min_contig)], stdout=log_out_f, stderr=log_err_f, indent=' ' + qutils.index_to_str(i), only_if_debug=False) if return_code != 0: logger.info(' ' + qutils.index_to_str(i) + 'Failed.') else: logger.info(' ' + qutils.index_to_str(i) + 'Done.') log_out_f.close() log_err_f.close() return return_code
def run(contig_path, tmp_path): with open(err_path, 'a') as err_file: return_code = qutils.call_subprocess( [tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index) + ' ') return return_code
def parallel_blast(contigs_fpath, label, blast_res_fpath, err_fpath, blast_check_fpath, blast_threads): cmd = blast_fpath('blastn') + ( ' -query %s -db %s -outfmt 7 -num_threads %s' % (contigs_fpath, db_fpath, blast_threads)) res_fpath = blast_res_fpath + '_' + label check_fpath = blast_check_fpath + '_' + label logger.info(' ' + 'processing ' + label) qutils.call_subprocess(shlex.split(cmd), stdout=open(res_fpath, 'w'), stderr=open(err_fpath, 'a'), logger=logger) logger.info(' ' + 'BLAST results for %s are saved to %s...' % (label, res_fpath)) with open(check_fpath, 'w') as check_file: check_file.writelines('Assembly: %s size: %d\n' % (contigs_fpath, os.path.getsize(contigs_fpath))) return
def do(contigs_fpaths, gene_lengths, out_dirpath): logger.print_timestamp() logger.main_info('Running GlimmerHMM...') tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer') tool_src_dirpath = os.path.join(tool_dirpath, 'src') tool_exec_fpath = os.path.join(tool_dirpath, 'glimmerhmm') tmp_dirpath = os.path.join(out_dirpath, 'tmp') if not os.path.isfile(tool_exec_fpath): # making logger.main_info("Compiling GlimmerHMM...") return_code = qutils.call_subprocess( ['make', '-C', tool_src_dirpath], stdout=open(os.path.join(tool_src_dirpath, 'make.log'), 'w'), stderr=open(os.path.join(tool_src_dirpath, 'make.err'), 'w'), indent=' ') if return_code != 0 or not os.path.isfile(tool_exec_fpath): logger.error( "Failed to compile GlimmerHMM (" + tool_src_dirpath + ")!\nTry to compile it manually or do not use --gene-finding " "option with --eukaryote.\nUse --debug option to see the command lines." ) return if not os.path.isdir(out_dirpath): os.makedirs(out_dirpath) if not os.path.isdir(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed results = Parallel(n_jobs=n_jobs)( delayed(predict_genes)(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)) # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) unique, cnt = results[i] if unique is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique) if cnt is not None: report.add_field(reporting.Fields.PREDICTED_GENES, cnt) if unique is None and cnt is None: logger.error( 'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option' ' to see the command line.' if not qconfig.debug else '') % qutils.label_from_fpath(contigs_fpath)) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.')
def run(contig_path, tmp_path): with open(err_path, 'a') as err_file: return_code = qutils.call_subprocess([ tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path ], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index) + ' ') return return_code
def gmhmm_p(tool_exec, fasta_fpath, heu_fpath, out_fpath, err_file, index): """ Run GeneMark.hmm with this heuristic model (heu_dirpath) prompt> gmhmmp -m heu_11_45.mod sequence prompt> gm -m heu_11_45.mat sequence""" return_code = qutils.call_subprocess( [tool_exec, '-d', '-p', '0', '-m', heu_fpath, '-o', out_fpath, fasta_fpath], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) return return_code == 0 and os.path.isfile(out_fpath)
def do(contigs_fpaths, gene_lengths, out_dirpath): logger.print_timestamp() logger.main_info('Running GlimmerHMM...') tool_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'glimmer') tool_src_dirpath = os.path.join(tool_dirpath, 'src') tool_exec_fpath = os.path.join(tool_dirpath, 'glimmerhmm') tmp_dirpath = os.path.join(out_dirpath, 'tmp') if not os.path.isfile(tool_exec_fpath): # making logger.main_info("Compiling GlimmerHMM...") return_code = qutils.call_subprocess( ['make', '-C', tool_src_dirpath], stdout=open(os.path.join(tool_src_dirpath, 'make.log'), 'w'), stderr=open(os.path.join(tool_src_dirpath, 'make.err'), 'w'), indent=' ') if return_code != 0 or not os.path.isfile(tool_exec_fpath): logger.error("Failed to compile GlimmerHMM (" + tool_src_dirpath + ")!\nTry to compile it manually or do not use --gene-finding " "option with --eukaryote.\nUse --debug option to see the command lines.") return if not os.path.isdir(out_dirpath): os.makedirs(out_dirpath) if not os.path.isdir(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed results = Parallel(n_jobs=n_jobs)(delayed(predict_genes)( index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath) for index, contigs_fpath in enumerate(contigs_fpaths)) # saving results for i, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) unique, cnt = results[i] if unique is not None: report.add_field(reporting.Fields.PREDICTED_GENES_UNIQUE, unique) if cnt is not None: report.add_field(reporting.Fields.PREDICTED_GENES, cnt) if unique is None and cnt is None: logger.error( 'Glimmer failed running Glimmer for %s. ' + ('Run with the --debug option' ' to see the command line.' if not qconfig.debug else '') % qutils.label_from_fpath(contigs_fpath)) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.main_info('Done.')
def gm_es(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads): tool_exec_fpath = os.path.join(tool_dirpath, 'gmes_petap.pl') libs_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'genemark-es', 'lib') err_file = open(err_fpath, 'w') tmp_dirpath += qutils.name_from_fpath(fasta_fpath) if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) return_code = qutils.call_subprocess( ['perl', '-I', libs_dirpath, tool_exec_fpath, '--ES', '--cores', str(num_threads), '--sequence', fasta_fpath, '--out', tmp_dirpath], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) if return_code != 0: return genes = [] _, _, fnames = os.walk(tmp_dirpath).next() for fname in fnames: if fname.endswith('gtf'): genes.extend(parse_gtf_out(os.path.join(tmp_dirpath, fname))) return genes
def gm_es(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads): tool_exec_fpath = os.path.join(tool_dirpath, 'gmes_petap.pl') libs_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'genemark-es', 'lib') err_file = open(err_fpath, 'w') tmp_dirpath += qutils.name_from_fpath(fasta_fpath) if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) return_code = qutils.call_subprocess([ 'perl', '-I', libs_dirpath, tool_exec_fpath, '--ES', '--cores', str(num_threads), '--sequence', fasta_fpath, '--out', tmp_dirpath ], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) if return_code != 0: return genes = [] _, _, fnames = os.walk(tmp_dirpath).next() for fname in fnames: if fname.endswith('gtf'): genes.extend(parse_gtf_out(os.path.join(tmp_dirpath, fname))) return genes
def do(ref_fpath, contigs_fpaths, reads_fpaths, meta_ref_fpaths, output_dir, interleaved=False, external_logger=None): if external_logger: global logger logger = external_logger logger.print_timestamp() logger.main_info('Running Structural Variants caller...') if not os.path.isdir(output_dir): os.makedirs(output_dir) if not all_required_binaries_exist(bowtie_dirpath, 'bowtie2-align-l'): # making logger.main_info('Compiling Bowtie2 (details are in ' + os.path.join(bowtie_dirpath, 'make.log') + ' and make.err)') return_code = qutils.call_subprocess( ['make', '-C', bowtie_dirpath], stdout=open(os.path.join(bowtie_dirpath, 'make.log'), 'w'), stderr=open(os.path.join(bowtie_dirpath, 'make.err'), 'w'), logger=logger) if return_code != 0 or not all_required_binaries_exist( bowtie_dirpath, 'bowtie2-align-l'): logger.error( 'Failed to compile Bowtie2 (' + bowtie_dirpath + ')! ' 'Try to compile it manually. ' + ('You can restart QUAST with the --debug flag ' 'to see the command line.' if not qconfig.debug else '')) logger.main_info('Failed searching structural variations') return None if not all_required_binaries_exist(samtools_dirpath, 'samtools'): # making logger.main_info('Compiling SAMtools (details are in ' + os.path.join(samtools_dirpath, 'make.log') + ' and make.err)') return_code = qutils.call_subprocess( ['make', '-C', samtools_dirpath], stdout=open(os.path.join(samtools_dirpath, 'make.log'), 'w'), stderr=open(os.path.join(samtools_dirpath, 'make.err'), 'w'), logger=logger) if return_code != 0 or not all_required_binaries_exist( samtools_dirpath, 'samtools'): logger.error( 'Failed to compile SAMtools (' + samtools_dirpath + ')! ' 'Try to compile it manually. ' + ('You can restart QUAST with the --debug flag ' 'to see the command line.' if not qconfig.debug else '')) logger.main_info('Failed searching structural variations') return None if not all_required_binaries_exist(manta_bin_dirpath, 'configManta.py'): # making if not os.path.exists(manta_build_dirpath): os.mkdir(manta_build_dirpath) if qconfig.platform_name == 'linux_64': logger.main_info(' Downloading binary distribution of Manta...') manta_downloaded_fpath = os.path.join(manta_build_dirpath, 'manta.tar.bz2') response = urllib2.urlopen(manta_download_path) content = response.read() if content: logger.main_info(' Manta successfully downloaded!') f = open(manta_downloaded_fpath + '.download', 'w') f.write(content) f.close() if os.path.exists(manta_downloaded_fpath + '.download'): logger.info(' Unpacking Manta...') shutil.move(manta_downloaded_fpath + '.download', manta_downloaded_fpath) import tarfile tar = tarfile.open(manta_downloaded_fpath, "r:bz2") tar.extractall(manta_build_dirpath) tar.close() manta_temp_dirpath = os.path.join(manta_build_dirpath, tar.members[0].name) from distutils.dir_util import copy_tree copy_tree(manta_temp_dirpath, manta_build_dirpath) shutil.rmtree(manta_temp_dirpath) os.remove(manta_downloaded_fpath) logger.main_info(' Done') else: logger.main_info(' Failed downloading Manta from %s!' % manta_download_path) if not all_required_binaries_exist(manta_bin_dirpath, 'configManta.py'): logger.main_info('Compiling Manta (details are in ' + os.path.join(manta_dirpath, 'make.log') + ' and make.err)') prev_dir = os.getcwd() os.chdir(manta_build_dirpath) return_code = qutils.call_subprocess( [ os.path.join(manta_dirpath, 'source', 'src', 'configure'), '--prefix=' + os.path.join(manta_dirpath, 'build'), '--jobs=' + str(qconfig.max_threads) ], stdout=open(os.path.join(manta_dirpath, 'make.log'), 'w'), stderr=open(os.path.join(manta_dirpath, 'make.err'), 'w'), logger=logger) if return_code == 0: return_code = qutils.call_subprocess( ['make', '-j' + str(qconfig.max_threads)], stdout=open(os.path.join(manta_dirpath, 'make.log'), 'a'), stderr=open(os.path.join(manta_dirpath, 'make.err'), 'a'), logger=logger) if return_code == 0: return_code = qutils.call_subprocess( ['make', 'install'], stdout=open(os.path.join(manta_dirpath, 'make.log'), 'a'), stderr=open(os.path.join(manta_dirpath, 'make.err'), 'a'), logger=logger) os.chdir(prev_dir) if return_code != 0 or not all_required_binaries_exist( manta_bin_dirpath, 'configManta.py'): logger.error( 'Failed to compile Manta (' + manta_dirpath + ')! ' 'Try to compile it manually ' + ('or download binary distribution from https://github.com/Illumina/manta/releases ' 'and unpack it into ' + os.path.join(manta_dirpath, 'build/') if qconfig. platform_name == 'linux_64' else '') + ('. You can restart QUAST with the --debug flag ' 'to see the command line.' if not qconfig.debug else '.')) logger.main_info( 'Failed searching structural variations. QUAST will search trivial deletions only.' ) temp_output_dir = os.path.join(output_dir, 'temp_output') if not os.path.isdir(temp_output_dir): os.mkdir(temp_output_dir) log_path = os.path.join(output_dir, 'sv_calling.log') err_path = os.path.join(output_dir, 'sv_calling.err') logger.info(' ' + 'Logging to files %s and %s...' % (log_path, err_path)) try: bed_fpath = run_processing_reads( ref_fpath, meta_ref_fpaths, contigs_analyzer.ref_labels_by_chromosomes, reads_fpaths, temp_output_dir, output_dir, log_path, err_path) except: bed_fpath = None logger.error( 'Failed searching structural variations! This function is experimental and may work improperly. Sorry for the inconvenience.' ) if not qconfig.debug: shutil.rmtree(temp_output_dir, ignore_errors=True) logger.info('Done.') return bed_fpath
def run_processing_reads(main_ref_fpath, meta_ref_fpaths, ref_labels, reads_fpaths, output_dirpath, res_path, log_path, err_path): ref_name = qutils.name_from_fpath(main_ref_fpath) sam_fpath = os.path.join(output_dirpath, ref_name + '.sam') bam_fpath = os.path.join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = os.path.join(output_dirpath, ref_name + '.sorted') sam_sorted_fpath = os.path.join(output_dirpath, ref_name + '.sorted.sam') bed_fpath = os.path.join(res_path, ref_name + '.bed') if is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) return bed_fpath logger.info(' ' + 'Pre-processing for searching structural variations...') logger.info(' ' + 'Logging to %s...' % err_path) if is_non_empty_file(sam_fpath): logger.info(' Using existing SAM-file: ' + sam_fpath) else: logger.info(' Running Bowtie2...') abs_reads_fpaths = [ ] # use absolute paths because we will change workdir for reads_fpath in reads_fpaths: abs_reads_fpaths.append(os.path.abspath(reads_fpath)) prev_dir = os.getcwd() os.chdir(output_dirpath) cmd = [bin_fpath('bowtie2-build'), main_ref_fpath, ref_name] qutils.call_subprocess(cmd, stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) cmd = bin_fpath('bowtie2') + ' -x ' + ref_name + ' -1 ' + abs_reads_fpaths[0] + ' -2 ' + abs_reads_fpaths[1] + ' -S ' + \ sam_fpath + ' --no-unal -a -p %s' % str(qconfig.max_threads) qutils.call_subprocess(shlex.split(cmd), stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) logger.info(' Done.') os.chdir(prev_dir) if not os.path.exists(sam_fpath) or os.path.getsize(sam_fpath) == 0: logger.error(' Failed running Bowtie2 for the reference. See ' + log_path + ' for information.') logger.info(' Failed searching structural variations.') return None logger.info(' Sorting SAM-file...') if is_non_empty_file(sam_sorted_fpath): logger.info(' Using existing sorted SAM-file: ' + sam_sorted_fpath) else: qutils.call_subprocess([ samtools_fpath('samtools'), 'view', '-@', str(qconfig.max_threads), '-bS', sam_fpath ], stdout=open(bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([ samtools_fpath('samtools'), 'sort', '-@', str(qconfig.max_threads), bam_fpath, bam_sorted_fpath ], stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([ samtools_fpath('samtools'), 'view', '-@', str(qconfig.max_threads), bam_sorted_fpath + '.bam' ], stdout=open(sam_sorted_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) if meta_ref_fpaths: logger.info(' Splitting SAM-file by references...') headers = [] seq_name_length = {} with open(sam_fpath) as sam_file: for line in sam_file: if not line.startswith('@'): break if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line: seq_name = line.split('\tSN:')[1].split('\t')[0] seq_length = int(line.split('\tLN:')[1].split('\t')[0]) seq_name_length[seq_name] = seq_length headers.append(line.strip()) need_ref_splitting = False if meta_ref_fpaths: ref_files = {} for cur_ref_fpath in meta_ref_fpaths: ref = qutils.name_from_fpath(cur_ref_fpath) new_ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam') if is_non_empty_file(new_ref_sam_fpath): logger.info(' Using existing split SAM-file for %s: %s' % (ref, new_ref_sam_fpath)) ref_files[ref] = None else: new_ref_sam_file = open(new_ref_sam_fpath, 'w') new_ref_sam_file.write(headers[0] + '\n') chrs = [] for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h): seq_name = h.split('\tSN:')[1].split('\t')[0] if seq_name in ref_labels and ref_labels[seq_name] == ref: new_ref_sam_file.write(h + '\n') chrs.append(seq_name) new_ref_sam_file.write(headers[-1] + '\n') ref_files[ref] = new_ref_sam_file need_ref_splitting = True deletions = [] trivial_deletions_fpath = os.path.join(output_dirpath, qconfig.trivial_deletions_fname) logger.info( ' Looking for trivial deletions (long zero-covered fragments)...') need_trivial_deletions = True if os.path.exists(trivial_deletions_fpath): need_trivial_deletions = False logger.info(' Using existing file: ' + trivial_deletions_fpath) if need_trivial_deletions or need_ref_splitting: with open(sam_sorted_fpath) as sam_file: cur_deletion = None for line in sam_file: mapping = Mapping.parse(line) if mapping: # common case: continue current deletion (potential) on the same reference if cur_deletion and cur_deletion.ref == mapping.ref: if cur_deletion.next_bad is None: # previous mapping was in region BEFORE 0-covered fragment # just passed 0-covered fragment if mapping.start - cur_deletion.prev_bad > QuastDeletion.MIN_GAP: cur_deletion.set_next_bad(mapping) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_good(mapping) # continue region BEFORE 0-covered fragment elif mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_prev_good(mapping) else: cur_deletion.set_prev_bad(mapping) else: # previous mapping was in region AFTER 0-covered fragment # just passed another 0-cov fragment between end of cur_deletion BAD region and this mapping if mapping.start - cur_deletion.next_bad_end > QuastDeletion.MIN_GAP: if cur_deletion.is_valid( ): # add previous fragment's deletion if needed deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_bad( position=cur_deletion.next_bad_end) # continue region AFTER 0-covered fragment (old one or new/another one -- see "if" above) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_good(mapping) else: cur_deletion.set_next_bad_end(mapping) # special case: just started or just switched to the next reference else: if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good( position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_good(mapping) if need_ref_splitting: cur_ref = ref_labels[mapping.ref] if mapping.ref_next.strip( ) == '=' or cur_ref == ref_labels[mapping.ref_next]: if ref_files[cur_ref] is not None: ref_files[cur_ref].write(line) if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good( position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) if need_ref_splitting: for ref_handler in ref_files.values(): if ref_handler is not None: ref_handler.close() if need_trivial_deletions: logger.info(' Trivial deletions: %d found' % len(deletions)) logger.info(' Saving to: ' + trivial_deletions_fpath) with open(trivial_deletions_fpath, 'w') as f: for deletion in deletions: f.write(str(deletion) + '\n') if os.path.exists(config_manta_fpath): manta_sv_fpath = search_sv_with_manta(main_ref_fpath, meta_ref_fpaths, output_dirpath, err_path) qutils.cat_files([manta_sv_fpath, trivial_deletions_fpath], bed_fpath) elif os.path.exists(trivial_deletions_fpath): shutil.copy(trivial_deletions_fpath, bed_fpath) if os.path.exists(bed_fpath): logger.main_info(' Structural variations saved to ' + bed_fpath) return bed_fpath else: logger.main_info(' Failed searching structural variations.') return None
def run_processing_reads(main_ref_fpath, meta_ref_fpaths, ref_labels, reads_fpaths, output_dirpath, res_path, log_path, err_path): ref_name = qutils.name_from_fpath(main_ref_fpath) sam_fpath = os.path.join(output_dirpath, ref_name + '.sam') bam_fpath = os.path.join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = os.path.join(output_dirpath, ref_name + '.sorted') sam_sorted_fpath = os.path.join(output_dirpath, ref_name + '.sorted.sam') bed_fpath = os.path.join(res_path, ref_name + '.bed') if is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) return bed_fpath logger.info(' ' + 'Pre-processing for searching structural variations...') logger.info(' ' + 'Logging to %s...' % err_path) if is_non_empty_file(sam_fpath): logger.info(' Using existing SAM-file: ' + sam_fpath) else: logger.info(' Running Bowtie2...') abs_reads_fpaths = [] # use absolute paths because we will change workdir for reads_fpath in reads_fpaths: abs_reads_fpaths.append(os.path.abspath(reads_fpath)) prev_dir = os.getcwd() os.chdir(output_dirpath) cmd = [bin_fpath('bowtie2-build'), main_ref_fpath, ref_name] qutils.call_subprocess(cmd, stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) cmd = bin_fpath('bowtie2') + ' -x ' + ref_name + ' -1 ' + abs_reads_fpaths[0] + ' -2 ' + abs_reads_fpaths[1] + ' -S ' + \ sam_fpath + ' --no-unal -a -p %s' % str(qconfig.max_threads) qutils.call_subprocess(shlex.split(cmd), stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) logger.info(' Done.') os.chdir(prev_dir) if not os.path.exists(sam_fpath) or os.path.getsize(sam_fpath) == 0: logger.error(' Failed running Bowtie2 for the reference. See ' + log_path + ' for information.') logger.info(' Failed searching structural variations.') return None logger.info(' Sorting SAM-file...') if is_non_empty_file(sam_sorted_fpath): logger.info(' Using existing sorted SAM-file: ' + sam_sorted_fpath) else: qutils.call_subprocess([samtools_fpath('samtools'), 'view', '-@', str(qconfig.max_threads), '-bS', sam_fpath], stdout=open(bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([samtools_fpath('samtools'), 'sort', '-@', str(qconfig.max_threads), bam_fpath, bam_sorted_fpath], stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([samtools_fpath('samtools'), 'view', '-@', str(qconfig.max_threads), bam_sorted_fpath + '.bam'], stdout=open(sam_sorted_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) if meta_ref_fpaths: logger.info(' Splitting SAM-file by references...') headers = [] seq_name_length = {} with open(sam_fpath) as sam_file: for line in sam_file: if not line.startswith('@'): break if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line: seq_name = line.split('\tSN:')[1].split('\t')[0] seq_length = int(line.split('\tLN:')[1].split('\t')[0]) seq_name_length[seq_name] = seq_length headers.append(line.strip()) need_ref_splitting = False if meta_ref_fpaths: ref_files = {} for cur_ref_fpath in meta_ref_fpaths: ref = qutils.name_from_fpath(cur_ref_fpath) new_ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam') if is_non_empty_file(new_ref_sam_fpath): logger.info(' Using existing split SAM-file for %s: %s' % (ref, new_ref_sam_fpath)) ref_files[ref] = None else: new_ref_sam_file = open(new_ref_sam_fpath, 'w') new_ref_sam_file.write(headers[0] + '\n') chrs = [] for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h): seq_name = h.split('\tSN:')[1].split('\t')[0] if seq_name in ref_labels and ref_labels[seq_name] == ref: new_ref_sam_file.write(h + '\n') chrs.append(seq_name) new_ref_sam_file.write(headers[-1] + '\n') ref_files[ref] = new_ref_sam_file need_ref_splitting = True deletions = [] trivial_deletions_fpath = os.path.join(output_dirpath, qconfig.trivial_deletions_fname) logger.info(' Looking for trivial deletions (long zero-covered fragments)...') need_trivial_deletions = True if os.path.exists(trivial_deletions_fpath): need_trivial_deletions = False logger.info(' Using existing file: ' + trivial_deletions_fpath) if need_trivial_deletions or need_ref_splitting: with open(sam_sorted_fpath) as sam_file: cur_deletion = None for line in sam_file: mapping = Mapping.parse(line) if mapping: # common case: continue current deletion (potential) on the same reference if cur_deletion and cur_deletion.ref == mapping.ref: if cur_deletion.next_bad is None: # previous mapping was in region BEFORE 0-covered fragment # just passed 0-covered fragment if mapping.start - cur_deletion.prev_bad > QuastDeletion.MIN_GAP: cur_deletion.set_next_bad(mapping) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion(mapping.ref).set_prev_good(mapping) # continue region BEFORE 0-covered fragment elif mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_prev_good(mapping) else: cur_deletion.set_prev_bad(mapping) else: # previous mapping was in region AFTER 0-covered fragment # just passed another 0-cov fragment between end of cur_deletion BAD region and this mapping if mapping.start - cur_deletion.next_bad_end > QuastDeletion.MIN_GAP: if cur_deletion.is_valid(): # add previous fragment's deletion if needed deletions.append(cur_deletion) cur_deletion = QuastDeletion(mapping.ref).set_prev_bad(position=cur_deletion.next_bad_end) # continue region AFTER 0-covered fragment (old one or new/another one -- see "if" above) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion(mapping.ref).set_prev_good(mapping) else: cur_deletion.set_next_bad_end(mapping) # special case: just started or just switched to the next reference else: if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good(position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion(mapping.ref).set_prev_good(mapping) if need_ref_splitting: cur_ref = ref_labels[mapping.ref] if mapping.ref_next.strip() == '=' or cur_ref == ref_labels[mapping.ref_next]: if ref_files[cur_ref] is not None: ref_files[cur_ref].write(line) if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good(position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) if need_ref_splitting: for ref_handler in ref_files.values(): if ref_handler is not None: ref_handler.close() if need_trivial_deletions: logger.info(' Trivial deletions: %d found' % len(deletions)) logger.info(' Saving to: ' + trivial_deletions_fpath) with open(trivial_deletions_fpath, 'w') as f: for deletion in deletions: f.write(str(deletion) + '\n') if os.path.exists(config_manta_fpath): manta_sv_fpath = search_sv_with_manta(main_ref_fpath, meta_ref_fpaths, output_dirpath, err_path) qutils.cat_files([manta_sv_fpath, trivial_deletions_fpath], bed_fpath) elif os.path.exists(trivial_deletions_fpath): shutil.copy(trivial_deletions_fpath, bed_fpath) if os.path.exists(bed_fpath): logger.main_info(' Structural variations saved to ' + bed_fpath) return bed_fpath else: logger.main_info(' Failed searching structural variations.') return None
def process_one_ref(cur_ref_fpath, output_dirpath, err_path, bed_fpath=None): ref = qutils.name_from_fpath(cur_ref_fpath) ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam') ref_bam_fpath = os.path.join(output_dirpath, ref + '.bam') ref_bamsorted_fpath = os.path.join(output_dirpath, ref + '.sorted') ref_bed_fpath = bed_fpath if bed_fpath else os.path.join( output_dirpath, ref + '.bed') if os.path.getsize( ref_sam_fpath ) < 1024 * 1024: # TODO: make it better (small files will cause Manta crush -- "not enough reads...") logger.info(' SAM file is too small for Manta (%d Kb), skipping..' % (os.path.getsize(ref_sam_fpath) // 1024)) return None if is_non_empty_file(ref_bed_fpath): logger.info(' Using existing Manta BED-file: ' + ref_bed_fpath) return ref_bed_fpath if not os.path.exists(ref_bamsorted_fpath + '.bam'): qutils.call_subprocess( [samtools_fpath('samtools'), 'view', '-bS', ref_sam_fpath], stdout=open(ref_bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([ samtools_fpath('samtools'), 'sort', ref_bam_fpath, ref_bamsorted_fpath ], stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(ref_bamsorted_fpath + '.bam.bai'): qutils.call_subprocess([ samtools_fpath('samtools'), 'index', ref_bamsorted_fpath + '.bam' ], stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(cur_ref_fpath + '.fai'): qutils.call_subprocess( [samtools_fpath('samtools'), 'faidx', cur_ref_fpath], stderr=open(err_path, 'a'), logger=logger) vcfoutput_dirpath = os.path.join(output_dirpath, ref + '_manta') found_SV_fpath = os.path.join(vcfoutput_dirpath, 'results/variants/diploidSV.vcf.gz') unpacked_SV_fpath = found_SV_fpath + '.unpacked' if not is_non_empty_file(found_SV_fpath): if os.path.exists(vcfoutput_dirpath): shutil.rmtree(vcfoutput_dirpath, ignore_errors=True) os.makedirs(vcfoutput_dirpath) qutils.call_subprocess([ config_manta_fpath, '--normalBam', ref_bamsorted_fpath + '.bam', '--referenceFasta', cur_ref_fpath, '--runDir', vcfoutput_dirpath ], stdout=open(err_path, 'a'), stderr=open(err_path, 'a'), logger=logger) if not os.path.exists(os.path.join(vcfoutput_dirpath, 'runWorkflow.py')): return None qutils.call_subprocess([ os.path.join(vcfoutput_dirpath, 'runWorkflow.py'), '-m', 'local', '-j', str(qconfig.max_threads) ], stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(unpacked_SV_fpath): cmd = 'gunzip -c %s' % found_SV_fpath qutils.call_subprocess(shlex.split(cmd), stdout=open(unpacked_SV_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) from manta import vcfToBedpe vcfToBedpe.vcfToBedpe(open(unpacked_SV_fpath), open(ref_bed_fpath, 'w')) return ref_bed_fpath
def create_meta_report(results_dirpath, json_texts): html_fpath = os.path.join(results_dirpath, report_fname) if not os.path.isfile(html_fpath): init(html_fpath, is_meta=True) from libs import search_references_meta taxons_for_krona = search_references_meta.taxons_for_krona meta_log = get_logger(qconfig.LOGGER_META_NAME) if taxons_for_krona: meta_log.info(' Drawing interactive Krona plots...') krona_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'kronatools') krona_res_dirpath = os.path.join(results_dirpath, qconfig.krona_dirname) simplejson_error = False try: import json except ImportError: try: import simplejson as json except ImportError: log.warning( 'Can\'t draw Krona charts - please install python-simplejson' ) simplejson_error = True if not simplejson_error: if not os.path.isdir(krona_res_dirpath): os.mkdir(krona_res_dirpath) json_data = json.loads(json_texts[0]) assemblies = json_data['assembliesNames'] krona_txt_ext = '_taxonomy.txt' krona_common_fpath = os.path.join(krona_res_dirpath, 'overall' + krona_txt_ext) krona_common_file = open(krona_common_fpath, 'w') for index, name in enumerate(assemblies): krona_file = open( os.path.join(krona_res_dirpath, name + krona_txt_ext), 'w') krona_file.close() for json_text in json_texts[1:]: json_data = json.loads(json_text) ref = json_data['referenceName'] report = json_data['report'][0] for metric in report[1]: if metric['metricName'] == 'Total length': lengths = metric['values'] break cur_assemblies = json_data['assembliesNames'] for index, name in enumerate(cur_assemblies): krona_fpath = os.path.join(krona_res_dirpath, name + krona_txt_ext) with open(krona_fpath, 'a') as f_krona: if ref in taxons_for_krona: f_krona.write( str(lengths[index]) + '\t' + taxons_for_krona[ref] + '\n') else: f_krona.write(str(lengths[index]) + '\n') if ref in taxons_for_krona: krona_common_file.write( str(sum(lengths)) + '\t' + taxons_for_krona[ref] + '\n') else: krona_common_file.write(str(sum(lengths)) + '\n') krona_common_file.close() krona_fpaths = [] for index, name in enumerate(assemblies): krona_fpath = os.path.join(krona_res_dirpath, name + '_taxonomy_chart.html') krona_txt_fpath = os.path.join(krona_res_dirpath, name + krona_txt_ext) qutils.call_subprocess([ 'perl', '-I', krona_dirpath + '/lib', krona_dirpath + '/scripts/ImportText.pl', krona_txt_fpath, '-o', krona_fpath, '-a' ], stdout=open(os.devnull, 'w'), stderr=open(os.devnull, 'w')) krona_fpaths.append( os.path.join(qconfig.krona_dirname, name + '_taxonomy_chart.html')) meta_log.main_info(' Krona chart for ' + name + ' is saved to ' + krona_fpath) os.remove(krona_txt_fpath) if len(assemblies) > 1: name = 'summary' krona_fpath = os.path.join(krona_res_dirpath, name + '_taxonomy_chart.html') qutils.call_subprocess([ 'perl', '-I', krona_dirpath + '/lib', krona_dirpath + '/scripts/ImportText.pl', krona_common_fpath, '-o', krona_fpath, '-a' ], stdout=open(os.devnull, 'w'), stderr=open(os.devnull, 'w')) meta_log.main_info(' Summary Krona chart is saved to ' + krona_fpath) krona_fpaths.append( os.path.join(qconfig.krona_dirname, name + '_taxonomy_chart.html')) # extra fpath! os.remove(krona_common_fpath) save_krona_paths(results_dirpath, krona_fpaths, assemblies) # reading html template file with open(html_fpath) as f_html: html_text = f_html.read() keyword = 'totalReport' html_text = re.sub('{{ ' + keyword + ' }}', '[' + ','.join(json_texts) + ']', html_text) html_text = re.sub(r'{{(\s+\S+\s+)}}', '{}', html_text) with open(html_fpath, 'w') as f_html: f_html.write(html_text) meta_log.main_info( ' Extended version of HTML-report (for all references and assemblies) is saved to ' + html_fpath)
def do(ref_fpath, contigs_fpaths, reads_fpaths, meta_ref_fpaths, output_dir, interleaved=False, external_logger=None): if external_logger: global logger logger = external_logger logger.print_timestamp() logger.main_info('Running Structural Variants caller...') if not os.path.isdir(output_dir): os.makedirs(output_dir) if not all_required_binaries_exist(bowtie_dirpath, 'bowtie2-align-l'): # making logger.main_info('Compiling Bowtie2 (details are in ' + os.path.join(bowtie_dirpath, 'make.log') + ' and make.err)') return_code = qutils.call_subprocess( ['make', '-C', bowtie_dirpath], stdout=open(os.path.join(bowtie_dirpath, 'make.log'), 'w'), stderr=open(os.path.join(bowtie_dirpath, 'make.err'), 'w'), logger=logger) if return_code != 0 or not all_required_binaries_exist(bowtie_dirpath, 'bowtie2-align-l'): logger.error('Failed to compile Bowtie2 (' + bowtie_dirpath + ')! ' 'Try to compile it manually. ' + ( 'You can restart QUAST with the --debug flag ' 'to see the command line.' if not qconfig.debug else '')) logger.main_info('Failed searching structural variations') return None if not all_required_binaries_exist(samtools_dirpath, 'samtools'): # making logger.main_info( 'Compiling SAMtools (details are in ' + os.path.join(samtools_dirpath, 'make.log') + ' and make.err)') return_code = qutils.call_subprocess( ['make', '-C', samtools_dirpath], stdout=open(os.path.join(samtools_dirpath, 'make.log'), 'w'), stderr=open(os.path.join(samtools_dirpath, 'make.err'), 'w'), logger=logger) if return_code != 0 or not all_required_binaries_exist(samtools_dirpath, 'samtools'): logger.error('Failed to compile SAMtools (' + samtools_dirpath + ')! ' 'Try to compile it manually. ' + ( 'You can restart QUAST with the --debug flag ' 'to see the command line.' if not qconfig.debug else '')) logger.main_info('Failed searching structural variations') return None if not all_required_binaries_exist(manta_bin_dirpath, 'configManta.py'): # making if not os.path.exists(manta_build_dirpath): os.mkdir(manta_build_dirpath) if qconfig.platform_name == 'linux_64': logger.main_info(' Downloading binary distribution of Manta...') manta_downloaded_fpath = os.path.join(manta_build_dirpath, 'manta.tar.bz2') response = urllib2.urlopen(manta_download_path) content = response.read() if content: logger.main_info(' Manta successfully downloaded!') f = open(manta_downloaded_fpath + '.download', 'w' ) f.write(content) f.close() if os.path.exists(manta_downloaded_fpath + '.download'): logger.info(' Unpacking Manta...') shutil.move(manta_downloaded_fpath + '.download', manta_downloaded_fpath) import tarfile tar = tarfile.open(manta_downloaded_fpath, "r:bz2") tar.extractall(manta_build_dirpath) tar.close() manta_temp_dirpath = os.path.join(manta_build_dirpath, tar.members[0].name) from distutils.dir_util import copy_tree copy_tree(manta_temp_dirpath, manta_build_dirpath) shutil.rmtree(manta_temp_dirpath) os.remove(manta_downloaded_fpath) logger.main_info(' Done') else: logger.main_info(' Failed downloading Manta from %s!' % manta_download_path) if not all_required_binaries_exist(manta_bin_dirpath, 'configManta.py'): logger.main_info('Compiling Manta (details are in ' + os.path.join(manta_dirpath, 'make.log') + ' and make.err)') prev_dir = os.getcwd() os.chdir(manta_build_dirpath) return_code = qutils.call_subprocess( [os.path.join(manta_dirpath, 'source', 'src', 'configure'), '--prefix=' + os.path.join(manta_dirpath, 'build'), '--jobs=' + str(qconfig.max_threads)], stdout=open(os.path.join(manta_dirpath, 'make.log'), 'w'), stderr=open(os.path.join(manta_dirpath, 'make.err'), 'w'), logger=logger) if return_code == 0: return_code = qutils.call_subprocess( ['make', '-j' + str(qconfig.max_threads)], stdout=open(os.path.join(manta_dirpath, 'make.log'), 'a'), stderr=open(os.path.join(manta_dirpath, 'make.err'), 'a'), logger=logger) if return_code == 0: return_code = qutils.call_subprocess( ['make', 'install'], stdout=open(os.path.join(manta_dirpath, 'make.log'), 'a'), stderr=open(os.path.join(manta_dirpath, 'make.err'), 'a'), logger=logger) os.chdir(prev_dir) if return_code != 0 or not all_required_binaries_exist(manta_bin_dirpath, 'configManta.py'): logger.error('Failed to compile Manta (' + manta_dirpath + ')! ' 'Try to compile it manually ' + ( 'or download binary distribution from https://github.com/Illumina/manta/releases ' 'and unpack it into ' + os.path.join(manta_dirpath, 'build/') if qconfig.platform_name == 'linux_64' else '') + ( '. You can restart QUAST with the --debug flag ' 'to see the command line.' if not qconfig.debug else '.')) logger.main_info('Failed searching structural variations. QUAST will search trivial deletions only.') temp_output_dir = os.path.join(output_dir, 'temp_output') if not os.path.isdir(temp_output_dir): os.mkdir(temp_output_dir) log_path = os.path.join(output_dir, 'sv_calling.log') err_path = os.path.join(output_dir, 'sv_calling.err') logger.info(' ' + 'Logging to files %s and %s...' % (log_path, err_path)) try: bed_fpath = run_processing_reads(ref_fpath, meta_ref_fpaths, contigs_analyzer.ref_labels_by_chromosomes, reads_fpaths, temp_output_dir, output_dir, log_path, err_path) except: bed_fpath = None logger.error('Failed searching structural variations! This function is experimental and may work improperly. Sorry for the inconvenience.') if not qconfig.debug: shutil.rmtree(temp_output_dir, ignore_errors=True) logger.info('Done.') return bed_fpath
def download_blastdb(): if os.path.isfile(db_fpath + '.nsq'): logger.info() logger.info( 'SILVA 16S rRNA database has already been downloaded, unpacked and BLAST database created. ' 'If not, please remove %s and rerun MetaQUAST' % db_fpath + '.nsq') return 0 log_fpath = os.path.join(blastdb_dirpath, 'blastdb.log') db_gz_fpath = os.path.join(blastdb_dirpath, silva_fname + '.gz') silva_fpath = os.path.join(blastdb_dirpath, silva_fname) logger.info() if os.path.isfile(db_gz_fpath): logger.info( 'SILVA 16S ribosomal RNA gene database has already been downloaded.' ) else: logger.info('Downloading SILVA 16S ribosomal RNA gene database...') if not os.path.isdir(blastdb_dirpath): os.mkdir(blastdb_dirpath) silva_download = urllib.URLopener() silva_remote_fpath = silva_db_path + silva_fname + '.gz' try: silva_download.retrieve(silva_remote_fpath, db_gz_fpath + '.download', show_progress) except Exception: logger.error( 'Failed downloading SILVA 16S rRNA gene database (%s)! The search for reference genomes cannot be performed. ' 'Try to download it manually in %s and restart MetaQUAST.' % (silva_remote_fpath, blastdb_dirpath)) return 1 shutil.move(db_gz_fpath + '.download', db_gz_fpath) logger.info('Processing downloaded file. Logging to %s...' % log_fpath) if not os.path.isfile(silva_fpath): logger.info('Unpacking and replacing " " with "_"...') unpacked_fpath = silva_fpath + ".unpacked" cmd = "gunzip -c %s" % db_gz_fpath qutils.call_subprocess(shlex.split(cmd), stdout=open(unpacked_fpath, 'w'), stderr=open(log_fpath, 'a'), logger=logger) in_file = open(unpacked_fpath).read() out_file = open(unpacked_fpath, 'w') out_file.write(in_file.replace(' ', '_')) out_file.close() shutil.move(unpacked_fpath, silva_fpath) logger.info('Making BLAST database...') cmd = blast_fpath('makeblastdb') + (' -in %s -dbtype nucl -out %s' % (silva_fpath, db_fpath)) qutils.call_subprocess(shlex.split(cmd), stdout=open(log_fpath, 'a'), stderr=open(log_fpath, 'a'), logger=logger) if not os.path.exists(db_fpath + '.nsq') or os.path.getsize(db_fpath + '.nsq') < db_nsq_fsize: logger.error('Failed to make BLAST database ("' + blastdb_dirpath + '"). See details in log. Try to make it manually: %s' % cmd) return 1 elif not qconfig.debug: os.remove(db_gz_fpath) os.remove(silva_fpath) return 0
def create_meta_report(results_dirpath, json_texts): html_fpath = os.path.join(results_dirpath, report_fname) if not os.path.isfile(html_fpath): init(html_fpath, is_meta=True) from libs import search_references_meta taxons_for_krona = search_references_meta.taxons_for_krona meta_log = get_logger(qconfig.LOGGER_META_NAME) if taxons_for_krona: meta_log.info(" Drawing interactive Krona plots...") krona_dirpath = os.path.join(qconfig.LIBS_LOCATION, "kronatools") krona_res_dirpath = os.path.join(results_dirpath, qconfig.krona_dirname) simplejson_error = False try: import json except ImportError: try: import simplejson as json except ImportError: log.warning("Can't draw Krona charts - please install python-simplejson") simplejson_error = True if not simplejson_error: if not os.path.isdir(krona_res_dirpath): os.mkdir(krona_res_dirpath) json_data = json.loads(json_texts[0]) assemblies = json_data["assembliesNames"] krona_txt_ext = "_taxonomy.txt" krona_common_fpath = os.path.join(krona_res_dirpath, "overall" + krona_txt_ext) krona_common_file = open(krona_common_fpath, "w") for index, name in enumerate(assemblies): krona_file = open(os.path.join(krona_res_dirpath, name + krona_txt_ext), "w") krona_file.close() for json_text in json_texts[1:]: json_data = json.loads(json_text) ref = json_data["referenceName"] report = json_data["report"][0] for metric in report[1]: if metric["metricName"] == "Total length": lengths = metric["values"] break cur_assemblies = json_data["assembliesNames"] for index, name in enumerate(cur_assemblies): krona_fpath = os.path.join(krona_res_dirpath, name + krona_txt_ext) with open(krona_fpath, "a") as f_krona: if ref in taxons_for_krona: f_krona.write(str(lengths[index]) + "\t" + taxons_for_krona[ref] + "\n") else: f_krona.write(str(lengths[index]) + "\n") if ref in taxons_for_krona: krona_common_file.write(str(sum(lengths)) + "\t" + taxons_for_krona[ref] + "\n") else: krona_common_file.write(str(sum(lengths)) + "\n") krona_common_file.close() krona_fpaths = [] for index, name in enumerate(assemblies): krona_fpath = os.path.join(krona_res_dirpath, name + "_taxonomy_chart.html") krona_txt_fpath = os.path.join(krona_res_dirpath, name + krona_txt_ext) qutils.call_subprocess( [ "perl", "-I", krona_dirpath + "/lib", krona_dirpath + "/scripts/ImportText.pl", krona_txt_fpath, "-o", krona_fpath, "-a", ], stdout=open(os.devnull, "w"), stderr=open(os.devnull, "w"), ) krona_fpaths.append(os.path.join(qconfig.krona_dirname, name + "_taxonomy_chart.html")) meta_log.main_info(" Krona chart for " + name + " is saved to " + krona_fpath) os.remove(krona_txt_fpath) if len(assemblies) > 1: name = "summary" krona_fpath = os.path.join(krona_res_dirpath, name + "_taxonomy_chart.html") qutils.call_subprocess( [ "perl", "-I", krona_dirpath + "/lib", krona_dirpath + "/scripts/ImportText.pl", krona_common_fpath, "-o", krona_fpath, "-a", ], stdout=open(os.devnull, "w"), stderr=open(os.devnull, "w"), ) meta_log.main_info(" Summary Krona chart is saved to " + krona_fpath) krona_fpaths.append(os.path.join(qconfig.krona_dirname, name + "_taxonomy_chart.html")) # extra fpath! os.remove(krona_common_fpath) save_krona_paths(results_dirpath, krona_fpaths, assemblies) # reading html template file with open(html_fpath) as f_html: html_text = f_html.read() keyword = "totalReport" html_text = re.sub("{{ " + keyword + " }}", "[" + ",".join(json_texts) + "]", html_text) html_text = re.sub(r"{{(\s+\S+\s+)}}", "{}", html_text) with open(html_fpath, "w") as f_html: f_html.write(html_text) meta_log.main_info( " Extended version of HTML-report (for all references and assemblies) is saved to " + html_fpath )
def process_one_ref(cur_ref_fpath, output_dirpath, err_path, bed_fpath=None): ref = qutils.name_from_fpath(cur_ref_fpath) ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam') ref_bam_fpath = os.path.join(output_dirpath, ref + '.bam') ref_bamsorted_fpath = os.path.join(output_dirpath, ref + '.sorted') ref_bed_fpath = bed_fpath if bed_fpath else os.path.join(output_dirpath, ref + '.bed') if os.path.getsize(ref_sam_fpath) < 1024 * 1024: # TODO: make it better (small files will cause Manta crush -- "not enough reads...") logger.info(' SAM file is too small for Manta (%d Kb), skipping..' % (os.path.getsize(ref_sam_fpath) // 1024)) return None if is_non_empty_file(ref_bed_fpath): logger.info(' Using existing Manta BED-file: ' + ref_bed_fpath) return ref_bed_fpath if not os.path.exists(ref_bamsorted_fpath + '.bam'): qutils.call_subprocess([samtools_fpath('samtools'), 'view', '-bS', ref_sam_fpath], stdout=open(ref_bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([samtools_fpath('samtools'), 'sort', ref_bam_fpath, ref_bamsorted_fpath], stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(ref_bamsorted_fpath + '.bam.bai'): qutils.call_subprocess([samtools_fpath('samtools'), 'index', ref_bamsorted_fpath + '.bam'], stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(cur_ref_fpath + '.fai'): qutils.call_subprocess([samtools_fpath('samtools'), 'faidx', cur_ref_fpath], stderr=open(err_path, 'a'), logger=logger) vcfoutput_dirpath = os.path.join(output_dirpath, ref + '_manta') found_SV_fpath = os.path.join(vcfoutput_dirpath, 'results/variants/diploidSV.vcf.gz') unpacked_SV_fpath = found_SV_fpath + '.unpacked' if not is_non_empty_file(found_SV_fpath): if os.path.exists(vcfoutput_dirpath): shutil.rmtree(vcfoutput_dirpath, ignore_errors=True) os.makedirs(vcfoutput_dirpath) qutils.call_subprocess([config_manta_fpath, '--normalBam', ref_bamsorted_fpath + '.bam', '--referenceFasta', cur_ref_fpath, '--runDir', vcfoutput_dirpath], stdout=open(err_path, 'a'), stderr=open(err_path, 'a'), logger=logger) if not os.path.exists(os.path.join(vcfoutput_dirpath, 'runWorkflow.py')): return None qutils.call_subprocess([os.path.join(vcfoutput_dirpath, 'runWorkflow.py'), '-m', 'local', '-j', str(qconfig.max_threads)], stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(unpacked_SV_fpath): cmd = 'gunzip -c %s' % found_SV_fpath qutils.call_subprocess(shlex.split(cmd), stdout=open(unpacked_SV_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) from manta import vcfToBedpe vcfToBedpe.vcfToBedpe(open(unpacked_SV_fpath), open(ref_bed_fpath, 'w')) return ref_bed_fpath