def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corr_seq_name += '_' + qutils.correct_name(seq_name[:20]) if not qconfig.no_check: corr_seq = seq.upper() dic = {'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N'} pat = "(%s)" % "|".join(map(re.escape, dic.keys())) corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq) if re.compile(r'[^ACGTN]').search(corr_seq): logger.warning('Skipping ' + ref_fpath + ' because it contains non-ACGTN characters.', indent=' ') return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath
def _partition_contigs(assemblies, ref_fpaths, corrected_dirpath, alignments_fpath_template, labels): # not_aligned_anywhere_dirpath = os.path.join(output_dirpath, 'contigs_not_aligned_anywhere') # if os.path.isdir(not_aligned_anywhere_dirpath): # os.rmdir(not_aligned_anywhere_dirpath) # os.mkdir(not_aligned_anywhere_dirpath) # array of assemblies for each reference assemblies_by_ref = dict([(qutils.name_from_fpath(ref_fpath), []) for ref_fpath in ref_fpaths]) n_jobs = min(qconfig.max_threads, len(assemblies)) from joblib import Parallel, delayed assemblies = Parallel(n_jobs=n_jobs)(delayed(parallel_partition_contigs)(asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template) for asm in assemblies) assemblies_dicts = [assembly[0] for assembly in assemblies] assemblies_by_ref = [] for ref_fpath in ref_fpaths: ref_name = qutils.name_from_fpath(ref_fpath) not_sorted_assemblies = set([val for sublist in (assemblies_dicts[i][ref_name] for i in range(len(assemblies_dicts))) for val in sublist]) sorted_assemblies = [] for label in labels: # sort by label for assembly in not_sorted_assemblies: if assembly.label == label: sorted_assemblies.append(assembly) break assemblies_by_ref.append((ref_fpath, sorted_assemblies)) not_aligned_assemblies = [assembly[1] for assembly in assemblies] return assemblies_by_ref, not_aligned_assemblies
def _partition_contigs(assemblies, ref_fpaths, corrected_dirpath, alignments_fpath_template, labels): # not_aligned_anywhere_dirpath = os.path.join(output_dirpath, 'contigs_not_aligned_anywhere') # if os.path.isdir(not_aligned_anywhere_dirpath): # os.rmdir(not_aligned_anywhere_dirpath) # os.mkdir(not_aligned_anywhere_dirpath) # array of assemblies for each reference assemblies_by_ref = dict([(qutils.name_from_fpath(ref_fpath), []) for ref_fpath in ref_fpaths]) n_jobs = min(qconfig.max_threads, len(assemblies)) from joblib import Parallel, delayed assemblies = Parallel(n_jobs=n_jobs)(delayed(parallel_partition_contigs)( asm, assemblies_by_ref, corrected_dirpath, alignments_fpath_template) for asm in assemblies) assemblies_dicts = [assembly[0] for assembly in assemblies] assemblies_by_ref = [] for ref_fpath in ref_fpaths: ref_name = qutils.name_from_fpath(ref_fpath) not_sorted_assemblies = set([ val for sublist in (assemblies_dicts[i][ref_name] for i in range(len(assemblies_dicts))) for val in sublist ]) sorted_assemblies = [] for label in labels: # sort by label for assembly in not_sorted_assemblies: if assembly.label == label: sorted_assemblies.append(assembly) break assemblies_by_ref.append((ref_fpath, sorted_assemblies)) not_aligned_assemblies = [assembly[1] for assembly in assemblies] return assemblies_by_ref, not_aligned_assemblies
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(index) + assembly_label) err_fpath = os.path.join(out_dirpath, assembly_name + '_genemark.stderr') genes = gmhmm_p_function(tool_dirpath, contigs_fpath, err_fpath, index, tmp_dirpath) if not genes: unique_count = None count = None # [None] * len(gene_lengths) else: tool_name = "genemark" out_gff_fpath = os.path.join(out_dirpath, assembly_name + '_' + tool_name + '_genes.gff') add_genes_to_gff(genes, out_gff_fpath) if OUTPUT_FASTA: out_fasta_fpath = os.path.join(out_dirpath, assembly_name + '_' + tool_name + '_genes.fasta') add_genes_to_fasta(genes, out_fasta_fpath) count = [sum([gene[3] - gene[2] > x for gene in genes]) for x in gene_lengths] unique_count = len(set([gene[4] for gene in genes])) total_count = len(genes) logger.info(' ' + qutils.index_to_str(index) + ' Genes = ' + str(unique_count) + ' unique, ' + str(total_count) + ' total') logger.info(' ' + qutils.index_to_str(index) + ' Predicted genes (GFF): ' + out_gff_fpath) return unique_count, count
def do(contigs_fpaths, contig_report_fpath_pattern, output_dirpath, ref_fpath, arcs=False, similar=False, coverage_hist=None): lists_of_aligned_blocks = [] total_genome_size = 0 reference_chromosomes = dict() for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_len = len(seq) total_genome_size += chr_len reference_chromosomes[chr_name] = chr_len virtual_genome_shift = int(0.1 * total_genome_size) sorted_ref_names = sorted(reference_chromosomes, key=reference_chromosomes.get, reverse=True) sorted_ref_lengths = sorted(reference_chromosomes.values(), reverse=True) cumulative_ref_lengths = [0] for length in sorted(reference_chromosomes.values(), reverse=True): cumulative_ref_lengths.append(cumulative_ref_lengths[-1] + virtual_genome_shift + length) virtual_genome_size = cumulative_ref_lengths[-1] - virtual_genome_shift for contigs_fpath in contigs_fpaths: report_fpath = contig_report_fpath_pattern % qutils.name_from_fpath(contigs_fpath) aligned_blocks = parse_nucmer_contig_report(report_fpath, sorted_ref_names, cumulative_ref_lengths) if aligned_blocks is None: return None lists_of_aligned_blocks.append(aligned_blocks) plot_fpath = draw_alignment_plot( contigs_fpaths, virtual_genome_size, sorted_ref_names, sorted_ref_lengths, virtual_genome_shift, output_dirpath, lists_of_aligned_blocks, arcs, similar, coverage_hist) return plot_fpath
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(i) + assembly_label + '...') # run gage tool log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_name + '.stdout') log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_name + '.stderr') logger.info(' ' + qutils.index_to_str(i) + 'Logging to files ' + os.path.basename(log_out_fpath) + ' and ' + os.path.basename(log_err_fpath) + '...') log_out_f = open(log_out_fpath, 'w') log_err_f = open(log_err_fpath, 'w') return_code = qutils.call_subprocess([ 'sh', gage_tool_path, reference, contigs_fpath, tmp_dir, str(qconfig.min_contig) ], stdout=log_out_f, stderr=log_err_f, indent=' ' + qutils.index_to_str(i), only_if_debug=False) if return_code == 0: logger.info(' ' + qutils.index_to_str(i) + 'Failed.') else: logger.info(' ' + qutils.index_to_str(i) + 'Done.') log_out_f.close() log_err_f.close() return return_code
def run_gage(i, contigs_fpath, gage_results_dirpath, gage_tool_path, reference, tmp_dir): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(i) + assembly_label + '...') # run gage tool log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_name + '.stdout') log_err_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_name + '.stderr') logger.info(' ' + qutils.index_to_str(i) + 'Logging to files ' + os.path.basename(log_out_fpath) + ' and ' + os.path.basename(log_err_fpath) + '...') log_out_f = open(log_out_fpath, 'w') log_err_f = open(log_err_fpath, 'w') return_code = qutils.call_subprocess( ['sh', gage_tool_path, reference, contigs_fpath, tmp_dir, str(qconfig.min_contig)], stdout=log_out_f, stderr=log_err_f, indent=' ' + qutils.index_to_str(i), only_if_debug=False) if return_code != 0: logger.info(' ' + qutils.index_to_str(i) + 'Failed.') else: logger.info(' ' + qutils.index_to_str(i) + 'Done.') log_out_f.close() log_err_f.close() return return_code
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(index) + assembly_label) out_fpath = os.path.join(out_dirpath, assembly_name + '_glimmer') err_fpath = os.path.join(out_dirpath, assembly_name + '_glimmer.stderr') #out_gff_path, out_fasta_path, unique, total, cnt = glimmerHMM(tool_dir, # fasta_path, out_path, gene_lengths, err_path) out_gff_path, unique, total, cnt = glimmerHMM(tool_dirpath, contigs_fpath, out_fpath, gene_lengths, err_fpath, tmp_dirpath, index) if out_gff_path: logger.info(' ' + qutils.index_to_str(index) + ' Genes = ' + str(unique) + ' unique, ' + str(total) + ' total') logger.info(' ' + qutils.index_to_str(index) + ' Predicted genes (GFF): ' + out_gff_path) return unique, cnt
def gmhmm_p_everyGC(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads): tmp_dirpath = tempfile.mkdtemp(dir=tmp_dirpath) tool_exec_fpath = os.path.join(tool_dirpath, 'gmsn.pl') err_file = open(err_fpath, 'w') fasta_name = qutils.name_from_fpath(fasta_fpath) return_code = qutils.call_subprocess([ 'perl', tool_exec_fpath, '--name', fasta_name, '--clean', '--out', tmp_dirpath, fasta_fpath ], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) if return_code != 0: return genes = [] tool_exec_fpath = os.path.join(tool_dirpath, 'gmhmmp') sub_fasta_fpath = os.path.join(tmp_dirpath, fasta_name) out_fpath = sub_fasta_fpath + '.gmhmm' heu_fpath = os.path.join(tmp_dirpath, fasta_name + '_hmm_heuristic.mod') with open(err_fpath, 'a') as err_file: ok = gmhmm_p(tool_exec_fpath, fasta_fpath, heu_fpath, out_fpath, err_file, index) if ok: genes.extend(parse_gmhmm_out(out_fpath)) if not qconfig.debug: shutil.rmtree(tmp_dirpath) return genes
def gmhmm_p_everyGC(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads): tmp_dirpath = tempfile.mkdtemp(dir=tmp_dirpath) tool_exec_fpath = os.path.join(tool_dirpath, 'gmsn.pl') err_file = open(err_fpath, 'w') fasta_name = qutils.name_from_fpath(fasta_fpath) return_code = qutils.call_subprocess( ['perl', tool_exec_fpath, '--name', fasta_name, '--clean', '--out', tmp_dirpath, fasta_fpath], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) if return_code != 0: return genes = [] tool_exec_fpath = os.path.join(tool_dirpath, 'gmhmmp') sub_fasta_fpath = os.path.join(tmp_dirpath, fasta_name) out_fpath = sub_fasta_fpath + '.gmhmm' heu_fpath = os.path.join(tmp_dirpath, fasta_name + '_hmm_heuristic.mod') with open(err_fpath, 'a') as err_file: ok = gmhmm_p(tool_exec_fpath, fasta_fpath, heu_fpath, out_fpath, err_file, index) if ok: genes.extend(parse_gmhmm_out(out_fpath)) if not qconfig.debug: shutil.rmtree(tmp_dirpath) return genes
def do(assemblies, labels, downloaded_dirpath, ref_txt_fpath=None): logger.print_timestamp() err_fpath = os.path.join(downloaded_dirpath, 'blast.err') contigs_names = [qutils.name_from_fpath(assembly.fpath) for assembly in assemblies] blast_check_fpath = os.path.join(downloaded_dirpath, 'blast.check') files_sizes = dict((assembly.fpath, os.path.getsize(assembly.fpath)) for assembly in assemblies) assemblies_fpaths = dict((assembly.fpath, assembly) for assembly in assemblies) blast_assemblies, downloaded_organisms, not_founded_organisms = \ check_blast(blast_check_fpath, files_sizes, assemblies_fpaths, assemblies, labels) organisms = [] if ref_txt_fpath: organisms = parse_refs_list(ref_txt_fpath) organisms_assemblies = None else: scores_organisms, organisms_assemblies = process_blast(blast_assemblies, downloaded_dirpath, contigs_names, labels, blast_check_fpath, err_fpath) if scores_organisms: scores_organisms = sorted(scores_organisms, reverse=True) organisms = [organism for (score, organism) in scores_organisms] downloaded_ref_fpaths = [os.path.join(downloaded_dirpath,file) for (path, dirs, files) in os.walk(downloaded_dirpath) for file in files if qutils.check_is_fasta_file(file)] ref_fpaths = process_refs(organisms, labels, downloaded_dirpath, not_founded_organisms, contigs_names, downloaded_ref_fpaths, blast_check_fpath, err_fpath, organisms_assemblies) if not ref_fpaths: logger.main_info('Reference genomes are not found.') if not qconfig.debug and os.path.exists(err_fpath): os.remove(err_fpath) ref_fpaths.sort() return ref_fpaths
def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corr_seq_name += '_' + qutils.correct_name(seq_name[:20]) if not qconfig.no_check: corr_seq = seq.upper() dic = { 'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N' } pat = "(%s)" % "|".join(map(re.escape, dic.keys())) corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq) if re.compile(r'[^ACGTN]').search(corr_seq): logger.warning('Skipping ' + ref_fpath + ' because it contains non-ACGTN characters.', indent=' ') return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[ corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath
def do(contigs_fpaths, contig_report_fpath_pattern, output_dirpath, ref_fpath, cov_fpath=None, arcs=False, similar=False, coverage_hist=None): make_output_dir(output_dirpath) lists_of_aligned_blocks = [] total_genome_size = 0 reference_chromosomes = dict() chr_names = [] for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_names.append(chr_name) chr_len = len(seq) total_genome_size += chr_len reference_chromosomes[chr_name] = chr_len virtual_genome_shift = 100 sorted_ref_names = sorted(reference_chromosomes, key=reference_chromosomes.get, reverse=True) sorted_ref_lengths = sorted(reference_chromosomes.values(), reverse=True) cumulative_ref_lengths = [0] for length in sorted(reference_chromosomes.values(), reverse=True): cumulative_ref_lengths.append(cumulative_ref_lengths[-1] + virtual_genome_shift + length) virtual_genome_size = cumulative_ref_lengths[-1] - virtual_genome_shift for contigs_fpath in contigs_fpaths: report_fpath = contig_report_fpath_pattern % qutils.label_from_fpath_for_fname( contigs_fpath) aligned_blocks = parse_nucmer_contig_report(report_fpath, sorted_ref_names, cumulative_ref_lengths) if aligned_blocks is None: return None for block in aligned_blocks: block.label = qutils.name_from_fpath(contigs_fpath) lists_of_aligned_blocks.append(aligned_blocks) plot_fpath, assemblies = draw_alignment_plot( contigs_fpaths, virtual_genome_size, sorted_ref_names, sorted_ref_lengths, virtual_genome_shift, output_dirpath, lists_of_aligned_blocks, arcs, similar, coverage_hist) if assemblies and qconfig.create_contig_alignment_html: js_data_gen(assemblies, contigs_fpaths, chr_names, reference_chromosomes, output_dirpath, cov_fpath, ref_fpath, virtual_genome_size) return plot_fpath
def _correct_reference(ref_fpath, corrected_dirpath): ref_fname = os.path.basename(ref_fpath) name, fasta_ext = qutils.splitext_for_fasta_file(ref_fname) corr_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, name + fasta_ext)) if not correct_fasta(ref_fpath, corr_fpath, qconfig.min_contig, is_reference=True): ref_fpath = '' else: logger.main_info(' %s ==> %s' % (ref_fpath, qutils.name_from_fpath(corr_fpath))) ref_fpath = corr_fpath return ref_fpath
def _correct_reference(ref_fpath, corrected_dirpath): ref_fname = os.path.basename(ref_fpath) name, fasta_ext = qutils.splitext_for_fasta_file(ref_fname) corr_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, name + fasta_ext)) if not correct_fasta(ref_fpath, corr_fpath, qconfig.min_contig, is_reference=True): ref_fpath = '' else: logger.info(' %s ==> %s' % (ref_fpath, qutils.name_from_fpath(corr_fpath))) ref_fpath = corr_fpath return ref_fpath
def _partition_contigs(assemblies, ref_fpaths, corrected_dirpath, alignments_fpath_template): # not_aligned_anywhere_dirpath = os.path.join(output_dirpath, 'contigs_not_aligned_anywhere') # if os.path.isdir(not_aligned_anywhere_dirpath): # os.rmdir(not_aligned_anywhere_dirpath) # os.mkdir(not_aligned_anywhere_dirpath) not_aligned_assemblies = [] # array of assemblies for each reference assemblies_by_ref = dict([(qutils.name_from_fpath(ref_fpath), []) for ref_fpath in ref_fpaths]) for asm in assemblies: not_aligned_fname = asm.name + '_not_aligned_anywhere.fasta' not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname) contigs = {} aligned_contig_names = set() with open(alignments_fpath_template % asm.name) as alignments_tsv_f: for line in alignments_tsv_f: values = line.split() ref_name = values[0] ref_contigs_names = values[1:] ref_contigs_fpath = os.path.join( corrected_dirpath, asm.name + '_to_' + ref_name[:40] + '.fasta') for (cont_name, seq) in fastaparser.read_fasta(asm.fpath): if not cont_name in contigs.keys(): contigs[cont_name] = seq if cont_name in ref_contigs_names: # Collecting all aligned contigs names in order to futher extract not-aligned aligned_contig_names.add(cont_name) fastaparser.write_fasta(ref_contigs_fpath, [(cont_name, seq)], 'a') ref_asm = Assembly(ref_contigs_fpath, asm.label) assemblies_by_ref[ref_name].append(ref_asm) # Exctraction not aligned contigs all_contigs_names = set(contigs.keys()) not_aligned_contigs_names = all_contigs_names - aligned_contig_names fastaparser.write_fasta(not_aligned_fpath, [(name, contigs[name]) for name in not_aligned_contigs_names]) not_aligned_asm = Assembly(not_aligned_fpath, asm.label) not_aligned_assemblies.append(not_aligned_asm) return assemblies_by_ref, not_aligned_assemblies
def save_total_report(output_dirpath, min_contig, ref_fpath): from libs import reporting asm_names = map(qutils.label_from_fpath, reporting.assembly_fpaths) report = reporting.table(reporting.Fields.grouped_order) t = datetime.datetime.now() return save(output_dirpath + total_report_fname, { 'date': t.strftime('%d %B %Y, %A, %H:%M:%S'), 'assembliesNames': asm_names, 'referenceName': qutils.name_from_fpath(ref_fpath) if ref_fpath else qconfig.not_aligned_name, 'order': [i for i, _ in enumerate(asm_names)], 'report': report, 'minContig': min_contig, })
def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references): seq_fname = ref_name if total_references > 1: seq_fname += '_' + qutils.correct_name(seq_name[:20]) seq_fname += ref_fasta_ext corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname)) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corrected_ref_fpaths.append(corr_seq_fpath) fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') return corr_seq_name
def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references): seq_fname = ref_name if total_references > 1: seq_fname += '_' + qutils.correct_name(seq_name[:20]) seq_fname += ref_fasta_ext corr_seq_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, seq_fname)) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corrected_ref_fpaths.append(corr_seq_fpath) fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') return corr_seq_name
def process_one_ref(cur_ref_fpath, output_dirpath, err_path, bed_fpath=None): ref = qutils.name_from_fpath(cur_ref_fpath) ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam') ref_bam_fpath = os.path.join(output_dirpath, ref + '.bam') ref_bamsorted_fpath = os.path.join(output_dirpath, ref + '.sorted') ref_bed_fpath = bed_fpath if bed_fpath else os.path.join(output_dirpath, ref + '.bed') if os.path.getsize(ref_sam_fpath) < 1024 * 1024: # TODO: make it better (small files will cause Manta crush -- "not enough reads...") logger.info(' SAM file is too small for Manta (%d Kb), skipping..' % (os.path.getsize(ref_sam_fpath) // 1024)) return None if is_non_empty_file(ref_bed_fpath): logger.info(' Using existing Manta BED-file: ' + ref_bed_fpath) return ref_bed_fpath if not os.path.exists(ref_bamsorted_fpath + '.bam'): qutils.call_subprocess([samtools_fpath('samtools'), 'view', '-bS', ref_sam_fpath], stdout=open(ref_bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([samtools_fpath('samtools'), 'sort', ref_bam_fpath, ref_bamsorted_fpath], stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(ref_bamsorted_fpath + '.bam.bai'): qutils.call_subprocess([samtools_fpath('samtools'), 'index', ref_bamsorted_fpath + '.bam'], stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(cur_ref_fpath + '.fai'): qutils.call_subprocess([samtools_fpath('samtools'), 'faidx', cur_ref_fpath], stderr=open(err_path, 'a'), logger=logger) vcfoutput_dirpath = os.path.join(output_dirpath, ref + '_manta') found_SV_fpath = os.path.join(vcfoutput_dirpath, 'results/variants/diploidSV.vcf.gz') unpacked_SV_fpath = found_SV_fpath + '.unpacked' if not is_non_empty_file(found_SV_fpath): if os.path.exists(vcfoutput_dirpath): shutil.rmtree(vcfoutput_dirpath, ignore_errors=True) os.makedirs(vcfoutput_dirpath) qutils.call_subprocess([config_manta_fpath, '--normalBam', ref_bamsorted_fpath + '.bam', '--referenceFasta', cur_ref_fpath, '--runDir', vcfoutput_dirpath], stdout=open(err_path, 'a'), stderr=open(err_path, 'a'), logger=logger) if not os.path.exists(os.path.join(vcfoutput_dirpath, 'runWorkflow.py')): return None qutils.call_subprocess([os.path.join(vcfoutput_dirpath, 'runWorkflow.py'), '-m', 'local', '-j', str(qconfig.max_threads)], stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(unpacked_SV_fpath): cmd = 'gunzip -c %s' % found_SV_fpath qutils.call_subprocess(shlex.split(cmd), stdout=open(unpacked_SV_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) from manta import vcfToBedpe vcfToBedpe.vcfToBedpe(open(unpacked_SV_fpath), open(ref_bed_fpath, 'w')) return ref_bed_fpath
def _partition_contigs(assemblies, ref_fpaths, corrected_dirpath, alignments_fpath_template): # not_aligned_anywhere_dirpath = os.path.join(output_dirpath, 'contigs_not_aligned_anywhere') # if os.path.isdir(not_aligned_anywhere_dirpath): # os.rmdir(not_aligned_anywhere_dirpath) # os.mkdir(not_aligned_anywhere_dirpath) not_aligned_assemblies = [] # array of assemblies for each reference assemblies_by_ref = dict([(qutils.name_from_fpath(ref_fpath), []) for ref_fpath in ref_fpaths]) for asm in assemblies: not_aligned_fname = asm.name + '_not_aligned_anywhere.fasta' not_aligned_fpath = os.path.join(corrected_dirpath, not_aligned_fname) contigs = {} aligned_contig_names = set() for line in open(alignments_fpath_template % asm.name): values = line.split() ref_name = values[0] ref_contigs_names = values[1:] ref_contigs_fpath = os.path.join( corrected_dirpath, asm.name + '_to_' + ref_name[:40] + '.fasta') for (cont_name, seq) in fastaparser.read_fasta(asm.fpath): if not cont_name in contigs.keys(): contigs[cont_name] = seq if cont_name in ref_contigs_names: # Collecting all aligned contigs names in order to futher extract not-aligned aligned_contig_names.add(cont_name) fastaparser.write_fasta(ref_contigs_fpath, [(cont_name, seq)], 'a') ref_asm = Assembly(ref_contigs_fpath, asm.label) assemblies_by_ref[ref_name].append(ref_asm) # Exctraction not aligned contigs all_contigs_names = set(contigs.keys()) not_aligned_contigs_names = all_contigs_names - aligned_contig_names fastaparser.write_fasta(not_aligned_fpath, [(name, contigs[name]) for name in not_aligned_contigs_names]) not_aligned_asm = Assembly(not_aligned_fpath, asm.label) not_aligned_assemblies.append(not_aligned_asm) return assemblies_by_ref, not_aligned_assemblies
def do(contigs_fpaths, contig_report_fpath_pattern, output_dirpath, ref_fpath, arcs=False, similar=False, coverage_hist=None): lists_of_aligned_blocks = [] total_genome_size = 0 reference_chromosomes = dict() for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_len = len(seq) total_genome_size += chr_len reference_chromosomes[chr_name] = chr_len virtual_genome_shift = int(0.1 * total_genome_size) sorted_ref_names = sorted(reference_chromosomes, key=reference_chromosomes.get, reverse=True) sorted_ref_lengths = sorted(reference_chromosomes.values(), reverse=True) cumulative_ref_lengths = [0] for length in sorted(reference_chromosomes.values(), reverse=True): cumulative_ref_lengths.append(cumulative_ref_lengths[-1] + virtual_genome_shift + length) virtual_genome_size = cumulative_ref_lengths[-1] - virtual_genome_shift for contigs_fpath in contigs_fpaths: report_fpath = contig_report_fpath_pattern % qutils.name_from_fpath( contigs_fpath) aligned_blocks = parse_nucmer_contig_report(report_fpath, sorted_ref_names, cumulative_ref_lengths) if aligned_blocks is None: return None lists_of_aligned_blocks.append(aligned_blocks) plot_fpath = draw_alignment_plot(contigs_fpaths, virtual_genome_size, sorted_ref_names, sorted_ref_lengths, virtual_genome_shift, output_dirpath, lists_of_aligned_blocks, arcs, similar, coverage_hist) return plot_fpath
def do(assemblies, labels, downloaded_dirpath, ref_txt_fpath=None): logger.print_timestamp() err_fpath = os.path.join(downloaded_dirpath, 'blast.err') contigs_names = [ qutils.name_from_fpath(assembly.fpath) for assembly in assemblies ] blast_check_fpath = os.path.join(downloaded_dirpath, 'blast.check') files_sizes = dict((assembly.fpath, os.path.getsize(assembly.fpath)) for assembly in assemblies) assemblies_fpaths = dict( (assembly.fpath, assembly) for assembly in assemblies) blast_assemblies, downloaded_organisms, not_founded_organisms = \ check_blast(blast_check_fpath, files_sizes, assemblies_fpaths, assemblies, labels) organisms = [] if ref_txt_fpath: organisms = parse_refs_list(ref_txt_fpath) organisms_assemblies = None else: scores_organisms, organisms_assemblies = process_blast( blast_assemblies, downloaded_dirpath, contigs_names, labels, blast_check_fpath, err_fpath) if scores_organisms: scores_organisms = sorted(scores_organisms, reverse=True) organisms = [organism for (score, organism) in scores_organisms] downloaded_ref_fpaths = [ os.path.join(downloaded_dirpath, file) for (path, dirs, files) in os.walk(downloaded_dirpath) for file in files if qutils.check_is_fasta_file(file) ] ref_fpaths = process_refs(organisms, labels, downloaded_dirpath, not_founded_organisms, contigs_names, downloaded_ref_fpaths, blast_check_fpath, err_fpath, organisms_assemblies) if not ref_fpaths: logger.main_info('Reference genomes are not found.') if not qconfig.debug and os.path.exists(err_fpath): os.remove(err_fpath) ref_fpaths.sort() return ref_fpaths
def gm_es(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads): tool_exec_fpath = os.path.join(tool_dirpath, 'gmes_petap.pl') libs_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'genemark-es', 'lib') err_file = open(err_fpath, 'w') tmp_dirpath += qutils.name_from_fpath(fasta_fpath) if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) return_code = qutils.call_subprocess( ['perl', '-I', libs_dirpath, tool_exec_fpath, '--ES', '--cores', str(num_threads), '--sequence', fasta_fpath, '--out', tmp_dirpath], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) if return_code != 0: return genes = [] _, _, fnames = os.walk(tmp_dirpath).next() for fname in fnames: if fname.endswith('gtf'): genes.extend(parse_gtf_out(os.path.join(tmp_dirpath, fname))) return genes
def save_total_report(output_dirpath, min_contig, ref_fpath): from libs import reporting asm_names = map(qutils.label_from_fpath, reporting.assembly_fpaths) report = reporting.table(reporting.Fields.grouped_order) t = datetime.datetime.now() return save( output_dirpath + total_report_fname, { 'date': t.strftime('%d %B %Y, %A, %H:%M:%S'), 'assembliesNames': asm_names, 'referenceName': qutils.name_from_fpath(ref_fpath) if ref_fpath else qconfig.not_aligned_name, 'order': [i for i, _ in enumerate(asm_names)], 'report': report, 'minContig': min_contig, })
def do(contigs_fpaths, contig_report_fpath_pattern, output_dirpath, ref_fpath, cov_fpath=None, arcs=False, similar=False, coverage_hist=None): make_output_dir(output_dirpath) lists_of_aligned_blocks = [] total_genome_size = 0 reference_chromosomes = dict() chr_names = [] for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_names.append(chr_name) chr_len = len(seq) total_genome_size += chr_len reference_chromosomes[chr_name] = chr_len virtual_genome_shift = 100 sorted_ref_names = sorted(reference_chromosomes, key=reference_chromosomes.get, reverse=True) sorted_ref_lengths = sorted(reference_chromosomes.values(), reverse=True) cumulative_ref_lengths = [0] for length in sorted(reference_chromosomes.values(), reverse=True): cumulative_ref_lengths.append(cumulative_ref_lengths[-1] + virtual_genome_shift + length) virtual_genome_size = cumulative_ref_lengths[-1] - virtual_genome_shift for contigs_fpath in contigs_fpaths: report_fpath = contig_report_fpath_pattern % qutils.label_from_fpath_for_fname(contigs_fpath) aligned_blocks = parse_nucmer_contig_report(report_fpath, sorted_ref_names, cumulative_ref_lengths) if aligned_blocks is None: return None for block in aligned_blocks: block.label = qutils.name_from_fpath(contigs_fpath) lists_of_aligned_blocks.append(aligned_blocks) plot_fpath, assemblies = draw_alignment_plot( contigs_fpaths, virtual_genome_size, sorted_ref_names, sorted_ref_lengths, virtual_genome_shift, output_dirpath, lists_of_aligned_blocks, arcs, similar, coverage_hist) if assemblies and qconfig.create_contig_alignment_html: js_data_gen(assemblies, contigs_fpaths, chr_names, reference_chromosomes, output_dirpath, cov_fpath, ref_fpath, virtual_genome_size) return plot_fpath
def gm_es(tool_dirpath, fasta_fpath, err_fpath, index, tmp_dirpath, num_threads): tool_exec_fpath = os.path.join(tool_dirpath, 'gmes_petap.pl') libs_dirpath = os.path.join(qconfig.LIBS_LOCATION, 'genemark-es', 'lib') err_file = open(err_fpath, 'w') tmp_dirpath += qutils.name_from_fpath(fasta_fpath) if not os.path.isdir(tmp_dirpath): os.mkdir(tmp_dirpath) return_code = qutils.call_subprocess([ 'perl', '-I', libs_dirpath, tool_exec_fpath, '--ES', '--cores', str(num_threads), '--sequence', fasta_fpath, '--out', tmp_dirpath ], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index)) if return_code != 0: return genes = [] _, _, fnames = os.walk(tmp_dirpath).next() for fname in fnames: if fname.endswith('gtf'): genes.extend(parse_gtf_out(os.path.join(tmp_dirpath, fname))) return genes
def main(args): if ' ' in qconfig.QUAST_HOME: logger.error( 'QUAST does not support spaces in paths. \n' 'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '\n' 'Please, put QUAST in a different directory, then try again.\n', to_stderr=True, exit_with_code=3) if not args: qconfig.usage(meta=True) sys.exit(0) metaquast_path = [os.path.realpath(__file__)] quast_py_args, contigs_fpaths = parse_options(logger, metaquast_path + args, is_metaquast=True) output_dirpath, ref_fpaths, labels = qconfig.output_dirpath, qconfig.reference, qconfig.labels html_report = qconfig.html_report test_mode = qconfig.test # Directories output_dirpath, _, _ = qutils.set_up_output_dir(output_dirpath, None, not output_dirpath, save_json=False) corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) qconfig.set_max_threads(logger) qutils.logger = logger ######################################################################## from libs import reporting reload(reporting) from libs import plotter if os.path.isdir(corrected_dirpath): shutil.rmtree(corrected_dirpath) os.mkdir(corrected_dirpath) # PROCESSING REFERENCES if ref_fpaths: logger.main_info() logger.main_info('Reference(s):') corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\ correct_meta_references(ref_fpaths, corrected_dirpath) # PROCESSING CONTIGS logger.main_info() logger.main_info('Contigs:') assemblies, labels = correct_assemblies(contigs_fpaths, output_dirpath, labels) if not assemblies: logger.error( "None of the assembly files contains correct contigs. " "Please, provide different files or decrease --min-contig threshold." ) return 4 # Running QUAST(s) quast_py_args += ['--meta'] downloaded_refs = False # SEARCHING REFERENCES if not ref_fpaths: logger.main_info() if qconfig.max_references == 0: logger.notice( "Maximum number of references (--max-ref-number) is set to 0, search in SILVA 16S rRNA database is disabled" ) else: if qconfig.references_txt: logger.main_info( "List of references was provided, starting to download reference genomes from NCBI..." ) else: logger.main_info( "No references are provided, starting to search for reference genomes in SILVA 16S rRNA database " "and to download them from NCBI...") downloaded_dirpath = os.path.join(output_dirpath, qconfig.downloaded_dirname) if not os.path.isdir(downloaded_dirpath): os.mkdir(downloaded_dirpath) ref_fpaths = search_references_meta.do(assemblies, labels, downloaded_dirpath, qconfig.references_txt) if ref_fpaths: search_references_meta.is_quast_first_run = True if not qconfig.references_txt: downloaded_refs = True logger.main_info() logger.main_info('Downloaded reference(s):') corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\ correct_meta_references(ref_fpaths, corrected_dirpath) elif test_mode and not ref_fpaths: logger.error( 'Failed to download or setup SILVA 16S rRNA database for working without ' 'references on metagenome datasets!', to_stderr=True, exit_with_code=4) if not ref_fpaths: # No references, running regular quast with MetaGenemark gene finder logger.main_info() logger.notice( 'No references are provided, starting regular QUAST with MetaGeneMark gene finder' ) _start_quast_main(quast_py_args, assemblies=assemblies, output_dirpath=output_dirpath) exit(0) # Running combined reference combined_output_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name) reads_fpaths = [] if qconfig.forward_reads: reads_fpaths.append(qconfig.forward_reads) if qconfig.reverse_reads: reads_fpaths.append(qconfig.reverse_reads) if (reads_fpaths or qconfig.sam or qconfig.bam) and ref_fpaths: bed_fpath, cov_fpath, _ = reads_analyzer.do( combined_ref_fpath, contigs_fpaths, reads_fpaths, corrected_ref_fpaths, os.path.join(combined_output_dirpath, qconfig.variation_dirname), external_logger=logger, sam_fpath=qconfig.sam, bam_fpath=qconfig.bam, bed_fpath=qconfig.bed) qconfig.bed = bed_fpath if qconfig.bed: quast_py_args += ['--sv-bed'] quast_py_args += [qconfig.bed] if qconfig.sam: quast_py_args += ['--sam'] quast_py_args += [qconfig.sam] if qconfig.bam: quast_py_args += ['--bam'] quast_py_args += [qconfig.bam] for arg in args: if arg in ('-s', "--scaffolds"): quast_py_args.remove(arg) quast_py_args += ['--combined-ref'] if qconfig.draw_plots or qconfig.html_report: if plotter.dict_color_and_ls: colors_and_ls = [ plotter.dict_color_and_ls[asm.label] for asm in assemblies ] quast_py_args += ['--colors'] quast_py_args += [','.join([style[0] for style in colors_and_ls])] quast_py_args += ['--ls'] quast_py_args += [','.join([style[1] for style in colors_and_ls])] run_name = 'for the combined reference' logger.main_info() logger.main_info('Starting quast.py ' + run_name + '...') total_num_notices = 0 total_num_warnings = 0 total_num_nf_errors = 0 total_num_notifications = (total_num_notices, total_num_warnings, total_num_nf_errors) if qconfig.html_report: from libs.html_saver import json_saver json_texts = [] else: json_texts = None return_code, total_num_notifications, assemblies, labels = \ _start_quast_main(quast_py_args + ([] if qconfig.unique_mapping else ["--ambiguity-usage", 'one']), assemblies=assemblies, reference_fpath=combined_ref_fpath, output_dirpath=combined_output_dirpath, num_notifications_tuple=total_num_notifications, is_first_run=True) if json_texts is not None: json_texts.append(json_saver.json_text) search_references_meta.is_quast_first_run = False genome_info_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name, 'genome_stats') genome_info_fpath = os.path.join(genome_info_dirpath, 'genome_info.txt') if not os.path.exists(genome_info_fpath): logger.main_info('') logger.main_info( 'Failed aligning the contigs for all the references. ' + ('Try to restart MetaQUAST with another references.' if not downloaded_refs else 'Try to use option --max-ref-number to change maximum number of references ' '(per each assembly) to download.')) logger.main_info('') cleanup(corrected_dirpath) logger.main_info('MetaQUAST finished.') logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode) return if downloaded_refs: logger.main_info() logger.main_info( 'Excluding downloaded references with low genome fraction from further analysis..' ) corr_ref_fpaths = get_downloaded_refs_with_alignments( genome_info_fpath, ref_fpaths, chromosomes_by_refs) if corr_ref_fpaths and corr_ref_fpaths != ref_fpaths: logger.main_info() logger.main_info('Filtered reference(s):') os.remove(combined_ref_fpath) contigs_analyzer.ref_labels_by_chromosomes = {} corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names = \ correct_meta_references(corr_ref_fpaths, corrected_dirpath) run_name = 'for the corrected combined reference' logger.main_info() logger.main_info('Starting quast.py ' + run_name + '...') return_code, total_num_notifications, assemblies, labels = \ _start_quast_main(quast_py_args + ([] if qconfig.unique_mapping else ["--ambiguity-usage", 'one']), assemblies=assemblies, reference_fpath=combined_ref_fpath, output_dirpath=combined_output_dirpath, num_notifications_tuple=total_num_notifications, is_first_run=True) if json_texts is not None: json_texts = json_texts[:-1] json_texts.append(json_saver.json_text) elif corr_ref_fpaths == ref_fpaths: logger.main_info( 'All downloaded references have genome fraction more than 10%. Nothing was excluded.' ) else: logger.main_info( 'All downloaded references have low genome fraction. Nothing was excluded for now.' ) quast_py_args += ['--no-check-meta'] qconfig.contig_thresholds = ','.join([ str(threshold) for threshold in qconfig.contig_thresholds if threshold > qconfig.min_contig ]) if not qconfig.contig_thresholds: qconfig.contig_thresholds = 'None' quast_py_args = remove_from_quast_py_args(quast_py_args, '--contig-thresholds', qconfig.contig_thresholds) quast_py_args += ['--contig-thresholds'] quast_py_args += [qconfig.contig_thresholds] quast_py_args.remove('--combined-ref') logger.main_info() logger.main_info( 'Partitioning contigs into bins aligned to each reference..') assemblies_by_reference, not_aligned_assemblies = partition_contigs( assemblies, corrected_ref_fpaths, corrected_dirpath, os.path.join(combined_output_dirpath, 'contigs_reports', 'alignments_%s.tsv'), labels) ref_names = [] output_dirpath_per_ref = os.path.join(output_dirpath, qconfig.per_ref_dirname) for ref_fpath, ref_assemblies in assemblies_by_reference: ref_name = qutils.name_from_fpath(ref_fpath) logger.main_info('') if not ref_assemblies: logger.main_info('No contigs were aligned to the reference ' + ref_name + ', skipping..') else: ref_names.append(ref_name) run_name = 'for the contigs aligned to ' + ref_name logger.main_info('Starting quast.py ' + run_name) return_code, total_num_notifications = _start_quast_main( quast_py_args, assemblies=ref_assemblies, reference_fpath=ref_fpath, output_dirpath=os.path.join(output_dirpath_per_ref, ref_name), num_notifications_tuple=total_num_notifications) if json_texts is not None: json_texts.append(json_saver.json_text) # Finally running for the contigs that has not been aligned to any reference no_unaligned_contigs = True for assembly in not_aligned_assemblies: if os.path.isfile( assembly.fpath) and os.stat(assembly.fpath).st_size != 0: no_unaligned_contigs = False break run_name = 'for the contigs not aligned anywhere' logger.main_info() if no_unaligned_contigs: logger.main_info('Skipping quast.py ' + run_name + ' (everything is aligned!)') else: logger.main_info('Starting quast.py ' + run_name + '...') return_code, total_num_notifications = _start_quast_main( quast_py_args, assemblies=not_aligned_assemblies, output_dirpath=os.path.join(output_dirpath, qconfig.not_aligned_name), num_notifications_tuple=total_num_notifications) if return_code not in [0, 4]: logger.error( 'Error running quast.py for the contigs not aligned anywhere') elif return_code == 4: # no unaligned contigs, i.e. everything aligned no_unaligned_contigs = True if not no_unaligned_contigs: if json_texts is not None: json_texts.append(json_saver.json_text) if ref_names: logger.print_timestamp() logger.main_info("Summarizing results...") summary_output_dirpath = os.path.join(output_dirpath, qconfig.meta_summary_dir) if not os.path.isdir(summary_output_dirpath): os.makedirs(summary_output_dirpath) if html_report and json_texts: from libs.html_saver import html_saver html_summary_report_fpath = html_saver.init_meta_report( output_dirpath) else: html_summary_report_fpath = None from libs import create_meta_summary metrics_for_plots = reporting.Fields.main_metrics misassembl_metrics = [ reporting.Fields.MIS_RELOCATION, reporting.Fields.MIS_TRANSLOCATION, reporting.Fields.MIS_INVERTION, reporting.Fields.MIS_ISTRANSLOCATIONS ] create_meta_summary.do( html_summary_report_fpath, summary_output_dirpath, combined_output_dirpath, output_dirpath_per_ref, metrics_for_plots, misassembl_metrics, ref_names if no_unaligned_contigs else ref_names + [qconfig.not_aligned_name]) if html_report and json_texts: html_saver.save_colors(output_dirpath, contigs_fpaths, plotter.dict_color_and_ls, meta=True) if qconfig.create_icarus_html: icarus_html_fpath = html_saver.create_meta_icarus( output_dirpath, ref_names) logger.main_info(' Icarus (contig browser) is saved to %s' % icarus_html_fpath) html_saver.create_meta_report(output_dirpath, json_texts) cleanup(corrected_dirpath) logger.main_info('') logger.main_info('MetaQUAST finished.') return logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode)
def _parallel_correct_contigs(file_counter, contigs_fpath, corrected_dirpath, labels): broken_scaffolds = None contigs_fname = os.path.basename(contigs_fpath) fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname) label = labels[file_counter] corr_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, label + fasta_ext)) logs = [] logs.append(' ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + '%s ==> %s' % (contigs_fpath, label)) # if option --scaffolds is specified QUAST adds split version of assemblies to the comparison if qconfig.scaffolds: logger.info(' ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + ' breaking scaffolds into contigs:') corr_fpath_wo_ext = os.path.join(corrected_dirpath, qutils.name_from_fpath(corr_fpath)) broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext broken_scaffolds_fasta = [] contigs_counter = 0 scaffold_counter = 0 for scaffold_counter, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)): if contigs_counter % 100 == 0: pass if contigs_counter > 520: pass cumul_contig_length = 0 total_contigs_for_the_scaf = 1 cur_contig_start = 0 while (cumul_contig_length < len(seq)) and (seq.find('N', cumul_contig_length) != -1): start = seq.find("N", cumul_contig_length) end = start + 1 while (end != len(seq)) and (seq[end] == 'N'): end += 1 cumul_contig_length = end + 1 if (end - start) >= qconfig.Ns_break_threshold: broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(total_contigs_for_the_scaf), seq[cur_contig_start:start])) total_contigs_for_the_scaf += 1 cur_contig_start = end broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(total_contigs_for_the_scaf), seq[cur_contig_start:])) contigs_counter += total_contigs_for_the_scaf if scaffold_counter + 1 != contigs_counter: fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta) logs.append(" " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + " %d scaffolds (%s) were broken into %d contigs (%s)" % (scaffold_counter + 1, label, contigs_counter, label + ' broken')) broken_scaffolds = (broken_scaffolds_fpath, broken_scaffolds_fpath) else: logs.append(" " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + " WARNING: nothing was broken, skipping '%s broken' from further analysis" % label) corr_fpaths = (contigs_fpath, corr_fpath) return corr_fpaths, broken_scaffolds, logs
def run_processing_reads(main_ref_fpath, meta_ref_fpaths, ref_labels, reads_fpaths, output_dirpath, res_path, log_path, err_path): ref_name = qutils.name_from_fpath(main_ref_fpath) sam_fpath = os.path.join(output_dirpath, ref_name + '.sam') bam_fpath = os.path.join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = os.path.join(output_dirpath, ref_name + '.sorted') sam_sorted_fpath = os.path.join(output_dirpath, ref_name + '.sorted.sam') bed_fpath = os.path.join(res_path, ref_name + '.bed') if is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) return bed_fpath logger.info(' ' + 'Pre-processing for searching structural variations...') logger.info(' ' + 'Logging to %s...' % err_path) if is_non_empty_file(sam_fpath): logger.info(' Using existing SAM-file: ' + sam_fpath) else: logger.info(' Running Bowtie2...') abs_reads_fpaths = [ ] # use absolute paths because we will change workdir for reads_fpath in reads_fpaths: abs_reads_fpaths.append(os.path.abspath(reads_fpath)) prev_dir = os.getcwd() os.chdir(output_dirpath) cmd = [bin_fpath('bowtie2-build'), main_ref_fpath, ref_name] qutils.call_subprocess(cmd, stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) cmd = bin_fpath('bowtie2') + ' -x ' + ref_name + ' -1 ' + abs_reads_fpaths[0] + ' -2 ' + abs_reads_fpaths[1] + ' -S ' + \ sam_fpath + ' --no-unal -a -p %s' % str(qconfig.max_threads) qutils.call_subprocess(shlex.split(cmd), stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) logger.info(' Done.') os.chdir(prev_dir) if not os.path.exists(sam_fpath) or os.path.getsize(sam_fpath) == 0: logger.error(' Failed running Bowtie2 for the reference. See ' + log_path + ' for information.') logger.info(' Failed searching structural variations.') return None logger.info(' Sorting SAM-file...') if is_non_empty_file(sam_sorted_fpath): logger.info(' Using existing sorted SAM-file: ' + sam_sorted_fpath) else: qutils.call_subprocess([ samtools_fpath('samtools'), 'view', '-@', str(qconfig.max_threads), '-bS', sam_fpath ], stdout=open(bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([ samtools_fpath('samtools'), 'sort', '-@', str(qconfig.max_threads), bam_fpath, bam_sorted_fpath ], stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([ samtools_fpath('samtools'), 'view', '-@', str(qconfig.max_threads), bam_sorted_fpath + '.bam' ], stdout=open(sam_sorted_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) if meta_ref_fpaths: logger.info(' Splitting SAM-file by references...') headers = [] seq_name_length = {} with open(sam_fpath) as sam_file: for line in sam_file: if not line.startswith('@'): break if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line: seq_name = line.split('\tSN:')[1].split('\t')[0] seq_length = int(line.split('\tLN:')[1].split('\t')[0]) seq_name_length[seq_name] = seq_length headers.append(line.strip()) need_ref_splitting = False if meta_ref_fpaths: ref_files = {} for cur_ref_fpath in meta_ref_fpaths: ref = qutils.name_from_fpath(cur_ref_fpath) new_ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam') if is_non_empty_file(new_ref_sam_fpath): logger.info(' Using existing split SAM-file for %s: %s' % (ref, new_ref_sam_fpath)) ref_files[ref] = None else: new_ref_sam_file = open(new_ref_sam_fpath, 'w') new_ref_sam_file.write(headers[0] + '\n') chrs = [] for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h): seq_name = h.split('\tSN:')[1].split('\t')[0] if seq_name in ref_labels and ref_labels[seq_name] == ref: new_ref_sam_file.write(h + '\n') chrs.append(seq_name) new_ref_sam_file.write(headers[-1] + '\n') ref_files[ref] = new_ref_sam_file need_ref_splitting = True deletions = [] trivial_deletions_fpath = os.path.join(output_dirpath, qconfig.trivial_deletions_fname) logger.info( ' Looking for trivial deletions (long zero-covered fragments)...') need_trivial_deletions = True if os.path.exists(trivial_deletions_fpath): need_trivial_deletions = False logger.info(' Using existing file: ' + trivial_deletions_fpath) if need_trivial_deletions or need_ref_splitting: with open(sam_sorted_fpath) as sam_file: cur_deletion = None for line in sam_file: mapping = Mapping.parse(line) if mapping: # common case: continue current deletion (potential) on the same reference if cur_deletion and cur_deletion.ref == mapping.ref: if cur_deletion.next_bad is None: # previous mapping was in region BEFORE 0-covered fragment # just passed 0-covered fragment if mapping.start - cur_deletion.prev_bad > QuastDeletion.MIN_GAP: cur_deletion.set_next_bad(mapping) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_good(mapping) # continue region BEFORE 0-covered fragment elif mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_prev_good(mapping) else: cur_deletion.set_prev_bad(mapping) else: # previous mapping was in region AFTER 0-covered fragment # just passed another 0-cov fragment between end of cur_deletion BAD region and this mapping if mapping.start - cur_deletion.next_bad_end > QuastDeletion.MIN_GAP: if cur_deletion.is_valid( ): # add previous fragment's deletion if needed deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_bad( position=cur_deletion.next_bad_end) # continue region AFTER 0-covered fragment (old one or new/another one -- see "if" above) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_good(mapping) else: cur_deletion.set_next_bad_end(mapping) # special case: just started or just switched to the next reference else: if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good( position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion( mapping.ref).set_prev_good(mapping) if need_ref_splitting: cur_ref = ref_labels[mapping.ref] if mapping.ref_next.strip( ) == '=' or cur_ref == ref_labels[mapping.ref_next]: if ref_files[cur_ref] is not None: ref_files[cur_ref].write(line) if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good( position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) if need_ref_splitting: for ref_handler in ref_files.values(): if ref_handler is not None: ref_handler.close() if need_trivial_deletions: logger.info(' Trivial deletions: %d found' % len(deletions)) logger.info(' Saving to: ' + trivial_deletions_fpath) with open(trivial_deletions_fpath, 'w') as f: for deletion in deletions: f.write(str(deletion) + '\n') if os.path.exists(config_manta_fpath): manta_sv_fpath = search_sv_with_manta(main_ref_fpath, meta_ref_fpaths, output_dirpath, err_path) qutils.cat_files([manta_sv_fpath, trivial_deletions_fpath], bed_fpath) elif os.path.exists(trivial_deletions_fpath): shutil.copy(trivial_deletions_fpath, bed_fpath) if os.path.exists(bed_fpath): logger.main_info(' Structural variations saved to ' + bed_fpath) return bed_fpath else: logger.main_info(' Failed searching structural variations.') return None
def process_one_ref(cur_ref_fpath, output_dirpath, err_path, bed_fpath=None): ref = qutils.name_from_fpath(cur_ref_fpath) ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam') ref_bam_fpath = os.path.join(output_dirpath, ref + '.bam') ref_bamsorted_fpath = os.path.join(output_dirpath, ref + '.sorted') ref_bed_fpath = bed_fpath if bed_fpath else os.path.join( output_dirpath, ref + '.bed') if os.path.getsize( ref_sam_fpath ) < 1024 * 1024: # TODO: make it better (small files will cause Manta crush -- "not enough reads...") logger.info(' SAM file is too small for Manta (%d Kb), skipping..' % (os.path.getsize(ref_sam_fpath) // 1024)) return None if is_non_empty_file(ref_bed_fpath): logger.info(' Using existing Manta BED-file: ' + ref_bed_fpath) return ref_bed_fpath if not os.path.exists(ref_bamsorted_fpath + '.bam'): qutils.call_subprocess( [samtools_fpath('samtools'), 'view', '-bS', ref_sam_fpath], stdout=open(ref_bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([ samtools_fpath('samtools'), 'sort', ref_bam_fpath, ref_bamsorted_fpath ], stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(ref_bamsorted_fpath + '.bam.bai'): qutils.call_subprocess([ samtools_fpath('samtools'), 'index', ref_bamsorted_fpath + '.bam' ], stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(cur_ref_fpath + '.fai'): qutils.call_subprocess( [samtools_fpath('samtools'), 'faidx', cur_ref_fpath], stderr=open(err_path, 'a'), logger=logger) vcfoutput_dirpath = os.path.join(output_dirpath, ref + '_manta') found_SV_fpath = os.path.join(vcfoutput_dirpath, 'results/variants/diploidSV.vcf.gz') unpacked_SV_fpath = found_SV_fpath + '.unpacked' if not is_non_empty_file(found_SV_fpath): if os.path.exists(vcfoutput_dirpath): shutil.rmtree(vcfoutput_dirpath, ignore_errors=True) os.makedirs(vcfoutput_dirpath) qutils.call_subprocess([ config_manta_fpath, '--normalBam', ref_bamsorted_fpath + '.bam', '--referenceFasta', cur_ref_fpath, '--runDir', vcfoutput_dirpath ], stdout=open(err_path, 'a'), stderr=open(err_path, 'a'), logger=logger) if not os.path.exists(os.path.join(vcfoutput_dirpath, 'runWorkflow.py')): return None qutils.call_subprocess([ os.path.join(vcfoutput_dirpath, 'runWorkflow.py'), '-m', 'local', '-j', str(qconfig.max_threads) ], stderr=open(err_path, 'a'), logger=logger) if not is_non_empty_file(unpacked_SV_fpath): cmd = 'gunzip -c %s' % found_SV_fpath qutils.call_subprocess(shlex.split(cmd), stdout=open(unpacked_SV_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) from manta import vcfToBedpe vcfToBedpe.vcfToBedpe(open(unpacked_SV_fpath), open(ref_bed_fpath, 'w')) return ref_bed_fpath
def run_processing_reads(main_ref_fpath, meta_ref_fpaths, ref_labels, reads_fpaths, output_dirpath, res_path, log_path, err_path): ref_name = qutils.name_from_fpath(main_ref_fpath) sam_fpath = os.path.join(output_dirpath, ref_name + '.sam') bam_fpath = os.path.join(output_dirpath, ref_name + '.bam') bam_sorted_fpath = os.path.join(output_dirpath, ref_name + '.sorted') sam_sorted_fpath = os.path.join(output_dirpath, ref_name + '.sorted.sam') bed_fpath = os.path.join(res_path, ref_name + '.bed') if is_non_empty_file(bed_fpath): logger.info(' Using existing BED-file: ' + bed_fpath) return bed_fpath logger.info(' ' + 'Pre-processing for searching structural variations...') logger.info(' ' + 'Logging to %s...' % err_path) if is_non_empty_file(sam_fpath): logger.info(' Using existing SAM-file: ' + sam_fpath) else: logger.info(' Running Bowtie2...') abs_reads_fpaths = [] # use absolute paths because we will change workdir for reads_fpath in reads_fpaths: abs_reads_fpaths.append(os.path.abspath(reads_fpath)) prev_dir = os.getcwd() os.chdir(output_dirpath) cmd = [bin_fpath('bowtie2-build'), main_ref_fpath, ref_name] qutils.call_subprocess(cmd, stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) cmd = bin_fpath('bowtie2') + ' -x ' + ref_name + ' -1 ' + abs_reads_fpaths[0] + ' -2 ' + abs_reads_fpaths[1] + ' -S ' + \ sam_fpath + ' --no-unal -a -p %s' % str(qconfig.max_threads) qutils.call_subprocess(shlex.split(cmd), stdout=open(log_path, 'a'), stderr=open(err_path, 'a'), logger=logger) logger.info(' Done.') os.chdir(prev_dir) if not os.path.exists(sam_fpath) or os.path.getsize(sam_fpath) == 0: logger.error(' Failed running Bowtie2 for the reference. See ' + log_path + ' for information.') logger.info(' Failed searching structural variations.') return None logger.info(' Sorting SAM-file...') if is_non_empty_file(sam_sorted_fpath): logger.info(' Using existing sorted SAM-file: ' + sam_sorted_fpath) else: qutils.call_subprocess([samtools_fpath('samtools'), 'view', '-@', str(qconfig.max_threads), '-bS', sam_fpath], stdout=open(bam_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([samtools_fpath('samtools'), 'sort', '-@', str(qconfig.max_threads), bam_fpath, bam_sorted_fpath], stderr=open(err_path, 'a'), logger=logger) qutils.call_subprocess([samtools_fpath('samtools'), 'view', '-@', str(qconfig.max_threads), bam_sorted_fpath + '.bam'], stdout=open(sam_sorted_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) if meta_ref_fpaths: logger.info(' Splitting SAM-file by references...') headers = [] seq_name_length = {} with open(sam_fpath) as sam_file: for line in sam_file: if not line.startswith('@'): break if line.startswith('@SQ') and 'SN:' in line and 'LN:' in line: seq_name = line.split('\tSN:')[1].split('\t')[0] seq_length = int(line.split('\tLN:')[1].split('\t')[0]) seq_name_length[seq_name] = seq_length headers.append(line.strip()) need_ref_splitting = False if meta_ref_fpaths: ref_files = {} for cur_ref_fpath in meta_ref_fpaths: ref = qutils.name_from_fpath(cur_ref_fpath) new_ref_sam_fpath = os.path.join(output_dirpath, ref + '.sam') if is_non_empty_file(new_ref_sam_fpath): logger.info(' Using existing split SAM-file for %s: %s' % (ref, new_ref_sam_fpath)) ref_files[ref] = None else: new_ref_sam_file = open(new_ref_sam_fpath, 'w') new_ref_sam_file.write(headers[0] + '\n') chrs = [] for h in (h for h in headers if h.startswith('@SQ') and 'SN:' in h): seq_name = h.split('\tSN:')[1].split('\t')[0] if seq_name in ref_labels and ref_labels[seq_name] == ref: new_ref_sam_file.write(h + '\n') chrs.append(seq_name) new_ref_sam_file.write(headers[-1] + '\n') ref_files[ref] = new_ref_sam_file need_ref_splitting = True deletions = [] trivial_deletions_fpath = os.path.join(output_dirpath, qconfig.trivial_deletions_fname) logger.info(' Looking for trivial deletions (long zero-covered fragments)...') need_trivial_deletions = True if os.path.exists(trivial_deletions_fpath): need_trivial_deletions = False logger.info(' Using existing file: ' + trivial_deletions_fpath) if need_trivial_deletions or need_ref_splitting: with open(sam_sorted_fpath) as sam_file: cur_deletion = None for line in sam_file: mapping = Mapping.parse(line) if mapping: # common case: continue current deletion (potential) on the same reference if cur_deletion and cur_deletion.ref == mapping.ref: if cur_deletion.next_bad is None: # previous mapping was in region BEFORE 0-covered fragment # just passed 0-covered fragment if mapping.start - cur_deletion.prev_bad > QuastDeletion.MIN_GAP: cur_deletion.set_next_bad(mapping) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion(mapping.ref).set_prev_good(mapping) # continue region BEFORE 0-covered fragment elif mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_prev_good(mapping) else: cur_deletion.set_prev_bad(mapping) else: # previous mapping was in region AFTER 0-covered fragment # just passed another 0-cov fragment between end of cur_deletion BAD region and this mapping if mapping.start - cur_deletion.next_bad_end > QuastDeletion.MIN_GAP: if cur_deletion.is_valid(): # add previous fragment's deletion if needed deletions.append(cur_deletion) cur_deletion = QuastDeletion(mapping.ref).set_prev_bad(position=cur_deletion.next_bad_end) # continue region AFTER 0-covered fragment (old one or new/another one -- see "if" above) if mapping.mapq >= Mapping.MIN_MAP_QUALITY: cur_deletion.set_next_good(mapping) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion(mapping.ref).set_prev_good(mapping) else: cur_deletion.set_next_bad_end(mapping) # special case: just started or just switched to the next reference else: if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good(position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) cur_deletion = QuastDeletion(mapping.ref).set_prev_good(mapping) if need_ref_splitting: cur_ref = ref_labels[mapping.ref] if mapping.ref_next.strip() == '=' or cur_ref == ref_labels[mapping.ref_next]: if ref_files[cur_ref] is not None: ref_files[cur_ref].write(line) if cur_deletion and cur_deletion.ref in seq_name_length: # switched to the next ref cur_deletion.set_next_good(position=seq_name_length[cur_deletion.ref]) if cur_deletion.is_valid(): deletions.append(cur_deletion) if need_ref_splitting: for ref_handler in ref_files.values(): if ref_handler is not None: ref_handler.close() if need_trivial_deletions: logger.info(' Trivial deletions: %d found' % len(deletions)) logger.info(' Saving to: ' + trivial_deletions_fpath) with open(trivial_deletions_fpath, 'w') as f: for deletion in deletions: f.write(str(deletion) + '\n') if os.path.exists(config_manta_fpath): manta_sv_fpath = search_sv_with_manta(main_ref_fpath, meta_ref_fpaths, output_dirpath, err_path) qutils.cat_files([manta_sv_fpath, trivial_deletions_fpath], bed_fpath) elif os.path.exists(trivial_deletions_fpath): shutil.copy(trivial_deletions_fpath, bed_fpath) if os.path.exists(bed_fpath): logger.main_info(' Structural variations saved to ' + bed_fpath) return bed_fpath else: logger.main_info(' Failed searching structural variations.') return None
def main(args): if ' ' in qconfig.QUAST_HOME: logger.error('QUAST does not support spaces in paths. \n' 'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '\n' 'Please, put QUAST in a different directory, then try again.\n', to_stderr=True, exit_with_code=3) if not args: qconfig.usage(meta=True) sys.exit(0) genes = [] operons = [] html_report = qconfig.html_report make_latest_symlink = True ref_txt_fpath = None try: options, contigs_fpaths = getopt.gnu_getopt(args, qconfig.short_options, qconfig.long_options) except getopt.GetoptError: _, exc_value, _ = sys.exc_info() print >> sys.stderr, exc_value print >> sys.stderr qconfig.usage(meta=True) sys.exit(2) quast_py_args = args[:] test_mode = False for opt, arg in options: if opt in ('-d', '--debug'): options.remove((opt, arg)) qconfig.debug = True logger.set_up_console_handler(debug=True) elif opt == '--test' or opt == '--test-no-ref': options.remove((opt, arg)) quast_py_args = __remove_from_quast_py_args(quast_py_args, opt) options += [('-o', 'quast_test_output')] if opt == '--test': options += [('-R', ','.join([os.path.join(qconfig.QUAST_HOME, 'test_data', 'meta_ref_1.fasta'), os.path.join(qconfig.QUAST_HOME, 'test_data', 'meta_ref_2.fasta'), os.path.join(qconfig.QUAST_HOME, 'test_data', 'meta_ref_3.fasta')]))] contigs_fpaths += [os.path.join(qconfig.QUAST_HOME, 'test_data', 'meta_contigs_1.fasta'), os.path.join(qconfig.QUAST_HOME, 'test_data', 'meta_contigs_2.fasta')] test_mode = True elif opt.startswith('--help') or opt == '-h': qconfig.usage(opt == "--help-hidden", meta=True, short=False) sys.exit(0) elif opt.startswith('--version') or opt == '-v': qconfig.print_version(meta=True) sys.exit(0) if not contigs_fpaths: logger.error("You should specify at least one file with contigs!\n") qconfig.usage(meta=True) sys.exit(2) ref_fpaths = [] combined_ref_fpath = '' reads_fpath_f = '' reads_fpath_r = '' output_dirpath = None labels = None all_labels_from_dirs = False for opt, arg in options: if opt in ('-o', "--output-dir"): # Removing output dir arg in order to further # construct other quast calls from this options if opt in quast_py_args and arg in quast_py_args: quast_py_args = __remove_from_quast_py_args(quast_py_args, opt, arg) output_dirpath = os.path.abspath(arg) make_latest_symlink = False elif opt in ('-G', "--genes"): assert_file_exists(arg, 'genes') genes += arg elif opt in ('-O', "--operons"): assert_file_exists(arg, 'operons') operons += arg elif opt in ('-R', "--reference"): # Removing reference args in order to further # construct quast calls from this args with other reference options if opt in quast_py_args and arg in quast_py_args: quast_py_args = __remove_from_quast_py_args(quast_py_args, opt, arg) if os.path.isdir(arg): ref_fpaths = [os.path.join(path,file) for (path, dirs, files) in os.walk(arg) for file in files if qutils.check_is_fasta_file(file)] ref_fpaths.sort() else: ref_fpaths = arg.split(',') for i, ref_fpath in enumerate(ref_fpaths): assert_file_exists(ref_fpath, 'reference') ref_fpaths[i] = ref_fpath elif opt == '--max-ref-number': quast_py_args = __remove_from_quast_py_args(quast_py_args, opt, arg) qconfig.max_references = int(arg) if qconfig.max_references < 0: qconfig.max_references = 0 elif opt in ('-m', "--min-contig"): qconfig.min_contig = int(arg) elif opt in ('-t', "--threads"): qconfig.max_threads = int(arg) if qconfig.max_threads < 1: qconfig.max_threads = 1 elif opt in ('-l', '--labels'): quast_py_args = __remove_from_quast_py_args(quast_py_args, opt, arg) labels = quast.parse_labels(arg, contigs_fpaths) elif opt == '-L': quast_py_args = __remove_from_quast_py_args(quast_py_args, opt) all_labels_from_dirs = True elif opt in ('-j', '--save-json'): pass elif opt in ('-J', '--save-json-to'): pass elif opt == "--contig-thresholds": pass elif opt in ('-c', "--mincluster"): pass elif opt == "--est-ref-size": pass elif opt == "--gene-thresholds": pass elif opt in ('-s', "--scaffolds"): pass elif opt == "--gage": pass elif opt == "--debug": pass elif opt in ('-e', "--eukaryote"): pass elif opt in ('-f', "--gene-finding"): pass elif opt in ('-i', "--min-alignment"): pass elif opt in ('-c', "--min-cluster"): pass elif opt in ('-a', "--ambiguity-usage"): pass elif opt in ('-u', "--use-all-alignments"): pass elif opt == "--strict-NA": pass elif opt in ('-x', "--extensive-mis-size"): pass elif opt == "--meta": pass elif opt == '--references-list': ref_txt_fpath = arg elif opt == '--glimmer': pass elif opt == '--no-snps': pass elif opt == '--no-check': pass elif opt == '--no-gc': pass elif opt == '--no-plots': pass elif opt == '--no-html': html_report = False elif opt == '--fast': # --no-check, --no-gc, --no-snps will automatically set in QUAST runs html_report = False elif opt == '--plots-format': pass elif opt == '--memory-efficient': pass elif opt == '--silent': qconfig.silent = True elif opt in ('-1', '--reads1'): reads_fpath_f = arg quast_py_args = __remove_from_quast_py_args(quast_py_args, opt, arg) elif opt in ('-2', '--reads2'): reads_fpath_r = arg quast_py_args = __remove_from_quast_py_args(quast_py_args, opt, arg) elif opt == '--contig-alignment-html': qconfig.create_contig_alignment_html = True else: logger.error('Unknown option: %s. Use -h for help.' % (opt + ' ' + arg), to_stderr=True, exit_with_code=2) for c_fpath in contigs_fpaths: assert_file_exists(c_fpath, 'contigs') labels = quast.process_labels(contigs_fpaths, labels, all_labels_from_dirs) for contigs_fpath in contigs_fpaths: if contigs_fpath in quast_py_args: quast_py_args.remove(contigs_fpath) # Directories output_dirpath, _, _ = quast._set_up_output_dir( output_dirpath, None, make_latest_symlink, save_json=False) corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) logger.set_up_file_handler(output_dirpath) args = [os.path.realpath(__file__)] for k, v in options: args.extend([k, v]) args.extend(contigs_fpaths) logger.print_command_line(args, wrap_after=None) logger.start() qconfig.set_max_threads(logger) ######################################################################## from libs import reporting reload(reporting) if os.path.isdir(corrected_dirpath): shutil.rmtree(corrected_dirpath) os.mkdir(corrected_dirpath) # PROCESSING REFERENCES if ref_fpaths: logger.main_info() logger.main_info('Reference(s):') corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\ _correct_references(ref_fpaths, corrected_dirpath) # PROCESSING CONTIGS logger.main_info() logger.main_info('Contigs:') assemblies, correct_assemblies = _correct_contigs(contigs_fpaths, output_dirpath, labels) if not assemblies: logger.error("None of the assembly files contains correct contigs. " "Please, provide different files or decrease --min-contig threshold.") return 4 # Running QUAST(s) quast_py_args += ['--meta'] downloaded_refs = False # SEARCHING REFERENCES if not ref_fpaths: logger.main_info() if qconfig.max_references == 0: logger.notice("Maximum number of references (--max-ref-number) is set to 0, search in SILVA 16S rRNA database is disabled") else: if ref_txt_fpath: logger.main_info("List of references was provided, starting to download reference genomes from NCBI...") else: logger.main_info("No references are provided, starting to search for reference genomes in SILVA 16S rRNA database " "and to download them from NCBI...") downloaded_dirpath = os.path.join(output_dirpath, qconfig.downloaded_dirname) if not os.path.isdir(downloaded_dirpath): os.mkdir(downloaded_dirpath) ref_fpaths = search_references_meta.do(assemblies, labels, downloaded_dirpath, ref_txt_fpath) if ref_fpaths: search_references_meta.is_quast_first_run = True if not ref_txt_fpath: downloaded_refs = True logger.main_info() logger.main_info('Downloaded reference(s):') corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\ _correct_references(ref_fpaths, corrected_dirpath) elif test_mode and ref_fpaths is None: logger.error('Failed to download or setup SILVA 16S rRNA database for working without ' 'references on metagenome datasets!', to_stderr=True, exit_with_code=4) if not ref_fpaths: # No references, running regular quast with MetaGenemark gene finder logger.main_info() logger.notice('No references are provided, starting regular QUAST with MetaGeneMark gene finder') _start_quast_main( None, quast_py_args, assemblies=assemblies, output_dirpath=output_dirpath, exit_on_exception=True) exit(0) # Running combined reference combined_output_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name) reads_fpaths = [] if reads_fpath_f: reads_fpaths.append(reads_fpath_f) if reads_fpath_r: reads_fpaths.append(reads_fpath_r) if reads_fpaths: bed_fpath = reads_analyzer.do(combined_ref_fpath, contigs_fpaths, reads_fpaths, corrected_ref_fpaths, os.path.join(combined_output_dirpath, qconfig.variation_dirname), external_logger=logger) if bed_fpath: quast_py_args += ['--bed-file'] quast_py_args += [bed_fpath] quast_py_args += ['--combined-ref'] run_name = 'for the combined reference' logger.main_info() logger.main_info('Starting quast.py ' + run_name + '...') total_num_notices = 0 total_num_warnings = 0 total_num_nf_errors = 0 total_num_notifications = (total_num_notices, total_num_warnings, total_num_nf_errors) if qconfig.html_report: from libs.html_saver import json_saver json_texts = [] else: json_texts = None return_code, total_num_notifications, assemblies, labels = _start_quast_main(run_name, quast_py_args + ["--ambiguity-usage"] + ['all'], assemblies=assemblies, reference_fpath=combined_ref_fpath, output_dirpath=combined_output_dirpath, num_notifications_tuple=total_num_notifications, is_first_run=True) for arg in args: if arg in ('-s', "--scaffolds"): quast_py_args.remove(arg) if json_texts is not None: json_texts.append(json_saver.json_text) search_references_meta.is_quast_first_run = False genome_info_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name, 'genome_stats') genome_info_fpath = os.path.join(genome_info_dirpath, 'genome_info.txt') if not os.path.exists(genome_info_fpath): logger.main_info('') logger.main_info('Failed aligning the contigs for all the references. ' + ('Try to restart MetaQUAST with another references.' if not downloaded_refs else 'Try to use option --max-ref-number to change maximum number of references ' '(per each assembly) to download.')) logger.main_info('') quast._cleanup(corrected_dirpath) logger.main_info('MetaQUAST finished.') logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode) return if downloaded_refs: logger.main_info() logger.main_info('Excluding downloaded references with low genome fraction from further analysis..') corr_ref_fpaths = remove_unaligned_downloaded_refs(genome_info_fpath, ref_fpaths, chromosomes_by_refs) if corr_ref_fpaths and corr_ref_fpaths != ref_fpaths: logger.main_info() logger.main_info('Filtered reference(s):') os.remove(combined_ref_fpath) contigs_analyzer.ref_labels_by_chromosomes = {} corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\ _correct_references(corr_ref_fpaths, corrected_dirpath) run_name = 'for the corrected combined reference' logger.main_info() logger.main_info('Starting quast.py ' + run_name + '...') return_code, total_num_notifications, assemblies, labels = _start_quast_main(run_name, quast_py_args + ["--ambiguity-usage"] + ['all'], assemblies=assemblies, reference_fpath=combined_ref_fpath, output_dirpath=combined_output_dirpath, num_notifications_tuple=total_num_notifications, is_first_run=True) if json_texts is not None: json_texts = json_texts[:-1] json_texts.append(json_saver.json_text) elif corr_ref_fpaths == ref_fpaths: logger.main_info('All downloaded references have genome fraction more than 10%. Nothing was excluded.') else: logger.main_info('All downloaded references have low genome fraction. Nothing was excluded for now.') quast_py_args += ['--no-check-meta'] qconfig.contig_thresholds = ','.join([str(threshold) for threshold in qconfig.contig_thresholds if threshold > qconfig.min_contig]) if not qconfig.contig_thresholds: qconfig.contig_thresholds = 'None' quast_py_args = __remove_from_quast_py_args(quast_py_args, '--contig-thresholds', qconfig.contig_thresholds) quast_py_args += ['--contig-thresholds'] quast_py_args += [qconfig.contig_thresholds] quast_py_args.remove('--combined-ref') logger.main_info() logger.main_info('Partitioning contigs into bins aligned to each reference..') assemblies_by_reference, not_aligned_assemblies = _partition_contigs( assemblies, corrected_ref_fpaths, corrected_dirpath, os.path.join(combined_output_dirpath, 'contigs_reports', 'alignments_%s.tsv'), labels) ref_names = [] output_dirpath_per_ref = os.path.join(output_dirpath, qconfig.per_ref_dirname) for ref_fpath, ref_assemblies in assemblies_by_reference: ref_name = qutils.name_from_fpath(ref_fpath) logger.main_info('') if not ref_assemblies: logger.main_info('No contigs were aligned to the reference ' + ref_name + ', skipping..') else: ref_names.append(ref_name) run_name = 'for the contigs aligned to ' + ref_name logger.main_info('Starting quast.py ' + run_name) return_code, total_num_notifications = _start_quast_main(run_name, quast_py_args, assemblies=ref_assemblies, reference_fpath=ref_fpath, output_dirpath=os.path.join(output_dirpath_per_ref, ref_name), exit_on_exception=False, num_notifications_tuple=total_num_notifications) if json_texts is not None: json_texts.append(json_saver.json_text) # Finally running for the contigs that has not been aligned to any reference no_unaligned_contigs = True for assembly in not_aligned_assemblies: if os.path.isfile(assembly.fpath) and os.stat(assembly.fpath).st_size != 0: no_unaligned_contigs = False break run_name = 'for the contigs not aligned anywhere' logger.main_info() if no_unaligned_contigs: logger.main_info('Skipping quast.py ' + run_name + ' (everything is aligned!)') else: logger.main_info('Starting quast.py ' + run_name + '...') return_code, total_num_notifications = _start_quast_main(run_name, quast_py_args, assemblies=not_aligned_assemblies, output_dirpath=os.path.join(output_dirpath, qconfig.not_aligned_name), exit_on_exception=False, num_notifications_tuple=total_num_notifications) if return_code not in [0, 4]: logger.error('Error running quast.py for the contigs not aligned anywhere') elif return_code == 4: # no unaligned contigs, i.e. everything aligned no_unaligned_contigs = True if not no_unaligned_contigs: if json_texts is not None: json_texts.append(json_saver.json_text) if ref_names: logger.print_timestamp() logger.main_info("Summarizing results...") summary_output_dirpath = os.path.join(output_dirpath, qconfig.meta_summary_dir) if not os.path.isdir(summary_output_dirpath): os.makedirs(summary_output_dirpath) if html_report and json_texts: from libs.html_saver import html_saver html_summary_report_fpath = html_saver.init_meta_report(output_dirpath) else: html_summary_report_fpath = None from libs import create_meta_summary metrics_for_plots = reporting.Fields.main_metrics misassembl_metrics = [reporting.Fields.MIS_RELOCATION, reporting.Fields.MIS_TRANSLOCATION, reporting.Fields.MIS_INVERTION, reporting.Fields.MIS_ISTRANSLOCATIONS] create_meta_summary.do(html_summary_report_fpath, summary_output_dirpath, combined_output_dirpath, output_dirpath_per_ref, metrics_for_plots, misassembl_metrics, ref_names if no_unaligned_contigs else ref_names + [qconfig.not_aligned_name]) if html_report and json_texts: from libs import plotter html_saver.save_colors(output_dirpath, contigs_fpaths, plotter.dict_color_and_ls, meta=True) html_saver.create_meta_report(output_dirpath, json_texts) quast._cleanup(corrected_dirpath) logger.main_info('') logger.main_info('MetaQUAST finished.') logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode)
def do(ref_fpath, contigs_fpaths, output_dirpath): gage_results_dirpath = os.path.join(output_dirpath, 'gage') # suffixes for files with report tables in plain text and tab separated formats if not os.path.isdir(gage_results_dirpath): os.mkdir(gage_results_dirpath) ######################################################################## gage_tool_path = os.path.join(qconfig.LIBS_LOCATION, 'gage', 'getCorrectnessStats.sh') ######################################################################## logger.print_timestamp() logger.info('Running GAGE...') metrics = ['Total units', 'Min', 'Max', 'N50', 'Genome Size', 'Assembly Size', 'Chaff bases', 'Missing Reference Bases', 'Missing Assembly Bases', 'Missing Assembly Contigs', 'Duplicated Reference Bases', 'Compressed Reference Bases', 'Bad Trim', 'Avg Idy', 'SNPs', 'Indels < 5bp', 'Indels >= 5', 'Inversions', 'Relocation', 'Translocation', 'Total units', 'BasesInFasta', 'Min', 'Max', 'N50'] metrics_in_reporting = [reporting.Fields.GAGE_NUMCONTIGS, reporting.Fields.GAGE_MINCONTIG, reporting.Fields.GAGE_MAXCONTIG, reporting.Fields.GAGE_N50, reporting.Fields.GAGE_GENOMESIZE, reporting.Fields.GAGE_ASSEMBLY_SIZE, reporting.Fields.GAGE_CHAFFBASES, reporting.Fields.GAGE_MISSINGREFBASES, reporting.Fields.GAGE_MISSINGASMBLYBASES, reporting.Fields.GAGE_MISSINGASMBLYCONTIGS, reporting.Fields.GAGE_DUPREFBASES, reporting.Fields.GAGE_COMPRESSEDREFBASES, reporting.Fields.GAGE_BADTRIM, reporting.Fields.GAGE_AVGIDY, reporting.Fields.GAGE_SNPS, reporting.Fields.GAGE_SHORTINDELS, reporting.Fields.GAGE_LONGINDELS, reporting.Fields.GAGE_INVERSIONS, reporting.Fields.GAGE_RELOCATION, reporting.Fields.GAGE_TRANSLOCATION, reporting.Fields.GAGE_NUMCORCONTIGS, reporting.Fields.GAGE_CORASMBLYSIZE, reporting.Fields.GAGE_MINCORCONTIG, reporting.Fields.GAGE_MAXCORCOTING, reporting.Fields.GAGE_CORN50] tmp_dirpath = os.path.join(gage_results_dirpath, 'tmp') if not os.path.exists(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed return_codes = Parallel(n_jobs=n_jobs)(delayed(run_gage)(i, contigs_fpath, gage_results_dirpath, gage_tool_path, ref_fpath, tmp_dirpath) for i, contigs_fpath in enumerate(contigs_fpaths)) if 0 not in return_codes: logger.warning('Error occurred while GAGE was processing assemblies.' ' See GAGE error logs for details: %s' % os.path.join(gage_results_dirpath, 'gage_*.stderr')) return ## find metrics for total report: for i, contigs_fpath in enumerate(contigs_fpaths): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) report = reporting.get(contigs_fpath) log_out_fpath = os.path.join( gage_results_dirpath, 'gage_' + assembly_name + '.stdout') logfile_out = open(log_out_fpath, 'r') cur_metric_id = 0 for line in logfile_out: if metrics[cur_metric_id] in line: if (metrics[cur_metric_id].startswith('N50')): report.add_field(metrics_in_reporting[cur_metric_id], line.split(metrics[cur_metric_id] + ':')[1].strip()) else: report.add_field(metrics_in_reporting[cur_metric_id], line.split(':')[1].strip()) cur_metric_id += 1 if cur_metric_id == len(metrics): break logfile_out.close() reporting.save_gage(output_dirpath) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.info('Done.')
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir): logger.print_timestamp() logger.info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None if ref_fpath: reference_length = sum( fastaparser.get_lengths_from_fastafile(ref_fpath)) reference_GC, reference_GC_distribution = GC_content(ref_fpath) logger.info(' Reference genome:') logger.info(' ' + os.path.basename(ref_fpath) + ', Reference length = ' + str(reference_length) + ', Reference GC % = ' + '%.2f' % reference_GC) elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size logger.info(' Estimated reference length = ' + str(reference_length)) if reference_length: # Saving the reference in JSON if json_output_dir: json_saver.save_reference_length(json_output_dir, reference_length) # Saving for an HTML report if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_reference_length(results_dir, reference_length) logger.info(' Contig files: ') lists_of_lengths = [] numbers_of_Ns = [] for id, contigs_fpath in enumerate(contigs_fpaths): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) #lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count('N') lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) # saving lengths to JSON if json_output_dir: json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, lists_of_lengths) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, lists_of_lengths) ######################################################################## logger.info(' Calculating N50 and L50...') list_of_GC_distributions = [] import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate( itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution = GC_content(contigs_fpath) list_of_GC_distributions.append(GC_distribution) logger.info(' ' + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + \ ', N50 = ' + str(n50) + \ ', L50 = ' + str(l50) + \ ', Total length = ' + str(total_length) + \ ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \ ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) ) report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field( reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) report.add_field(reporting.Fields.REFGC, '%.2f' % reference_GC) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) if json_output_dir: json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions) if qconfig.draw_plots: import plotter ########################################################################import plotter plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + '/cumulative_plot', 'Cumulative length') ######################################################################## # Drawing GC content plot... list_of_GC_distributions_with_ref = list_of_GC_distributions if ref_fpath: list_of_GC_distributions_with_ref.append(reference_GC_distribution) # Drawing cumulative plot... plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + '/GC_content_plot') ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot(contigs_fpaths, lists_of_lengths, output_dirpath + '/Nx_plot', 'Nx', []) if reference_length: plotter.Nx_plot( contigs_fpaths, lists_of_lengths, output_dirpath + '/NGx_plot', 'NGx', [reference_length for i in range(len(contigs_fpaths))]) logger.info('Done.')
def process_single_file(contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) results = dict() logger.info(' ' + qutils.index_to_str(index) + assembly_label) nucmer_base_fpath = os.path.join(nucmer_path_dirpath, assembly_name + '.coords') if qconfig.use_all_alignments: nucmer_fpath = nucmer_base_fpath else: nucmer_fpath = nucmer_base_fpath + '.filtered' if not os.path.isfile(nucmer_fpath): logger.error('Nucmer\'s coords file (' + nucmer_fpath + ') not found! Try to restart QUAST.', indent=' ') coordfile = open(nucmer_fpath, 'r') for line in coordfile: if line.startswith('='): break # EXAMPLE: # [S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [% IDY] | [TAGS] #===================================================================================== # 338980 339138 | 2298 2134 | 159 165 | 79.76 | gi|48994873|gb|U00096.2| NODE_0_length_6088 # 374145 374355 | 2306 2097 | 211 210 | 85.45 | gi|48994873|gb|U00096.2| NODE_0_length_6088 genome_mapping = {} for chr_name, chr_len in reference_chromosomes.iteritems(): genome_mapping[chr_name] = [0] * (chr_len + 1) contig_tuples = fastaparser.read_fasta(contigs_fpath) # list of FASTA entries (in tuples: name, seq) contig_tuples = sorted(contig_tuples, key=lambda contig: len(contig[1]), reverse=True) sorted_contigs_names = [name for (name, seq) in contig_tuples] genes_in_contigs = [0] * len(sorted_contigs_names) # for cumulative plots: i-th element is the number of genes in i-th contig operons_in_contigs = [0] * len(sorted_contigs_names) aligned_blocks_by_contig_name = {} # for gene finding: contig_name --> list of AlignedBlock for name in sorted_contigs_names: aligned_blocks_by_contig_name[name] = [] for line in coordfile: if line.strip() == '': break s1 = int(line.split('|')[0].split()[0]) e1 = int(line.split('|')[0].split()[1]) s2 = int(line.split('|')[1].split()[0]) e2 = int(line.split('|')[1].split()[1]) contig_name = line.split()[12].strip() chr_name = line.split()[11].strip() if chr_name not in genome_mapping: logger.error("Something went wrong and chromosome names in your coords file (" + nucmer_base_fpath + ") " \ "differ from the names in the reference. Try to remove the file and restart QUAST.") aligned_blocks_by_contig_name[contig_name].append(AlignedBlock(seqname=chr_name, start=s1, end=e1)) if s2 == 0 and e2 == 0: # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning for i in range(s1, len(genome_mapping[chr_name])): genome_mapping[chr_name][i] = 1 for i in range(1, e1 + 1): genome_mapping[chr_name][i] = 1 else: #if s1 <= e1: for i in range(s1, e1 + 1): genome_mapping[chr_name][i] = 1 coordfile.close() # counting genome coverage and gaps number covered_bp = 0 gaps_count = 0 gaps_fpath = os.path.join(genome_stats_dirpath, assembly_name + '_gaps.txt') gaps_file = open(gaps_fpath, 'w') for chr_name, chr_len in reference_chromosomes.iteritems(): print >>gaps_file, chr_name cur_gap_size = 0 for i in range(1, chr_len + 1): if genome_mapping[chr_name][i] == 1: if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 print >>gaps_file, i - cur_gap_size, i - 1 covered_bp += 1 cur_gap_size = 0 else: cur_gap_size += 1 if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 print >>gaps_file, chr_len - cur_gap_size + 1, chr_len gaps_file.close() results["covered_bp"] = covered_bp results["gaps_count"] = gaps_count # finding genes and operons for container, feature_in_contigs, field, suffix in [ (genes_container, genes_in_contigs, reporting.Fields.GENES, '_genes.txt'), (operons_container, operons_in_contigs, reporting.Fields.OPERONS, '_operons.txt')]: if not container.region_list: results[field + "_full"] = None results[field + "_partial"] = None continue total_full = 0 total_partial = 0 found_fpath = os.path.join(genome_stats_dirpath, assembly_name + suffix) found_file = open(found_fpath, 'w') print >>found_file, '%s\t\t%s\t%s' % ('ID or #', 'Start', 'End') print >>found_file, '============================' # 0 - gene is not found, # 1 - gene is found, # 2 - part of gene is found found_list = [0] * len(container.region_list) for i, region in enumerate(container.region_list): found_list[i] = 0 for contig_id, name in enumerate(sorted_contigs_names): cur_feature_is_found = False for cur_block in aligned_blocks_by_contig_name[name]: if container.chr_names_dict[region.seqname] != cur_block.seqname: continue # computing circular genomes if cur_block.start > cur_block.end: blocks = [AlignedBlock(seqname=cur_block.seqname, start=cur_block.start, end=region.end + 1), AlignedBlock(seqname=cur_block.seqname, start=1, end=cur_block.end)] else: blocks = [cur_block] for block in blocks: if region.end <= block.start or block.end <= region.start: continue elif block.start <= region.start and region.end <= block.end: if found_list[i] == 2: # already found as partial gene total_partial -= 1 found_list[i] = 1 total_full += 1 i = str(region.id) if i == 'None': i = '# ' + str(region.number + 1) print >>found_file, '%s\t\t%d\t%d' % (i, region.start, region.end) feature_in_contigs[contig_id] += 1 # inc number of found genes/operons in id-th contig cur_feature_is_found = True break elif found_list[i] == 0 and min(region.end, block.end) - max(region.start, block.start) >= qconfig.min_gene_overlap: found_list[i] = 2 total_partial += 1 if cur_feature_is_found: break if cur_feature_is_found: break results[field + "_full"] = total_full results[field + "_partial"] = total_partial found_file.close() logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') return results, genes_in_contigs, operons_in_contigs
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir): logger.print_timestamp() logger.info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None if ref_fpath: reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath)) reference_GC, reference_GC_distribution = GC_content(ref_fpath) logger.info(' Reference genome:') logger.info(' ' + os.path.basename(ref_fpath) + ', Reference length = ' + str(reference_length) + ', Reference GC % = ' + '%.2f' % reference_GC) elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size logger.info(' Estimated reference length = ' + str(reference_length)) if reference_length: # Saving the reference in JSON if json_output_dir: json_saver.save_reference_length(json_output_dir, reference_length) # Saving for an HTML report if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_reference_length(results_dir, reference_length) logger.info(' Contig files: ') lists_of_lengths = [] numbers_of_Ns = [] for id, contigs_fpath in enumerate(contigs_fpaths): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) #lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count('N') lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) # saving lengths to JSON if json_output_dir: json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, lists_of_lengths) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, lists_of_lengths) ######################################################################## logger.info(' Calculating N50 and L50...') list_of_GC_distributions = [] import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution = GC_content(contigs_fpath) list_of_GC_distributions.append(GC_distribution) logger.info(' ' + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + \ ', N50 = ' + str(n50) + \ ', L50 = ' + str(l50) + \ ', Total length = ' + str(total_length) + \ ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \ ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) ) report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) report.add_field(reporting.Fields.REFGC, '%.2f' % reference_GC) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) if json_output_dir: json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions) if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions) if qconfig.draw_plots: import plotter ########################################################################import plotter plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + '/cumulative_plot', 'Cumulative length') ######################################################################## # Drawing GC content plot... list_of_GC_distributions_with_ref = list_of_GC_distributions if ref_fpath: list_of_GC_distributions_with_ref.append(reference_GC_distribution) # Drawing cumulative plot... plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + '/GC_content_plot') ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot(contigs_fpaths, lists_of_lengths, output_dirpath + '/Nx_plot', 'Nx', []) if reference_length: plotter.Nx_plot(contigs_fpaths, lists_of_lengths, output_dirpath + '/NGx_plot', 'NGx', [reference_length for i in range(len(contigs_fpaths))]) logger.info('Done.')
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, aligned_lengths_lists, aligned_stats_dirpath): if not os.path.isdir(aligned_stats_dirpath): os.mkdir(aligned_stats_dirpath) ######################################################################## report_dict = {'header': []} for contigs_fpath in aligned_contigs_fpaths: report_dict[qutils.name_from_fpath(contigs_fpath)] = [] ######################################################################## logger.print_timestamp() logger.main_info('Running NA-NGA calculation...') reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath)) assembly_lengths = [] for contigs_fpath in aligned_contigs_fpaths: assembly_lengths.append( sum(fastaparser.get_lengths_from_fastafile(contigs_fpath))) import N50 for i, (contigs_fpath, lens, assembly_len) in enumerate( itertools.izip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)): na50 = N50.NG50(lens, assembly_len) na75 = N50.NG50(lens, assembly_len, 75) la50 = N50.LG50(lens, assembly_len) la75 = N50.LG50(lens, assembly_len, 75) if not qconfig.is_combined_ref: nga50 = N50.NG50(lens, reference_length) nga75 = N50.NG50(lens, reference_length, 75) lga50 = N50.LG50(lens, reference_length) lga75 = N50.LG50(lens, reference_length, 75) logger.info( ' ' + qutils.index_to_str(i) + qutils.label_from_fpath(contigs_fpath) + ', Largest alignment = ' + str(max(lens)) + ', NA50 = ' + str(na50) + (', NGA50 = ' + str(nga50) if not qconfig.is_combined_ref and nga50 else '') + ', LA50 = ' + str(la50) + (', LGA50 = ' + str(lga50) if not qconfig.is_combined_ref and lga50 else '')) report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.LARGALIGN, max(lens)) report.add_field(reporting.Fields.NA50, na50) report.add_field(reporting.Fields.NA75, na75) report.add_field(reporting.Fields.LA50, la50) report.add_field(reporting.Fields.LA75, la75) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.NGA50, nga50) report.add_field(reporting.Fields.NGA75, nga75) report.add_field(reporting.Fields.LGA50, lga50) report.add_field(reporting.Fields.LGA75, lga75) ######################################################################## num_contigs = max([ len(aligned_lengths_lists[i]) for i in range(len(aligned_lengths_lists)) ]) if json_output_dirpath: from libs.html_saver import json_saver json_saver.save_assembly_lengths(json_output_dirpath, aligned_contigs_fpaths, assembly_lengths) # saving to html if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths) import plotter if qconfig.draw_plots: # Drawing cumulative plot (aligned contigs)... plotter.cumulative_plot( ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists, os.path.join(aligned_stats_dirpath, 'cumulative_plot'), 'Cumulative length (aligned contigs)') # Drawing NAx and NGAx plots... plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx', assembly_lengths, json_output_dir=json_output_dirpath) if not qconfig.is_combined_ref: plotter.Nx_plot( output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))], json_output_dir=json_output_dirpath) logger.main_info('Done.') return report_dict
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, aligned_lengths_lists, aligned_stats_dirpath): if not os.path.isdir(aligned_stats_dirpath): os.mkdir(aligned_stats_dirpath) ######################################################################## report_dict = {'header': []} for contigs_fpath in aligned_contigs_fpaths: report_dict[qutils.name_from_fpath(contigs_fpath)] = [] ######################################################################## logger.print_timestamp() logger.info('Running NA-NGA calculation...') reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath)) assembly_lengths = [] for contigs_fpath in aligned_contigs_fpaths: assembly_lengths.append(sum(fastaparser.get_lengths_from_fastafile(contigs_fpath))) import N50 for i, (contigs_fpath, lens, assembly_len) in enumerate( itertools.izip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)): na50 = N50.NG50(lens, assembly_len) nga50 = N50.NG50(lens, reference_length) na75 = N50.NG50(lens, assembly_len, 75) nga75 = N50.NG50(lens, reference_length, 75) la50 = N50.LG50(lens, assembly_len) lga50 = N50.LG50(lens, reference_length) la75 = N50.LG50(lens, assembly_len, 75) lga75 = N50.LG50(lens, reference_length, 75) logger.info(' ' + qutils.index_to_str(i) + qutils.label_from_fpath(contigs_fpath) + ', Largest alignment = ' + str(max(lens)) + ', NA50 = ' + str(na50) + ', NGA50 = ' + str(nga50) + ', LA50 = ' + str(la50) + ', LGA50 = ' + str(lga50)) report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.LARGALIGN, max(lens)) report.add_field(reporting.Fields.NA50, na50) report.add_field(reporting.Fields.NGA50, nga50) report.add_field(reporting.Fields.NA75, na75) report.add_field(reporting.Fields.NGA75, nga75) report.add_field(reporting.Fields.LA50, la50) report.add_field(reporting.Fields.LGA50, lga50) report.add_field(reporting.Fields.LA75, la75) report.add_field(reporting.Fields.LGA75, lga75) ######################################################################## # saving to JSON if json_output_dirpath: from libs.html_saver import json_saver json_saver.save_aligned_contigs_lengths(json_output_dirpath, aligned_contigs_fpaths, aligned_lengths_lists) json_saver.save_assembly_lengths(json_output_dirpath, aligned_contigs_fpaths, assembly_lengths) # saving to html if qconfig.html_report: from libs.html_saver import html_saver html_saver.save_aligned_contigs_lengths(output_dirpath, aligned_contigs_fpaths, aligned_lengths_lists) html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths) if qconfig.draw_plots: # Drawing cumulative plot (aligned contigs)... import plotter plotter.cumulative_plot(ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists, os.path.join(aligned_stats_dirpath, 'cumulative_plot'), 'Cumulative length (aligned contigs)') # Drawing NAx and NGAx plots... plotter.Nx_plot(aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx', assembly_lengths) plotter.Nx_plot(aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))]) logger.info('Done.') return report_dict
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath): nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath, 'nucmer_output') logger.print_timestamp() logger.info('Running Genome analyzer...') if not os.path.isdir(genome_stats_dirpath): os.mkdir(genome_stats_dirpath) reference_chromosomes = {} genome_size = 0 for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_len = len(seq) genome_size += chr_len reference_chromosomes[chr_name] = chr_len # reading genome size # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0] # reading reference name # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome # ref_file = open(reference, 'r') # reference_name = ref_file.readline().split()[0][1:] # ref_file.close() # RESULTS file result_fpath = genome_stats_dirpath + '/genome_info.txt' res_file = open(result_fpath, 'w') res_file.write('reference chromosomes:\n') for chr_name, chr_len in reference_chromosomes.iteritems(): res_file.write('\t' + chr_name + ' (' + str(chr_len) + ' bp)\n') res_file.write('\n') res_file.write('total genome size: ' + str(genome_size) + '\n\n') res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n') res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n') genes_container = FeatureContainer(genes_fpaths, 'gene') operons_container = FeatureContainer(operons_fpaths, 'operon') for container in [genes_container, operons_container]: if not container.fpaths: logger.notice('No file with ' + container.kind + 's provided. ' 'Use the -' + container.kind[0].capitalize() + ' option ' 'if you want to specify it.', indent=' ') continue for fpath in container.fpaths: container.region_list += genes_parser.get_genes_from_file(fpath, container.kind) if len(container.region_list) == 0: logger.warning('No ' + container.kind + 's were loaded.', indent=' ') res_file.write(container.kind + 's loaded: ' + 'None' + '\n') else: logger.info(' Loaded ' + str(len(container.region_list)) + ' ' + container.kind + 's') res_file.write(container.kind + 's loaded: ' + str(len(container.region_list)) + '\n') container.chr_names_dict = chromosomes_names_dict(container.kind, container.region_list, reference_chromosomes.keys()) for contigs_fpath in aligned_contigs_fpaths: report = reporting.get(contigs_fpath) if genes_container.fpaths: report.add_field(reporting.Fields.REF_GENES, len(genes_container.region_list)) if operons_container.fpaths: report.add_field(reporting.Fields.REF_OPERONS, len(operons_container.region_list)) # header res_file.write('\n\n') res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial')) res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons')) res_file.write('================================================================================================================\n') # for cumulative plots: files_genes_in_contigs = {} # "filename" : [ genes in sorted contigs (see below) ] files_operons_in_contigs = {} # for histograms genome_mapped = [] full_found_genes = [] full_found_operons = [] # process all contig files n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed results_genes_operons_tuples = Parallel(n_jobs=n_jobs)(delayed(process_single_file)( contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container) for index, contigs_fpath in enumerate(aligned_contigs_fpaths)) for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip(aligned_contigs_fpaths, results_genes_operons_tuples): assembly_name = qutils.name_from_fpath(contigs_fpath) files_genes_in_contigs[contigs_fpath] = genes_in_contigs files_operons_in_contigs[contigs_fpath] = operons_in_contigs full_found_genes.append(sum(genes_in_contigs)) full_found_operons.append(sum(operons_in_contigs)) covered_bp = results["covered_bp"] gaps_count = results["gaps_count"] genes_full = results[reporting.Fields.GENES + "_full"] genes_part = results[reporting.Fields.GENES + "_partial"] operons_full = results[reporting.Fields.OPERONS + "_full"] operons_part = results[reporting.Fields.OPERONS + "_partial"] report = reporting.get(contigs_fpath) genome_fraction = float(covered_bp) * 100 / float(genome_size) duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) + report.get_field(reporting.Fields.MISINTERNALOVERLAP) + report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) - report.get_field(reporting.Fields.UNALIGNEDBASES)) /\ ((genome_fraction / 100.0) * float(genome_size)) res_file.write('%-25s| %-10s| %-12s| %-10s|' % (assembly_name[:24], '%3.5f%%' % genome_fraction, '%1.5f' % duplication_ratio, gaps_count)) report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction) report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio) genome_mapped.append(genome_fraction) for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part), (reporting.Fields.OPERONS, operons_full, operons_part)]: if full is None and part is None: res_file.write(' %-10s| %-10s|' % ('-', '-')) else: res_file.write(' %-10s| %-10s|' % (full, part)) report.add_field(field, '%s + %s part' % (full, part)) res_file.write('\n') res_file.close() if genes_container.region_list: ref_genes_num = len(genes_container.region_list) else: ref_genes_num = None if operons_container.region_list: ref_operons_num = len(operons_container.region_list) else: ref_operons_num = None # saving json if json_output_dirpath: if genes_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.html_report: from libs.html_saver import html_saver if genes_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.draw_plots: # cumulative plots: import plotter if genes_container.region_list: plotter.genes_operons_plot(len(genes_container.region_list), aligned_contigs_fpaths, files_genes_in_contigs, genome_stats_dirpath + '/genes_cumulative_plot', 'genes') plotter.histogram(aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_genes_histogram', '# complete genes') if operons_container.region_list: plotter.genes_operons_plot(len(operons_container.region_list), aligned_contigs_fpaths, files_operons_in_contigs, genome_stats_dirpath + '/operons_cumulative_plot', 'operons') plotter.histogram(aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram', '# complete operons') plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram', 'Genome fraction, %', top_value=100) logger.info('Done.')
def _correct_references(ref_fpaths, corrected_dirpath): corrected_ref_fpaths = [] combined_ref_fpath = os.path.join(corrected_dirpath, COMBINED_REF_FNAME) chromosomes_by_refs = {} def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corr_seq_name += '_' + qutils.correct_name(seq_name[:20]) if not qconfig.no_check: corr_seq = seq.upper() dic = { 'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N' } pat = "(%s)" % "|".join(map(re.escape, dic.keys())) corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq) if re.compile(r'[^ACGTN]').search(corr_seq): logger.warning('Skipping ' + ref_fpath + ' because it contains non-ACGTN characters.', indent=' ') return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[ corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths] ref_names = [] for ref_fname in ref_fnames: ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) ref_names.append(ref_name) dupl_ref_names = [ ref_name for ref_name in ref_names if ref_names.count(ref_name) > 1 ] for ref_fpath in ref_fpaths: total_references = 0 ref_fname = os.path.basename(ref_fpath) ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) if ref_name in dupl_ref_names: ref_name = get_label_from_par_dir_and_fname(ref_fpath) chromosomes_by_refs[ref_name] = [] corr_seq_fpath = None for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)): total_references += 1 corr_seq_name, corr_seq_fpath = correct_seq( seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath) if not corr_seq_name: break if corr_seq_fpath: logger.main_info(' ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '') logger.main_info(' All references combined in ' + COMBINED_REF_FNAME) return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths
def do(ref_fpath, contigs_fpaths, output_dirpath): gage_results_dirpath = os.path.join(output_dirpath, 'gage') # suffixes for files with report tables in plain text and tab separated formats if not os.path.isdir(gage_results_dirpath): os.mkdir(gage_results_dirpath) ######################################################################## gage_tool_path = os.path.join(qconfig.LIBS_LOCATION, 'gage', 'getCorrectnessStats.sh') ######################################################################## logger.print_timestamp() logger.info('Running GAGE...') metrics = [ 'Total units', 'Min', 'Max', 'N50', 'Genome Size', 'Assembly Size', 'Chaff bases', 'Missing Reference Bases', 'Missing Assembly Bases', 'Missing Assembly Contigs', 'Duplicated Reference Bases', 'Compressed Reference Bases', 'Bad Trim', 'Avg Idy', 'SNPs', 'Indels < 5bp', 'Indels >= 5', 'Inversions', 'Relocation', 'Translocation', 'Total units', 'BasesInFasta', 'Min', 'Max', 'N50' ] metrics_in_reporting = [ reporting.Fields.GAGE_NUMCONTIGS, reporting.Fields.GAGE_MINCONTIG, reporting.Fields.GAGE_MAXCONTIG, reporting.Fields.GAGE_N50, reporting.Fields.GAGE_GENOMESIZE, reporting.Fields.GAGE_ASSEMBLY_SIZE, reporting.Fields.GAGE_CHAFFBASES, reporting.Fields.GAGE_MISSINGREFBASES, reporting.Fields.GAGE_MISSINGASMBLYBASES, reporting.Fields.GAGE_MISSINGASMBLYCONTIGS, reporting.Fields.GAGE_DUPREFBASES, reporting.Fields.GAGE_COMPRESSEDREFBASES, reporting.Fields.GAGE_BADTRIM, reporting.Fields.GAGE_AVGIDY, reporting.Fields.GAGE_SNPS, reporting.Fields.GAGE_SHORTINDELS, reporting.Fields.GAGE_LONGINDELS, reporting.Fields.GAGE_INVERSIONS, reporting.Fields.GAGE_RELOCATION, reporting.Fields.GAGE_TRANSLOCATION, reporting.Fields.GAGE_NUMCORCONTIGS, reporting.Fields.GAGE_CORASMBLYSIZE, reporting.Fields.GAGE_MINCORCONTIG, reporting.Fields.GAGE_MAXCORCOTING, reporting.Fields.GAGE_CORN50 ] tmp_dirpath = os.path.join(gage_results_dirpath, 'tmp') if not os.path.exists(tmp_dirpath): os.makedirs(tmp_dirpath) n_jobs = min(len(contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed return_codes = Parallel(n_jobs=n_jobs)( delayed(run_gage)(i, contigs_fpath, gage_results_dirpath, gage_tool_path, ref_fpath, tmp_dirpath) for i, contigs_fpath in enumerate(contigs_fpaths)) if 0 not in return_codes: logger.warning('Error occurred while GAGE was processing assemblies.' ' See GAGE error logs for details: %s' % os.path.join(gage_results_dirpath, 'gage_*.stderr')) return ## find metrics for total report: for i, contigs_fpath in enumerate(contigs_fpaths): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) report = reporting.get(contigs_fpath) log_out_fpath = os.path.join(gage_results_dirpath, 'gage_' + assembly_name + '.stdout') logfile_out = open(log_out_fpath, 'r') cur_metric_id = 0 for line in logfile_out: if metrics[cur_metric_id] in line: if (metrics[cur_metric_id].startswith('N50')): report.add_field( metrics_in_reporting[cur_metric_id], line.split(metrics[cur_metric_id] + ':')[1].strip()) else: report.add_field(metrics_in_reporting[cur_metric_id], line.split(':')[1].strip()) cur_metric_id += 1 if cur_metric_id == len(metrics): break logfile_out.close() reporting.save_gage(output_dirpath) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.info('Done.')
def _correct_contigs(contigs_fpaths, corrected_dirpath, reporting, labels): ## removing from contigs' names special characters because: ## 1) Some embedded tools can fail on some strings with "...", "+", "-", etc ## 2) Nucmer fails on names like "contig 1_bla_bla", "contig 2_bla_bla" (it interprets as a contig's name only the first word of caption and gets ambiguous contigs names) corrected_contigs_fpaths = [] for i, contigs_fpath in enumerate(contigs_fpaths): contigs_fname = os.path.basename(contigs_fpath) fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname) label = labels[i] corr_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, label + fasta_ext)) qconfig.assembly_labels_by_fpath[corr_fpath] = label logger.info(' %s ==> %s' % (contigs_fpath, label)) # if option --scaffolds is specified QUAST adds splitted version of assemblies to the comparison if qconfig.scaffolds: logger.info(" breaking scaffolds into contigs:") corr_fpath_wo_ext = os.path.join(corrected_dirpath, qutils.name_from_fpath(corr_fpath)) broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext broken_scaffolds_fasta = [] contigs_counter = 0 for i, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)): i = 0 cur_contig_number = 1 cur_contig_start = 0 while (i < len(seq)) and (seq.find("N", i) != -1): start = seq.find("N", i) end = start + 1 while (end != len(seq)) and (seq[end] == 'N'): end += 1 i = end + 1 if (end - start) >= qconfig.Ns_break_threshold: broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(cur_contig_number), seq[cur_contig_start:start])) cur_contig_number += 1 cur_contig_start = end broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(cur_contig_number), seq[cur_contig_start:])) contigs_counter += cur_contig_number fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta) qconfig.assembly_labels_by_fpath[broken_scaffolds_fpath] = label + ' broken' logger.info(" %d scaffolds (%s) were broken into %d contigs (%s)" % (i + 1, qutils.name_from_fpath(corr_fpath), contigs_counter, qutils.name_from_fpath(broken_scaffolds_fpath))) if _handle_fasta(broken_scaffolds_fpath, broken_scaffolds_fpath, reporting): corrected_contigs_fpaths.append(broken_scaffolds_fpath) qconfig.list_of_broken_scaffolds.append(qutils.name_from_fpath(broken_scaffolds_fpath)) if _handle_fasta(contigs_fpath, corr_fpath, reporting): corrected_contigs_fpaths.append(corr_fpath) return corrected_contigs_fpaths
def _parallel_correct_contigs(file_counter, contigs_fpath, corrected_dirpath, labels): broken_scaffolds = None contigs_fname = os.path.basename(contigs_fpath) fname, fasta_ext = qutils.splitext_for_fasta_file(contigs_fname) label = labels[file_counter] corr_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, label + fasta_ext)) logs = [] logs.append(' ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + '%s ==> %s' % (contigs_fpath, label)) # if option --scaffolds is specified QUAST adds split version of assemblies to the comparison if qconfig.scaffolds: logger.info( ' ' + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + ' breaking scaffolds into contigs:') corr_fpath_wo_ext = os.path.join(corrected_dirpath, qutils.name_from_fpath(corr_fpath)) broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext broken_scaffolds_fasta = [] contigs_counter = 0 scaffold_counter = 0 for scaffold_counter, (name, seq) in enumerate( fastaparser.read_fasta(contigs_fpath)): if contigs_counter % 100 == 0: pass if contigs_counter > 520: pass cumul_contig_length = 0 total_contigs_for_the_scaf = 1 cur_contig_start = 0 while (cumul_contig_length < len(seq)) and (seq.find( 'N', cumul_contig_length) != -1): start = seq.find("N", cumul_contig_length) end = start + 1 while (end != len(seq)) and (seq[end] == 'N'): end += 1 cumul_contig_length = end + 1 if (end - start) >= qconfig.Ns_break_threshold: broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(total_contigs_for_the_scaf), seq[cur_contig_start:start])) total_contigs_for_the_scaf += 1 cur_contig_start = end broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(total_contigs_for_the_scaf), seq[cur_contig_start:])) contigs_counter += total_contigs_for_the_scaf if scaffold_counter + 1 != contigs_counter: fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta) logs.append( " " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + " %d scaffolds (%s) were broken into %d contigs (%s)" % (scaffold_counter + 1, label, contigs_counter, label + ' broken')) broken_scaffolds = (broken_scaffolds_fpath, broken_scaffolds_fpath) else: logs.append( " " + qutils.index_to_str(file_counter, force=(len(labels) > 1)) + " WARNING: nothing was broken, skipping '%s broken' from further analysis" % label) corr_fpaths = (contigs_fpath, corr_fpath) return corr_fpaths, broken_scaffolds, logs
def main(args): if ' ' in qconfig.QUAST_HOME: logger.error( 'QUAST does not support spaces in paths. \n' 'You are trying to run it from ' + str(qconfig.QUAST_HOME) + '\n' 'Please, put QUAST in a different directory, then try again.\n', to_stderr=True, exit_with_code=3) if not args: qconfig.usage(meta=True) sys.exit(0) genes = [] operons = [] html_report = qconfig.html_report make_latest_symlink = True ref_txt_fpath = None try: options, contigs_fpaths = getopt.gnu_getopt(args, qconfig.short_options, qconfig.long_options) except getopt.GetoptError: _, exc_value, _ = sys.exc_info() print >> sys.stderr, exc_value print >> sys.stderr qconfig.usage(meta=True) sys.exit(2) quast_py_args = args[:] test_mode = False for opt, arg in options: if opt in ('-d', '--debug'): options.remove((opt, arg)) qconfig.debug = True logger.set_up_console_handler(debug=True) elif opt == '--test' or opt == '--test-no-ref': options.remove((opt, arg)) quast_py_args = __remove_from_quast_py_args(quast_py_args, opt) options += [('-o', 'quast_test_output')] if opt == '--test': options += [('-R', ','.join([ os.path.join(qconfig.QUAST_HOME, 'test_data', 'meta_ref_1.fasta'), os.path.join(qconfig.QUAST_HOME, 'test_data', 'meta_ref_2.fasta'), os.path.join(qconfig.QUAST_HOME, 'test_data', 'meta_ref_3.fasta') ]))] contigs_fpaths += [ os.path.join(qconfig.QUAST_HOME, 'test_data', 'meta_contigs_1.fasta'), os.path.join(qconfig.QUAST_HOME, 'test_data', 'meta_contigs_2.fasta') ] test_mode = True elif opt.startswith('--help') or opt == '-h': qconfig.usage(opt == "--help-hidden", meta=True, short=False) sys.exit(0) elif opt.startswith('--version') or opt == '-v': qconfig.print_version(meta=True) sys.exit(0) if not contigs_fpaths: logger.error("You should specify at least one file with contigs!\n") qconfig.usage(meta=True) sys.exit(2) ref_fpaths = [] combined_ref_fpath = '' reads_fpath_f = '' reads_fpath_r = '' output_dirpath = None labels = None all_labels_from_dirs = False for opt, arg in options: if opt in ('-o', "--output-dir"): # Removing output dir arg in order to further # construct other quast calls from this options if opt in quast_py_args and arg in quast_py_args: quast_py_args = __remove_from_quast_py_args( quast_py_args, opt, arg) output_dirpath = os.path.abspath(arg) make_latest_symlink = False elif opt in ('-G', "--genes"): assert_file_exists(arg, 'genes') genes += arg elif opt in ('-O', "--operons"): assert_file_exists(arg, 'operons') operons += arg elif opt in ('-R', "--reference"): # Removing reference args in order to further # construct quast calls from this args with other reference options if opt in quast_py_args and arg in quast_py_args: quast_py_args = __remove_from_quast_py_args( quast_py_args, opt, arg) if os.path.isdir(arg): ref_fpaths = [ os.path.join(path, file) for (path, dirs, files) in os.walk(arg) for file in files if qutils.check_is_fasta_file(file) ] ref_fpaths.sort() else: ref_fpaths = arg.split(',') for i, ref_fpath in enumerate(ref_fpaths): assert_file_exists(ref_fpath, 'reference') ref_fpaths[i] = ref_fpath elif opt == '--max-ref-number': quast_py_args = __remove_from_quast_py_args( quast_py_args, opt, arg) qconfig.max_references = int(arg) if qconfig.max_references < 0: qconfig.max_references = 0 elif opt in ('-m', "--min-contig"): qconfig.min_contig = int(arg) elif opt in ('-t', "--threads"): qconfig.max_threads = int(arg) if qconfig.max_threads < 1: qconfig.max_threads = 1 elif opt in ('-l', '--labels'): quast_py_args = __remove_from_quast_py_args( quast_py_args, opt, arg) labels = quast.parse_labels(arg, contigs_fpaths) elif opt == '-L': quast_py_args = __remove_from_quast_py_args(quast_py_args, opt) all_labels_from_dirs = True elif opt in ('-j', '--save-json'): pass elif opt in ('-J', '--save-json-to'): pass elif opt == "--contig-thresholds": pass elif opt in ('-c', "--mincluster"): pass elif opt == "--est-ref-size": pass elif opt == "--gene-thresholds": pass elif opt in ('-s', "--scaffolds"): pass elif opt == "--gage": pass elif opt == "--debug": pass elif opt in ('-e', "--eukaryote"): pass elif opt in ('-f', "--gene-finding"): pass elif opt in ('-i', "--min-alignment"): pass elif opt in ('-c', "--min-cluster"): pass elif opt in ('-a', "--ambiguity-usage"): pass elif opt in ('-u', "--use-all-alignments"): pass elif opt == "--strict-NA": pass elif opt in ('-x', "--extensive-mis-size"): pass elif opt == "--meta": pass elif opt == '--references-list': ref_txt_fpath = arg elif opt == '--glimmer': pass elif opt == '--no-snps': pass elif opt == '--no-check': pass elif opt == '--no-gc': pass elif opt == '--no-plots': pass elif opt == '--no-html': html_report = False elif opt == '--fast': # --no-check, --no-gc, --no-snps will automatically set in QUAST runs html_report = False elif opt == '--plots-format': pass elif opt == '--memory-efficient': pass elif opt == '--silent': qconfig.silent = True elif opt in ('-1', '--reads1'): reads_fpath_f = arg quast_py_args = __remove_from_quast_py_args( quast_py_args, opt, arg) elif opt in ('-2', '--reads2'): reads_fpath_r = arg quast_py_args = __remove_from_quast_py_args( quast_py_args, opt, arg) elif opt == '--contig-alignment-html': qconfig.create_contig_alignment_html = True else: logger.error('Unknown option: %s. Use -h for help.' % (opt + ' ' + arg), to_stderr=True, exit_with_code=2) for c_fpath in contigs_fpaths: assert_file_exists(c_fpath, 'contigs') labels = quast.process_labels(contigs_fpaths, labels, all_labels_from_dirs) for contigs_fpath in contigs_fpaths: if contigs_fpath in quast_py_args: quast_py_args.remove(contigs_fpath) # Directories output_dirpath, _, _ = quast._set_up_output_dir(output_dirpath, None, make_latest_symlink, save_json=False) corrected_dirpath = os.path.join(output_dirpath, qconfig.corrected_dirname) logger.set_up_file_handler(output_dirpath) args = [os.path.realpath(__file__)] for k, v in options: args.extend([k, v]) args.extend(contigs_fpaths) logger.print_command_line(args, wrap_after=None) logger.start() qconfig.set_max_threads(logger) ######################################################################## from libs import reporting reload(reporting) if os.path.isdir(corrected_dirpath): shutil.rmtree(corrected_dirpath) os.mkdir(corrected_dirpath) # PROCESSING REFERENCES if ref_fpaths: logger.main_info() logger.main_info('Reference(s):') corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\ _correct_references(ref_fpaths, corrected_dirpath) # PROCESSING CONTIGS logger.main_info() logger.main_info('Contigs:') assemblies, correct_assemblies = _correct_contigs(contigs_fpaths, output_dirpath, labels) if not assemblies: logger.error( "None of the assembly files contains correct contigs. " "Please, provide different files or decrease --min-contig threshold." ) return 4 # Running QUAST(s) quast_py_args += ['--meta'] downloaded_refs = False # SEARCHING REFERENCES if not ref_fpaths: logger.main_info() if qconfig.max_references == 0: logger.notice( "Maximum number of references (--max-ref-number) is set to 0, search in SILVA 16S rRNA database is disabled" ) else: if ref_txt_fpath: logger.main_info( "List of references was provided, starting to download reference genomes from NCBI..." ) else: logger.main_info( "No references are provided, starting to search for reference genomes in SILVA 16S rRNA database " "and to download them from NCBI...") downloaded_dirpath = os.path.join(output_dirpath, qconfig.downloaded_dirname) if not os.path.isdir(downloaded_dirpath): os.mkdir(downloaded_dirpath) ref_fpaths = search_references_meta.do(assemblies, labels, downloaded_dirpath, ref_txt_fpath) if ref_fpaths: search_references_meta.is_quast_first_run = True if not ref_txt_fpath: downloaded_refs = True logger.main_info() logger.main_info('Downloaded reference(s):') corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\ _correct_references(ref_fpaths, corrected_dirpath) elif test_mode and ref_fpaths is None: logger.error( 'Failed to download or setup SILVA 16S rRNA database for working without ' 'references on metagenome datasets!', to_stderr=True, exit_with_code=4) if not ref_fpaths: # No references, running regular quast with MetaGenemark gene finder logger.main_info() logger.notice( 'No references are provided, starting regular QUAST with MetaGeneMark gene finder' ) _start_quast_main(None, quast_py_args, assemblies=assemblies, output_dirpath=output_dirpath, exit_on_exception=True) exit(0) # Running combined reference combined_output_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name) reads_fpaths = [] if reads_fpath_f: reads_fpaths.append(reads_fpath_f) if reads_fpath_r: reads_fpaths.append(reads_fpath_r) if reads_fpaths: bed_fpath = reads_analyzer.do(combined_ref_fpath, contigs_fpaths, reads_fpaths, corrected_ref_fpaths, os.path.join(combined_output_dirpath, qconfig.variation_dirname), external_logger=logger) if bed_fpath: quast_py_args += ['--bed-file'] quast_py_args += [bed_fpath] quast_py_args += ['--combined-ref'] run_name = 'for the combined reference' logger.main_info() logger.main_info('Starting quast.py ' + run_name + '...') total_num_notices = 0 total_num_warnings = 0 total_num_nf_errors = 0 total_num_notifications = (total_num_notices, total_num_warnings, total_num_nf_errors) if qconfig.html_report: from libs.html_saver import json_saver json_texts = [] else: json_texts = None return_code, total_num_notifications, assemblies, labels = _start_quast_main( run_name, quast_py_args + ["--ambiguity-usage"] + ['all'], assemblies=assemblies, reference_fpath=combined_ref_fpath, output_dirpath=combined_output_dirpath, num_notifications_tuple=total_num_notifications, is_first_run=True) for arg in args: if arg in ('-s', "--scaffolds"): quast_py_args.remove(arg) if json_texts is not None: json_texts.append(json_saver.json_text) search_references_meta.is_quast_first_run = False genome_info_dirpath = os.path.join(output_dirpath, qconfig.combined_output_name, 'genome_stats') genome_info_fpath = os.path.join(genome_info_dirpath, 'genome_info.txt') if not os.path.exists(genome_info_fpath): logger.main_info('') logger.main_info( 'Failed aligning the contigs for all the references. ' + ('Try to restart MetaQUAST with another references.' if not downloaded_refs else 'Try to use option --max-ref-number to change maximum number of references ' '(per each assembly) to download.')) logger.main_info('') quast._cleanup(corrected_dirpath) logger.main_info('MetaQUAST finished.') logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode) return if downloaded_refs: logger.main_info() logger.main_info( 'Excluding downloaded references with low genome fraction from further analysis..' ) corr_ref_fpaths = remove_unaligned_downloaded_refs( genome_info_fpath, ref_fpaths, chromosomes_by_refs) if corr_ref_fpaths and corr_ref_fpaths != ref_fpaths: logger.main_info() logger.main_info('Filtered reference(s):') os.remove(combined_ref_fpath) contigs_analyzer.ref_labels_by_chromosomes = {} corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_names =\ _correct_references(corr_ref_fpaths, corrected_dirpath) run_name = 'for the corrected combined reference' logger.main_info() logger.main_info('Starting quast.py ' + run_name + '...') return_code, total_num_notifications, assemblies, labels = _start_quast_main( run_name, quast_py_args + ["--ambiguity-usage"] + ['all'], assemblies=assemblies, reference_fpath=combined_ref_fpath, output_dirpath=combined_output_dirpath, num_notifications_tuple=total_num_notifications, is_first_run=True) if json_texts is not None: json_texts = json_texts[:-1] json_texts.append(json_saver.json_text) elif corr_ref_fpaths == ref_fpaths: logger.main_info( 'All downloaded references have genome fraction more than 10%. Nothing was excluded.' ) else: logger.main_info( 'All downloaded references have low genome fraction. Nothing was excluded for now.' ) quast_py_args += ['--no-check-meta'] qconfig.contig_thresholds = ','.join([ str(threshold) for threshold in qconfig.contig_thresholds if threshold > qconfig.min_contig ]) if not qconfig.contig_thresholds: qconfig.contig_thresholds = 'None' quast_py_args = __remove_from_quast_py_args(quast_py_args, '--contig-thresholds', qconfig.contig_thresholds) quast_py_args += ['--contig-thresholds'] quast_py_args += [qconfig.contig_thresholds] quast_py_args.remove('--combined-ref') logger.main_info() logger.main_info( 'Partitioning contigs into bins aligned to each reference..') assemblies_by_reference, not_aligned_assemblies = _partition_contigs( assemblies, corrected_ref_fpaths, corrected_dirpath, os.path.join(combined_output_dirpath, 'contigs_reports', 'alignments_%s.tsv'), labels) ref_names = [] output_dirpath_per_ref = os.path.join(output_dirpath, qconfig.per_ref_dirname) for ref_fpath, ref_assemblies in assemblies_by_reference: ref_name = qutils.name_from_fpath(ref_fpath) logger.main_info('') if not ref_assemblies: logger.main_info('No contigs were aligned to the reference ' + ref_name + ', skipping..') else: ref_names.append(ref_name) run_name = 'for the contigs aligned to ' + ref_name logger.main_info('Starting quast.py ' + run_name) return_code, total_num_notifications = _start_quast_main( run_name, quast_py_args, assemblies=ref_assemblies, reference_fpath=ref_fpath, output_dirpath=os.path.join(output_dirpath_per_ref, ref_name), exit_on_exception=False, num_notifications_tuple=total_num_notifications) if json_texts is not None: json_texts.append(json_saver.json_text) # Finally running for the contigs that has not been aligned to any reference no_unaligned_contigs = True for assembly in not_aligned_assemblies: if os.path.isfile( assembly.fpath) and os.stat(assembly.fpath).st_size != 0: no_unaligned_contigs = False break run_name = 'for the contigs not aligned anywhere' logger.main_info() if no_unaligned_contigs: logger.main_info('Skipping quast.py ' + run_name + ' (everything is aligned!)') else: logger.main_info('Starting quast.py ' + run_name + '...') return_code, total_num_notifications = _start_quast_main( run_name, quast_py_args, assemblies=not_aligned_assemblies, output_dirpath=os.path.join(output_dirpath, qconfig.not_aligned_name), exit_on_exception=False, num_notifications_tuple=total_num_notifications) if return_code not in [0, 4]: logger.error( 'Error running quast.py for the contigs not aligned anywhere') elif return_code == 4: # no unaligned contigs, i.e. everything aligned no_unaligned_contigs = True if not no_unaligned_contigs: if json_texts is not None: json_texts.append(json_saver.json_text) if ref_names: logger.print_timestamp() logger.main_info("Summarizing results...") summary_output_dirpath = os.path.join(output_dirpath, qconfig.meta_summary_dir) if not os.path.isdir(summary_output_dirpath): os.makedirs(summary_output_dirpath) if html_report and json_texts: from libs.html_saver import html_saver html_summary_report_fpath = html_saver.init_meta_report( output_dirpath) else: html_summary_report_fpath = None from libs import create_meta_summary metrics_for_plots = reporting.Fields.main_metrics misassembl_metrics = [ reporting.Fields.MIS_RELOCATION, reporting.Fields.MIS_TRANSLOCATION, reporting.Fields.MIS_INVERTION, reporting.Fields.MIS_ISTRANSLOCATIONS ] create_meta_summary.do( html_summary_report_fpath, summary_output_dirpath, combined_output_dirpath, output_dirpath_per_ref, metrics_for_plots, misassembl_metrics, ref_names if no_unaligned_contigs else ref_names + [qconfig.not_aligned_name]) if html_report and json_texts: from libs import plotter html_saver.save_colors(output_dirpath, contigs_fpaths, plotter.dict_color_and_ls, meta=True) html_saver.create_meta_report(output_dirpath, json_texts) quast._cleanup(corrected_dirpath) logger.main_info('') logger.main_info('MetaQUAST finished.') logger.finish_up(numbers=tuple(total_num_notifications), check_test=test_mode)
def _correct_references(ref_fpaths, corrected_dirpath): corrected_ref_fpaths = [] combined_ref_fpath = os.path.join(corrected_dirpath, COMBINED_REF_FNAME) chromosomes_by_refs = {} def correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) corr_seq_name += '_' + qutils.correct_name(seq_name[:20]) if not qconfig.no_check: corr_seq = seq.upper() dic = {'M': 'N', 'K': 'N', 'R': 'N', 'Y': 'N', 'W': 'N', 'S': 'N', 'V': 'N', 'B': 'N', 'H': 'N', 'D': 'N'} pat = "(%s)" % "|".join(map(re.escape, dic.keys())) corr_seq = re.sub(pat, lambda m: dic[m.group()], corr_seq) if re.compile(r'[^ACGTN]').search(corr_seq): logger.warning('Skipping ' + ref_fpath + ' because it contains non-ACGTN characters.', indent=' ') return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths] ref_names = [] for ref_fname in ref_fnames: ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) ref_names.append(ref_name) dupl_ref_names = [ref_name for ref_name in ref_names if ref_names.count(ref_name) > 1] for ref_fpath in ref_fpaths: total_references = 0 ref_fname = os.path.basename(ref_fpath) ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) if ref_name in dupl_ref_names: ref_name = get_label_from_par_dir_and_fname(ref_fpath) chromosomes_by_refs[ref_name] = [] corr_seq_fpath = None for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)): total_references += 1 corr_seq_name, corr_seq_fpath = correct_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath) if not corr_seq_name: break if corr_seq_fpath: logger.main_info(' ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '') logger.main_info(' All references combined in ' + COMBINED_REF_FNAME) return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath): nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath, 'nucmer_output') from libs import search_references_meta if search_references_meta.is_quast_first_run: nucmer_path_dirpath = os.path.join(nucmer_path_dirpath, 'raw') logger.print_timestamp() logger.main_info('Running Genome analyzer...') if not os.path.isdir(genome_stats_dirpath): os.mkdir(genome_stats_dirpath) reference_chromosomes = {} genome_size = 0 for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_len = len(seq) genome_size += chr_len reference_chromosomes[chr_name] = chr_len # reading genome size # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0] # reading reference name # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome # ref_file = open(reference, 'r') # reference_name = ref_file.readline().split()[0][1:] # ref_file.close() # RESULTS file result_fpath = genome_stats_dirpath + '/genome_info.txt' res_file = open(result_fpath, 'w') genes_container = FeatureContainer(genes_fpaths, 'gene') operons_container = FeatureContainer(operons_fpaths, 'operon') for container in [genes_container, operons_container]: if not container.fpaths: logger.notice('No file with ' + container.kind + 's provided. ' 'Use the -' + container.kind[0].capitalize() + ' option ' 'if you want to specify it.', indent=' ') continue for fpath in container.fpaths: container.region_list += genes_parser.get_genes_from_file( fpath, container.kind) if len(container.region_list) == 0: logger.warning('No ' + container.kind + 's were loaded.', indent=' ') res_file.write(container.kind + 's loaded: ' + 'None' + '\n') else: logger.info(' Loaded ' + str(len(container.region_list)) + ' ' + container.kind + 's') res_file.write(container.kind + 's loaded: ' + str(len(container.region_list)) + '\n') container.chr_names_dict = chromosomes_names_dict( container.kind, container.region_list, reference_chromosomes.keys()) for contigs_fpath in aligned_contigs_fpaths: report = reporting.get(contigs_fpath) if genes_container.fpaths: report.add_field(reporting.Fields.REF_GENES, len(genes_container.region_list)) if operons_container.fpaths: report.add_field(reporting.Fields.REF_OPERONS, len(operons_container.region_list)) # for cumulative plots: files_genes_in_contigs = { } # "filename" : [ genes in sorted contigs (see below) ] files_operons_in_contigs = {} # for histograms genome_mapped = [] full_found_genes = [] full_found_operons = [] # process all contig files num_nf_errors = logger._num_nf_errors n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads) from joblib import Parallel, delayed process_results = Parallel(n_jobs=n_jobs)( delayed(process_single_file)( contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container) for index, contigs_fpath in enumerate(aligned_contigs_fpaths)) num_nf_errors += len([res for res in process_results if res is None]) logger._num_nf_errors = num_nf_errors process_results = [res for res in process_results if res] if not process_results: logger.main_info('Genome analyzer failed for all the assemblies.') res_file.close() return ref_lengths = [process_results[i][0] for i in range(len(process_results))] results_genes_operons_tuples = [ process_results[i][1] for i in range(len(process_results)) ] for ref in reference_chromosomes: ref_lengths_by_contigs[ref] = [ ref_lengths[i][ref] for i in range(len(ref_lengths)) ] res_file.write('reference chromosomes:\n') for chr_name, chr_len in reference_chromosomes.iteritems(): aligned_len = max(ref_lengths_by_contigs[chr_name]) res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n') res_file.write('\n') res_file.write('total genome size: ' + str(genome_size) + '\n\n') res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n') res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n') # header # header res_file.write('\n\n') res_file.write( '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial')) res_file.write( '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons')) res_file.write( '================================================================================================================\n' ) for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip( aligned_contigs_fpaths, results_genes_operons_tuples): assembly_name = qutils.name_from_fpath(contigs_fpath) files_genes_in_contigs[contigs_fpath] = genes_in_contigs files_operons_in_contigs[contigs_fpath] = operons_in_contigs full_found_genes.append(sum(genes_in_contigs)) full_found_operons.append(sum(operons_in_contigs)) covered_bp = results["covered_bp"] gaps_count = results["gaps_count"] genes_full = results[reporting.Fields.GENES + "_full"] genes_part = results[reporting.Fields.GENES + "_partial"] operons_full = results[reporting.Fields.OPERONS + "_full"] operons_part = results[reporting.Fields.OPERONS + "_partial"] report = reporting.get(contigs_fpath) genome_fraction = float(covered_bp) * 100 / float(genome_size) duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) + report.get_field(reporting.Fields.MISINTERNALOVERLAP) + report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) - report.get_field(reporting.Fields.UNALIGNEDBASES)) /\ ((genome_fraction / 100.0) * float(genome_size)) res_file.write('%-25s| %-10s| %-12s| %-10s|' % (assembly_name[:24], '%3.5f%%' % genome_fraction, '%1.5f' % duplication_ratio, gaps_count)) report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction) report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio) genome_mapped.append(genome_fraction) for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part), (reporting.Fields.OPERONS, operons_full, operons_part)]: if full is None and part is None: res_file.write(' %-10s| %-10s|' % ('-', '-')) else: res_file.write(' %-10s| %-10s|' % (full, part)) report.add_field(field, '%s + %s part' % (full, part)) res_file.write('\n') res_file.close() if genes_container.region_list: ref_genes_num = len(genes_container.region_list) else: ref_genes_num = None if operons_container.region_list: ref_operons_num = len(operons_container.region_list) else: ref_operons_num = None # saving json if json_output_dirpath: if genes_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.html_report: from libs.html_saver import html_saver if genes_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.draw_plots: # cumulative plots: import plotter if genes_container.region_list: plotter.genes_operons_plot( len(genes_container.region_list), aligned_contigs_fpaths, files_genes_in_contigs, genome_stats_dirpath + '/genes_cumulative_plot', 'genes') plotter.histogram( aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_genes_histogram', '# complete genes') if operons_container.region_list: plotter.genes_operons_plot( len(operons_container.region_list), aligned_contigs_fpaths, files_operons_in_contigs, genome_stats_dirpath + '/operons_cumulative_plot', 'operons') plotter.histogram( aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram', '# complete operons') plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram', 'Genome fraction, %', top_value=100) logger.main_info('Done.')
def process_single_file(contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container): assembly_name = qutils.name_from_fpath(contigs_fpath) assembly_label = qutils.label_from_fpath(contigs_fpath) results = dict() logger.info(' ' + qutils.index_to_str(index) + assembly_label) nucmer_base_fpath = os.path.join(nucmer_path_dirpath, assembly_name + '.coords') if qconfig.use_all_alignments: nucmer_fpath = nucmer_base_fpath else: nucmer_fpath = nucmer_base_fpath + '.filtered' if not os.path.isfile(nucmer_fpath): logger.error('Nucmer\'s coords file (' + nucmer_fpath + ') not found! Try to restart QUAST.', indent=' ') coordfile = open(nucmer_fpath, 'r') for line in coordfile: if line.startswith('='): break # EXAMPLE: # [S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [% IDY] | [TAGS] #===================================================================================== # 338980 339138 | 2298 2134 | 159 165 | 79.76 | gi|48994873|gb|U00096.2| NODE_0_length_6088 # 374145 374355 | 2306 2097 | 211 210 | 85.45 | gi|48994873|gb|U00096.2| NODE_0_length_6088 genome_mapping = {} for chr_name, chr_len in reference_chromosomes.iteritems(): genome_mapping[chr_name] = [0] * (chr_len + 1) contig_tuples = fastaparser.read_fasta( contigs_fpath) # list of FASTA entries (in tuples: name, seq) contig_tuples = sorted(contig_tuples, key=lambda contig: len(contig[1]), reverse=True) sorted_contigs_names = [name for (name, seq) in contig_tuples] genes_in_contigs = [0] * len( sorted_contigs_names ) # for cumulative plots: i-th element is the number of genes in i-th contig operons_in_contigs = [0] * len(sorted_contigs_names) aligned_blocks_by_contig_name = { } # for gene finding: contig_name --> list of AlignedBlock for name in sorted_contigs_names: aligned_blocks_by_contig_name[name] = [] for line in coordfile: if line.strip() == '': break s1 = int(line.split('|')[0].split()[0]) e1 = int(line.split('|')[0].split()[1]) s2 = int(line.split('|')[1].split()[0]) e2 = int(line.split('|')[1].split()[1]) contig_name = line.split()[12].strip() chr_name = line.split()[11].strip() if chr_name not in genome_mapping: logger.error("Something went wrong and chromosome names in your coords file (" + nucmer_base_fpath + ") " \ "differ from the names in the reference. Try to remove the file and restart QUAST.") aligned_blocks_by_contig_name[contig_name].append( AlignedBlock(seqname=chr_name, start=s1, end=e1)) if s2 == 0 and e2 == 0: # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning for i in range(s1, len(genome_mapping[chr_name])): genome_mapping[chr_name][i] = 1 for i in range(1, e1 + 1): genome_mapping[chr_name][i] = 1 else: #if s1 <= e1: for i in range(s1, e1 + 1): genome_mapping[chr_name][i] = 1 coordfile.close() # counting genome coverage and gaps number covered_bp = 0 gaps_count = 0 gaps_fpath = os.path.join(genome_stats_dirpath, assembly_name + '_gaps.txt') gaps_file = open(gaps_fpath, 'w') for chr_name, chr_len in reference_chromosomes.iteritems(): print >> gaps_file, chr_name cur_gap_size = 0 for i in range(1, chr_len + 1): if genome_mapping[chr_name][i] == 1: if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 print >> gaps_file, i - cur_gap_size, i - 1 covered_bp += 1 cur_gap_size = 0 else: cur_gap_size += 1 if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 print >> gaps_file, chr_len - cur_gap_size + 1, chr_len gaps_file.close() results["covered_bp"] = covered_bp results["gaps_count"] = gaps_count # finding genes and operons for container, feature_in_contigs, field, suffix in [ (genes_container, genes_in_contigs, reporting.Fields.GENES, '_genes.txt'), (operons_container, operons_in_contigs, reporting.Fields.OPERONS, '_operons.txt') ]: if not container.region_list: results[field + "_full"] = None results[field + "_partial"] = None continue total_full = 0 total_partial = 0 found_fpath = os.path.join(genome_stats_dirpath, assembly_name + suffix) found_file = open(found_fpath, 'w') print >> found_file, '%s\t\t%s\t%s' % ('ID or #', 'Start', 'End') print >> found_file, '============================' # 0 - gene is not found, # 1 - gene is found, # 2 - part of gene is found found_list = [0] * len(container.region_list) for i, region in enumerate(container.region_list): found_list[i] = 0 for contig_id, name in enumerate(sorted_contigs_names): cur_feature_is_found = False for cur_block in aligned_blocks_by_contig_name[name]: if container.chr_names_dict[ region.seqname] != cur_block.seqname: continue # computing circular genomes if cur_block.start > cur_block.end: blocks = [ AlignedBlock(seqname=cur_block.seqname, start=cur_block.start, end=region.end + 1), AlignedBlock(seqname=cur_block.seqname, start=1, end=cur_block.end) ] else: blocks = [cur_block] for block in blocks: if region.end <= block.start or block.end <= region.start: continue elif block.start <= region.start and region.end <= block.end: if found_list[ i] == 2: # already found as partial gene total_partial -= 1 found_list[i] = 1 total_full += 1 i = str(region.id) if i == 'None': i = '# ' + str(region.number + 1) print >> found_file, '%s\t\t%d\t%d' % ( i, region.start, region.end) feature_in_contigs[ contig_id] += 1 # inc number of found genes/operons in id-th contig cur_feature_is_found = True break elif found_list[i] == 0 and min( region.end, block.end) - max( region.start, block.start) >= qconfig.min_gene_overlap: found_list[i] = 2 total_partial += 1 if cur_feature_is_found: break if cur_feature_is_found: break results[field + "_full"] = total_full results[field + "_partial"] = total_partial found_file.close() logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') return results, genes_in_contigs, operons_in_contigs