def get_assemblies_data(contigs_fpaths, icarus_dirpath, stdout_pattern, nx_marks): assemblies_n50 = defaultdict(dict) assemblies_data = '' assemblies_data += 'var assemblies_links = {};\n' assemblies_data += 'var assemblies_len = {};\n' assemblies_data += 'var assemblies_contigs = {};\n' assemblies_data += 'var assemblies_misassemblies = {};\n' assemblies_data += 'var assemblies_n50 = {};\n' assemblies_contig_size_data = '' for contigs_fpath in contigs_fpaths: assembly_label = qutils.label_from_fpath(contigs_fpath) report = reporting.get(contigs_fpath) l = report.get_field(reporting.Fields.TOTALLEN) contigs = report.get_field(reporting.Fields.CONTIGS) n50 = report.get_field(reporting.Fields.N50) if stdout_pattern: contig_stdout_fpath = stdout_pattern % qutils.label_from_fpath_for_fname(contigs_fpath) + '.stdout' contig_stdout_fpath = qutils.relpath(contig_stdout_fpath, icarus_dirpath) assemblies_data += 'assemblies_links["' + assembly_label + '"] = "' + contig_stdout_fpath + '";\n' assemblies_contig_size_data += 'assemblies_len["' + assembly_label + '"] = ' + str(l) + ';\n' assemblies_contig_size_data += 'assemblies_contigs["' + assembly_label + '"] = ' + str(contigs) + ';\n' assemblies_contig_size_data += 'assemblies_n50["' + assembly_label + '"] = "' + str(n50) + '";\n' for nx in nx_marks: assemblies_n50[assembly_label][nx] = report.get_field(nx) return assemblies_data, assemblies_contig_size_data, assemblies_n50
def create_labels(chr_lengths, assemblies, features_containers, coverage_fpath, output_dir): labels_txt_fpath = join(output_dir, 'labels.txt') track_labels = [] plot_idx = 0 for i, assembly in enumerate(assemblies): track_labels.append(('assembly' + str(i + 1), plot_idx)) plot_idx += 1 for feature_container in features_containers: if len(feature_container.region_list) > 0: track_labels.append((feature_container.kind, plot_idx)) plot_idx += 1 if coverage_fpath: track_labels.append(('coverage', plot_idx)) with open(labels_txt_fpath, 'w') as out_f: out_f.write(list(chr_lengths.keys())[0] + '\t0\t0\tnull\t' + ','.join(['track%d=%s' % (i, label) for label, i in track_labels])) labels_conf_fpath = join(output_dir, 'label.conf') with open(labels_conf_fpath, 'w') as out_f: out_f.write('z = 10\n' 'type = text\n' 'label_size = 30p\n' 'label_font = bold\n' 'label_parallel = yes\n' 'file = ' + relpath(labels_txt_fpath, output_dir) + '\n' 'r0 = eval(sprintf("%fr+5p", conf(conf(., track_idx)_pos)))\n' 'r1 = eval(sprintf("%fr+500p", conf(conf(., track_idx)_pos)))\n' '<rules>\n' '<rule>\n' 'condition = 1\n' 'value = eval(var(conf(., track_idx)))\n' '</rule>\n' '</rules>\n') return labels_conf_fpath, track_labels
def create_housekeeping_file(chr_lengths, max_points, root_dir, output_dir, logger): max_ideograms = len(chr_lengths.keys()) template_fpath = None circos_bin_fpath = get_path_to_program('circos') if circos_bin_fpath: circos_dirpath = dirname(realpath(get_path_to_program('circos'))) template_fpath = join(circos_dirpath, '..', 'libexec', 'etc', 'housekeeping.conf') if not is_non_empty_file(template_fpath): template_fpath = join(circos_dirpath, '..', 'etc', 'housekeeping.conf') if not is_non_empty_file(template_fpath): if not get_path_to_program('circos'): msg = 'Circos is not found.' else: msg = 'File etc/housekeeping.conf is not found.' logger.warning(msg + ' You will have to manually edit etc/housekeeping.conf: ' 'set max_points_per_track to ' + str(max_points) + ' and max_ideograms to ' + str(max_ideograms)) return '<<include %s>>\n' % join('etc', 'housekeeping.conf') housekeeping_fpath = join(output_dir, 'housekeeping.conf') with open(template_fpath) as f: with open(housekeeping_fpath, 'w') as out_f: for line in f: if 'max_points_per_track' in line: out_f.write('max_points_per_track = %d\n' % max_points) elif 'max_ideograms' in line: out_f.write('max_ideograms = %d\n' % max_ideograms) else: out_f.write(line) return '<<include %s>>\n' % relpath(housekeeping_fpath, root_dir)
from quast_libs.qutils import compile_tool, check_prev_compilation_failed bwa_dirpath = join(qconfig.LIBS_LOCATION, 'bwa') sambamba_dirpath = join(qconfig.LIBS_LOCATION, 'sambamba') bedtools_dirpath = join(qconfig.LIBS_LOCATION, 'bedtools') bedtools_bin_dirpath = join(qconfig.LIBS_LOCATION, 'bedtools', 'bin') manta_dirpath = join(qconfig.LIBS_LOCATION, 'manta') manta_build_dirpath = join(qconfig.LIBS_LOCATION, 'manta', 'build') manta_bin_dirpath = join(qconfig.LIBS_LOCATION, 'manta', 'build', 'bin') config_manta_fpath = join(manta_bin_dirpath, 'configManta.py') manta_external_dirpath = join(qconfig.QUAST_HOME, 'external_tools/manta') manta_ext_linux_fpath = join(manta_external_dirpath, 'manta_linux.tar.bz2') manta_ext_osx_fpath = join(manta_external_dirpath, 'manta_osx.tar.bz2') manta_linux_url = qconfig.GIT_ROOT_URL + qutils.relpath(manta_ext_linux_fpath, qconfig.QUAST_HOME) manta_osx_url = qconfig.GIT_ROOT_URL + qutils.relpath(manta_ext_osx_fpath, qconfig.QUAST_HOME) def bwa_fpath(fname): return join(bwa_dirpath, fname) def sambamba_fpath(fname): platform_suffix = '_osx' if qconfig.platform_name == 'macosx' else '_linux' return join(sambamba_dirpath, fname + platform_suffix) def bedtools_fpath(fname): return join(bedtools_bin_dirpath, fname)
from quast_libs.qutils import compile_tool, get_dir_for_download, relpath, get_path_to_program, download_file, \ download_external_tool, is_non_empty_file, correct_name, get_free_memory bwa_dirpath = join(qconfig.LIBS_LOCATION, 'bwa') bedtools_dirpath = join(qconfig.LIBS_LOCATION, 'bedtools') bedtools_bin_dirpath = join(qconfig.LIBS_LOCATION, 'bedtools', 'bin') sambamba_dirpath = join(qconfig.LIBS_LOCATION, 'sambamba') gridss_dirpath = None gridss_version = '1.4.1' gridss_fname = 'gridss-' + gridss_version + '.jar' gridss_external_fpath = join(qconfig.QUAST_HOME, 'external_tools/gridss', gridss_fname) gridss_url = qconfig.GIT_ROOT_URL + relpath(gridss_external_fpath, qconfig.QUAST_HOME) def bwa_fpath(fname): return get_path_to_program(fname, bwa_dirpath) def sambamba_fpath(fname): platform_suffix = '_osx' if qconfig.platform_name == 'macosx' else '_linux' return join(sambamba_dirpath, fname + platform_suffix) def bedtools_fpath(fname): return get_path_to_program(fname, bedtools_bin_dirpath)
def find_package_files(dirpath, package=quast_package): paths = [] for (path, dirs, fnames) in os.walk(join(package, dirpath)): for fname in fnames: paths.append(qutils.relpath(join(path, fname), package)) return paths
import urllib.request as urllib import xml.etree.ElementTree as ET import socket socket.setdefaulttimeout(120) silva_pattern = re.compile(r'\S+\_(?P<taxons>\S+);(?P<seqname>\S+)', re.I) ncbi_pattern = re.compile(r'(?P<id>\S+\_[0-9.]+)[_ |](?P<seqname>\S+)', re.I) silva_db_url = 'http://www.arb-silva.de/fileadmin/silva_databases/release_123/Exports/' silva_fname = 'SILVA_123_SSURef_Nr99_tax_silva.fasta' external_tools_dirpath = join(qconfig.QUAST_HOME, 'external_tools') blast_external_tools_dirpath = join(external_tools_dirpath, 'blast', qconfig.platform_name) blast_filenames = ['makeblastdb', 'blastn'] blast_dirpath_url = qconfig.GIT_ROOT_URL + qutils.relpath(blast_external_tools_dirpath, qconfig.QUAST_HOME) blast_dirpath = join(qconfig.LIBS_LOCATION, 'blast') blastdb_dirpath = join(qconfig.LIBS_LOCATION, 'blast', '16S_RNA_blastdb') db_fpath = join(blastdb_dirpath, 'silva.db') db_nsq_fsize = 194318557 is_quast_first_run = False taxons_for_krona = {} connection_errors = 0 def get_blast_fpath(fname): blast_path = os.path.join(blast_dirpath, fname) if os.path.exists(blast_path): return blast_path
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath, reference_chromosomes, ns_by_chromosomes, old_contigs_fpath, bed_fpath, threads=1): tmp_output_dirpath = create_minimap_output_dir(output_dirpath) assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) out_basename = join(tmp_output_dirpath, corr_assembly_label) logger.info(' ' + qutils.index_to_str(index) + assembly_label) if not qconfig.space_efficient: log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout') log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr') icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label) misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info') unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info') else: log_out_fpath = '/dev/null' log_err_fpath = '/dev/null' icarus_out_fpath = '/dev/null' misassembly_fpath = '/dev/null' unaligned_info_fpath = '/dev/null' icarus_out_f = open(icarus_out_fpath, 'w') icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group'] icarus_out_f.write('\t'.join(icarus_header_cols) + '\n') misassembly_f = open(misassembly_fpath, 'w') if not qconfig.space_efficient: logger.info(' ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath + ' and ' + os.path.basename(log_err_fpath) + '...') else: logger.info(' ' + qutils.index_to_str(index) + 'Logging is disabled.') coords_fpath, coords_filtered_fpath, unaligned_fpath, used_snps_fpath = get_aux_out_fpaths(out_basename) status = align_contigs(coords_fpath, out_basename, ref_fpath, contigs_fpath, old_contigs_fpath, index, threads, log_out_fpath, log_err_fpath) if status != AlignerStatus.OK: with open(log_err_fpath, 'a') as log_err_f: if status == AlignerStatus.ERROR: logger.error(' ' + qutils.index_to_str(index) + 'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) + ' to the reference (non-zero exit code). ' + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) elif status == AlignerStatus.FAILED: log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n') logger.info(' ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.') elif status == AlignerStatus.NOT_ALIGNED: log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n') logger.info(' ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.') return status, {}, [], [], [] log_out_f = open(log_out_fpath, 'a') # Loading the alignment files log_out_f.write('Parsing coords...\n') aligns = {} with open(coords_fpath) as coords_file: for line in coords_file: mapping = Mapping.from_line(line) aligns.setdefault(mapping.contig, []).append(mapping) # Loading the reference sequences log_out_f.write('Loading reference...\n') # TODO: move up ref_features = {} # Loading the regions (if any) regions = {} total_reg_len = 0 total_regions = 0 # # TODO: gff # log_out_f.write('Loading regions...\n') # log_out_f.write('\tNo regions given, using whole reference.\n') for name, seq_len in reference_chromosomes.items(): log_out_f.write('\tLoaded [%s]\n' % name) regions.setdefault(name, []).append([1, seq_len]) total_regions += 1 total_reg_len += seq_len log_out_f.write('\tTotal Regions: %d\n' % total_regions) log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len) ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=open(coords_filtered_fpath, 'w'), icarus_out_f=icarus_out_f) log_out_f.write('Analyzing contigs...\n') result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\ analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, reference_chromosomes, is_cyclic) log_out_f.write('Analyzing coverage...\n') if qconfig.show_snps: log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n') total_aligned_bases, indels_info = analyze_coverage(ref_aligns, reference_chromosomes, ns_by_chromosomes, used_snps_fpath) total_indels_info += indels_info cov_stats = {'SNPs': total_indels_info.mismatches, 'indels_list': total_indels_info.indels_list, 'total_aligned_bases': total_aligned_bases} result.update(cov_stats) result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result) if not qconfig.space_efficient: ## outputting misassembled contigs to separate file fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath) if name in misassembled_contigs.keys()] fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta) if qconfig.is_combined_ref: alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv') unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label) logger.debug(' ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath)) used_contigs = set() with open(unique_contigs_fpath, 'w') as unique_contigs_f: with open(alignment_tsv_fpath, 'w') as alignment_tsv_f: for chr_name, aligns in ref_aligns.items(): alignment_tsv_f.write(chr_name) contigs = set([align.contig for align in aligns]) for contig in contigs: alignment_tsv_f.write('\t' + contig) if qconfig.is_combined_ref: ref_name = ref_labels_by_chromosomes[chr_name] align_by_contigs = defaultdict(int) for align in aligns: align_by_contigs[align.contig] += align.len2 for contig, aligned_len in align_by_contigs.items(): if contig in used_contigs: continue used_contigs.add(contig) len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)') if len_cov_pattern.findall(contig): contig_len = len_cov_pattern.findall(contig)[0][0] contig_cov = len_cov_pattern.findall(contig)[0][1] if aligned_len / float(contig_len) > 0.9: unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n') alignment_tsv_f.write('\n') close_handlers(ca_output) logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') logger.debug('') if not ref_aligns: return AlignerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs else: return AlignerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath, old_contigs_fpath, bed_fpath, parallel_by_chr=False, threads=1): nucmer_output_dirpath = create_nucmer_output_dir(output_dirpath) assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) nucmer_fpath = join(nucmer_output_dirpath, corr_assembly_label) logger.info(' ' + qutils.index_to_str(index) + assembly_label) if not qconfig.space_efficient: log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout') log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr') icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label) misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info') unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info') else: log_out_fpath = '/dev/null' log_err_fpath = '/dev/null' icarus_out_fpath = '/dev/null' misassembly_fpath = '/dev/null' unaligned_info_fpath = '/dev/null' icarus_out_f = open(icarus_out_fpath, 'w') icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group'] icarus_out_f.write('\t'.join(icarus_header_cols) + '\n') misassembly_f = open(misassembly_fpath, 'w') if not qconfig.space_efficient: logger.info(' ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath + ' and ' + os.path.basename(log_err_fpath) + '...') else: logger.info(' ' + qutils.index_to_str(index) + 'Logging is disabled.') coords_fpath, coords_filtered_fpath, unaligned_fpath, show_snps_fpath, used_snps_fpath = \ get_nucmer_aux_out_fpaths(nucmer_fpath) nucmer_status = align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index, parallel_by_chr, threads, log_out_fpath, log_err_fpath) if nucmer_status != NucmerStatus.OK: with open(log_err_fpath, 'a') as log_err_f: if nucmer_status == NucmerStatus.ERROR: logger.error(' ' + qutils.index_to_str(index) + 'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) + ' to the reference (non-zero exit code). ' + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) elif nucmer_status == NucmerStatus.FAILED: log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n') logger.info(' ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.') elif nucmer_status == NucmerStatus.NOT_ALIGNED: log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n') logger.info(' ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.') clean_tmp_files(nucmer_fpath) return nucmer_status, {}, [], [], [] log_out_f = open(log_out_fpath, 'a') # Loading the alignment files log_out_f.write('Parsing coords...\n') aligns = {} coords_file = open(coords_fpath) coords_filtered_file = open(coords_filtered_fpath, 'w') coords_filtered_file.write(coords_file.readline()) coords_filtered_file.write(coords_file.readline()) for line in coords_file: if line.strip() == '': break assert line[0] != '=' #Clear leading spaces from nucmer output #Store nucmer lines in an array mapping = Mapping.from_line(line) aligns.setdefault(mapping.contig, []).append(mapping) # Loading the reference sequences log_out_f.write('Loading reference...\n') # TODO: move up references = {} ref_features = {} for name, seq in fastaparser.read_fasta(ref_fpath): name = name.split()[0] # no spaces in reference header references[name] = seq log_out_f.write('\tLoaded [%s]\n' % name) #Loading the SNP calls if qconfig.show_snps: log_out_f.write('Loading SNPs...\n') used_snps_file = None snps = {} if qconfig.show_snps: prev_line = None for line in open_gzipsafe(show_snps_fpath): #print "$line"; line = line.split() if not line[0].isdigit(): continue if prev_line and line == prev_line: continue ref = line[10] ctg = line[11] pos = int(line[0]) # Kolya: python don't convert int<->str types automatically loc = int(line[3]) # Kolya: same as above # if (! exists $line[11]) { die "Malformed line in SNP file. Please check that show-snps has completed succesfully.\n$line\n[$line[9]][$line[10]][$line[11]]\n"; } if pos in snps.setdefault(ref, {}).setdefault(ctg, {}): snps.setdefault(ref, {}).setdefault(ctg, {})[pos].append(SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])) else: snps.setdefault(ref, {}).setdefault(ctg, {})[pos] = [SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])] prev_line = line used_snps_file = open_gzipsafe(used_snps_fpath, 'w') # Loading the regions (if any) regions = {} ref_lens = {} total_reg_len = 0 total_regions = 0 # # TODO: gff # log_out_f.write('Loading regions...\n') # log_out_f.write('\tNo regions given, using whole reference.\n') for name, seq in references.items(): regions.setdefault(name, []).append([1, len(seq)]) ref_lens[name] = len(seq) total_regions += 1 total_reg_len += ref_lens[name] log_out_f.write('\tTotal Regions: %d\n' % total_regions) log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len) ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=coords_filtered_file, used_snps_f=used_snps_file, icarus_out_f=icarus_out_f) log_out_f.write('Analyzing contigs...\n') result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\ analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic) log_out_f.write('Analyzing coverage...\n') if qconfig.show_snps: log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n') result.update(analyze_coverage(ca_output, regions, ref_aligns, ref_features, snps, total_indels_info)) result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result) if not qconfig.space_efficient: ## outputting misassembled contigs to separate file fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath) if name in misassembled_contigs.keys()] fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta) if qconfig.is_combined_ref: alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv') unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label) logger.debug(' ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath)) used_contigs = set() with open(unique_contigs_fpath, 'w') as unique_contigs_f: with open(alignment_tsv_fpath, 'w') as alignment_tsv_f: for chr_name, aligns in ref_aligns.items(): alignment_tsv_f.write(chr_name) contigs = set([align.contig for align in aligns]) for contig in contigs: alignment_tsv_f.write('\t' + contig) if qconfig.is_combined_ref: ref_name = ref_labels_by_chromosomes[chr_name] align_by_contigs = defaultdict(int) for align in aligns: align_by_contigs[align.contig] += align.len2 for contig, aligned_len in align_by_contigs.items(): if contig in used_contigs: continue used_contigs.add(contig) len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)') if len_cov_pattern.findall(contig): contig_len = len_cov_pattern.findall(contig)[0][0] contig_cov = len_cov_pattern.findall(contig)[0][1] if aligned_len / float(contig_len) > 0.9: unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n') alignment_tsv_f.write('\n') close_handlers(ca_output) logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') logger.debug('') clean_tmp_files(nucmer_fpath) if not qconfig.no_gzip: compress_nucmer_output(logger, nucmer_fpath) if not ref_aligns: return NucmerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs else: return NucmerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
from quast_libs.fastaparser import get_chr_lengths_from_fastafile from quast_libs.qutils import compile_tool, get_dir_for_download, relpath, get_path_to_program, download_file, \ download_external_tool, is_non_empty_file, correct_name, get_total_memory bwa_dirpath = join(qconfig.LIBS_LOCATION, 'bwa') bedtools_dirpath = join(qconfig.LIBS_LOCATION, 'bedtools') bedtools_bin_dirpath = join(qconfig.LIBS_LOCATION, 'bedtools', 'bin') lap_dirpath = join(qconfig.LIBS_LOCATION, 'LAP') sambamba_dirpath = join(qconfig.LIBS_LOCATION, 'sambamba') gridss_dirpath = None gridss_version = '1.4.1' gridss_fname = 'gridss-' + gridss_version + '.jar' gridss_external_fpath = join(qconfig.QUAST_HOME, 'external_tools/gridss', gridss_fname) gridss_url = qconfig.GIT_ROOT_URL + relpath(gridss_external_fpath, qconfig.QUAST_HOME) def bwa_fpath(fname): return get_path_to_program(fname, bwa_dirpath) def sambamba_fpath(fname): platform_suffix = '_osx' if qconfig.platform_name == 'macosx' else '_linux' return join(sambamba_dirpath, fname + platform_suffix) def bedtools_fpath(fname): return get_path_to_program(fname, bedtools_bin_dirpath)
def create_conf(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, output_dir, gc_fpath, features_containers, cov_fpath, logger): data_dir = join(output_dir, 'data') if not exists(data_dir): os.makedirs(data_dir) chr_lengths = get_chr_lengths_from_fastafile(ref_fpath) max_len, karyotype_fpath, ideogram_fpath = create_ideogram(chr_lengths, data_dir) if max_len >= 10 ** 6: chrom_units = 10 ** 5 elif max_len >= 10 ** 5: chrom_units = 10 ** 4 else: chrom_units = 1000 ticks_fpath = create_ticks_conf(chrom_units, data_dir) ref_len = sum(chr_lengths.values()) window_size = set_window_size(ref_len) assemblies, contig_points = parse_alignments(contigs_fpaths, contig_report_fpath_pattern) alignments_fpaths = [create_alignment_plots(assembly, ref_len, data_dir) for assembly in assemblies] if not alignments_fpaths: return None gc_fpath, min_gc, max_gc, gc_points = create_gc_plot(gc_fpath, data_dir) feature_fpaths, gene_points = create_genes_plot(features_containers, window_size, ref_len, data_dir) mismatches_fpaths = [create_mismatches_plot(assembly, window_size, ref_len, output_dir, data_dir) for assembly in assemblies] cov_data_fpath, cov_points = create_coverage_plot(cov_fpath, window_size, ref_len, data_dir) max_points = max([MAX_POINTS, gc_points, gene_points, cov_points, contig_points]) labels_fpath, track_labels = create_labels(chr_lengths, assemblies, features_containers, cov_data_fpath, data_dir) conf_fpath = join(output_dir, 'circos.conf') radius = 0.95 plot_idx = 0 track_intervals = [TRACK_INTERVAL] * len(assemblies) if feature_fpaths: track_intervals[-1] = BIG_TRACK_INTERVAL track_intervals += [TRACK_INTERVAL] * len(feature_fpaths) if cov_data_fpath: track_intervals[-1] = BIG_TRACK_INTERVAL track_intervals.append(TRACK_INTERVAL) track_intervals[-1] = BIG_TRACK_INTERVAL with open(conf_fpath, 'w') as out_f: out_f.write('<<include etc/colors_fonts_patterns.conf>>\n') out_f.write('<<include %s>>\n' % relpath(ideogram_fpath, output_dir)) out_f.write('<<include %s>>\n' % relpath(ticks_fpath, output_dir)) out_f.write('karyotype = %s\n' % relpath(karyotype_fpath, output_dir)) out_f.write('chromosomes_units = %d\n' % chrom_units) out_f.write('chromosomes_display_default = yes\n') out_f.write('track_width = ' + str(TRACK_WIDTH) + '\n') for i in range(len(track_intervals)): out_f.write('track%d_pos = %f\n' % (i, radius)) radius -= TRACK_WIDTH radius -= track_intervals[i] out_f.write('track%d_pos = %f\n' % (len(track_intervals), radius)) out_f.write('<image>\n') out_f.write('dir = %s\n' % output_dir) out_f.write('file = %s\n' % circos_png_fname) out_f.write('png = yes\n') out_f.write('svg = no\n') out_f.write('radius = 1500p\n') out_f.write('angle_offset = -90\n') out_f.write('auto_alpha_colors = yes\n') out_f.write('auto_alpha_steps = 5\n') out_f.write('background = white\n') out_f.write('</image>\n') if qconfig.is_combined_ref: out_f.write('<highlights>\n') highlights_fpath = create_meta_highlights(chr_lengths, data_dir) out_f.write('<highlight>\n') out_f.write('file = %s\n' % relpath(highlights_fpath, output_dir)) out_f.write('r0 = 1r - 50p\n') out_f.write('r1 = 1r - 30p\n') out_f.write('</highlight>\n') out_f.write('</highlights>\n') out_f.write(create_housekeeping_file(chr_lengths, max_points, output_dir, data_dir, logger)) out_f.write('<plots>\n') out_f.write('layers_overflow = collapse\n') for label, i in track_labels: out_f.write('<plot>\n') out_f.write('track_idx = track%d\n' % i) out_f.write('<<include %s>>\n' % relpath(labels_fpath, output_dir)) out_f.write('</plot>\n') for i, alignments_conf in enumerate(alignments_fpaths): out_f.write('<plot>\n') out_f.write('type = tile\n') out_f.write('thickness = 50p\n') out_f.write('stroke_thickness = 0\n') out_f.write('layers = 1\n') out_f.write('file = %s\n' % relpath(alignments_conf, output_dir)) out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n') out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n') out_f.write('</plot>\n') if mismatches_fpaths and mismatches_fpaths[i]: out_f.write('<plot>\n') out_f.write('type = histogram\n') out_f.write('thickness = 1\n') out_f.write('fill_color = vlyellow\n') out_f.write('file = %s\n' % relpath(mismatches_fpaths[i], output_dir)) out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n') out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n') out_f.write('</plot>\n') plot_idx += 1 for feature_fpath in feature_fpaths: # genes plot out_f.write('<plot>\n') out_f.write('type = heatmap\n') out_f.write('file = %s\n' % relpath(feature_fpath, output_dir)) out_f.write('color = ylorbr-9\n') out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n') out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n') out_f.write('</plot>\n') plot_idx += 1 if cov_data_fpath: # coverage plot out_f.write('<plot>\n') out_f.write('type = histogram\n') out_f.write('thickness = 1\n') out_f.write('file = %s\n' % relpath(cov_data_fpath, output_dir)) out_f.write('fill_color = vlblue\n') out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n') out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n') out_f.write('</plot>\n') plot_idx += 1 # GC plot out_f.write('<plot>\n') out_f.write('type = heatmap\n') out_f.write('file = %s\n' % relpath(gc_fpath, output_dir)) out_f.write('color = greys-6\n') out_f.write('scale_log_base = 1.5\n') out_f.write('r0 = 1r - 29p\n') out_f.write('r1 = 1r - 1p\n') out_f.write('</plot>\n') out_f.write('</plots>\n') circos_legend_fpath = create_legend(assemblies, min_gc, max_gc, features_containers, cov_data_fpath, output_dir) return conf_fpath, circos_legend_fpath
def create_conf(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, output_dir, gc_fpath, features_containers, cov_fpath, logger): data_dir = join(output_dir, 'data') if not exists(data_dir): os.makedirs(data_dir) chr_lengths = get_chr_lengths_from_fastafile(ref_fpath) max_len, karyotype_fpath, ideogram_fpath = create_ideogram(chr_lengths, data_dir) if max_len >= 10 ** 6: chrom_units = 10 ** 5 elif max_len >= 10 ** 5: chrom_units = 10 ** 4 else: chrom_units = 1000 ticks_fpath = create_ticks_conf(chrom_units, data_dir) ref_len = sum(chr_lengths.values()) window_size = set_window_size(ref_len) assemblies, contig_points = parse_alignments(contigs_fpaths, contig_report_fpath_pattern) alignments_fpaths = [create_alignment_plots(assembly, ref_len, data_dir) for assembly in assemblies] if not alignments_fpaths: return None gc_fpath, min_gc, max_gc, gc_points = create_gc_plot(gc_fpath, data_dir) feature_fpaths, gene_points = create_genes_plot(features_containers, window_size, ref_len, data_dir) mismatches_fpaths = [create_mismatches_plot(assembly, window_size, ref_len, output_dir, data_dir) for assembly in assemblies] cov_data_fpath, cov_points = create_coverage_plot(cov_fpath, window_size, chr_lengths, data_dir) max_points = max([MAX_POINTS, gc_points, gene_points, cov_points, contig_points]) labels_fpath, track_labels = create_labels(chr_lengths, assemblies, features_containers, cov_data_fpath, data_dir) conf_fpath = join(output_dir, 'circos.conf') radius = 0.95 plot_idx = 0 track_intervals = [TRACK_INTERVAL] * len(assemblies) if feature_fpaths: track_intervals[-1] = BIG_TRACK_INTERVAL track_intervals += [TRACK_INTERVAL] * len(feature_fpaths) if cov_data_fpath: track_intervals[-1] = BIG_TRACK_INTERVAL track_intervals.append(TRACK_INTERVAL) track_intervals[-1] = BIG_TRACK_INTERVAL with open(conf_fpath, 'w') as out_f: out_f.write('<<include etc/colors_fonts_patterns.conf>>\n') out_f.write('<<include %s>>\n' % relpath(ideogram_fpath, output_dir)) out_f.write('<<include %s>>\n' % relpath(ticks_fpath, output_dir)) out_f.write('karyotype = %s\n' % relpath(karyotype_fpath, output_dir)) out_f.write('chromosomes_units = %d\n' % chrom_units) out_f.write('chromosomes_display_default = yes\n') out_f.write('track_width = ' + str(TRACK_WIDTH) + '\n') for i in range(len(track_intervals)): out_f.write('track%d_pos = %f\n' % (i, radius)) radius -= TRACK_WIDTH radius -= track_intervals[i] out_f.write('track%d_pos = %f\n' % (len(track_intervals), radius)) out_f.write('<image>\n') out_f.write('dir = %s\n' % output_dir) out_f.write('file = %s\n' % circos_png_fname) out_f.write('png = yes\n') out_f.write('svg = no\n') out_f.write('radius = 1500p\n') out_f.write('angle_offset = -90\n') out_f.write('auto_alpha_colors = yes\n') out_f.write('auto_alpha_steps = 5\n') out_f.write('background = white\n') out_f.write('</image>\n') if qconfig.is_combined_ref: out_f.write('<highlights>\n') highlights_fpath = create_meta_highlights(chr_lengths, data_dir) out_f.write('<highlight>\n') out_f.write('file = %s\n' % relpath(highlights_fpath, output_dir)) out_f.write('r0 = 1r - 50p\n') out_f.write('r1 = 1r - 30p\n') out_f.write('</highlight>\n') out_f.write('</highlights>\n') out_f.write(create_housekeeping_file(chr_lengths, max_points, output_dir, data_dir, logger)) out_f.write('<plots>\n') out_f.write('layers_overflow = collapse\n') for label, i in track_labels: out_f.write('<plot>\n') out_f.write('track_idx = track%d\n' % i) out_f.write('<<include %s>>\n' % relpath(labels_fpath, output_dir)) out_f.write('</plot>\n') for i, alignments_conf in enumerate(alignments_fpaths): out_f.write('<plot>\n') out_f.write('type = tile\n') out_f.write('thickness = 50p\n') out_f.write('stroke_thickness = 0\n') out_f.write('layers = 1\n') out_f.write('file = %s\n' % relpath(alignments_conf, output_dir)) out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n') out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n') out_f.write('</plot>\n') if mismatches_fpaths and mismatches_fpaths[i]: out_f.write('<plot>\n') out_f.write('type = histogram\n') out_f.write('thickness = 1\n') out_f.write('fill_color = vlyellow\n') out_f.write('file = %s\n' % relpath(mismatches_fpaths[i], output_dir)) out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n') out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n') out_f.write('</plot>\n') plot_idx += 1 for feature_fpath in feature_fpaths: # genes plot out_f.write('<plot>\n') out_f.write('type = heatmap\n') out_f.write('file = %s\n' % relpath(feature_fpath, output_dir)) out_f.write('color = ylorbr-9\n') out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n') out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n') out_f.write('</plot>\n') plot_idx += 1 if cov_data_fpath: # coverage plot out_f.write('<plot>\n') out_f.write('type = histogram\n') out_f.write('thickness = 1\n') out_f.write('file = %s\n' % relpath(cov_data_fpath, output_dir)) out_f.write('fill_color = vlblue\n') out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n') out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n') out_f.write('</plot>\n') plot_idx += 1 # GC plot out_f.write('<plot>\n') out_f.write('type = heatmap\n') out_f.write('file = %s\n' % relpath(gc_fpath, output_dir)) out_f.write('color = greys-6\n') out_f.write('scale_log_base = 1.5\n') out_f.write('r0 = 1r - 29p\n') out_f.write('r1 = 1r - 1p\n') out_f.write('</plot>\n') out_f.write('</plots>\n') circos_legend_fpath = create_legend(assemblies, min_gc, max_gc, features_containers, cov_data_fpath, output_dir) return conf_fpath, circos_legend_fpath
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath, reference_chromosomes, ns_by_chromosomes, old_contigs_fpath, bed_fpath, threads=1): tmp_output_dirpath = create_minimap_output_dir(output_dirpath) assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) out_basename = join(tmp_output_dirpath, corr_assembly_label) logger.info(' ' + qutils.index_to_str(index) + assembly_label) if not qconfig.space_efficient: log_out_fpath = join( output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout') log_err_fpath = join( output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr') icarus_out_fpath = join( output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label) misassembly_fpath = join( output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info') unaligned_info_fpath = join( output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info') else: log_out_fpath = '/dev/null' log_err_fpath = '/dev/null' icarus_out_fpath = '/dev/null' misassembly_fpath = '/dev/null' unaligned_info_fpath = '/dev/null' icarus_out_f = open(icarus_out_fpath, 'w') icarus_header_cols = [ 'S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group' ] icarus_out_f.write('\t'.join(icarus_header_cols) + '\n') misassembly_f = open(misassembly_fpath, 'w') if not qconfig.space_efficient: logger.info(' ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath + ' and ' + os.path.basename(log_err_fpath) + '...') else: logger.info(' ' + qutils.index_to_str(index) + 'Logging is disabled.') coords_fpath, coords_filtered_fpath, unaligned_fpath, used_snps_fpath = get_aux_out_fpaths( out_basename) status = align_contigs(coords_fpath, out_basename, ref_fpath, contigs_fpath, old_contigs_fpath, index, threads, log_out_fpath, log_err_fpath) if status != AlignerStatus.OK: with open(log_err_fpath, 'a') as log_err_f: if status == AlignerStatus.ERROR: logger.error( ' ' + qutils.index_to_str(index) + 'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) + ' to the reference (non-zero exit code). ' + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) elif status == AlignerStatus.FAILED: log_err_f.write( qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n') logger.info(' ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.') elif status == AlignerStatus.NOT_ALIGNED: log_err_f.write( qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n') logger.info(' ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.') return status, {}, [], [], [] log_out_f = open(log_out_fpath, 'a') # Loading the alignment files log_out_f.write('Parsing coords...\n') aligns = {} with open(coords_fpath) as coords_file: for line in coords_file: mapping = Mapping.from_line(line) aligns.setdefault(mapping.contig, []).append(mapping) # Loading the reference sequences log_out_f.write('Loading reference...\n') # TODO: move up ref_features = {} # Loading the regions (if any) regions = {} total_reg_len = 0 total_regions = 0 # # TODO: gff # log_out_f.write('Loading regions...\n') # log_out_f.write('\tNo regions given, using whole reference.\n') for name, seq_len in reference_chromosomes.items(): log_out_f.write('\tLoaded [%s]\n' % name) regions.setdefault(name, []).append([1, seq_len]) total_regions += 1 total_reg_len += seq_len log_out_f.write('\tTotal Regions: %d\n' % total_regions) log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len) ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=open(coords_filtered_fpath, 'w'), icarus_out_f=icarus_out_f) log_out_f.write('Analyzing contigs...\n') result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\ analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, reference_chromosomes, is_cyclic) log_out_f.write('Analyzing coverage...\n') if qconfig.show_snps: log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n') total_aligned_bases, indels_info = analyze_coverage( ref_aligns, reference_chromosomes, ns_by_chromosomes, used_snps_fpath) total_indels_info += indels_info cov_stats = { 'SNPs': total_indels_info.mismatches, 'indels_list': total_indels_info.indels_list, 'total_aligned_bases': total_aligned_bases } result.update(cov_stats) result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result) if not qconfig.space_efficient: ## outputting misassembled contigs to separate file fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath) if name in misassembled_contigs.keys()] fastaparser.write_fasta( join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta) if qconfig.is_combined_ref: alignment_tsv_fpath = join( output_dirpath, "alignments_" + corr_assembly_label + '.tsv') unique_contigs_fpath = join( output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label) logger.debug(' ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath)) used_contigs = set() with open(unique_contigs_fpath, 'w') as unique_contigs_f: with open(alignment_tsv_fpath, 'w') as alignment_tsv_f: for chr_name, aligns in ref_aligns.items(): alignment_tsv_f.write(chr_name) contigs = set([align.contig for align in aligns]) for contig in contigs: alignment_tsv_f.write('\t' + contig) if qconfig.is_combined_ref: ref_name = ref_labels_by_chromosomes[chr_name] align_by_contigs = defaultdict(int) for align in aligns: align_by_contigs[align.contig] += align.len2 for contig, aligned_len in align_by_contigs.items(): if contig in used_contigs: continue used_contigs.add(contig) len_cov_pattern = re.compile( r'_length_([\d\.]+)_cov_([\d\.]+)') if len_cov_pattern.findall(contig): contig_len = len_cov_pattern.findall( contig)[0][0] contig_cov = len_cov_pattern.findall( contig)[0][1] if aligned_len / float(contig_len) > 0.9: unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n') alignment_tsv_f.write('\n') close_handlers(ca_output) logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') logger.debug('') if not ref_aligns: return AlignerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs else: return AlignerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath, old_contigs_fpath, bed_fpath, parallel_by_chr=False, threads=1): nucmer_output_dirpath = create_nucmer_output_dir(output_dirpath) assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) nucmer_fpath = join(nucmer_output_dirpath, corr_assembly_label) logger.info(' ' + qutils.index_to_str(index) + assembly_label) if not qconfig.space_efficient: log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout') log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr') icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label) misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info') unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info') else: log_out_fpath = '/dev/null' log_err_fpath = '/dev/null' icarus_out_fpath = '/dev/null' misassembly_fpath = '/dev/null' unaligned_info_fpath = '/dev/null' icarus_out_f = open(icarus_out_fpath, 'w') icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group'] icarus_out_f.write('\t'.join(icarus_header_cols) + '\n') misassembly_f = open(misassembly_fpath, 'w') if not qconfig.space_efficient: logger.info(' ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath + ' and ' + os.path.basename(log_err_fpath) + '...') else: logger.info(' ' + qutils.index_to_str(index) + 'Logging is disabled.') coords_fpath, coords_filtered_fpath, unaligned_fpath, show_snps_fpath, used_snps_fpath = \ get_nucmer_aux_out_fpaths(nucmer_fpath) nucmer_status = align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index, parallel_by_chr, threads, log_out_fpath, log_err_fpath) if nucmer_status != NucmerStatus.OK: with open(log_err_fpath, 'a') as log_err_f: if nucmer_status == NucmerStatus.ERROR: logger.error(' ' + qutils.index_to_str(index) + 'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) + ' to the reference (non-zero exit code). ' + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) elif nucmer_status == NucmerStatus.FAILED: log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n') logger.info(' ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.') elif nucmer_status == NucmerStatus.NOT_ALIGNED: log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n') logger.info(' ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.') clean_tmp_files(nucmer_fpath) return nucmer_status, {}, [], [], [] log_out_f = open(log_out_fpath, 'a') # Loading the alignment files log_out_f.write('Parsing coords...\n') aligns = {} coords_file = open(coords_fpath) coords_filtered_file = open(coords_filtered_fpath, 'w') coords_filtered_file.write(coords_file.readline()) coords_filtered_file.write(coords_file.readline()) for line in coords_file: if line.strip() == '': break assert line[0] != '=' #Clear leading spaces from nucmer output #Store nucmer lines in an array mapping = Mapping.from_line(line) aligns.setdefault(mapping.contig, []).append(mapping) # Loading the reference sequences log_out_f.write('Loading reference...\n') # TODO: move up ref_lens = {} ref_features = {} for name, seq in fastaparser.read_fasta(ref_fpath): name = name.split()[0] # no spaces in reference header ref_lens[name] = len(seq) log_out_f.write('\tLoaded [%s]\n' % name) #Loading the SNP calls if qconfig.show_snps: log_out_f.write('Loading SNPs...\n') used_snps_file = None snps = {} if qconfig.show_snps: prev_line = None for line in open_gzipsafe(show_snps_fpath): #print "$line"; line = line.split() if not line[0].isdigit(): continue if prev_line and line == prev_line: continue ref = line[10] ctg = line[11] pos = int(line[0]) # Kolya: python don't convert int<->str types automatically loc = int(line[3]) # Kolya: same as above # if (! exists $line[11]) { die "Malformed line in SNP file. Please check that show-snps has completed succesfully.\n$line\n[$line[9]][$line[10]][$line[11]]\n"; } if pos in snps.setdefault(ref, {}).setdefault(ctg, {}): snps.setdefault(ref, {}).setdefault(ctg, {})[pos].append(SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])) else: snps.setdefault(ref, {}).setdefault(ctg, {})[pos] = [SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])] prev_line = line used_snps_file = open_gzipsafe(used_snps_fpath, 'w') # Loading the regions (if any) regions = {} total_reg_len = 0 total_regions = 0 # # TODO: gff # log_out_f.write('Loading regions...\n') # log_out_f.write('\tNo regions given, using whole reference.\n') for name, seq_len in ref_lens.items(): regions.setdefault(name, []).append([1, seq_len]) total_regions += 1 total_reg_len += seq_len log_out_f.write('\tTotal Regions: %d\n' % total_regions) log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len) ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=coords_filtered_file, used_snps_f=used_snps_file, icarus_out_f=icarus_out_f) log_out_f.write('Analyzing contigs...\n') result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\ analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic) # if qconfig.large_genome: # log_out_f.write('Analyzing large blocks...\n') # large_misassembly_fpath = add_suffix(misassembly_fpath, 'large_blocks') if not qconfig.space_efficient else '/dev/null' # ca_large_output = CAOutput(stdout_f=log_out_f, misassembly_f=open(large_misassembly_fpath, 'w'), # coords_filtered_f=coords_filtered_file, used_snps_f=open('/dev/null', 'w'), icarus_out_f=open('/dev/null', 'w')) # min_alignment, extensive_mis_threshold = qconfig.min_alignment, qconfig.extensive_misassembly_threshold # qconfig.min_alignment, qconfig.extensive_misassembly_threshold = qconfig.LARGE_MIN_ALIGNMENT, qconfig.LARGE_EXTENSIVE_MIS_THRESHOLD # result.update(analyze_contigs(ca_large_output, contigs_fpath, '/dev/null', '/dev/null', # aligns, ref_features, ref_lens, is_cyclic, large_misassemblies_search=True)[0]) # qconfig.min_alignment, qconfig.extensive_misassembly_threshold = min_alignment, extensive_mis_threshold log_out_f.write('Analyzing coverage...\n') if qconfig.show_snps: log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n') result.update(analyze_coverage(ca_output, regions, ref_aligns, ref_features, snps, total_indels_info)) result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result) if not qconfig.space_efficient: ## outputting misassembled contigs to separate file fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath) if name in misassembled_contigs.keys()] fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta) if qconfig.is_combined_ref: alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv') unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label) logger.debug(' ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath)) used_contigs = set() with open(unique_contigs_fpath, 'w') as unique_contigs_f: with open(alignment_tsv_fpath, 'w') as alignment_tsv_f: for chr_name, aligns in ref_aligns.items(): alignment_tsv_f.write(chr_name) contigs = set([align.contig for align in aligns]) for contig in contigs: alignment_tsv_f.write('\t' + contig) if qconfig.is_combined_ref: ref_name = ref_labels_by_chromosomes[chr_name] align_by_contigs = defaultdict(int) for align in aligns: align_by_contigs[align.contig] += align.len2 for contig, aligned_len in align_by_contigs.items(): if contig in used_contigs: continue used_contigs.add(contig) len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)') if len_cov_pattern.findall(contig): contig_len = len_cov_pattern.findall(contig)[0][0] contig_cov = len_cov_pattern.findall(contig)[0][1] if aligned_len / float(contig_len) > 0.9: unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n') alignment_tsv_f.write('\n') close_handlers(ca_output) logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') logger.debug('') clean_tmp_files(nucmer_fpath) if not qconfig.no_gzip: compress_nucmer_output(logger, nucmer_fpath) if not ref_aligns: return NucmerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs else: return NucmerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
import xml.etree.ElementTree as ET import socket socket.setdefaulttimeout(120) silva_pattern = re.compile(r'\S+\_(?P<taxons>\S+);(?P<seqname>\S+)', re.I) ncbi_pattern = re.compile(r'(?P<id>\S+\_[0-9.]+)[_ |](?P<seqname>\S+)', re.I) silva_db_url = 'http://www.arb-silva.de/fileadmin/silva_databases/release_123/Exports/' silva_fname = 'SILVA_123_SSURef_Nr99_tax_silva.fasta' silva_id = '123' silva_downloaded_fname = 'silva.' + silva_id + '.db' external_tools_dirpath = join(qconfig.QUAST_HOME, 'external_tools') blast_external_tools_dirpath = join(external_tools_dirpath, 'blast', qconfig.platform_name) blast_filenames = ['makeblastdb', 'blastn'] blast_dirpath_url = qconfig.GIT_ROOT_URL + qutils.relpath(blast_external_tools_dirpath, qconfig.QUAST_HOME) blast_dirpath = None blastdb_dirpath = None db_fpath = None db_nsq_fsize = 194318557 is_quast_first_run = False taxons_for_krona = {} connection_errors = 0 def get_blast_fpath(fname): if blast_dirpath: blast_path = os.path.join(blast_dirpath, fname) if os.path.exists(blast_path):