def downsample_kmers(tmp_dirpath, ref_fpath, kmc_db_fpath, kmer_len, log_fpath, err_fpath): downsampled_txt_fpath = join(tmp_dirpath, 'kmc.downsampled.txt') open(downsampled_txt_fpath, 'w').close() ref_kmers = dict() prev_kmer_idx = 0 for chrom, seq in read_fasta(ref_fpath): kmc_fasta_fpath = join(tmp_dirpath, 'kmers_' + chrom + '.fasta') num_kmers_in_seq = len(seq) - kmer_len + 1 with open(kmc_fasta_fpath, 'w') as out_f: for i in range(num_kmers_in_seq): out_f.write('>' + str(i) + '\n') out_f.write(seq[i: i + kmer_len] + '\n') filtered_fpath = join(tmp_dirpath, 'kmers_' + chrom + '.filtered.fasta') filter_contigs(kmc_fasta_fpath, filtered_fpath, kmc_db_fpath, log_fpath, err_fpath, min_kmers=1) filtered_kmers = set() for idx, _ in read_fasta(filtered_fpath): filtered_kmers.add(idx) with open(downsampled_txt_fpath, 'a') as out_f: kmer_i = 0 for idx, seq in read_fasta(kmc_fasta_fpath): if idx in filtered_kmers: if not kmer_i or int(idx) - kmer_i >= KMERS_INTERVAL: kmer_i = int(idx) out_f.write('>' + str(prev_kmer_idx + kmer_i) + '\n') out_f.write(seq + '\n') ref_kmers[prev_kmer_idx + kmer_i] = (chrom, kmer_i) prev_kmer_idx += num_kmers_in_seq if qconfig.space_efficient: os.remove(kmc_fasta_fpath) return ref_kmers, downsampled_txt_fpath
def fill_gaps_mate_pair(bam_fpath, ref_fpath, assembly_fpath, assembly_covered_regions, output_dir, uncovered_fpath, err_fpath): matepair_reads_covered_regions = parse_uncovered_fpath(uncovered_fpath, ref_fpath, return_covered_regions=True) final_fasta = [] matepair_regions = connect_with_matepairs(bam_fpath, output_dir, err_fpath) final_assembly_fpath = add_suffix(assembly_fpath, mp_polished_suffix) for name, seq in fastaparser.read_fasta(ref_fpath): covered_regions = list(find_overlaps(assembly_covered_regions[name], matepair_reads_covered_regions[name], overlap=50)) total_contigs = 0 if name not in matepair_regions or len(covered_regions) == 1: for region in covered_regions: final_fasta.append((name.split()[0] + "_" + str(total_contigs + 1), seq[region[0]: region[1]])) total_contigs += 1 else: frags_to_merge = [covered_regions.pop(0)] sorted_mp_intervals = sorted(matepair_regions[name]) while covered_regions: region2 = covered_regions.pop(0) if is_overlapped(frags_to_merge[-1], region2, sorted_mp_intervals): frags_to_merge.append(region2) else: merged_seq = merge_fragments_with_ns(seq, frags_to_merge) final_fasta.append((name.split()[0] + "_" + str(total_contigs + 1), merged_seq)) total_contigs += 1 frags_to_merge = [region2] if frags_to_merge: merged_seq = merge_fragments_with_ns(seq, frags_to_merge) final_fasta.append((name.split()[0] + "_" + str(total_contigs + 1), merged_seq)) total_contigs += 1 fastaparser.write_fasta(final_assembly_fpath, final_fasta) return final_assembly_fpath
def broke_scaffolds(file_counter, labels, contigs_fpath, corrected_dirpath, logs): logs.append(' ' + index_to_str(file_counter, force=(len(labels) > 1)) + ' breaking scaffolds into contigs:') contigs_fname = os.path.basename(contigs_fpath) fname, fasta_ext = splitext_for_fasta_file(contigs_fname) label = labels[file_counter] corr_fpath = unique_corrected_fpath(os.path.join(corrected_dirpath, slugify(label) + fasta_ext)) corr_fpath_wo_ext = os.path.join(corrected_dirpath, name_from_fpath(corr_fpath)) broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext broken_scaffolds_fasta = [] contigs_counter = 0 scaffold_counter = 0 for scaffold_counter, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)): if contigs_counter % 100 == 0: pass if contigs_counter > 520: pass total_contigs_for_the_scaf = split_by_ns(seq, name, broken_scaffolds_fasta, qconfig.Ns_break_threshold, qconfig.min_contig) contigs_counter += total_contigs_for_the_scaf if contigs_counter > scaffold_counter + 1: fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta) logs.append(" " + index_to_str(file_counter, force=(len(labels) > 1)) + " %d scaffolds (%s) were broken into %d contigs (%s)" % (scaffold_counter + 1, label, contigs_counter, label + '_broken')) return broken_scaffolds_fpath, logs logs.append(" " + index_to_str(file_counter, force=(len(labels) > 1)) + " WARNING: nothing was broken, skipping '%s broken' from further analysis" % label) return None, logs
def correct_fasta(original_fpath, min_contig, corrected_fpath=None, is_reference=False): modified_fasta_entries = [] used_seq_names = defaultdict(int) for first_line, seq in fastaparser.read_fasta(original_fpath): if not first_line: logger.error('Skipping ' + original_fpath + ' because >sequence_name field is empty.', indent=' ') return False if (len(seq) >= min_contig) or is_reference: corr_name = correct_name(first_line) uniq_name = get_uniq_name(corr_name, used_seq_names) used_seq_names[corr_name] += 1 if not qconfig.no_check: # seq to uppercase, because we later looking only uppercase letters corr_seq = correct_seq(seq, original_fpath) if not corr_seq: return False else: corr_seq = seq modified_fasta_entries.append((uniq_name, corr_seq)) if not modified_fasta_entries: logger.warning('Skipping ' + original_fpath + ' because file is empty.', indent=' ') return False if corrected_fpath: fastaparser.write_fasta(corrected_fpath, modified_fasta_entries) return True
def correct_fasta(original_fpath, min_contig, corrected_fpath=None, is_reference=False): modified_fasta_entries = [] used_seq_names = defaultdict(int) for first_line, seq in fastaparser.read_fasta(original_fpath): if not first_line: logger.error('Skipping ' + original_fpath + ' because >sequence_name field is empty.', indent=' ') return False if (len(seq) >= min_contig) or is_reference: corr_name = correct_name(first_line) uniq_name = get_uniq_name(corr_name, used_seq_names) used_seq_names[corr_name] += 1 if not qconfig.no_check: # seq to uppercase, because we later looking only uppercase letters corr_seq = correct_seq(seq, original_fpath) if not corr_seq: return False else: if re.compile(r'[^ACGTN]').search(seq): logger.error('File ' + original_fpath + ' contains non-ACGTN characters. ' 'Please re-run QUAST without --no-check.', indent=' ', exit_with_code=1) return False corr_seq = seq modified_fasta_entries.append((uniq_name, corr_seq)) if not modified_fasta_entries: logger.warning('Skipping ' + original_fpath + ' because file is empty.', indent=' ') return False if corrected_fpath: fastaparser.write_fasta(corrected_fpath, modified_fasta_entries) return True
def correct_fasta(original_fpath, corrected_fpath, min_contig, is_reference=False): modified_fasta_entries = [] used_seq_names = defaultdict(int) for first_line, seq in fastaparser.read_fasta(original_fpath): if not first_line: logger.warning('Skipping ' + original_fpath + ' because >sequence_name field is empty.', indent=' ') return False if (len(seq) >= min_contig) or is_reference: corr_name = correct_name(first_line) uniq_name = get_uniq_name(corr_name, used_seq_names) used_seq_names[corr_name] += 1 if not qconfig.no_check: # seq to uppercase, because we later looking only uppercase letters corr_seq = correct_seq(seq, original_fpath) if not corr_seq: return False else: corr_seq = seq modified_fasta_entries.append((uniq_name, corr_seq)) fastaparser.write_fasta(corrected_fpath, modified_fasta_entries) if is_reference: ref_len = sum(len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries) if ref_len > qconfig.MAX_REFERENCE_FILE_LENGTH: qconfig.splitted_ref = [] # important for MetaQUAST which runs QUAST multiple times _, fasta_ext = os.path.splitext(corrected_fpath) split_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath), 'split_ref') if os.path.exists(split_ref_dirpath): shutil.rmtree(split_ref_dirpath, ignore_errors=True) os.makedirs(split_ref_dirpath) max_len = min(ref_len/qconfig.max_threads, qconfig.MAX_REFERENCE_LENGTH) cur_part_len = 0 cur_part_num = 1 cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext for (chr_name, chr_seq) in modified_fasta_entries: cur_chr_len = len(chr_seq) if cur_chr_len > qconfig.MAX_REFERENCE_LENGTH: logger.warning("Skipping chromosome " + chr_name + " because its length is greater than " + str(qconfig.MAX_REFERENCE_LENGTH) + " (Nucmer's constraint).") continue cur_part_len += cur_chr_len if cur_part_len > max_len and cur_part_len != cur_chr_len: qconfig.splitted_ref.append(cur_part_fpath) cur_part_len = cur_chr_len cur_part_num += 1 cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext fastaparser.write_fasta(cur_part_fpath, [(chr_name, chr_seq)], mode='a') if cur_part_len > 0: qconfig.splitted_ref.append(cur_part_fpath) if len(qconfig.splitted_ref) == 0: logger.warning("Skipping reference because all of its chromosomes exceeded Nucmer's constraint.") return False return True
def broke_scaffolds(file_counter, labels, contigs_fpath, corrected_dirpath, logs): logs.append(' ' + index_to_str(file_counter, force=(len(labels) > 1)) + ' breaking scaffolds into contigs:') contigs_fname = os.path.basename(contigs_fpath) fname, fasta_ext = splitext_for_fasta_file(contigs_fname) label = labels[file_counter] corr_fpath = unique_corrected_fpath(os.path.join(corrected_dirpath, slugify(label) + fasta_ext)) corr_fpath_wo_ext = os.path.join(corrected_dirpath, name_from_fpath(corr_fpath)) broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext broken_scaffolds_fasta = [] contigs_counter = 0 scaffold_counter = 0 is_broken = False for scaffold_counter, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)): if contigs_counter % 100 == 0: pass if contigs_counter > 520: pass cumul_contig_length = 0 total_contigs_for_the_scaf = 0 cur_contig_start = 0 while (cumul_contig_length < len(seq)) and (seq.find('N', cumul_contig_length) != -1): start = seq.find("N", cumul_contig_length) end = start + 1 while (end != len(seq)) and (seq[end] == 'N'): end += 1 cumul_contig_length = end + 1 if end - start >= qconfig.Ns_break_threshold: is_broken = True if start - cur_contig_start >= qconfig.min_contig: broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(total_contigs_for_the_scaf + 1), seq[cur_contig_start:start])) total_contigs_for_the_scaf += 1 cur_contig_start = end if len(seq) - cur_contig_start >= qconfig.min_contig: broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(total_contigs_for_the_scaf + 1), seq[cur_contig_start:])) total_contigs_for_the_scaf += 1 contigs_counter += total_contigs_for_the_scaf if is_broken: fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta) logs.append(" " + index_to_str(file_counter, force=(len(labels) > 1)) + " %d scaffolds (%s) were broken into %d contigs (%s)" % (scaffold_counter + 1, label, contigs_counter, label + '_broken')) return broken_scaffolds_fpath, logs logs.append(" " + index_to_str(file_counter, force=(len(labels) > 1)) + " WARNING: nothing was broken, skipping '%s broken' from further analysis" % label) return None, logs
def save_circos_GC(ref_fpath, reference_length, gc_fpath): window_size = set_window_size(reference_length) with open(gc_fpath, 'w') as out_f: for name, seq_full in fastaparser.read_fasta(ref_fpath): for i in range(0, len(seq_full), window_size): seq = seq_full[i:i + window_size] GC_percent = get_GC_percent(seq, window_size) out_f.write('\t'.join([name, str(i), str(i + window_size), str(GC_percent) + '\n']))
def glimmerHMM(tool_dir, fasta_fpath, out_fpath, gene_lengths, err_path, tmp_dir, index): def run(contig_path, tmp_path): with open(err_path, 'a') as err_file: return_code = qutils.call_subprocess( [tool_exec, contig_path, '-d', trained_dir, '-g', '-o', tmp_path], stdout=err_file, stderr=err_file, indent=' ' + qutils.index_to_str(index) + ' ') return return_code tool_exec = os.path.join(tool_dir, 'glimmerhmm') # Note: why arabidopsis? for no particular reason, really. trained_dir = os.path.join(tool_dir, 'trained', 'arabidopsis') contigs = {} gffs = [] base_dir = tempfile.mkdtemp(dir=tmp_dir) for seq_num, (ind, seq) in enumerate(read_fasta(fasta_fpath)): seq_num = str(seq_num) ind = ind[:qutils.MAX_CONTIG_NAME_GLIMMER] contig_path = os.path.join(base_dir, seq_num + '.fasta') gff_path = os.path.join(base_dir, seq_num + '.gff') write_fasta(contig_path, [(ind, seq)]) if run(contig_path, gff_path) == 0: gffs.append(gff_path) contigs[ind] = seq if not gffs: return None, None, None, None out_gff_fpath = out_fpath + '_genes.gff' + ('.gz' if not qconfig.no_gzip else '') out_gff_path = merge_gffs(gffs, out_gff_fpath) unique, total = set(), 0 genes = [] for contig, gene_id, start, end, strand in parse_gff(out_gff_path): total += 1 if strand == '+': gene_seq = contigs[contig][start - 1:end] else: gene_seq = rev_comp(contigs[contig][start - 1:end]) if gene_seq not in unique: unique.add(gene_seq) gene = Gene(contig=contig, start=start, end=end, strand=strand, seq=gene_seq) gene.is_full = gene.start > 1 and gene.end < len(contigs[contig]) genes.append(gene) full_cnt = [sum([gene.end - gene.start >= threshold for gene in genes if gene.is_full]) for threshold in gene_lengths] partial_cnt = [sum([gene.end - gene.start >= threshold for gene in genes if not gene.is_full]) for threshold in gene_lengths] if OUTPUT_FASTA: out_fasta_fpath = out_fpath + '_genes.fasta' add_genes_to_fasta(genes, out_fasta_fpath) if not qconfig.debug: shutil.rmtree(base_dir) #return out_gff_path, out_fasta_path, len(unique), total, cnt return out_gff_path, genes, len(unique), total, full_cnt, partial_cnt
def GC_content(contigs_fpath, skip=False): """ Returns percent of GC for assembly and GC distribution: (list of GC%, list of # windows) """ total_GC_amount = 0 total_contig_length = 0 GC_contigs_bin_num = int(100 / qconfig.GC_contig_bin_size) + 1 GC_contigs_distribution_x = [ i * qconfig.GC_contig_bin_size for i in range(0, GC_contigs_bin_num) ] # list of X-coordinates, i.e. GC % GC_contigs_distribution_y = [ 0 ] * GC_contigs_bin_num # list of Y-coordinates, i.e. # contigs with GC % = x GC_bin_num = int(100 / qconfig.GC_bin_size) + 1 GC_distribution_x = [ i * qconfig.GC_bin_size for i in range(0, GC_bin_num) ] # list of X-coordinates, i.e. GC % GC_distribution_y = [ 0 ] * GC_bin_num # list of Y-coordinates, i.e. # windows with GC % = x total_GC = None if skip: return total_GC, (GC_distribution_x, GC_distribution_y), (GC_contigs_distribution_x, GC_contigs_distribution_y) for name, seq_full in fastaparser.read_fasta( contigs_fpath): # in tuples: (name, seq) contig_ACGT_len = len(seq_full) - seq_full.count("N") if not contig_ACGT_len: continue contig_GC_len = seq_full.count("G") + seq_full.count("C") contig_GC_percent = 100.0 * contig_GC_len / contig_ACGT_len GC_contigs_distribution_y[int(contig_GC_percent // qconfig.GC_contig_bin_size)] += 1 n = 100 # blocks of length 100 # non-overlapping windows for seq in (seq_full[i:i + n] for i in range(0, len(seq_full), n)): GC_percent = get_GC_percent(seq, n) if not GC_percent: continue GC_distribution_y[int( int(GC_percent / qconfig.GC_bin_size) * qconfig.GC_bin_size)] += 1 total_GC_amount += contig_GC_len total_contig_length += contig_ACGT_len if total_contig_length == 0: total_GC = None else: total_GC = total_GC_amount * 100.0 / total_contig_length return total_GC, (GC_distribution_x, GC_distribution_y), (GC_contigs_distribution_x, GC_contigs_distribution_y)
def save_icarus_GC(ref_fpath, gc_fpath): chr_index = 0 n = qconfig.GC_window_size_large if qconfig.large_genome else qconfig.GC_window_size # non-overlapping windows with open(gc_fpath, 'w') as out_f: for name, seq_full in fastaparser.read_fasta(ref_fpath): out_f.write('#' + name + ' ' + str(chr_index) + '\n') for i in range(0, len(seq_full), n): seq = seq_full[i:i + n] GC_percent = get_GC_percent(seq, n) out_f.write(str(chr_index) + ' ' + str(GC_percent) + '\n')
def remove_repeat_regions(ref_fpath, repeats_fpath, uncovered_fpath): repeats_regions = parse_bed(repeats_fpath) uncovered_regions = parse_bed(uncovered_fpath) unique_regions = defaultdict(list) for name, seq in fastaparser.read_fasta(ref_fpath): if name in repeats_regions: cur_contig_start = 0 for start, end in repeats_regions[name]: if start > cur_contig_start: unique_regions[name].append([cur_contig_start, start]) else: unique_regions[name].append([cur_contig_start, cur_contig_start]) unique_regions[name].append([start, start]) cur_contig_start = end + 1 if cur_contig_start < len(seq): unique_regions[name].append([cur_contig_start, len(seq)]) else: unique_regions[name].append([0, len(seq)]) unique_covered_regions = defaultdict(list) for name, regions in unique_regions.items(): if name in uncovered_regions: cur_contig_idx = 0 cur_contig_start, cur_contig_end = unique_regions[name][cur_contig_idx] for uncov_start, uncov_end in uncovered_regions[name]: while cur_contig_end < uncov_start: unique_covered_regions[name].append([cur_contig_start, cur_contig_end]) cur_contig_idx += 1 if cur_contig_idx >= len(unique_regions[name]): break cur_contig_start, cur_contig_end = unique_regions[name][cur_contig_idx] if uncov_end < cur_contig_start: continue if uncov_start <= cur_contig_start and uncov_end >= cur_contig_end: cur_contig_idx += 1 if cur_contig_idx >= len(unique_regions[name]): break cur_contig_start, cur_contig_end = unique_regions[name][cur_contig_idx] elif cur_contig_start <= uncov_start <= cur_contig_end or cur_contig_start <= uncov_end <= cur_contig_end: if uncov_start > cur_contig_start: unique_covered_regions[name].append([cur_contig_start, uncov_start]) if uncov_end < cur_contig_end: cur_contig_start = uncov_end else: cur_contig_idx += 1 if cur_contig_idx >= len(unique_regions[name]): break cur_contig_start, cur_contig_end = unique_regions[name][cur_contig_idx] else: unique_covered_regions[name].append([cur_contig_start, cur_contig_end]) for contig in unique_regions[name][cur_contig_idx:]: unique_covered_regions[name].append(contig) else: unique_covered_regions[name] = unique_regions[name] return unique_covered_regions
def broke_scaffolds(file_counter, labels, contigs_fpath, corrected_dirpath, logs): logs.append(' ' + index_to_str(file_counter, force=(len(labels) > 1)) + ' breaking scaffolds into contigs:') contigs_fname = os.path.basename(contigs_fpath) fname, fasta_ext = splitext_for_fasta_file(contigs_fname) label = labels[file_counter] corr_fpath = unique_corrected_fpath(os.path.join(corrected_dirpath, slugify(label) + fasta_ext)) corr_fpath_wo_ext = os.path.join(corrected_dirpath, name_from_fpath(corr_fpath)) broken_scaffolds_fpath = corr_fpath_wo_ext + '_broken' + fasta_ext broken_scaffolds_fasta = [] contigs_counter = 0 scaffold_counter = 0 for scaffold_counter, (name, seq) in enumerate(fastaparser.read_fasta(contigs_fpath)): if contigs_counter % 100 == 0: pass if contigs_counter > 520: pass cumul_contig_length = 0 total_contigs_for_the_scaf = 1 cur_contig_start = 0 while (cumul_contig_length < len(seq)) and (seq.find('N', cumul_contig_length) != -1): start = seq.find("N", cumul_contig_length) end = start + 1 while (end != len(seq)) and (seq[end] == 'N'): end += 1 cumul_contig_length = end + 1 if (end - start) >= qconfig.Ns_break_threshold: broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(total_contigs_for_the_scaf), seq[cur_contig_start:start])) total_contigs_for_the_scaf += 1 cur_contig_start = end broken_scaffolds_fasta.append( (name.split()[0] + "_" + str(total_contigs_for_the_scaf), seq[cur_contig_start:])) contigs_counter += total_contigs_for_the_scaf if scaffold_counter + 1 != contigs_counter: fastaparser.write_fasta(broken_scaffolds_fpath, broken_scaffolds_fasta) logs.append(" " + index_to_str(file_counter, force=(len(labels) > 1)) + " %d scaffolds (%s) were broken into %d contigs (%s)" % (scaffold_counter + 1, label, contigs_counter, label + '_broken')) return broken_scaffolds_fpath, logs logs.append(" " + index_to_str(file_counter, force=(len(labels) > 1)) + " WARNING: nothing was broken, skipping '%s broken' from further analysis" % label) return None, logs
def fill_gaps_single(ref_fpath, assembly_fpath, assembly_covered_regions, uncovered_fpath): single_reads_covered_regions = parse_uncovered_fpath(uncovered_fpath, ref_fpath, return_covered_regions=True) final_assembly_fpath = add_suffix(assembly_fpath, single_polished_suffix) final_fasta = [] for name, seq in fastaparser.read_fasta(ref_fpath): covered_regions = find_overlaps(assembly_covered_regions[name], single_reads_covered_regions[name], overlap=50) for i, region in enumerate(covered_regions): start, end = region final_fasta.append((name.split()[0] + "_" + str(i + 1), seq[start: end])) fastaparser.write_fasta(final_assembly_fpath, final_fasta) return final_assembly_fpath
def preprocess_reference(ref_fpath, tmp_dir, uncovered_fpath): uncovered_regions = parse_uncovered_fpath(uncovered_fpath, ref_fpath, return_covered_regions=False) splitted_fasta = [] for name, seq in fastaparser.read_fasta(ref_fpath): if name in uncovered_regions: cur_contig_start = 0 total_contigs = 0 for start, end in uncovered_regions[name]: total_contigs = split_by_ns(seq[cur_contig_start: start], name, splitted_fasta, total_contigs=total_contigs) cur_contig_start = end split_by_ns(seq[cur_contig_start:], name, splitted_fasta, total_contigs=total_contigs) else: split_by_ns(seq, name, splitted_fasta) processed_ref_fpath = join(tmp_dir, basename(ref_fpath)) fastaparser.write_fasta(processed_ref_fpath, splitted_fasta) return processed_ref_fpath
def remove_repeat_regions(ref_fpath, repeats_fpath, insert_size, tmp_dir, uncovered_fpath, err_fpath): merged_fpath = merge_bed(repeats_fpath, uncovered_fpath, insert_size, tmp_dir, err_fpath) regions_to_remove = parse_uncovered_fpath(merged_fpath, ref_fpath, return_covered_regions=False) unique_regions = defaultdict(list) for name, seq in fastaparser.read_fasta(ref_fpath): if name in regions_to_remove: cur_contig_start = 0 for start, end in regions_to_remove[name]: if start > cur_contig_start: unique_regions[name].append([cur_contig_start, start]) cur_contig_start = end + 1 if cur_contig_start < len(seq): unique_regions[name].append([cur_contig_start, len(seq)]) else: unique_regions[name].append([0, len(seq)]) return unique_regions
def GC_content(contigs_fpath, skip=False): """ Returns percent of GC for assembly and GC distribution: (list of GC%, list of # windows) """ total_GC_amount = 0 total_contig_length = 0 GC_contigs_bin_num = int(100 / qconfig.GC_contig_bin_size) + 1 GC_contigs_distribution_x = [i * qconfig.GC_contig_bin_size for i in range(0, GC_contigs_bin_num)] # list of X-coordinates, i.e. GC % GC_contigs_distribution_y = [0] * GC_contigs_bin_num # list of Y-coordinates, i.e. # contigs with GC % = x GC_bin_num = int(100 / qconfig.GC_bin_size) + 1 GC_distribution_x = [i * qconfig.GC_bin_size for i in range(0, GC_bin_num)] # list of X-coordinates, i.e. GC % GC_distribution_y = [0] * GC_bin_num # list of Y-coordinates, i.e. # windows with GC % = x total_GC = None if skip: return total_GC, (GC_distribution_x, GC_distribution_y), (GC_contigs_distribution_x, GC_contigs_distribution_y) for name, seq_full in fastaparser.read_fasta(contigs_fpath): # in tuples: (name, seq) contig_ACGT_len = len(seq_full) - seq_full.count("N") if not contig_ACGT_len: continue contig_GC_len = seq_full.count("G") + seq_full.count("C") contig_GC_percent = 100.0 * contig_GC_len / contig_ACGT_len GC_contigs_distribution_y[int(contig_GC_percent // qconfig.GC_contig_bin_size)] += 1 n = 100 # blocks of length 100 # non-overlapping windows for seq in (seq_full[i:i+n] for i in range(0, len(seq_full), n)): GC_percent = get_GC_percent(seq, n) if not GC_percent: continue GC_distribution_y[int(int(GC_percent / qconfig.GC_bin_size) * qconfig.GC_bin_size)] += 1 total_GC_amount += contig_GC_len total_contig_length += contig_ACGT_len if total_contig_length == 0: total_GC = None else: total_GC = total_GC_amount * 100.0 / total_contig_length return total_GC, (GC_distribution_x, GC_distribution_y), (GC_contigs_distribution_x, GC_contigs_distribution_y)
def parse_uncovered_fpath(uncovered_fpath, fasta_fpath, return_covered_regions=True): regions = defaultdict(list) prev_start = defaultdict(int) if exists(uncovered_fpath): with open(uncovered_fpath) as f: for line in f: chrom, start, end = line.split('\t') if return_covered_regions: if prev_start[chrom] != int(start): regions[chrom].append((prev_start[chrom], int(start))) prev_start[chrom] = int(end) else: regions[chrom].append((int(start), int(end))) if return_covered_regions: for name, seq in fastaparser.read_fasta(fasta_fpath): if name in regions: if prev_start[name] != len(seq): regions[name].append((prev_start[name], len(seq))) else: regions[name].append((0, len(seq))) return regions
def parse_uncovered_fpath(uncovered_fpath, fasta_fpath, return_covered_regions=True): regions = defaultdict(list) prev_start = defaultdict(int) if uncovered_fpath and exists(uncovered_fpath): with open(uncovered_fpath) as f: for line in f: chrom, start, end = line.split('\t') if return_covered_regions: if prev_start[chrom] != int(start): regions[chrom].append((prev_start[chrom], int(start))) prev_start[chrom] = int(end) else: regions[chrom].append((int(start), int(end))) if return_covered_regions: for name, seq in fastaparser.read_fasta(fasta_fpath): if name in regions: if prev_start[name] != len(seq): regions[name].append((prev_start[name], len(seq))) else: regions[name].append((0, len(seq))) return regions
def parse_contigs_fpath(contigs_fpath): contigs = [] for name, seq in fastaparser.read_fasta(contigs_fpath): contig = Contig(name=name, size=len(seq)) contigs.append(contig) return contigs
REF_MARGINS = 300 REF_FNAME = "ref.fa" if len(sys.argv) != 4: print "Usage:", sys.argv[0], "reference pos1 pos2" sys.exit(0) pos1 = int(sys.argv[2]) pos2 = int(sys.argv[3]) if pos1 > pos2: pos = pos1 pos1 = pos2 pos2 = pos reference = fastaparser.read_fasta( sys.argv[1])[0][1] # Returns list of FASTA entries (in tuples: name, seq) if len(reference) < pos2: pos2 = len(reference) ref_file = open(REF_FNAME, 'w') ref_file.write(">reference\n") ref_file.write(reference[max(0, pos1 - 1 - REF_MARGINS):min(len(reference), pos2 + REF_MARGINS)] + "\n") ref_file.close() misassembled_site = reference[pos1 - 1:pos2] kmers = set() i = pos1 - 1 while i + KMER_SIZE <= pos2:
def do(ref_fpath, original_ref_fpath, output_dirpath): logger.print_timestamp() logger.main_info("Simulating Optimal Assembly...") uncovered_fpath = None reads_analyzer_dir = join(dirname(output_dirpath), qconfig.reads_stats_dirname) if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_bam: sam_fpath, bam_fpath, uncovered_fpath = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads='all', calculate_coverage=True) insert_size = qconfig.optimal_assembly_insert_size if insert_size == 'auto' or not insert_size: insert_size = qconfig.optimal_assembly_default_IS ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(ref_fpath)) result_basename = '%s.%s.is%d.fasta' % ( ref_basename, qconfig.optimal_assembly_basename, insert_size) long_reads = qconfig.pacbio_reads or qconfig.nanopore_reads if long_reads: result_basename = add_suffix(result_basename, long_reads_polished_suffix) elif qconfig.mate_pairs: result_basename = add_suffix(result_basename, mp_polished_suffix) result_fpath = os.path.join(output_dirpath, result_basename) original_ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(original_ref_fpath)) prepared_optimal_assembly_basename = '%s.%s.is%d.fasta' % ( original_ref_basename, qconfig.optimal_assembly_basename, insert_size) ref_prepared_optimal_assembly = os.path.join( os.path.dirname(original_ref_fpath), prepared_optimal_assembly_basename) if os.path.isfile(result_fpath) or os.path.isfile( ref_prepared_optimal_assembly): already_done_fpath = result_fpath if os.path.isfile( result_fpath) else ref_prepared_optimal_assembly logger.notice( ' Will reuse already generated Optimal Assembly with insert size %d (%s)' % (insert_size, already_done_fpath)) return already_done_fpath if qconfig.platform_name == 'linux_32': logger.warning( ' Sorry, can\'t create Optimal Assembly on this platform, skipping...' ) return None red_dirpath = get_dir_for_download('red', 'Red', ['Red'], logger) binary_fpath = download_external_tool('Red', red_dirpath, 'red', platform_specific=True, is_executable=True) if not binary_fpath or not os.path.isfile(binary_fpath): logger.warning(' Sorry, can\'t create Optimal Assembly, skipping...') return None log_fpath = os.path.join(output_dirpath, 'optimal_assembly.log') tmp_dir = os.path.join(output_dirpath, 'tmp') if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) os.makedirs(tmp_dir) unique_covered_regions, repeats_regions = get_unique_covered_regions( ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size, uncovered_fpath) if unique_covered_regions is None: logger.error( ' Failed to create Optimal Assembly, see log for details: ' + log_fpath) return None reference = list(fastaparser.read_fasta(ref_fpath)) result_fasta = [] if long_reads or qconfig.mate_pairs: if long_reads: join_reads = 'pacbio' if qconfig.pacbio_reads else 'nanopore' else: join_reads = 'mp' sam_fpath, bam_fpath, _ = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads=join_reads) joiners = get_joiners(qutils.name_from_fpath(ref_fpath), sam_fpath, bam_fpath, tmp_dir, log_fpath, join_reads) uncovered_regions = parse_uncovered_fpath( uncovered_fpath, ref_fpath, return_covered_regions=False ) if join_reads == 'mp' else defaultdict(list) mp_len = calculate_read_len(sam_fpath) if join_reads == 'mp' else None for chrom, seq in reference: region_pairing = get_regions_pairing(unique_covered_regions[chrom], joiners[chrom], mp_len) ref_coords_to_output = scaffolding(unique_covered_regions[chrom], region_pairing) get_fasta_entries_from_coords(result_fasta, (chrom, seq), ref_coords_to_output, repeats_regions[chrom], uncovered_regions[chrom]) else: for chrom, seq in reference: for idx, region in enumerate(unique_covered_regions[chrom]): if region[1] - region[0] >= MIN_CONTIG_LEN: result_fasta.append( (chrom + '_' + str(idx), seq[region[0]:region[1]])) fastaparser.write_fasta(result_fpath, result_fasta) logger.info(' ' + 'Theoretically optimal Assembly saved to ' + result_fpath) logger.notice( 'You can copy it to ' + ref_prepared_optimal_assembly + ' and QUAST will reuse it in further runs against the same reference (' + original_ref_fpath + ')') if not qconfig.debug: shutil.rmtree(tmp_dir) logger.main_info('Done.') return result_fpath
def analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, aligns, ref_features, ref_lens, cyclic=None): maxun = 10 epsilon = 0.99 umt = 0.5 # threshold for misassembled contigs with aligned less than $umt * 100% (Unaligned Missassembled Threshold) unaligned = 0 partially_unaligned = 0 fully_unaligned_bases = 0 partially_unaligned_bases = 0 ambiguous_contigs = 0 ambiguous_contigs_extra_bases = 0 ambiguous_contigs_len = 0 partially_unaligned_with_misassembly = 0 partially_unaligned_with_significant_parts = 0 misassembly_internal_overlap = 0 contigs_with_istranslocations = 0 misassemblies_matched_sv = 0 ref_aligns = dict() aligned_lengths = [] region_misassemblies = [] misassembled_contigs = dict() region_struct_variations = find_all_sv(qconfig.bed) references_misassemblies = {} for ref in ref_labels_by_chromosomes.values(): references_misassemblies[ref] = dict( (key, 0) for key in ref_labels_by_chromosomes.values()) # for counting SNPs and indels (both original (.all_snps) and corrected from Nucmer's local misassemblies) total_indels_info = IndelsInfo() unaligned_file = open(unaligned_fpath, 'w') for contig, seq in fastaparser.read_fasta(contigs_fpath): #Recording contig stats ctg_len = len(seq) print >> ca_output.stdout_f, 'CONTIG: %s (%dbp)' % (contig, ctg_len) contig_type = 'unaligned' #Check if this contig aligned to the reference if contig in aligns: for align in aligns[contig]: #sub_seq = seq[align.start(): align.end()] sub_seq = seq[_start(align):_end(align)] if 'N' in sub_seq: ns_pos = [ pos for pos in xrange(_start(align), _end(align)) if seq[pos] == 'N' ] # ns_pos = [pos for pos in xrange(align.start(), align.end()) if seq[pos] == 'N'] contig_type = 'correct' #Pull all aligns for this contig num_aligns = len(aligns[contig]) #Sort aligns by aligned_length * identity - unaligned_length (as we do in BSS) sorted_aligns = sorted(aligns[contig], key=lambda x: (score_single_align(x), x[5]), reverse=True) top_len = sorted_aligns[0][5] top_id = sorted_aligns[0][6] top_score = score_single_align(sorted_aligns[0]) top_aligns = [] print >> ca_output.stdout_f, 'Top Length: %d Top ID: %.2f (Score: %.1f)' % ( top_len, top_id, top_score) #Check that top hit captures most of the contig if top_len > ctg_len * epsilon or ctg_len - top_len < maxun: #Reset top aligns: aligns that share the same value of longest and highest identity top_aligns.append(sorted_aligns[0]) sorted_aligns = sorted_aligns[1:] #Continue grabbing alignments while length and identity are identical #while sorted_aligns and top_len == sorted_aligns[0][5] and top_id == sorted_aligns[0][6]: while sorted_aligns and (score_single_align( sorted_aligns[0]) >= qconfig.ambiguity_score * top_score): top_aligns.append(sorted_aligns[0]) sorted_aligns = sorted_aligns[1:] #Mark other alignments as insignificant (former ambiguous) if sorted_aligns: print >> ca_output.stdout_f, '\t\tSkipping these alignments as insignificant (option --ambiguity-score is set to "%s"):' % str( qconfig.ambiguity_score) for align in sorted_aligns: print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', align if len(top_aligns) == 1: #There is only one top align, life is good print >> ca_output.stdout_f, '\t\tOne align captures most of this contig: %s' % str( top_aligns[0]) # print >> ca_output.icarus_out_f, top_aligns[0].icarus_report_str() print >> ca_output.icarus_out_f, icarus_report_str( top_aligns[0]) ref_aligns.setdefault(top_aligns[0][7], []).append(top_aligns[0]) print >> ca_output.coords_filtered_f, str(top_aligns[0]) aligned_lengths.append(top_aligns[0][5]) else: #There is more than one top align print >> ca_output.stdout_f, '\t\tThis contig has %d significant alignments. [An ambiguously mapped contig]' % len( top_aligns) #Increment count of ambiguously mapped contigs and bases in them ambiguous_contigs += 1 # we count only extra bases, so we shouldn't include bases in the first alignment # if --ambiguity-usage is 'none', the number of extra bases will be negative! ambiguous_contigs_len += ctg_len # Alex: skip all alignments or count them as normal (just different aligns of one repeat). Depend on --allow-ambiguity option if qconfig.ambiguity_usage == "none": ambiguous_contigs_extra_bases -= top_aligns[0][5] print >> ca_output.stdout_f, '\t\tSkipping these alignments (option --ambiguity-usage is set to "none"):' for align in top_aligns: print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', align elif qconfig.ambiguity_usage == "one": ambiguous_contigs_extra_bases += 0 print >> ca_output.stdout_f, '\t\tUsing only first of these alignment (option --ambiguity-usage is set to "one"):' print >> ca_output.stdout_f, '\t\t\tAlignment: %s' % str( top_aligns[0]) # print >> ca_output.icarus_out_f, top_aligns[0].icarus_report_str() print >> ca_output.icarus_out_f, icarus_report_str( top_aligns[0]) ref_aligns.setdefault(top_aligns[0][7], []).append(top_aligns[0]) aligned_lengths.append(top_aligns[0][5]) print >> ca_output.coords_filtered_f, str( top_aligns[0]) top_aligns = top_aligns[1:] for align in top_aligns: print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', align elif qconfig.ambiguity_usage == "all": ambiguous_contigs_extra_bases -= top_aligns[0][5] print >> ca_output.stdout_f, '\t\tUsing all these alignments (option --ambiguity-usage is set to "all"):' # we count only extra bases, so we shouldn't include bases in the first alignment first_alignment = True while len(top_aligns): print >> ca_output.stdout_f, '\t\t\tAlignment: %s' % str( top_aligns[0]) # print >> ca_output.icarus_out_f, top_aligns[0].icarus_report_str(ambiguity=True) print >> ca_output.icarus_out_f, icarus_report_str( top_aligns[0], ambiguity=True) ref_aligns.setdefault(top_aligns[0][7], []).append(top_aligns[0]) if first_alignment: first_alignment = False aligned_lengths.append(top_aligns[0][5]) ambiguous_contigs_extra_bases += top_aligns[0][5] print >> ca_output.coords_filtered_f, str( top_aligns[0]), "ambiguous" top_aligns = top_aligns[1:] else: # choose appropriate alignments (to maximize total size of contig alignment and reduce # misassemblies) is_ambiguous, too_much_best_sets, sorted_aligns, best_sets = get_best_aligns_sets( sorted_aligns, ctg_len, ca_output.stdout_f, seq, ref_lens, cyclic, region_struct_variations) the_best_set = best_sets[0] used_indexes = range( len(sorted_aligns) ) if too_much_best_sets else get_used_indexes(best_sets) if len(used_indexes) < len(sorted_aligns): print >> ca_output.stdout_f, '\t\t\tSkipping redundant alignments after choosing the best set of alignments' for idx in set(range(len(sorted_aligns))) - used_indexes: print >> ca_output.stdout_f, '\t\tSkipping redundant alignment', sorted_aligns[ idx] if is_ambiguous: print >> ca_output.stdout_f, '\t\tThis contig has several significant sets of alignments. [An ambiguously mapped contig]' # similar to regular ambiguous contigs, see above ambiguous_contigs += 1 ambiguous_contigs_len += ctg_len if qconfig.ambiguity_usage == "none": ambiguous_contigs_extra_bases -= ( ctg_len - the_best_set.uncovered) print >> ca_output.stdout_f, '\t\tSkipping all alignments in these sets (option --ambiguity-usage is set to "none"):' for idx in used_indexes: print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', sorted_aligns[ idx] continue elif qconfig.ambiguity_usage == "one": ambiguous_contigs_extra_bases += 0 print >> ca_output.stdout_f, '\t\tUsing only the very best set (option --ambiguity-usage is set to "one").' if len(the_best_set.indexes) < len(used_indexes): print >> ca_output.stdout_f, '\t\tSo, skipping alignments from other sets:' for idx in used_indexes: if idx not in the_best_set.indexes: print >> ca_output.stdout_f, '\t\t\tSkipping alignment ', sorted_aligns[ idx] elif qconfig.ambiguity_usage == "all": print >> ca_output.stdout_f, '\t\tUsing all alignments in these sets (option --ambiguity-usage is set to "all"):' print >> ca_output.stdout_f, '\t\t\tThe very best set is shown in details below, the rest are:' for idx, cur_set in enumerate(best_sets[1:]): print >> ca_output.stdout_f, '\t\t\t\tGroup #%d. Score: %.1f, number of alignments: %d, unaligned bases: %d' % \ (idx + 2, cur_set.score, len(cur_set.indexes), cur_set.uncovered) if too_much_best_sets: print >> ca_output.stdout_f, '\t\t\t\tetc...' if len(the_best_set.indexes) < len(used_indexes): ambiguous_contigs_extra_bases -= ( ctg_len - the_best_set.uncovered) print >> ca_output.stdout_f, '\t\t\tList of alignments used in the sets above:' for idx in used_indexes: align = sorted_aligns[idx] print >> ca_output.stdout_f, '\t\tAlignment: %s' % str( align) ref_aligns.setdefault(align[7], []).append(align) ambiguous_contigs_extra_bases += align[5] print >> ca_output.coords_filtered_f, str( align), "ambiguous" if idx not in the_best_set.indexes: print >> ca_output.icarus_out_f, icarus_report_str( align, is_best=False) # print >> ca_output.icarus_out_f, align.icarus_report_str(is_best=False) print >> ca_output.stdout_f, '\t\t\tThe best set is below. Score: %.1f, number of alignments: %d, unaligned bases: %d' % \ (the_best_set.score, len(the_best_set.indexes), the_best_set.uncovered) real_aligns = [sorted_aligns[i] for i in the_best_set.indexes] # main processing part if len(real_aligns) == 1: the_only_align = real_aligns[0] #There is only one alignment of this contig to the reference print >> ca_output.coords_filtered_f, str(the_only_align) aligned_lengths.append(the_only_align[5]) # begin, end = the_only_align.start(), the_only_align.end() begin, end = _start(the_only_align), _end(the_only_align) unaligned_bases = 0 if (begin - 1) or (ctg_len - end): partially_unaligned += 1 unaligned_bases = (begin - 1) + (ctg_len - end) partially_unaligned_bases += unaligned_bases print >> ca_output.stdout_f, '\t\tThis contig is partially unaligned. (Aligned %d out of %d bases)' % ( top_len, ctg_len) print >> ca_output.stdout_f, '\t\tAlignment: %s' % str( the_only_align) # print >> ca_output.icarus_out_f, the_only_align.icarus_report_str() print >> ca_output.icarus_out_f, icarus_report_str( the_only_align) if begin - 1: print >> ca_output.stdout_f, '\t\tUnaligned bases: 1 to %d (%d)' % ( begin - 1, begin - 1) if ctg_len - end: print >> ca_output.stdout_f, '\t\tUnaligned bases: %d to %d (%d)' % ( end + 1, ctg_len, ctg_len - end) # check if both parts (aligned and unaligned) have significant length if (unaligned_bases >= qconfig.significant_part_size) and ( ctg_len - unaligned_bases >= qconfig.significant_part_size): print >> ca_output.stdout_f, '\t\tThis contig has both significant aligned and unaligned parts ' \ '(of length >= %d)!' % (qconfig.significant_part_size) partially_unaligned_with_significant_parts += 1 if qconfig.meta: contigs_with_istranslocations += check_for_potential_translocation( seq, ctg_len, real_aligns, ca_output.stdout_f) ref_aligns.setdefault(the_only_align[7], []).append(the_only_align) else: #Sort real alignments by position on the contig sorted_aligns = sorted(real_aligns, key=lambda x: (_end(x), _start(x))) # sorted_aligns = sorted(real_aligns, key=lambda x: (x.end(), x.start())) #There is more than one alignment of this contig to the reference print >> ca_output.stdout_f, '\t\tThis contig is misassembled. %d total aligns.' % num_aligns aligned_bases_in_contig = ctg_len - the_best_set.uncovered if aligned_bases_in_contig < umt * ctg_len: print >> ca_output.stdout_f, '\t\t\tWarning! This contig is more unaligned than misassembled. ' + \ 'Contig length is %d and total length of all aligns is %d' % (ctg_len, aligned_bases_in_contig) for align in sorted_aligns: print >> ca_output.stdout_f, '\t\tAlignment: %s' % str( align) # print >> ca_output.icarus_out_f, align.icarus_report_str() print >> ca_output.icarus_out_f, icarus_report_str( align) print >> ca_output.coords_filtered_f, str(align) aligned_lengths.append(align[5]) ref_aligns.setdefault(align[7], []).append(align) partially_unaligned_with_misassembly += 1 partially_unaligned += 1 partially_unaligned_bases += ctg_len - aligned_bases_in_contig print >> ca_output.stdout_f, '\t\tUnaligned bases: %d' % ( ctg_len - aligned_bases_in_contig) # check if both parts (aligned and unaligned) have significant length if (aligned_bases_in_contig >= qconfig.significant_part_size) and ( ctg_len - aligned_bases_in_contig >= qconfig.significant_part_size): print >> ca_output.stdout_f, '\t\tThis contig has both significant aligned and unaligned parts ' \ '(of length >= %d)!' % (qconfig.significant_part_size) partially_unaligned_with_significant_parts += 1 if qconfig.meta: contigs_with_istranslocations += check_for_potential_translocation( seq, ctg_len, sorted_aligns, ca_output.stdout_f) contig_type = 'misassembled' print >> ca_output.icarus_out_f, '\t'.join( ['CONTIG', contig, str(ctg_len), contig_type]) print >> ca_output.stdout_f continue ### processing misassemblies is_misassembled, current_mio, references_misassemblies, indels_info, misassemblies_matched_sv = \ process_misassembled_contig(sorted_aligns, cyclic, aligned_lengths, region_misassemblies, ref_lens, ref_aligns, ref_features, seq, references_misassemblies, region_struct_variations, misassemblies_matched_sv, ca_output, is_ambiguous) misassembly_internal_overlap += current_mio total_indels_info += indels_info if is_misassembled: misassembled_contigs[contig] = ctg_len contig_type = 'misassembled' if ctg_len - aligned_bases_in_contig >= qconfig.significant_part_size: print >> ca_output.stdout_f, '\t\tThis contig has significant unaligned parts ' \ '(of length >= %d)!' % (qconfig.significant_part_size) if qconfig.meta: contigs_with_istranslocations += check_for_potential_translocation( seq, ctg_len, sorted_aligns, ca_output.stdout_f) else: #No aligns to this contig print >> ca_output.stdout_f, '\t\tThis contig is unaligned. (%d bp)' % ctg_len print >> unaligned_file, contig #Increment unaligned contig count and bases unaligned += 1 fully_unaligned_bases += ctg_len print >> ca_output.stdout_f, '\t\tUnaligned bases: %d total: %d' % ( ctg_len, fully_unaligned_bases) print >> ca_output.icarus_out_f, '\t'.join( ['CONTIG', contig, str(ctg_len), contig_type]) print >> ca_output.stdout_f ca_output.coords_filtered_f.close() unaligned_file.close() misassembled_bases = sum(misassembled_contigs.itervalues()) result = { 'region_misassemblies': region_misassemblies, 'region_struct_variations': region_struct_variations.get_count() if region_struct_variations else None, 'misassemblies_matched_sv': misassemblies_matched_sv, 'misassembled_contigs': misassembled_contigs, 'misassembled_bases': misassembled_bases, 'misassembly_internal_overlap': misassembly_internal_overlap, 'unaligned': unaligned, 'partially_unaligned': partially_unaligned, 'partially_unaligned_bases': partially_unaligned_bases, 'fully_unaligned_bases': fully_unaligned_bases, 'ambiguous_contigs': ambiguous_contigs, 'ambiguous_contigs_extra_bases': ambiguous_contigs_extra_bases, 'ambiguous_contigs_len': ambiguous_contigs_len, 'partially_unaligned_with_misassembly': partially_unaligned_with_misassembly, 'partially_unaligned_with_significant_parts': partially_unaligned_with_significant_parts, 'contigs_with_istranslocations': contigs_with_istranslocations, 'istranslocations_by_refs': references_misassemblies } return result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs
def do(output_dir, ref_fpath, contigs_fpaths, logger): logger.print_timestamp() logger.main_info('Running analysis based on unique ' + str(KMERS_LEN) + '-mers...') checked_assemblies = [] for contigs_fpath in contigs_fpaths: label = qutils.label_from_fpath_for_fname(contigs_fpath) if check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): kmc_stats_fpath = join(output_dir, label + '.stat') stats_content = open(kmc_stats_fpath).read().split('\n') if len(stats_content) < 1: continue logger.info(' Using existing results for ' + label + '... ') report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1])) if len(stats_content) >= 5: len_map_to_one_chrom = int(stats_content[1].strip().split(': ')[-1]) len_map_to_multi_chrom = int(stats_content[2].strip().split(': ')[-1]) len_map_to_none_chrom = int(stats_content[3].strip().split(': ')[-1]) total_len = int(stats_content[4].strip().split(': ')[-1]) report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len)) checked_assemblies.append(contigs_fpath) contigs_fpaths = [fpath for fpath in contigs_fpaths if fpath not in checked_assemblies] if len(contigs_fpaths) == 0: logger.info('Done.') return if not exists(kmc_bin_fpath) or not exists(kmc_tools_fpath): logger.warning(' Sorry, can\'t run KMC on this platform, skipping...') return None logger.info('Running KMC on reference...') log_fpath = join(output_dir, 'kmc.log') err_fpath = join(output_dir, 'kmc.err') open(log_fpath, 'w').close() open(err_fpath, 'w').close() tmp_dirpath = join(output_dir, 'tmp') if not isdir(tmp_dirpath): os.makedirs(tmp_dirpath) ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, log_fpath, err_fpath) unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath, err_fpath) if not unique_kmers: return logger.info('Analyzing assemblies completeness...') kmc_out_fpaths = [] for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, log_fpath, err_fpath) intersect_out_fpath = intersect_kmers(tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath, err_fpath) matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath, log_fpath, err_fpath) completeness = matched_kmers * 100.0 / unique_kmers report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness) kmc_out_fpaths.append(intersect_out_fpath) logger.info('Analyzing assemblies accuracy...') if len(kmc_out_fpaths) > 1: shared_kmc_db = intersect_kmers(tmp_dirpath, kmc_out_fpaths, log_fpath, err_fpath) else: shared_kmc_db = kmc_out_fpaths[0] kmer_fraction = 100 if getsize(ref_fpath) < 500 * 1024 ** 2 else 1000 shared_downsampled_kmc_db = downsample_kmers(tmp_dirpath, shared_kmc_db, log_fpath, err_fpath, kmer_fraction=kmer_fraction) shared_kmers_by_chrom = dict() shared_kmers_fpath = join(tmp_dirpath, 'shared_kmers.txt') ref_contigs = dict((name, seq) for name, seq in read_fasta(ref_fpath)) with open(shared_kmers_fpath, 'w') as out_f: for name, seq in ref_contigs.items(): seq_kmers = get_string_kmers(tmp_dirpath, log_fpath, err_fpath, seq=seq, intersect_with=shared_downsampled_kmc_db) for kmer_i, kmer in enumerate(seq_kmers): shared_kmers_by_chrom[str(kmer)] = name out_f.write('>' + str(kmer_i) + '\n') out_f.write(kmer + '\n') shared_kmc_db = count_kmers(tmp_dirpath, shared_kmers_fpath, log_fpath, err_fpath) ref_kmc_dbs = [] for ref_name, ref_seq in ref_contigs.items(): ref_contig_fpath = join(tmp_dirpath, ref_name + '.fa') if not is_non_empty_file(ref_contig_fpath): with open(ref_contig_fpath, 'w') as out_f: out_f.write(ref_seq) ref_kmc_db = count_kmers(tmp_dirpath, ref_contig_fpath, log_fpath, err_fpath) ref_shared_kmc_db = intersect_kmers(tmp_dirpath, [ref_kmc_db, shared_kmc_db], log_fpath, err_fpath) ref_kmc_dbs.append((ref_name, ref_shared_kmc_db)) for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) len_map_to_one_chrom = None len_map_to_multi_chrom = None len_map_to_none_chrom = None total_len = 0 long_contigs = [] contig_lens = dict() contig_markers = defaultdict(list) for name, seq in read_fasta(contigs_fpath): total_len += len(seq) contig_lens[name] = len(seq) if len(seq) >= MIN_CONTIGS_LEN: long_contigs.append(len(seq)) if len(long_contigs) > MAX_CONTIGS_NUM or sum(long_contigs) < total_len * 0.5: logger.warning('Assembly is too fragmented. Scaffolding accuracy will not be assessed.') elif len(ref_kmc_dbs) > MAX_CONTIGS_NUM: logger.warning('Reference is too fragmented. Scaffolding accuracy will not be assessed.') else: len_map_to_one_chrom = 0 len_map_to_multi_chrom = 0 for name, seq in read_fasta(contigs_fpath): if len(seq) < MIN_CONTIGS_LEN: continue tmp_contig_fpath = join(tmp_dirpath, name + '.fa') with open(tmp_contig_fpath, 'w') as out_tmp_f: out_tmp_f.write(seq) contig_kmc_db = count_kmers(tmp_dirpath, tmp_contig_fpath, log_fpath, err_fpath) intersect_all_ref_kmc_db = intersect_kmers(tmp_dirpath, [contig_kmc_db, shared_kmc_db], log_fpath, err_fpath) kmers_cnt = get_kmers_cnt(tmp_dirpath, intersect_all_ref_kmc_db, log_fpath, err_fpath) if kmers_cnt < MIN_MARKERS: continue for ref_name, ref_kmc_db in ref_kmc_dbs: intersect_kmc_db = intersect_kmers(tmp_dirpath, [ref_kmc_db, intersect_all_ref_kmc_db], log_fpath, err_fpath) kmers_cnt = get_kmers_cnt(tmp_dirpath, intersect_kmc_db, log_fpath, err_fpath) if kmers_cnt: contig_markers[name].append(ref_name) for name, chr_markers in contig_markers.items(): if len(chr_markers) == 1: len_map_to_one_chrom += contig_lens[name] else: len_map_to_multi_chrom += contig_lens[name] len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len)) create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath, report.get_field(reporting.Fields.KMER_COMPLETENESS), len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.info('Done.')
def do(ref_fpath, original_ref_fpath, output_dirpath): logger.print_timestamp() logger.main_info("Generating Upper Bound Assembly...") if not reads_analyzer.compile_reads_analyzer_tools(logger): logger.warning( ' Sorry, can\'t create Upper Bound Assembly ' '(failed to compile necessary third-party read processing tools [bwa, bedtools, minimap2]), skipping...' ) return None if qconfig.platform_name == 'linux_32': logger.warning( ' Sorry, can\'t create Upper Bound Assembly on this platform ' '(only linux64 and macOS are supported), skipping...') return None red_dirpath = get_dir_for_download('red', 'Red', ['Red'], logger) binary_fpath = download_external_tool('Red', red_dirpath, 'red', platform_specific=True, is_executable=True) if not binary_fpath or not os.path.isfile(binary_fpath): logger.warning( ' Sorry, can\'t create Upper Bound Assembly ' '(failed to install/download third-party repeat finding tool [Red]), skipping...' ) return None insert_size = qconfig.optimal_assembly_insert_size if insert_size == 'auto' or not insert_size: insert_size = qconfig.optimal_assembly_default_IS ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(ref_fpath)) result_basename = '%s.%s.is%d.fasta' % ( ref_basename, qconfig.optimal_assembly_basename, insert_size) long_reads = qconfig.pacbio_reads or qconfig.nanopore_reads if long_reads: result_basename = add_suffix(result_basename, long_reads_polished_suffix) elif qconfig.mate_pairs: result_basename = add_suffix(result_basename, mp_polished_suffix) result_fpath = os.path.join(output_dirpath, result_basename) original_ref_basename, fasta_ext = splitext_for_fasta_file( os.path.basename(original_ref_fpath)) prepared_optimal_assembly_basename = '%s.%s.is%d.fasta' % ( original_ref_basename, qconfig.optimal_assembly_basename, insert_size) if long_reads: prepared_optimal_assembly_basename = add_suffix( prepared_optimal_assembly_basename, long_reads_polished_suffix) elif qconfig.mate_pairs: prepared_optimal_assembly_basename = add_suffix( prepared_optimal_assembly_basename, mp_polished_suffix) ref_prepared_optimal_assembly = os.path.join( os.path.dirname(original_ref_fpath), prepared_optimal_assembly_basename) already_done_fpath = check_prepared_optimal_assembly( insert_size, result_fpath, ref_prepared_optimal_assembly) if already_done_fpath: return already_done_fpath uncovered_fpath = None reads_analyzer_dir = join(dirname(output_dirpath), qconfig.reads_stats_dirname) if qconfig.reads_fpaths or qconfig.reference_sam or qconfig.reference_bam: sam_fpath, bam_fpath, uncovered_fpath = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads='all', calculate_coverage=True) if qconfig.optimal_assembly_insert_size != 'auto' and qconfig.optimal_assembly_insert_size != insert_size: calculated_insert_size = qconfig.optimal_assembly_insert_size result_fpath = result_fpath.replace('is' + str(insert_size), 'is' + str(calculated_insert_size)) prepared_optimal_assembly_basename = prepared_optimal_assembly_basename.replace( 'is' + str(insert_size), 'is' + str(calculated_insert_size)) insert_size = calculated_insert_size ref_prepared_optimal_assembly = os.path.join( os.path.dirname(original_ref_fpath), prepared_optimal_assembly_basename) already_done_fpath = check_prepared_optimal_assembly( insert_size, result_fpath, ref_prepared_optimal_assembly) if already_done_fpath: return already_done_fpath log_fpath = os.path.join(output_dirpath, 'upper_bound_assembly.log') tmp_dir = os.path.join(output_dirpath, 'tmp') if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) os.makedirs(tmp_dir) unique_covered_regions, repeats_regions = get_unique_covered_regions( ref_fpath, tmp_dir, log_fpath, binary_fpath, insert_size, uncovered_fpath, use_long_reads=long_reads) if unique_covered_regions is None: logger.error( ' Failed to create Upper Bound Assembly, see log for details: ' + log_fpath) return None reference = list(fastaparser.read_fasta(ref_fpath)) result_fasta = [] if long_reads or qconfig.mate_pairs: if long_reads: join_reads = 'pacbio' if qconfig.pacbio_reads else 'nanopore' else: join_reads = 'mp' sam_fpath, bam_fpath, _ = reads_analyzer.align_reference( ref_fpath, reads_analyzer_dir, using_reads=join_reads) joiners = get_joiners(qutils.name_from_fpath(ref_fpath), sam_fpath, bam_fpath, tmp_dir, log_fpath, join_reads) uncovered_regions = parse_bed( uncovered_fpath) if join_reads == 'mp' else defaultdict(list) mp_len = calculate_read_len(sam_fpath) if join_reads == 'mp' else None for chrom, seq in reference: region_pairing = get_regions_pairing(unique_covered_regions[chrom], joiners[chrom], mp_len) ref_coords_to_output = scaffolding(unique_covered_regions[chrom], region_pairing) get_fasta_entries_from_coords(result_fasta, (chrom, seq), ref_coords_to_output, repeats_regions[chrom], uncovered_regions[chrom]) else: for chrom, seq in reference: for idx, region in enumerate(unique_covered_regions[chrom]): if region[1] - region[0] >= MIN_CONTIG_LEN: result_fasta.append( (chrom + '_' + str(idx), seq[region[0]:region[1]])) fastaparser.write_fasta(result_fpath, result_fasta) logger.info(' ' + 'Theoretical Upper Bound Assembly is saved to ' + result_fpath) logger.notice( '(on reusing *this* Upper Bound Assembly in the *future* evaluations on *the same* dataset)\n' '\tThe next time, you can simply provide this file as an additional assembly (you could also rename it to UpperBound.fasta for the clarity). ' 'In this case, you do not need to specify --upper-bound-assembly and provide files with reads (--pe1/pe2, etc).\n' '\t\tOR\n' '\tYou can copy ' + result_fpath + ' to ' + ref_prepared_optimal_assembly + '. ' 'The next time you evaluate assemblies with --upper-bound-assembly option and against the same reference (' + original_ref_fpath + ') and ' 'the same reads (or if you specify the insert size of the paired-end reads explicitly with --est-insert-size ' + str(insert_size) + '), ' 'QUAST will reuse this Upper Bound Assembly.\n') if not qconfig.debug: shutil.rmtree(tmp_dir) logger.main_info('Done.') return result_fpath
" <input fasta (scaffolds)> <THRESHOLD> <output fasta (contigs)> (to break contigs on Ns regions of size >= THRESHOLD)" ) sys.exit() BREAK_SCAFFOLDS = False if len(sys.argv) == 4: BREAK_SCAFFOLDS = True N_NUMBER = None counter = 0 if BREAK_SCAFFOLDS: N_NUMBER = int(sys.argv[2]) sizes_of_Ns_regions = dict() new_fasta = [] for id, (name, seq) in enumerate(fastaparser.read_fasta(sys.argv[1])): i = 0 cur_contig_number = 1 cur_contig_start = 0 while (i < len(seq)) and (seq.find("N", i) != -1): start = seq.find("N", i) end = start + 1 while (end != len(seq)) and (seq[end] == 'N'): end += 1 i = end + 1 if BREAK_SCAFFOLDS and (end - start) >= N_NUMBER: new_fasta.append((name.split()[0] + "_" + str(cur_contig_number), seq[cur_contig_start:start])) cur_contig_number += 1 cur_contig_start = end
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath, reference_chromosomes, ns_by_chromosomes, old_contigs_fpath, bed_fpath, threads=1): tmp_output_dirpath = create_minimap_output_dir(output_dirpath) assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) out_basename = join(tmp_output_dirpath, corr_assembly_label) logger.info(' ' + qutils.index_to_str(index) + assembly_label) if not qconfig.space_efficient: log_out_fpath = join( output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout') log_err_fpath = join( output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr') icarus_out_fpath = join( output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label) misassembly_fpath = join( output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info') unaligned_info_fpath = join( output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info') else: log_out_fpath = '/dev/null' log_err_fpath = '/dev/null' icarus_out_fpath = '/dev/null' misassembly_fpath = '/dev/null' unaligned_info_fpath = '/dev/null' icarus_out_f = open(icarus_out_fpath, 'w') icarus_header_cols = [ 'S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group' ] icarus_out_f.write('\t'.join(icarus_header_cols) + '\n') misassembly_f = open(misassembly_fpath, 'w') if not qconfig.space_efficient: logger.info(' ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath + ' and ' + os.path.basename(log_err_fpath) + '...') else: logger.info(' ' + qutils.index_to_str(index) + 'Logging is disabled.') coords_fpath, coords_filtered_fpath, unaligned_fpath, used_snps_fpath = get_aux_out_fpaths( out_basename) status = align_contigs(coords_fpath, out_basename, ref_fpath, contigs_fpath, old_contigs_fpath, index, threads, log_out_fpath, log_err_fpath) if status != AlignerStatus.OK: with open(log_err_fpath, 'a') as log_err_f: if status == AlignerStatus.ERROR: logger.error( ' ' + qutils.index_to_str(index) + 'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) + ' to the reference (non-zero exit code). ' + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) elif status == AlignerStatus.FAILED: log_err_f.write( qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n') logger.info(' ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.') elif status == AlignerStatus.NOT_ALIGNED: log_err_f.write( qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n') logger.info(' ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.') return status, {}, [], [], [] log_out_f = open(log_out_fpath, 'a') # Loading the alignment files log_out_f.write('Parsing coords...\n') aligns = {} with open(coords_fpath) as coords_file: for line in coords_file: mapping = Mapping.from_line(line) aligns.setdefault(mapping.contig, []).append(mapping) # Loading the reference sequences log_out_f.write('Loading reference...\n') # TODO: move up ref_features = {} # Loading the regions (if any) regions = {} total_reg_len = 0 total_regions = 0 # # TODO: gff # log_out_f.write('Loading regions...\n') # log_out_f.write('\tNo regions given, using whole reference.\n') for name, seq_len in reference_chromosomes.items(): log_out_f.write('\tLoaded [%s]\n' % name) regions.setdefault(name, []).append([1, seq_len]) total_regions += 1 total_reg_len += seq_len log_out_f.write('\tTotal Regions: %d\n' % total_regions) log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len) ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=open(coords_filtered_fpath, 'w'), icarus_out_f=icarus_out_f) log_out_f.write('Analyzing contigs...\n') result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\ analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, reference_chromosomes, is_cyclic) log_out_f.write('Analyzing coverage...\n') if qconfig.show_snps: log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n') total_aligned_bases, indels_info = analyze_coverage( ref_aligns, reference_chromosomes, ns_by_chromosomes, used_snps_fpath) total_indels_info += indels_info cov_stats = { 'SNPs': total_indels_info.mismatches, 'indels_list': total_indels_info.indels_list, 'total_aligned_bases': total_aligned_bases } result.update(cov_stats) result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result) if not qconfig.space_efficient: ## outputting misassembled contigs to separate file fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath) if name in misassembled_contigs.keys()] fastaparser.write_fasta( join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta) if qconfig.is_combined_ref: alignment_tsv_fpath = join( output_dirpath, "alignments_" + corr_assembly_label + '.tsv') unique_contigs_fpath = join( output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label) logger.debug(' ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath)) used_contigs = set() with open(unique_contigs_fpath, 'w') as unique_contigs_f: with open(alignment_tsv_fpath, 'w') as alignment_tsv_f: for chr_name, aligns in ref_aligns.items(): alignment_tsv_f.write(chr_name) contigs = set([align.contig for align in aligns]) for contig in contigs: alignment_tsv_f.write('\t' + contig) if qconfig.is_combined_ref: ref_name = ref_labels_by_chromosomes[chr_name] align_by_contigs = defaultdict(int) for align in aligns: align_by_contigs[align.contig] += align.len2 for contig, aligned_len in align_by_contigs.items(): if contig in used_contigs: continue used_contigs.add(contig) len_cov_pattern = re.compile( r'_length_([\d\.]+)_cov_([\d\.]+)') if len_cov_pattern.findall(contig): contig_len = len_cov_pattern.findall( contig)[0][0] contig_cov = len_cov_pattern.findall( contig)[0][1] if aligned_len / float(contig_len) > 0.9: unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n') alignment_tsv_f.write('\n') close_handlers(ca_output) logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') logger.debug('') if not ref_aligns: return AlignerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs else: return AlignerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
def analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic=None): maxun = 10 epsilon = 0.99 unaligned = 0 partially_unaligned = 0 fully_unaligned_bases = 0 partially_unaligned_bases = 0 ambiguous_contigs = 0 ambiguous_contigs_extra_bases = 0 ambiguous_contigs_len = 0 half_unaligned_with_misassembly = 0 misassembly_internal_overlap = 0 ref_aligns = dict() contigs_aligned_lengths = [] aligned_lengths = [] region_misassemblies = [] misassembled_contigs = dict() misassemblies_in_contigs = [] region_struct_variations = find_all_sv(qconfig.bed) istranslocations_by_ref = dict() misassemblies_by_ref = defaultdict(list) for ref in ref_labels_by_chromosomes.values(): istranslocations_by_ref[ref] = dict((key, 0) for key in ref_labels_by_chromosomes.values()) # for counting SNPs and indels (both original (.all_snps) and corrected from local misassemblies) total_indels_info = IndelsInfo() unaligned_file = open(unaligned_fpath, 'w') unaligned_info_file = open(unaligned_info_fpath, 'w') unaligned_info_file.write('\t'.join(['Contig', 'Total_length', 'Unaligned_length', 'Unaligned_type', 'Unaligned_parts']) + '\n') for contig, seq in fastaparser.read_fasta(contigs_fpath): #Recording contig stats ctg_len = len(seq) ca_output.stdout_f.write('CONTIG: %s (%dbp)\n' % (contig, ctg_len)) contig_type = 'unaligned' misassemblies_in_contigs.append(0) contigs_aligned_lengths.append(0) filtered_aligns = [] if contig in aligns: filtered_aligns = [align for align in aligns[contig] if align.len2 >= qconfig.min_alignment] #Check if this contig aligned to the reference if filtered_aligns: contig_type = 'correct' #Sort aligns by aligned_length * identity - unaligned_length (as we do in BSS) sorted_aligns = sorted(filtered_aligns, key=lambda x: (score_single_align(x), x.len2), reverse=True) top_len = sorted_aligns[0].len2 top_id = sorted_aligns[0].idy top_score = score_single_align(sorted_aligns[0]) top_aligns = [] ca_output.stdout_f.write('Best alignment score: %.1f (LEN: %d, IDY: %.2f), Total number of alignments: %d\n' % (top_score, top_len, top_id, len(sorted_aligns))) #Check that top hit captures most of the contig if top_len > ctg_len * epsilon or ctg_len - top_len < maxun: #Reset top aligns: aligns that share the same value of longest and highest identity top_aligns.append(sorted_aligns[0]) sorted_aligns = sorted_aligns[1:] #Continue grabbing alignments while length and identity are identical #while sorted_aligns and top_len == sorted_aligns[0].len2 and top_id == sorted_aligns[0].idy: while sorted_aligns and (score_single_align(sorted_aligns[0]) >= qconfig.ambiguity_score * top_score): top_aligns.append(sorted_aligns[0]) sorted_aligns = sorted_aligns[1:] #Mark other alignments as insignificant (former ambiguous) if sorted_aligns: ca_output.stdout_f.write('\t\tSkipping these alignments as insignificant (option --ambiguity-score is set to "%s"):\n' % str(qconfig.ambiguity_score)) for align in sorted_aligns: ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(align) + '\n') if len(top_aligns) == 1: #There is only one top align, life is good ca_output.stdout_f.write('\t\tOne align captures most of this contig: %s\n' % str(top_aligns[0])) ca_output.icarus_out_f.write(top_aligns[0].icarus_report_str() + '\n') ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0]) ca_output.coords_filtered_f.write(top_aligns[0].coords_str() + '\n') aligned_lengths.append(top_aligns[0].len2) contigs_aligned_lengths[-1] = top_aligns[0].len2 else: #There is more than one top align ca_output.stdout_f.write('\t\tThis contig has %d significant alignments. [An ambiguously mapped contig]\n' % len(top_aligns)) #Increment count of ambiguously mapped contigs and bases in them ambiguous_contigs += 1 # we count only extra bases, so we shouldn't include bases in the first alignment # if --ambiguity-usage is 'none', the number of extra bases will be negative! ambiguous_contigs_len += ctg_len # Alex: skip all alignments or count them as normal (just different aligns of one repeat). Depend on --allow-ambiguity option if qconfig.ambiguity_usage == "none": ambiguous_contigs_extra_bases -= top_aligns[0].len2 ca_output.stdout_f.write('\t\tSkipping these alignments (option --ambiguity-usage is set to "none"):\n') for align in top_aligns: ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(align) + '\n') elif qconfig.ambiguity_usage == "one": ambiguous_contigs_extra_bases += 0 ca_output.stdout_f.write('\t\tUsing only first of these alignment (option --ambiguity-usage is set to "one"):\n') ca_output.stdout_f.write('\t\t\tAlignment: %s\n' % str(top_aligns[0])) ca_output.icarus_out_f.write(top_aligns[0].icarus_report_str() + '\n') ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0]) aligned_lengths.append(top_aligns[0].len2) contigs_aligned_lengths[-1] = top_aligns[0].len2 ca_output.coords_filtered_f.write(top_aligns[0].coords_str() + '\n') top_aligns = top_aligns[1:] for align in top_aligns: ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(align) + '\n') elif qconfig.ambiguity_usage == "all": ambiguous_contigs_extra_bases -= top_aligns[0].len2 ca_output.stdout_f.write('\t\tUsing all these alignments (option --ambiguity-usage is set to "all"):\n') # we count only extra bases, so we shouldn't include bases in the first alignment first_alignment = True contig_type = 'ambiguous' while len(top_aligns): ca_output.stdout_f.write('\t\t\tAlignment: %s\n' % str(top_aligns[0])) ca_output.icarus_out_f.write(top_aligns[0].icarus_report_str(ambiguity=True) + '\n') ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0]) if first_alignment: first_alignment = False aligned_lengths.append(top_aligns[0].len2) contigs_aligned_lengths[-1] = top_aligns[0].len2 ambiguous_contigs_extra_bases += top_aligns[0].len2 ca_output.coords_filtered_f.write(top_aligns[0].coords_str() + ' ambiguous\n') top_aligns = top_aligns[1:] else: # choose appropriate alignments (to maximize total size of contig alignment and reduce # misassemblies) is_ambiguous, too_much_best_sets, sorted_aligns, best_sets = get_best_aligns_sets( sorted_aligns, ctg_len, ca_output.stdout_f, seq, ref_lens, is_cyclic, region_struct_variations) the_best_set = best_sets[0] used_indexes = list(range(len(sorted_aligns)) if too_much_best_sets else get_used_indexes(best_sets)) if len(used_indexes) < len(sorted_aligns): ca_output.stdout_f.write('\t\t\tSkipping redundant alignments after choosing the best set of alignments\n') for idx in set([idx for idx in range(len(sorted_aligns)) if idx not in used_indexes]): ca_output.stdout_f.write('\t\tSkipping redundant alignment ' + str(sorted_aligns[idx]) + '\n') if is_ambiguous: ca_output.stdout_f.write('\t\tThis contig has several significant sets of alignments. [An ambiguously mapped contig]\n') # similar to regular ambiguous contigs, see above ambiguous_contigs += 1 ambiguous_contigs_len += ctg_len if qconfig.ambiguity_usage == "none": ambiguous_contigs_extra_bases -= (ctg_len - the_best_set.uncovered) ca_output.stdout_f.write('\t\tSkipping all alignments in these sets (option --ambiguity-usage is set to "none"):\n') for idx in used_indexes: ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(sorted_aligns[idx]) + '\n') continue elif qconfig.ambiguity_usage == "one": ambiguous_contigs_extra_bases += 0 ca_output.stdout_f.write('\t\tUsing only the very best set (option --ambiguity-usage is set to "one").\n') if len(the_best_set.indexes) < len(used_indexes): ca_output.stdout_f.write('\t\tSo, skipping alignments from other sets:\n') for idx in used_indexes: if idx not in the_best_set.indexes: ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(sorted_aligns[idx]) + '\n') elif qconfig.ambiguity_usage == "all": ca_output.stdout_f.write('\t\tUsing all alignments in these sets (option --ambiguity-usage is set to "all"):\n') ca_output.stdout_f.write('\t\t\tThe very best set is shown in details below, the rest are:\n') for idx, cur_set in enumerate(best_sets[1:]): ca_output.stdout_f.write('\t\t\t\tGroup #%d. Score: %.1f, number of alignments: %d, unaligned bases: %d\n' % \ (idx + 2, cur_set.score, len(cur_set.indexes), cur_set.uncovered)) if too_much_best_sets: ca_output.stdout_f.write('\t\t\t\tetc...\n') if len(the_best_set.indexes) < len(used_indexes): ambiguous_contigs_extra_bases -= (ctg_len - the_best_set.uncovered) ca_output.stdout_f.write('\t\t\tList of alignments used in the sets above:\n') for idx in used_indexes: align = sorted_aligns[idx] ca_output.stdout_f.write('\t\tAlignment: %s\n' % str(align)) ref_aligns.setdefault(align.ref, []).append(align) ambiguous_contigs_extra_bases += align.len2 ca_output.coords_filtered_f.write(align.coords_str() + " ambiguous\n") if idx not in the_best_set.indexes: ca_output.icarus_out_f.write(align.icarus_report_str(is_best=False) + '\n') ca_output.stdout_f.write('\t\t\tThe best set is below. Score: %.1f, number of alignments: %d, unaligned bases: %d\n' % \ (the_best_set.score, len(the_best_set.indexes), the_best_set.uncovered)) real_aligns = [sorted_aligns[i] for i in the_best_set.indexes] # main processing part if len(real_aligns) == 1: the_only_align = real_aligns[0] #There is only one alignment of this contig to the reference ca_output.coords_filtered_f.write(the_only_align.coords_str() + '\n') aligned_lengths.append(the_only_align.len2) contigs_aligned_lengths[-1] = the_only_align.len2 begin, end = the_only_align.start(), the_only_align.end() unaligned_bases = (begin - 1) + (ctg_len - end) number_unaligned_ns = seq[:begin - 1].count('N') + seq[end:].count('N') aligned_bases_in_contig = ctg_len - unaligned_bases acgt_ctg_len = ctg_len - seq.count('N') is_partially_unaligned = check_partially_unaligned(seq, real_aligns, ctg_len) if is_partially_unaligned: partially_unaligned += 1 partially_unaligned_bases += unaligned_bases - number_unaligned_ns if aligned_bases_in_contig < qconfig.unaligned_mis_threshold * acgt_ctg_len: contig_type = 'correct_unaligned' ca_output.stdout_f.write('\t\tThis contig is partially unaligned. ' '(Aligned %d out of %d non-N bases (%.2f%%))\n' % (aligned_bases_in_contig, acgt_ctg_len, 100.0 * aligned_bases_in_contig / acgt_ctg_len)) save_unaligned_info(real_aligns, contig, ctg_len, unaligned_bases, unaligned_info_file) ca_output.stdout_f.write('\t\tAlignment: %s\n' % str(the_only_align)) ca_output.icarus_out_f.write(the_only_align.icarus_report_str() + '\n') if is_partially_unaligned: if begin - 1: ca_output.stdout_f.write('\t\tUnaligned bases: 1 to %d (%d)\n' % (begin - 1, begin - 1)) if ctg_len - end: ca_output.stdout_f.write('\t\tUnaligned bases: %d to %d (%d)\n' % (end + 1, ctg_len, ctg_len - end)) if qconfig.is_combined_ref: check_for_potential_translocation(seq, ctg_len, real_aligns, region_misassemblies, misassemblies_by_ref, ca_output.stdout_f) ref_aligns.setdefault(the_only_align.ref, []).append(the_only_align) else: #Sort real alignments by position on the contig sorted_aligns = sorted(real_aligns, key=lambda x: (x.end(), x.start())) #There is more than one alignment of this contig to the reference ca_output.stdout_f.write('\t\tThis contig is misassembled.\n') unaligned_bases = the_best_set.uncovered number_unaligned_ns, prev_pos = 0, 0 for align in sorted_aligns: number_unaligned_ns += seq[prev_pos: align.start() - 1].count('N') prev_pos = align.end() number_unaligned_ns += seq[prev_pos:].count('N') aligned_bases_in_contig = ctg_len - unaligned_bases number_ns = seq.count('N') acgt_ctg_len = ctg_len - number_ns is_partially_unaligned = check_partially_unaligned(seq, sorted_aligns, ctg_len) if is_partially_unaligned: partially_unaligned += 1 partially_unaligned_bases += unaligned_bases - number_unaligned_ns ca_output.stdout_f.write('\t\tThis contig is partially unaligned. ' '(Aligned %d out of %d non-N bases (%.2f%%))\n' % (aligned_bases_in_contig, acgt_ctg_len, 100.0 * aligned_bases_in_contig / acgt_ctg_len)) save_unaligned_info(sorted_aligns, contig, ctg_len, unaligned_bases, unaligned_info_file) if aligned_bases_in_contig < qconfig.unaligned_mis_threshold * acgt_ctg_len: ca_output.stdout_f.write('\t\t\tWarning! This contig is more unaligned than misassembled. ' + \ 'Contig length is %d (number of Ns: %d) and total length of all aligns is %d\n' % (ctg_len, number_ns, aligned_bases_in_contig)) contigs_aligned_lengths[-1] = sum(align.len2 for align in sorted_aligns) for align in sorted_aligns: ca_output.stdout_f.write('\t\tAlignment: %s\n' % str(align)) ca_output.icarus_out_f.write(align.icarus_report_str() + '\n') ca_output.icarus_out_f.write('unknown\n') ca_output.coords_filtered_f.write(align.coords_str() + '\n') aligned_lengths.append(align.len2) ref_aligns.setdefault(align.ref, []).append(align) half_unaligned_with_misassembly += 1 ca_output.stdout_f.write('\t\tUnaligned bases: %d\n' % unaligned_bases) contig_type = 'mis_unaligned' ca_output.icarus_out_f.write('\t'.join(['CONTIG', contig, str(ctg_len), contig_type + '\n'])) ca_output.stdout_f.write('\n') continue ### processing misassemblies is_misassembled, current_mio, indels_info, cnt_misassemblies, contig_aligned_length = \ process_misassembled_contig(sorted_aligns, is_cyclic, aligned_lengths, region_misassemblies, ref_lens, ref_aligns, ref_features, seq, misassemblies_by_ref, istranslocations_by_ref, region_struct_variations, ca_output) contigs_aligned_lengths[-1] = contig_aligned_length misassembly_internal_overlap += current_mio total_indels_info += indels_info if is_misassembled: misassembled_contigs[contig] = ctg_len contig_type = 'misassembled' misassemblies_in_contigs[-1] = cnt_misassemblies if is_partially_unaligned: ca_output.stdout_f.write('\t\tUnaligned bases: %d\n' % unaligned_bases) if qconfig.is_combined_ref: check_for_potential_translocation(seq, ctg_len, sorted_aligns, region_misassemblies, misassemblies_by_ref, ca_output.stdout_f) else: #No aligns to this contig ca_output.stdout_f.write('\t\tThis contig is unaligned. (%d bp)\n' % ctg_len) unaligned_file.write(contig + '\n') #Increment unaligned contig count and bases unaligned += 1 number_ns = seq.count('N') fully_unaligned_bases += ctg_len - number_ns ca_output.stdout_f.write('\t\tUnaligned bases: %d (number of Ns: %d)\n' % (ctg_len, number_ns)) save_unaligned_info([], contig, ctg_len, ctg_len, unaligned_info_file) ca_output.icarus_out_f.write('\t'.join(['CONTIG', contig, str(ctg_len), contig_type]) + '\n') ca_output.stdout_f.write('\n') unaligned_file.close() unaligned_info_file.close() misassembled_bases = sum(misassembled_contigs.values()) # special case: --skip-unaligned-mis-contigs is specified if qconfig.unaligned_mis_threshold == 0.0: half_unaligned_with_misassembly = None result = {'region_misassemblies': region_misassemblies, 'region_struct_variations': region_struct_variations.get_count() if region_struct_variations else None, 'misassembled_contigs': misassembled_contigs, 'misassembled_bases': misassembled_bases, 'misassembly_internal_overlap': misassembly_internal_overlap, 'unaligned': unaligned, 'partially_unaligned': partially_unaligned, 'partially_unaligned_bases': partially_unaligned_bases, 'fully_unaligned_bases': fully_unaligned_bases, 'ambiguous_contigs': ambiguous_contigs, 'ambiguous_contigs_extra_bases': ambiguous_contigs_extra_bases, 'ambiguous_contigs_len': ambiguous_contigs_len, 'half_unaligned_with_misassembly': half_unaligned_with_misassembly, 'misassemblies_by_ref': misassemblies_by_ref, 'istranslocations_by_refs': istranslocations_by_ref} return result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, contigs_aligned_lengths
return qutils.correct_name(name) # return re.sub(r'\W', '', re.sub(r'\s', '_', name)) # MAIN if len(sys.argv) != 3: print("Usage: " + sys.argv[0] + " <input fasta> <contig id or file with list of contig ids>") sys.exit() if os.path.isfile(sys.argv[2]): list_of_ids = [] for line in open(sys.argv[2]): list_of_ids.append(line.strip()) else: list_of_ids = [sys.argv[2]] origin_fasta = fastaparser.read_fasta(sys.argv[1]) dict_of_all_contigs = dict() selected_contigs = [] for (name, seq) in origin_fasta: corr_name = get_corr_name(name) dict_of_all_contigs[corr_name] = seq for name in list_of_ids: corr_name = get_corr_name(name) if corr_name in dict_of_all_contigs: selected_contigs.append((name, dict_of_all_contigs[corr_name])) else: print >> sys.stderr, "Contig", name, "(cor name:", corr_name, ") not found!" for (name, seq) in selected_contigs: print '>' + name
def do(output_dir, ref_fpath, contigs_fpaths, logger): logger.print_timestamp() logger.main_info('Running analysis based on unique ' + str(KMERS_LEN) + '-mers...') checked_assemblies = [] for contigs_fpath in contigs_fpaths: label = qutils.label_from_fpath_for_fname(contigs_fpath) if check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): kmc_stats_fpath = join(output_dir, label + '.stat') stats_content = open(kmc_stats_fpath).read().split('\n') if len(stats_content) < 1: continue logger.info(' Using existing results for ' + label + '... ') report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1])) if len(stats_content) >= 5: len_map_to_one_chrom = int(stats_content[1].strip().split(': ')[-1]) len_map_to_multi_chrom = int(stats_content[2].strip().split(': ')[-1]) len_map_to_none_chrom = int(stats_content[3].strip().split(': ')[-1]) total_len = int(stats_content[4].strip().split(': ')[-1]) report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len)) checked_assemblies.append(contigs_fpath) contigs_fpaths = [fpath for fpath in contigs_fpaths if fpath not in checked_assemblies] if len(contigs_fpaths) == 0: logger.info('Done.') return if qconfig.platform_name == 'linux_32': logger.warning(' Sorry, can\'t run KMC on this platform, skipping...') return None kmc_dirpath = get_dir_for_download(kmc_dirname, 'KMC', ['kmc', 'kmc_tools'], logger) global kmc_bin_fpath global kmc_tools_fpath kmc_bin_fpath = download_external_tool('kmc', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True) kmc_tools_fpath = download_external_tool('kmc_tools', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True) if not exists(kmc_bin_fpath) or not exists(kmc_tools_fpath) or not compile_minimap(logger): logger.warning(' Sorry, can\'t run KMC, skipping...') return None logger.info('Running KMC on reference...') log_fpath = join(output_dir, 'kmc.log') err_fpath = join(output_dir, 'kmc.err') open(log_fpath, 'w').close() open(err_fpath, 'w').close() tmp_dirpath = join(output_dir, 'tmp') if not isdir(tmp_dirpath): os.makedirs(tmp_dirpath) ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, log_fpath, err_fpath) unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath, err_fpath) if not unique_kmers: return logger.info('Analyzing assemblies completeness...') kmc_out_fpaths = [] for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, log_fpath, err_fpath) intersect_out_fpath = intersect_kmers(tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath, err_fpath) matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath, log_fpath, err_fpath) completeness = matched_kmers * 100.0 / unique_kmers report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness) kmc_out_fpaths.append(intersect_out_fpath) logger.info('Analyzing assemblies accuracy...') if len(kmc_out_fpaths) > 1: shared_kmc_db = intersect_kmers(tmp_dirpath, kmc_out_fpaths, log_fpath, err_fpath) else: shared_kmc_db = kmc_out_fpaths[0] kmer_fraction = 0.001 ref_contigs = [name for name, _ in read_fasta(ref_fpath)] ref_kmc_dbs = [] if len(ref_contigs) <= MAX_REF_CONTIGS_NUM: shared_downsampled_kmc_db = downsample_kmers(tmp_dirpath, ref_fpath, shared_kmc_db, log_fpath, err_fpath, kmer_fraction=kmer_fraction) for name, seq in read_fasta(ref_fpath): seq_kmc_db = seq_to_kmc_db(tmp_dirpath, log_fpath, err_fpath, seq=seq, name=name, is_ref=True, intersect_with=shared_downsampled_kmc_db) ref_kmc_dbs.append((name, seq_kmc_db)) for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) len_map_to_one_chrom = None len_map_to_multi_chrom = None len_map_to_none_chrom = None total_len = 0 long_contigs = [] contig_lens = dict() contig_markers = defaultdict(list) label = qutils.label_from_fpath_for_fname(contigs_fpath) list_files_fpath = join(tmp_dirpath, label + '_files.txt') with open(list_files_fpath, 'w') as list_files: for name, seq in read_fasta(contigs_fpath): total_len += len(seq) contig_lens[name] = len(seq) if len(seq) >= MIN_CONTIGS_LEN: long_contigs.append(len(seq)) tmp_contig_fpath = join(tmp_dirpath, name + '.fasta') with open(tmp_contig_fpath, 'w') as out_f: out_f.write('>%s\n' % name) out_f.write('%s\n' % seq) list_files.write(tmp_contig_fpath + '\n') if len(long_contigs) > MAX_CONTIGS_NUM or sum(long_contigs) < total_len * 0.5: logger.warning('Assembly is too fragmented. Scaffolding accuracy will not be assessed.') elif len(ref_contigs) > MAX_REF_CONTIGS_NUM: logger.warning('Reference is too fragmented. Scaffolding accuracy will not be assessed.') else: len_map_to_one_chrom = 0 len_map_to_multi_chrom = 0 filtered_fpath = join(tmp_dirpath, label + '.filtered.fasta') filter_contigs(list_files_fpath, filtered_fpath, shared_kmc_db, log_fpath, err_fpath, min_kmers=MIN_MARKERS) filtered_list_files_fpath = join(tmp_dirpath, label + '_files.filtered.txt') with open(filtered_list_files_fpath, 'w') as list_files: for name, _ in read_fasta(filtered_fpath): tmp_contig_fpath = join(tmp_dirpath, name + '.fasta') list_files.write(tmp_contig_fpath + '\n') for ref_name, ref_kmc_db in ref_kmc_dbs: tmp_filtered_fpath = join(tmp_dirpath, ref_name + '.filtered.fasta') filter_contigs(filtered_list_files_fpath, tmp_filtered_fpath, ref_kmc_db, log_fpath, err_fpath, min_kmers=MIN_MISJOIN_MARKERS) if exists(tmp_filtered_fpath): for name, _ in read_fasta(tmp_filtered_fpath): contig_markers[name].append(ref_name) for name, chr_markers in contig_markers.items(): if len(chr_markers) == 1: len_map_to_one_chrom += contig_lens[name] else: len_map_to_multi_chrom += contig_lens[name] len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len)) create_kmc_stats_file(output_dir, contigs_fpath, contigs_fpaths, ref_fpath, report.get_field(reporting.Fields.KMER_COMPLETENESS), len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom, total_len) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.info('Done.')
import sys import os sys.path.append(os.path.join(os.path.abspath(sys.path[0]), '../')) import quast_libs from quast_libs import fastaparser if len(sys.argv) <= 3 or len(sys.argv) >= 6: print("Returns [reverse-complement] sequence from START to END position from each entry of input fasta") print("Usage: " + sys.argv[0] + " <input fasta> <START> <END, -1 for the end> [any string -- optional parameter for reverse-complement]") sys.exit() inp=sys.argv[1] start=int(sys.argv[2]) end=int(sys.argv[3]) reverse = False if len(sys.argv) == 5: reverse = True for tup in fastaparser.read_fasta(inp): cur_start = min(start, len(tup[1])) if end == -1: cur_end = len(tup[1]) else: cur_end = min(end, len(tup[1])) print (">" + tup[0] + "_cropped_" + str(cur_start) + "_" + str(cur_end)) if reverse: print (fastaparser.rev_comp(tup[1][cur_start - 1 : cur_end])) else: print (tup[1][cur_start - 1 : cur_end])
def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir): logger.print_timestamp() logger.main_info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None reference_lengths = [] reference_fragments = None icarus_gc_fpath = None circos_gc_fpath = None if ref_fpath: reference_lengths = sorted(fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values(), reverse=True) reference_fragments = len(reference_lengths) reference_length = sum(reference_lengths) reference_GC, reference_GC_distribution, reference_GC_contigs_distribution = GC_content(ref_fpath) if qconfig.create_icarus_html or qconfig.draw_plots: icarus_gc_fpath = join(output_dirpath, 'gc.icarus.txt') save_icarus_GC(ref_fpath, icarus_gc_fpath) if qconfig.draw_plots: circos_gc_fpath = join(output_dirpath, 'gc.circos.txt') save_circos_GC(ref_fpath, reference_length, circos_gc_fpath) logger.info(' Reference genome:') logger.info(' ' + os.path.basename(ref_fpath) + ', length = ' + str(reference_length) + ', num fragments = ' + str(reference_fragments) + ', GC % = ' + '%.2f' % reference_GC if reference_GC is not None else 'undefined') if reference_fragments > 30 and not qconfig.check_for_fragmented_ref: logger.warning(' Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option.' ' QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).') elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size reference_lengths = [reference_length] logger.info(' Estimated reference length = ' + str(reference_length)) logger.info(' Contig files: ') lists_of_lengths = [] numbers_of_Ns = [] coverage_dict = dict() cov_pattern = re.compile(r'_cov_(\d+\.?\d*)') for id, contigs_fpath in enumerate(contigs_fpaths): coverage_dict[contigs_fpath] = [] assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 is_potential_scaffold = False for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count('N') if not qconfig.scaffolds and not is_potential_scaffold and qutils.is_scaffold(seq): is_potential_scaffold = True qconfig.potential_scaffolds_assemblies.append(assembly_label) if cov_pattern.findall(name): cov = int(float(cov_pattern.findall(name)[0])) if len(coverage_dict[contigs_fpath]) <= cov: coverage_dict[contigs_fpath] += [0] * (cov - len(coverage_dict[contigs_fpath]) + 1) coverage_dict[contigs_fpath][cov] += len(seq) lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths] num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths]) multiplicator = 1 if num_contigs >= (qconfig.max_points * 2): import math multiplicator = int(num_contigs / qconfig.max_points) max_points = num_contigs // multiplicator corr_lists_of_lengths = [[sum(list_of_length[((i - 1) * multiplicator):(i * multiplicator)]) for i in range(1, max_points) if (i * multiplicator) < len(list_of_length)] for list_of_length in lists_of_lengths] if len(reference_lengths) > 1: reference_lengths = [sum(reference_lengths[((i - 1) * multiplicator):(i * multiplicator)]) if (i * multiplicator) < len(reference_lengths) else sum(reference_lengths[((i - 1) * multiplicator):]) for i in range(1, max_points)] + [sum(reference_lengths[(max_points - 1) * multiplicator:])] for num_list in range(len(corr_lists_of_lengths)): last_index = len(corr_lists_of_lengths[num_list]) corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index * multiplicator:])) else: corr_lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths] if reference_lengths: # Saving for an HTML report if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_reference_lengths(results_dir, reference_lengths) if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths) html_saver.save_tick_x(results_dir, multiplicator) ######################################################################## logger.info(' Calculating N50 and L50...') list_of_GC_distributions = [] list_of_GC_contigs_distributions = [] largest_contig = 0 from . import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(zip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution, GC_contigs_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc) list_of_GC_distributions.append(GC_distribution) list_of_GC_contigs_distributions.append(GC_contigs_distribution) logger.info(' ' + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + \ ', N50 = ' + str(n50) + \ ', L50 = ' + str(l50) + \ ', Total length = ' + str(total_length) + \ ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \ ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined') report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) if lengths_list: report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) largest_contig = max(largest_contig, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None)) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) import math qconfig.min_difference = math.ceil((largest_contig / 1000) / 600) # divide on height of plot list_of_GC_distributions_with_ref = list_of_GC_distributions reference_index = None if ref_fpath: reference_index = len(list_of_GC_distributions_with_ref) list_of_GC_distributions_with_ref.append(reference_GC_distribution) if qconfig.html_report and not qconfig.no_gc: from quast_libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions_with_ref, list_of_GC_contigs_distributions, reference_index) ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'Nx_plot'), 'Nx', []) if reference_length and not qconfig.is_combined_ref: plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'NGx_plot'), 'NGx', [reference_length for i in range(len(contigs_fpaths))]) if qconfig.draw_plots: ########################################################################import plotter # Drawing cumulative plot... plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'cumulative_plot'), 'Cumulative length') if not qconfig.no_gc: ######################################################################## # Drawing GC content plot... plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, join(output_dirpath, 'GC_content_plot')) for contigs_fpath, GC_distribution in zip(contigs_fpaths, list_of_GC_contigs_distributions): plotter.contigs_GC_content_plot(contigs_fpath, GC_distribution, join(output_dirpath, qutils.label_from_fpath(contigs_fpath) + '_GC_content_plot')) if any(coverage_dict[contigs_fpath] for contigs_fpath in contigs_fpaths): draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath) logger.main_info('Done.') return icarus_gc_fpath, circos_gc_fpath
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath, old_contigs_fpath, bed_fpath, parallel_by_chr=False, threads=1): nucmer_output_dirpath = create_nucmer_output_dir(output_dirpath) assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) nucmer_fpath = join(nucmer_output_dirpath, corr_assembly_label) logger.info(' ' + qutils.index_to_str(index) + assembly_label) if not qconfig.space_efficient: log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout') log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr') icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label) misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info') unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info') else: log_out_fpath = '/dev/null' log_err_fpath = '/dev/null' icarus_out_fpath = '/dev/null' misassembly_fpath = '/dev/null' unaligned_info_fpath = '/dev/null' icarus_out_f = open(icarus_out_fpath, 'w') icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group'] icarus_out_f.write('\t'.join(icarus_header_cols) + '\n') misassembly_f = open(misassembly_fpath, 'w') if not qconfig.space_efficient: logger.info(' ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath + ' and ' + os.path.basename(log_err_fpath) + '...') else: logger.info(' ' + qutils.index_to_str(index) + 'Logging is disabled.') coords_fpath, coords_filtered_fpath, unaligned_fpath, show_snps_fpath, used_snps_fpath = \ get_nucmer_aux_out_fpaths(nucmer_fpath) nucmer_status = align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index, parallel_by_chr, threads, log_out_fpath, log_err_fpath) if nucmer_status != NucmerStatus.OK: with open(log_err_fpath, 'a') as log_err_f: if nucmer_status == NucmerStatus.ERROR: logger.error(' ' + qutils.index_to_str(index) + 'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) + ' to the reference (non-zero exit code). ' + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) elif nucmer_status == NucmerStatus.FAILED: log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n') logger.info(' ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.') elif nucmer_status == NucmerStatus.NOT_ALIGNED: log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n') logger.info(' ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.') clean_tmp_files(nucmer_fpath) return nucmer_status, {}, [], [], [] log_out_f = open(log_out_fpath, 'a') # Loading the alignment files log_out_f.write('Parsing coords...\n') aligns = {} coords_file = open(coords_fpath) coords_filtered_file = open(coords_filtered_fpath, 'w') coords_filtered_file.write(coords_file.readline()) coords_filtered_file.write(coords_file.readline()) for line in coords_file: if line.strip() == '': break assert line[0] != '=' #Clear leading spaces from nucmer output #Store nucmer lines in an array mapping = Mapping.from_line(line) aligns.setdefault(mapping.contig, []).append(mapping) # Loading the reference sequences log_out_f.write('Loading reference...\n') # TODO: move up ref_lens = {} ref_features = {} for name, seq in fastaparser.read_fasta(ref_fpath): name = name.split()[0] # no spaces in reference header ref_lens[name] = len(seq) log_out_f.write('\tLoaded [%s]\n' % name) #Loading the SNP calls if qconfig.show_snps: log_out_f.write('Loading SNPs...\n') used_snps_file = None snps = {} if qconfig.show_snps: prev_line = None for line in open_gzipsafe(show_snps_fpath): #print "$line"; line = line.split() if not line[0].isdigit(): continue if prev_line and line == prev_line: continue ref = line[10] ctg = line[11] pos = int(line[0]) # Kolya: python don't convert int<->str types automatically loc = int(line[3]) # Kolya: same as above # if (! exists $line[11]) { die "Malformed line in SNP file. Please check that show-snps has completed succesfully.\n$line\n[$line[9]][$line[10]][$line[11]]\n"; } if pos in snps.setdefault(ref, {}).setdefault(ctg, {}): snps.setdefault(ref, {}).setdefault(ctg, {})[pos].append(SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])) else: snps.setdefault(ref, {}).setdefault(ctg, {})[pos] = [SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])] prev_line = line used_snps_file = open_gzipsafe(used_snps_fpath, 'w') # Loading the regions (if any) regions = {} total_reg_len = 0 total_regions = 0 # # TODO: gff # log_out_f.write('Loading regions...\n') # log_out_f.write('\tNo regions given, using whole reference.\n') for name, seq_len in ref_lens.items(): regions.setdefault(name, []).append([1, seq_len]) total_regions += 1 total_reg_len += seq_len log_out_f.write('\tTotal Regions: %d\n' % total_regions) log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len) ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=coords_filtered_file, used_snps_f=used_snps_file, icarus_out_f=icarus_out_f) log_out_f.write('Analyzing contigs...\n') result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\ analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic) # if qconfig.large_genome: # log_out_f.write('Analyzing large blocks...\n') # large_misassembly_fpath = add_suffix(misassembly_fpath, 'large_blocks') if not qconfig.space_efficient else '/dev/null' # ca_large_output = CAOutput(stdout_f=log_out_f, misassembly_f=open(large_misassembly_fpath, 'w'), # coords_filtered_f=coords_filtered_file, used_snps_f=open('/dev/null', 'w'), icarus_out_f=open('/dev/null', 'w')) # min_alignment, extensive_mis_threshold = qconfig.min_alignment, qconfig.extensive_misassembly_threshold # qconfig.min_alignment, qconfig.extensive_misassembly_threshold = qconfig.LARGE_MIN_ALIGNMENT, qconfig.LARGE_EXTENSIVE_MIS_THRESHOLD # result.update(analyze_contigs(ca_large_output, contigs_fpath, '/dev/null', '/dev/null', # aligns, ref_features, ref_lens, is_cyclic, large_misassemblies_search=True)[0]) # qconfig.min_alignment, qconfig.extensive_misassembly_threshold = min_alignment, extensive_mis_threshold log_out_f.write('Analyzing coverage...\n') if qconfig.show_snps: log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n') result.update(analyze_coverage(ca_output, regions, ref_aligns, ref_features, snps, total_indels_info)) result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result) if not qconfig.space_efficient: ## outputting misassembled contigs to separate file fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath) if name in misassembled_contigs.keys()] fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta) if qconfig.is_combined_ref: alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv') unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label) logger.debug(' ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath)) used_contigs = set() with open(unique_contigs_fpath, 'w') as unique_contigs_f: with open(alignment_tsv_fpath, 'w') as alignment_tsv_f: for chr_name, aligns in ref_aligns.items(): alignment_tsv_f.write(chr_name) contigs = set([align.contig for align in aligns]) for contig in contigs: alignment_tsv_f.write('\t' + contig) if qconfig.is_combined_ref: ref_name = ref_labels_by_chromosomes[chr_name] align_by_contigs = defaultdict(int) for align in aligns: align_by_contigs[align.contig] += align.len2 for contig, aligned_len in align_by_contigs.items(): if contig in used_contigs: continue used_contigs.add(contig) len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)') if len_cov_pattern.findall(contig): contig_len = len_cov_pattern.findall(contig)[0][0] contig_cov = len_cov_pattern.findall(contig)[0][1] if aligned_len / float(contig_len) > 0.9: unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n') alignment_tsv_f.write('\n') close_handlers(ca_output) logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') logger.debug('') clean_tmp_files(nucmer_fpath) if not qconfig.no_gzip: compress_nucmer_output(logger, nucmer_fpath) if not ref_aligns: return NucmerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs else: return NucmerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
def do(contigs_fpaths, contig_report_fpath_pattern, output_dirpath, ref_fpath, cov_fpath=None, physical_cov_fpath=None, gc_fpath=None, stdout_pattern=None, find_similar=True, features=None, json_output_dir=None, genes_by_labels=None): make_output_dir(output_dirpath) lists_of_aligned_blocks = [] contigs_by_assemblies = OrderedDict() structures_by_labels = {} ambiguity_alignments_by_labels = {} total_genome_size = 0 reference_chromosomes = OrderedDict() contig_names_by_refs = None assemblies = None chr_names = [] features_data = None plot_fpath = None if ref_fpath: for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_names.append(chr_name) chr_len = len(seq) total_genome_size += chr_len reference_chromosomes[chr_name] = chr_len virtual_genome_shift = 100 sorted_ref_names = sorted(reference_chromosomes, key=reference_chromosomes.get, reverse=True) sorted_ref_lengths = sorted(reference_chromosomes.values(), reverse=True) cumulative_ref_lengths = [0] if ref_labels_by_chromosomes: contig_names_by_refs = ref_labels_by_chromosomes elif sum(reference_chromosomes.values() ) > qconfig.MAX_SIZE_FOR_COMB_PLOT: contig_names_by_refs = dict() if len(chr_names) > qconfig.ICARUS_MAX_CHROMOSOMES: summary_len = 0 num_parts = 1 html_name = qconfig.alignment_viewer_part_name + str(num_parts) for chr_name, chr_len in reference_chromosomes.items(): summary_len += chr_len contig_names_by_refs[chr_name] = html_name if summary_len >= qconfig.MAX_SIZE_FOR_COMB_PLOT: summary_len = 0 num_parts += 1 html_name = qconfig.alignment_viewer_part_name + str( num_parts) else: for chr_name in chr_names: contig_names_by_refs[chr_name] = chr_name for i, chr in enumerate(chr_names): chr_length = reference_chromosomes[chr] len_to_append = cumulative_ref_lengths[-1] + chr_length if contig_names_by_refs: if i < len(chr_names) - 1 and contig_names_by_refs[ chr] != contig_names_by_refs[chr_names[i + 1]]: len_to_append = 0 cumulative_ref_lengths.append(len_to_append) virtual_genome_size = sum(reference_chromosomes.values( )) + virtual_genome_shift * (len(reference_chromosomes.values()) - 1) for contigs_fpath in contigs_fpaths: label = qconfig.assembly_labels_by_fpath[contigs_fpath] if not contig_report_fpath_pattern: contigs = parse_contigs_fpath(contigs_fpath) else: report_fpath = contig_report_fpath_pattern % qutils.label_from_fpath_for_fname( contigs_fpath) aligned_blocks, misassembled_id_to_structure, contigs, ambiguity_alignments = parse_aligner_contig_report( report_fpath, list(reference_chromosomes.keys()), cumulative_ref_lengths) if not contigs: contigs = parse_contigs_fpath(contigs_fpath) if aligned_blocks is None: return None for block in aligned_blocks: block.label = label aligned_blocks = check_misassembled_blocks( aligned_blocks, misassembled_id_to_structure) lists_of_aligned_blocks.append(aligned_blocks) structures_by_labels[label] = misassembled_id_to_structure if qconfig.ambiguity_usage == 'all': ambiguity_alignments_by_labels[label] = ambiguity_alignments contigs_by_assemblies[label] = contigs if ref_fpath: features_data = parse_features_data(features, cumulative_ref_lengths, chr_names) if contigs_fpaths and qconfig.gene_finding: parse_genes_data(contigs_by_assemblies, genes_by_labels) if reference_chromosomes and lists_of_aligned_blocks: assemblies = get_assemblies(contigs_fpaths, lists_of_aligned_blocks, virtual_genome_size, find_similar) if qconfig.draw_svg: plot_fpath = draw_alignment_plot(assemblies, virtual_genome_size, output_dirpath, sorted_ref_names, sorted_ref_lengths, virtual_genome_shift) if (assemblies or contigs_by_assemblies) and qconfig.create_icarus_html: icarus_html_fpath = js_data_gen( assemblies, contigs_fpaths, reference_chromosomes, output_dirpath, structures_by_labels, contig_names_by_refs=contig_names_by_refs, ref_fpath=ref_fpath, stdout_pattern=stdout_pattern, ambiguity_alignments_by_labels=ambiguity_alignments_by_labels, contigs_by_assemblies=contigs_by_assemblies, features_data=features_data, gc_fpath=gc_fpath, cov_fpath=cov_fpath, physical_cov_fpath=physical_cov_fpath, json_output_dir=json_output_dir) else: icarus_html_fpath = None return icarus_html_fpath, plot_fpath
def do(contigs_fpaths, contig_report_fpath_pattern, output_dirpath, ref_fpath, cov_fpath=None, physical_cov_fpath=None, stdout_pattern=None, find_similar=True, features=None, json_output_dir=None, genes_by_labels=None): make_output_dir(output_dirpath) lists_of_aligned_blocks = [] contigs_by_assemblies = OrderedDict() structures_by_labels = {} ambiguity_alignments_by_labels = {} total_genome_size = 0 reference_chromosomes = OrderedDict() contig_names_by_refs = None assemblies = None chr_names = [] features_data = None plot_fpath = None max_small_chromosomes = 10 if ref_fpath: for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_names.append(chr_name) chr_len = len(seq) total_genome_size += chr_len reference_chromosomes[chr_name] = chr_len virtual_genome_shift = 100 sorted_ref_names = sorted(reference_chromosomes, key=reference_chromosomes.get, reverse=True) sorted_ref_lengths = sorted(reference_chromosomes.values(), reverse=True) cumulative_ref_lengths = [0] if ref_labels_by_chromosomes: contig_names_by_refs = ref_labels_by_chromosomes elif sum(reference_chromosomes.values()) > qconfig.MAX_SIZE_FOR_COMB_PLOT: contig_names_by_refs = dict() if len(chr_names) > max_small_chromosomes: summary_len = 0 num_parts = 1 html_name = qconfig.alignment_viewer_part_name + str(num_parts) for chr_name, chr_len in reference_chromosomes.items(): summary_len += chr_len contig_names_by_refs[chr_name] = html_name if summary_len >= qconfig.MAX_SIZE_FOR_COMB_PLOT: summary_len = 0 num_parts += 1 html_name = qconfig.alignment_viewer_part_name + str(num_parts) else: for chr_name in chr_names: contig_names_by_refs[chr_name] = chr_name for i, chr in enumerate(chr_names): chr_length = reference_chromosomes[chr] len_to_append = cumulative_ref_lengths[-1] + chr_length if contig_names_by_refs: if i < len(chr_names) - 1 and contig_names_by_refs[chr] != contig_names_by_refs[chr_names[i + 1]]: len_to_append = 0 cumulative_ref_lengths.append(len_to_append) virtual_genome_size = sum(reference_chromosomes.values()) + virtual_genome_shift * (len(reference_chromosomes.values()) - 1) for contigs_fpath in contigs_fpaths: label = qconfig.assembly_labels_by_fpath[contigs_fpath] if not contig_report_fpath_pattern: contigs = parse_contigs_fpath(contigs_fpath) else: report_fpath = contig_report_fpath_pattern % qutils.label_from_fpath_for_fname(contigs_fpath) aligned_blocks, misassembled_id_to_structure, contigs, ambiguity_alignments = parse_nucmer_contig_report(report_fpath, list(reference_chromosomes.keys()), cumulative_ref_lengths) if not contigs: contigs = parse_contigs_fpath(contigs_fpath) if aligned_blocks is None: return None for block in aligned_blocks: block.label = label aligned_blocks = check_misassembled_blocks(aligned_blocks, misassembled_id_to_structure) lists_of_aligned_blocks.append(aligned_blocks) structures_by_labels[label] = misassembled_id_to_structure if qconfig.ambiguity_usage == 'all': ambiguity_alignments_by_labels[label] = ambiguity_alignments contigs_by_assemblies[label] = contigs if contigs_fpaths and ref_fpath and features: features_data = parse_features_data(features, cumulative_ref_lengths, chr_names) if contigs_fpaths and qconfig.gene_finding: parse_genes_data(contigs_by_assemblies, genes_by_labels) if reference_chromosomes and lists_of_aligned_blocks: assemblies = get_assemblies(contigs_fpaths, virtual_genome_size, lists_of_aligned_blocks, find_similar) if qconfig.draw_svg: plot_fpath = draw_alignment_plot(assemblies, virtual_genome_size, output_dirpath, sorted_ref_names, sorted_ref_lengths, virtual_genome_shift) if (assemblies or contigs_by_assemblies) and qconfig.create_icarus_html: icarus_html_fpath = js_data_gen(assemblies, contigs_fpaths, reference_chromosomes, output_dirpath, structures_by_labels, contig_names_by_refs=contig_names_by_refs, ref_fpath=ref_fpath, stdout_pattern=stdout_pattern, ambiguity_alignments_by_labels=ambiguity_alignments_by_labels, contigs_by_assemblies=contigs_by_assemblies, features_data=features_data, cov_fpath=cov_fpath, physical_cov_fpath=physical_cov_fpath, json_output_dir=json_output_dir) else: icarus_html_fpath = None return icarus_html_fpath, plot_fpath
def analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic=None): maxun = 10 epsilon = 0.99 umt = 0.5 # threshold for misassembled contigs with aligned less than $umt * 100% (Unaligned Missassembled Threshold) unaligned = 0 partially_unaligned = 0 fully_unaligned_bases = 0 partially_unaligned_bases = 0 ambiguous_contigs = 0 ambiguous_contigs_extra_bases = 0 ambiguous_contigs_len = 0 half_unaligned_with_misassembly = 0 misassembly_internal_overlap = 0 misassemblies_matched_sv = 0 ref_aligns = dict() contigs_aligned_lengths = [] aligned_lengths = [] region_misassemblies = [] misassembled_contigs = dict() misassemblies_in_contigs = [] region_struct_variations = find_all_sv(qconfig.bed) istranslocations_by_ref = dict() misassemblies_by_ref = defaultdict(list) for ref in ref_labels_by_chromosomes.values(): istranslocations_by_ref[ref] = dict( (key, 0) for key in ref_labels_by_chromosomes.values()) # for counting SNPs and indels (both original (.all_snps) and corrected from Nucmer's local misassemblies) total_indels_info = IndelsInfo() unaligned_file = open(unaligned_fpath, 'w') unaligned_info_file = open(unaligned_info_fpath, 'w') unaligned_info_file.write('\t'.join([ 'Contig', 'Total_length', 'Unaligned_length', 'Unaligned_type', 'Unaligned_parts' ]) + '\n') for contig, seq in fastaparser.read_fasta(contigs_fpath): #Recording contig stats ctg_len = len(seq) ca_output.stdout_f.write('CONTIG: %s (%dbp)\n' % (contig, ctg_len)) contig_type = 'unaligned' misassemblies_in_contigs.append(0) contigs_aligned_lengths.append(0) #Check if this contig aligned to the reference if contig in aligns: for align in aligns[contig]: sub_seq = seq[align.start():align.end()] if 'N' in sub_seq: ns_pos = [ pos for pos in range(align.start(), align.end()) if seq[pos] == 'N' ] contig_type = 'correct' #Pull all aligns for this contig num_aligns = len(aligns[contig]) #Sort aligns by aligned_length * identity - unaligned_length (as we do in BSS) sorted_aligns = sorted(aligns[contig], key=lambda x: (score_single_align(x), x.len2), reverse=True) top_len = sorted_aligns[0].len2 top_id = sorted_aligns[0].idy top_score = score_single_align(sorted_aligns[0]) top_aligns = [] ca_output.stdout_f.write( 'Best alignment score: %.1f (LEN: %d, IDY: %.2f)\n' % (top_score, top_len, top_id)) #Check that top hit captures most of the contig if top_len > ctg_len * epsilon or ctg_len - top_len < maxun: #Reset top aligns: aligns that share the same value of longest and highest identity top_aligns.append(sorted_aligns[0]) sorted_aligns = sorted_aligns[1:] #Continue grabbing alignments while length and identity are identical #while sorted_aligns and top_len == sorted_aligns[0].len2 and top_id == sorted_aligns[0].idy: while sorted_aligns and (score_single_align( sorted_aligns[0]) >= qconfig.ambiguity_score * top_score): top_aligns.append(sorted_aligns[0]) sorted_aligns = sorted_aligns[1:] #Mark other alignments as insignificant (former ambiguous) if sorted_aligns: ca_output.stdout_f.write( '\t\tSkipping these alignments as insignificant (option --ambiguity-score is set to "%s"):\n' % str(qconfig.ambiguity_score)) for align in sorted_aligns: ca_output.stdout_f.write('\t\t\tSkipping alignment ' + str(align) + '\n') if len(top_aligns) == 1: #There is only one top align, life is good ca_output.stdout_f.write( '\t\tOne align captures most of this contig: %s\n' % str(top_aligns[0])) ca_output.icarus_out_f.write( top_aligns[0].icarus_report_str() + '\n') ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0]) ca_output.coords_filtered_f.write( str(top_aligns[0]) + '\n') aligned_lengths.append(top_aligns[0].len2) contigs_aligned_lengths[-1] = top_aligns[0].len2 else: #There is more than one top align ca_output.stdout_f.write( '\t\tThis contig has %d significant alignments. [An ambiguously mapped contig]\n' % len(top_aligns)) #Increment count of ambiguously mapped contigs and bases in them ambiguous_contigs += 1 # we count only extra bases, so we shouldn't include bases in the first alignment # if --ambiguity-usage is 'none', the number of extra bases will be negative! ambiguous_contigs_len += ctg_len # Alex: skip all alignments or count them as normal (just different aligns of one repeat). Depend on --allow-ambiguity option if qconfig.ambiguity_usage == "none": ambiguous_contigs_extra_bases -= top_aligns[0].len2 ca_output.stdout_f.write( '\t\tSkipping these alignments (option --ambiguity-usage is set to "none"):\n' ) for align in top_aligns: ca_output.stdout_f.write( '\t\t\tSkipping alignment ' + str(align) + '\n') elif qconfig.ambiguity_usage == "one": ambiguous_contigs_extra_bases += 0 ca_output.stdout_f.write( '\t\tUsing only first of these alignment (option --ambiguity-usage is set to "one"):\n' ) ca_output.stdout_f.write('\t\t\tAlignment: %s\n' % str(top_aligns[0])) ca_output.icarus_out_f.write( top_aligns[0].icarus_report_str() + '\n') ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0]) aligned_lengths.append(top_aligns[0].len2) contigs_aligned_lengths[-1] = top_aligns[0].len2 ca_output.coords_filtered_f.write( str(top_aligns[0]) + '\n') top_aligns = top_aligns[1:] for align in top_aligns: ca_output.stdout_f.write( '\t\t\tSkipping alignment ' + str(align) + '\n') elif qconfig.ambiguity_usage == "all": ambiguous_contigs_extra_bases -= top_aligns[0].len2 ca_output.stdout_f.write( '\t\tUsing all these alignments (option --ambiguity-usage is set to "all"):\n' ) # we count only extra bases, so we shouldn't include bases in the first alignment first_alignment = True contig_type = 'ambiguous' while len(top_aligns): ca_output.stdout_f.write('\t\t\tAlignment: %s\n' % str(top_aligns[0])) ca_output.icarus_out_f.write( top_aligns[0].icarus_report_str( ambiguity=True) + '\n') ref_aligns.setdefault(top_aligns[0].ref, []).append(top_aligns[0]) if first_alignment: first_alignment = False aligned_lengths.append(top_aligns[0].len2) contigs_aligned_lengths[-1] = top_aligns[ 0].len2 ambiguous_contigs_extra_bases += top_aligns[0].len2 ca_output.coords_filtered_f.write( str(top_aligns[0]) + ' ambiguous\n') top_aligns = top_aligns[1:] else: # choose appropriate alignments (to maximize total size of contig alignment and reduce # misassemblies) is_ambiguous, too_much_best_sets, sorted_aligns, best_sets = get_best_aligns_sets( sorted_aligns, ctg_len, ca_output.stdout_f, seq, ref_lens, is_cyclic, region_struct_variations) the_best_set = best_sets[0] used_indexes = list( range(len(sorted_aligns)) if too_much_best_sets else get_used_indexes(best_sets)) if len(used_indexes) < len(sorted_aligns): ca_output.stdout_f.write( '\t\t\tSkipping redundant alignments after choosing the best set of alignments\n' ) for idx in set([ idx for idx in range(len(sorted_aligns)) if idx not in used_indexes ]): ca_output.stdout_f.write( '\t\tSkipping redundant alignment ' + str(sorted_aligns[idx]) + '\n') if is_ambiguous: ca_output.stdout_f.write( '\t\tThis contig has several significant sets of alignments. [An ambiguously mapped contig]\n' ) # similar to regular ambiguous contigs, see above ambiguous_contigs += 1 ambiguous_contigs_len += ctg_len if qconfig.ambiguity_usage == "none": ambiguous_contigs_extra_bases -= ( ctg_len - the_best_set.uncovered) ca_output.stdout_f.write( '\t\tSkipping all alignments in these sets (option --ambiguity-usage is set to "none"):\n' ) for idx in used_indexes: ca_output.stdout_f.write( '\t\t\tSkipping alignment ' + str(sorted_aligns[idx]) + '\n') continue elif qconfig.ambiguity_usage == "one": ambiguous_contigs_extra_bases += 0 ca_output.stdout_f.write( '\t\tUsing only the very best set (option --ambiguity-usage is set to "one").\n' ) if len(the_best_set.indexes) < len(used_indexes): ca_output.stdout_f.write( '\t\tSo, skipping alignments from other sets:\n' ) for idx in used_indexes: if idx not in the_best_set.indexes: ca_output.stdout_f.write( '\t\t\tSkipping alignment ' + str(sorted_aligns[idx]) + '\n') elif qconfig.ambiguity_usage == "all": ca_output.stdout_f.write( '\t\tUsing all alignments in these sets (option --ambiguity-usage is set to "all"):\n' ) ca_output.stdout_f.write( '\t\t\tThe very best set is shown in details below, the rest are:\n' ) for idx, cur_set in enumerate(best_sets[1:]): ca_output.stdout_f.write('\t\t\t\tGroup #%d. Score: %.1f, number of alignments: %d, unaligned bases: %d\n' % \ (idx + 2, cur_set.score, len(cur_set.indexes), cur_set.uncovered)) if too_much_best_sets: ca_output.stdout_f.write('\t\t\t\tetc...\n') if len(the_best_set.indexes) < len(used_indexes): ambiguous_contigs_extra_bases -= ( ctg_len - the_best_set.uncovered) ca_output.stdout_f.write( '\t\t\tList of alignments used in the sets above:\n' ) for idx in used_indexes: align = sorted_aligns[idx] ca_output.stdout_f.write( '\t\tAlignment: %s\n' % str(align)) ref_aligns.setdefault(align.ref, []).append(align) ambiguous_contigs_extra_bases += align.len2 ca_output.coords_filtered_f.write( str(align) + " ambiguous\n") if idx not in the_best_set.indexes: ca_output.icarus_out_f.write( align.icarus_report_str( is_best=False) + '\n') ca_output.stdout_f.write('\t\t\tThe best set is below. Score: %.1f, number of alignments: %d, unaligned bases: %d\n' % \ (the_best_set.score, len(the_best_set.indexes), the_best_set.uncovered)) real_aligns = [sorted_aligns[i] for i in the_best_set.indexes] # main processing part if len(real_aligns) == 1: the_only_align = real_aligns[0] #There is only one alignment of this contig to the reference ca_output.coords_filtered_f.write( str(the_only_align) + '\n') aligned_lengths.append(the_only_align.len2) contigs_aligned_lengths[-1] = the_only_align.len2 begin, end = the_only_align.start(), the_only_align.end() unaligned_bases = (begin - 1) + (ctg_len - end) aligned_bases_in_contig = ctg_len - unaligned_bases is_partially_unaligned = check_partially_unaligned( real_aligns, ctg_len) if is_partially_unaligned: partially_unaligned += 1 partially_unaligned_bases += unaligned_bases if aligned_bases_in_contig < umt * ctg_len: contig_type = 'correct_unaligned' ca_output.stdout_f.write( '\t\tThis contig is partially unaligned. (Aligned %d out of %d bases)\n' % (aligned_bases_in_contig, ctg_len)) save_unaligned_info(real_aligns, contig, ctg_len, unaligned_bases, unaligned_info_file) ca_output.stdout_f.write('\t\tAlignment: %s\n' % str(the_only_align)) ca_output.icarus_out_f.write( the_only_align.icarus_report_str() + '\n') if is_partially_unaligned: if begin - 1: ca_output.stdout_f.write( '\t\tUnaligned bases: 1 to %d (%d)\n' % (begin - 1, begin - 1)) if ctg_len - end: ca_output.stdout_f.write( '\t\tUnaligned bases: %d to %d (%d)\n' % (end + 1, ctg_len, ctg_len - end)) if qconfig.is_combined_ref and aligned_bases_in_contig >= umt * ctg_len: check_for_potential_translocation( seq, ctg_len, real_aligns, region_misassemblies, misassemblies_by_ref, ca_output.stdout_f) ref_aligns.setdefault(the_only_align.ref, []).append(the_only_align) else: #Sort real alignments by position on the contig sorted_aligns = sorted(real_aligns, key=lambda x: (x.end(), x.start())) #There is more than one alignment of this contig to the reference ca_output.stdout_f.write( '\t\tThis contig is misassembled. %d total aligns.\n' % num_aligns) unaligned_bases = the_best_set.uncovered aligned_bases_in_contig = ctg_len - unaligned_bases is_partially_unaligned = check_partially_unaligned( sorted_aligns, ctg_len) if is_partially_unaligned: partially_unaligned += 1 partially_unaligned_bases += unaligned_bases if aligned_bases_in_contig >= umt * ctg_len: ca_output.stdout_f.write( '\t\tThis contig is partially unaligned. (Aligned %d out of %d bases)\n' % (aligned_bases_in_contig, ctg_len)) save_unaligned_info(sorted_aligns, contig, ctg_len, unaligned_bases, unaligned_info_file) if aligned_bases_in_contig < umt * ctg_len: ca_output.stdout_f.write('\t\t\tWarning! This contig is more unaligned than misassembled. ' + \ 'Contig length is %d and total length of all aligns is %d\n' % (ctg_len, aligned_bases_in_contig)) contigs_aligned_lengths[-1] = sum( align.len2 for align in sorted_aligns) for align in sorted_aligns: ca_output.stdout_f.write('\t\tAlignment: %s\n' % str(align)) ca_output.icarus_out_f.write( align.icarus_report_str() + '\n') ca_output.icarus_out_f.write('unknown\n') ca_output.coords_filtered_f.write( str(align) + '\n') aligned_lengths.append(align.len2) ref_aligns.setdefault(align.ref, []).append(align) half_unaligned_with_misassembly += 1 ca_output.stdout_f.write('\t\tUnaligned bases: %d\n' % unaligned_bases) contig_type = 'mis_unaligned' ca_output.icarus_out_f.write('\t'.join([ 'CONTIG', contig, str(ctg_len), contig_type + '\n' ])) ca_output.stdout_f.write('\n') continue ### processing misassemblies is_misassembled, current_mio, indels_info, misassemblies_matched_sv, cnt_misassemblies, contig_aligned_length = \ process_misassembled_contig(sorted_aligns, is_cyclic, aligned_lengths, region_misassemblies, ref_lens, ref_aligns, ref_features, seq, misassemblies_by_ref, istranslocations_by_ref, region_struct_variations, misassemblies_matched_sv, ca_output) contigs_aligned_lengths[-1] = contig_aligned_length misassembly_internal_overlap += current_mio total_indels_info += indels_info if is_misassembled: misassembled_contigs[contig] = ctg_len contig_type = 'misassembled' misassemblies_in_contigs[-1] = cnt_misassemblies if is_partially_unaligned: ca_output.stdout_f.write('\t\tUnaligned bases: %d\n' % unaligned_bases) if qconfig.is_combined_ref: check_for_potential_translocation( seq, ctg_len, sorted_aligns, region_misassemblies, misassemblies_by_ref, ca_output.stdout_f) else: #No aligns to this contig ca_output.stdout_f.write( '\t\tThis contig is unaligned. (%d bp)\n' % ctg_len) unaligned_file.write(contig) #Increment unaligned contig count and bases unaligned += 1 fully_unaligned_bases += ctg_len ca_output.stdout_f.write('\t\tUnaligned bases: %d total: %d\n' % (ctg_len, fully_unaligned_bases)) save_unaligned_info([], contig, ctg_len, ctg_len, unaligned_info_file) ca_output.icarus_out_f.write('\t'.join( ['CONTIG', contig, str(ctg_len), contig_type]) + '\n') ca_output.stdout_f.write('\n') unaligned_file.close() unaligned_info_file.close() misassembled_bases = sum(misassembled_contigs.values()) result = { 'region_misassemblies': region_misassemblies, 'region_struct_variations': region_struct_variations.get_count() if region_struct_variations else None, 'misassemblies_matched_sv': misassemblies_matched_sv, 'misassembled_contigs': misassembled_contigs, 'misassembled_bases': misassembled_bases, 'misassembly_internal_overlap': misassembly_internal_overlap, 'unaligned': unaligned, 'partially_unaligned': partially_unaligned, 'partially_unaligned_bases': partially_unaligned_bases, 'fully_unaligned_bases': fully_unaligned_bases, 'ambiguous_contigs': ambiguous_contigs, 'ambiguous_contigs_extra_bases': ambiguous_contigs_extra_bases, 'ambiguous_contigs_len': ambiguous_contigs_len, 'half_unaligned_with_misassembly': half_unaligned_with_misassembly, 'misassemblies_by_ref': misassemblies_by_ref, 'istranslocations_by_refs': istranslocations_by_ref } return result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, contigs_aligned_lengths
def process_single_file(contigs_fpath, index, coords_dirpath, genome_stats_dirpath, reference_chromosomes, ns_by_chromosomes, containers): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) results = dict() ref_lengths = defaultdict(int) logger.info(' ' + qutils.index_to_str(index) + assembly_label) coords_base_fpath = os.path.join(coords_dirpath, corr_assembly_label + '.coords') if qconfig.use_all_alignments: coords_fpath = coords_base_fpath else: coords_fpath = coords_base_fpath + '.filtered' if not os.path.isfile(coords_fpath): logger.error('File with alignment coords (' + coords_fpath + ') not found! Try to restart QUAST.', indent=' ') return None, None # EXAMPLE: # [S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [% IDY] | [TAGS] #===================================================================================== # 338980 339138 | 2298 2134 | 159 165 | 79.76 | gi|48994873|gb|U00096.2| NODE_0_length_6088 # 374145 374355 | 2306 2097 | 211 210 | 85.45 | gi|48994873|gb|U00096.2| NODE_0_length_6088 genome_mapping = {} for chr_name, chr_len in reference_chromosomes.items(): genome_mapping[chr_name] = [0] * (chr_len + 1) contig_tuples = fastaparser.read_fasta(contigs_fpath) # list of FASTA entries (in tuples: name, seq) sorted_contig_tuples = sorted(enumerate(contig_tuples), key=lambda x: len(x[1][1]), reverse=True) sorted_contigs_names = [] contigs_order = [] for idx, (name, _) in sorted_contig_tuples: sorted_contigs_names.append(name) contigs_order.append(idx) features_in_contigs = [0] * len(sorted_contigs_names) # for cumulative plots: i-th element is the number of genes in i-th contig operons_in_contigs = [0] * len(sorted_contigs_names) aligned_blocks_by_contig_name = {} # for gene finding: contig_name --> list of AlignedBlock gene_searching_enabled = len(containers) if qconfig.memory_efficient and gene_searching_enabled: logger.warning('Run QUAST without genes and operons files to reduce memory consumption.') if gene_searching_enabled: for name in sorted_contigs_names: aligned_blocks_by_contig_name[name] = [] with open(coords_fpath) as coordfile: for line in coordfile: s1 = int(line.split('|')[0].split()[0]) e1 = int(line.split('|')[0].split()[1]) s2 = int(line.split('|')[1].split()[0]) e2 = int(line.split('|')[1].split()[1]) contig_name = line.split()[12].strip() chr_name = line.split()[11].strip() if chr_name not in genome_mapping: logger.error("Something went wrong and chromosome names in your coords file (" + coords_base_fpath + ") " \ "differ from the names in the reference. Try to remove the file and restart QUAST.") return None if gene_searching_enabled: aligned_blocks_by_contig_name[contig_name].append(AlignedBlock(seqname=chr_name, start=s1, end=e1, contig=contig_name, start_in_contig=s2, end_in_contig=e2)) for i in range(s1, e1 + 1): genome_mapping[chr_name][i] = 1 for chr_name in genome_mapping.keys(): for i in ns_by_chromosomes[chr_name]: genome_mapping[chr_name][i] = 0 ref_lengths[chr_name] = sum(genome_mapping[chr_name]) if qconfig.space_efficient and coords_fpath.endswith('.filtered'): os.remove(coords_fpath) # counting genome coverage and gaps number gaps_count = 0 if qconfig.analyze_gaps: gaps_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + '_gaps.txt') if not qconfig.space_efficient else '/dev/null' with open(gaps_fpath, 'w') as gaps_file: for chr_name, chr_len in reference_chromosomes.items(): gaps_file.write(chr_name + '\n') cur_gap_size = 0 for i in range(1, chr_len + 1): if genome_mapping[chr_name][i] == 1 or i in ns_by_chromosomes[chr_name]: if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 gaps_file.write(str(i - cur_gap_size) + ' ' + str(i - 1) + '\n') cur_gap_size = 0 else: cur_gap_size += 1 if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 gaps_file.write(str(chr_len - cur_gap_size + 1) + ' ' + str(chr_len) + '\n') results["gaps_count"] = gaps_count results[reporting.Fields.GENES + "_full"] = None results[reporting.Fields.GENES + "_partial"] = None results[reporting.Fields.OPERONS + "_full"] = None results[reporting.Fields.OPERONS + "_partial"] = None # finding genes and operons for container in containers: if not container.region_list: continue total_full = 0 total_partial = 0 found_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + '_genomic_features_' + container.kind.lower() + '.txt') found_file = open(found_fpath, 'w') found_file.write('%s\t\t%s\t%s\t%s\t%s\n' % ('ID or #', 'Start', 'End', 'Type', 'Contig')) found_file.write('=' * 50 + '\n') # 0 - gene is not found, # 1 - gene is found, # 2 - part of gene is found found_list = [0] * len(container.region_list) for i, region in enumerate(container.region_list): found_list[i] = 0 gene_blocks = [] if region.id is None: region.id = '# ' + str(region.number + 1) for contig_id, name in enumerate(sorted_contigs_names): cur_feature_is_found = False for cur_block in aligned_blocks_by_contig_name[name]: if cur_block.seqname != region.seqname: continue if region.end <= cur_block.start or cur_block.end <= region.start: continue elif cur_block.start <= region.start and region.end <= cur_block.end: if found_list[i] == 2: # already found as partial gene total_partial -= 1 found_list[i] = 1 total_full += 1 contig_info = cur_block.format_gene_info(region) found_file.write('%s\t\t%d\t%d\tcomplete\t%s\n' % (region.id, region.start, region.end, contig_info)) if container.kind == 'operon': operons_in_contigs[contig_id] += 1 # inc number of found genes/operons in id-th contig else: features_in_contigs[contig_id] += 1 cur_feature_is_found = True break elif min(region.end, cur_block.end) - max(region.start, cur_block.start) >= qconfig.min_gene_overlap: if found_list[i] == 0: found_list[i] = 2 total_partial += 1 gene_blocks.append(cur_block) if cur_feature_is_found: break if cur_feature_is_found: break # adding info about partially found genes/operons if found_list[i] == 2: # partial gene/operon contig_info = ','.join([block.format_gene_info(region) for block in sorted(gene_blocks, key=lambda block: block.start)]) found_file.write('%s\t\t%d\t%d\tpartial\t%s\n' % (region.id, region.start, region.end, contig_info)) if container.kind == 'operon': results[reporting.Fields.OPERONS + "_full"] = total_full results[reporting.Fields.OPERONS + "_partial"] = total_partial else: if results[reporting.Fields.GENES + "_full"] is None: results[reporting.Fields.GENES + "_full"] = 0 results[reporting.Fields.GENES + "_partial"] = 0 results[reporting.Fields.GENES + "_full"] += total_full results[reporting.Fields.GENES + "_partial"] += total_partial found_file.close() logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') unsorted_features_in_contigs = [features_in_contigs[idx] for idx in contigs_order] unsorted_operons_in_contigs = [operons_in_contigs[idx] for idx in contigs_order] return ref_lengths, (results, unsorted_features_in_contigs, features_in_contigs, unsorted_operons_in_contigs, operons_in_contigs)
REF_MARGINS = 300 REF_FNAME = "ref.fa" if len(sys.argv) != 4: print "Usage:", sys.argv[0], "reference pos1 pos2" sys.exit(0) pos1 = int(sys.argv[2]) pos2 = int(sys.argv[3]) if pos1 > pos2: pos = pos1 pos1 = pos2 pos2 = pos reference = fastaparser.read_fasta(sys.argv[1])[0][1] # Returns list of FASTA entries (in tuples: name, seq) if len(reference) < pos2: pos2 = len(reference) ref_file = open(REF_FNAME, 'w') ref_file.write(">reference\n") ref_file.write(reference[max(0, pos1 - 1 - REF_MARGINS) : min(len(reference), pos2 + REF_MARGINS)] + "\n") ref_file.close() misassembled_site = reference[pos1 - 1 : pos2] kmers = set() i = pos1 - 1 while i + KMER_SIZE <= pos2: kmers.add(reference[i : i + KMER_SIZE]) i += 1
def correct_fasta(original_fpath, corrected_fpath, min_contig, is_reference=False): modified_fasta_entries = [] used_seq_names = defaultdict(int) for first_line, seq in fastaparser.read_fasta(original_fpath): if not first_line: logger.warning('Skipping ' + original_fpath + ' because >sequence_name field is empty.', indent=' ') return False if (len(seq) >= min_contig) or is_reference: corr_name = correct_name(first_line) uniq_name = get_uniq_name(corr_name, used_seq_names) used_seq_names[corr_name] += 1 if not qconfig.no_check: # seq to uppercase, because we later looking only uppercase letters corr_seq = correct_seq(seq, original_fpath) if not corr_seq: return False else: corr_seq = seq modified_fasta_entries.append((uniq_name, corr_seq)) if not modified_fasta_entries: logger.warning('Skipping ' + original_fpath + ' because file is empty.', indent=' ') return False fastaparser.write_fasta(corrected_fpath, modified_fasta_entries) if is_reference: ref_len = sum(len(chr_seq) for (chr_name, chr_seq) in modified_fasta_entries) if ref_len > qconfig.MAX_REFERENCE_FILE_LENGTH: qconfig.splitted_ref = [] # important for MetaQUAST which runs QUAST multiple times _, fasta_ext = os.path.splitext(corrected_fpath) split_ref_dirpath = os.path.join(os.path.dirname(corrected_fpath), 'split_ref') if os.path.exists(split_ref_dirpath): shutil.rmtree(split_ref_dirpath, ignore_errors=True) os.makedirs(split_ref_dirpath) max_len = min(ref_len/qconfig.max_threads, qconfig.MAX_REFERENCE_LENGTH) cur_part_len = 0 cur_part_num = 1 cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext for (chr_name, chr_seq) in modified_fasta_entries: cur_chr_len = len(chr_seq) if cur_chr_len > qconfig.MAX_REFERENCE_LENGTH: logger.warning("Skipping chromosome " + chr_name + " because its length is greater than " + str(qconfig.MAX_REFERENCE_LENGTH) + " (Nucmer's constraint).") continue cur_part_len += cur_chr_len if cur_part_len > max_len and cur_part_len != cur_chr_len: qconfig.splitted_ref.append(cur_part_fpath) cur_part_len = cur_chr_len cur_part_num += 1 cur_part_fpath = os.path.join(split_ref_dirpath, "part_%d" % cur_part_num) + fasta_ext fastaparser.write_fasta(cur_part_fpath, [(chr_name, chr_seq)], mode='a') if cur_part_len > 0: qconfig.splitted_ref.append(cur_part_fpath) if len(qconfig.splitted_ref) == 0: logger.warning("Skipping reference because all of its chromosomes exceeded Nucmer's constraint.") return False return True
def process_single_file(contigs_fpath, index, coords_dirpath, genome_stats_dirpath, reference_chromosomes, ns_by_chromosomes, containers): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) results = dict() ref_lengths = defaultdict(int) logger.info(' ' + qutils.index_to_str(index) + assembly_label) coords_base_fpath = os.path.join(coords_dirpath, corr_assembly_label + '.coords') if qconfig.use_all_alignments: coords_fpath = coords_base_fpath else: coords_fpath = coords_base_fpath + '.filtered' if not os.path.isfile(coords_fpath): logger.error('File with alignment coords (' + coords_fpath + ') not found! Try to restart QUAST.', indent=' ') return None # EXAMPLE: # [S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [% IDY] | [TAGS] #===================================================================================== # 338980 339138 | 2298 2134 | 159 165 | 79.76 | gi|48994873|gb|U00096.2| NODE_0_length_6088 # 374145 374355 | 2306 2097 | 211 210 | 85.45 | gi|48994873|gb|U00096.2| NODE_0_length_6088 genome_mapping = {} for chr_name, chr_len in reference_chromosomes.items(): genome_mapping[chr_name] = [0] * (chr_len + 1) contig_tuples = fastaparser.read_fasta( contigs_fpath) # list of FASTA entries (in tuples: name, seq) sorted_contig_tuples = sorted(enumerate(contig_tuples), key=lambda x: len(x[1][1]), reverse=True) sorted_contigs_names = [] contigs_order = [] for idx, (name, _) in sorted_contig_tuples: sorted_contigs_names.append(name) contigs_order.append(idx) features_in_contigs = [0] * len( sorted_contigs_names ) # for cumulative plots: i-th element is the number of genes in i-th contig operons_in_contigs = [0] * len(sorted_contigs_names) aligned_blocks_by_contig_name = { } # for gene finding: contig_name --> list of AlignedBlock gene_searching_enabled = len(containers) if qconfig.memory_efficient and gene_searching_enabled: logger.warning( 'Run QUAST without genes and operons files to reduce memory consumption.' ) if gene_searching_enabled: for name in sorted_contigs_names: aligned_blocks_by_contig_name[name] = [] with open(coords_fpath) as coordfile: for line in coordfile: s1 = int(line.split('|')[0].split()[0]) e1 = int(line.split('|')[0].split()[1]) s2 = int(line.split('|')[1].split()[0]) e2 = int(line.split('|')[1].split()[1]) contig_name = line.split()[12].strip() chr_name = line.split()[11].strip() if chr_name not in genome_mapping: logger.error("Something went wrong and chromosome names in your coords file (" + coords_base_fpath + ") " \ "differ from the names in the reference. Try to remove the file and restart QUAST.") return None if gene_searching_enabled: aligned_blocks_by_contig_name[contig_name].append( AlignedBlock(seqname=chr_name, start=s1, end=e1, contig=contig_name, start_in_contig=s2, end_in_contig=e2)) if s2 == 0 and e2 == 0: # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning for i in range(s1, len(genome_mapping[chr_name])): genome_mapping[chr_name][i] = 1 for i in range(1, e1 + 1): genome_mapping[chr_name][i] = 1 else: #if s1 <= e1: for i in range(s1, e1 + 1): genome_mapping[chr_name][i] = 1 for chr_name in genome_mapping.keys(): for i in ns_by_chromosomes[chr_name]: genome_mapping[chr_name][i] = 0 ref_lengths[chr_name] = sum(genome_mapping[chr_name]) if qconfig.space_efficient and coords_fpath.endswith('.filtered'): os.remove(coords_fpath) # counting genome coverage and gaps number gaps_count = 0 if qconfig.analyze_gaps: gaps_fpath = os.path.join( genome_stats_dirpath, corr_assembly_label + '_gaps.txt') if not qconfig.space_efficient else '/dev/null' with open(gaps_fpath, 'w') as gaps_file: for chr_name, chr_len in reference_chromosomes.items(): gaps_file.write(chr_name + '\n') cur_gap_size = 0 for i in range(1, chr_len + 1): if genome_mapping[chr_name][ i] == 1 or i in ns_by_chromosomes[chr_name]: if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 gaps_file.write( str(i - cur_gap_size) + ' ' + str(i - 1) + '\n') cur_gap_size = 0 else: cur_gap_size += 1 if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 gaps_file.write( str(chr_len - cur_gap_size + 1) + ' ' + str(chr_len) + '\n') results["gaps_count"] = gaps_count results[reporting.Fields.GENES + "_full"] = None results[reporting.Fields.GENES + "_partial"] = None results[reporting.Fields.OPERONS + "_full"] = None results[reporting.Fields.OPERONS + "_partial"] = None # finding genes and operons for container in containers: if not container.region_list: continue total_full = 0 total_partial = 0 found_fpath = os.path.join( genome_stats_dirpath, corr_assembly_label + '_genomic_features_' + container.kind.lower() + '.txt') found_file = open(found_fpath, 'w') found_file.write('%s\t\t%s\t%s\t%s\t%s\n' % ('ID or #', 'Start', 'End', 'Type', 'Contig')) found_file.write('=' * 50 + '\n') # 0 - gene is not found, # 1 - gene is found, # 2 - part of gene is found found_list = [0] * len(container.region_list) for i, region in enumerate(container.region_list): found_list[i] = 0 gene_blocks = [] if region.id is None: region.id = '# ' + str(region.number + 1) for contig_id, name in enumerate(sorted_contigs_names): cur_feature_is_found = False for cur_block in aligned_blocks_by_contig_name[name]: if cur_block.seqname != region.seqname: continue # computing circular genomes if cur_block.start > cur_block.end: blocks = [ AlignedBlock( seqname=cur_block.seqname, start=cur_block.start, end=region.end + 1, contig=cur_block.contig_name, start_in_contig=cur_block.start_in_contig), AlignedBlock(seqname=cur_block.seqname, start=1, end=cur_block.end, contig=cur_block.contig_name, end_in_contig=cur_block.end_in_contig) ] if cur_block.start_in_contig < cur_block.end_in_contig: blocks[0].end_in_contig = blocks[ 0].start_in_contig + (blocks[0].end - blocks[0].start) blocks[1].start_in_contig = blocks[ 0].end_in_contig + 1 else: blocks[0].end_in_contig = blocks[ 0].start_in_contig - (blocks[1].end - blocks[1].start) blocks[1].start_in_contig = blocks[ 0].end_in_contig - 1 else: blocks = [cur_block] for block in blocks: if region.end <= block.start or block.end <= region.start: continue elif block.start <= region.start and region.end <= block.end: if found_list[ i] == 2: # already found as partial gene total_partial -= 1 found_list[i] = 1 total_full += 1 contig_info = block.format_gene_info(region) found_file.write('%s\t\t%d\t%d\tcomplete\t%s\n' % (region.id, region.start, region.end, contig_info)) if container.kind == 'operon': operons_in_contigs[ contig_id] += 1 # inc number of found genes/operons in id-th contig else: features_in_contigs[contig_id] += 1 cur_feature_is_found = True break elif min(region.end, block.end) - max( region.start, block.start) >= qconfig.min_gene_overlap: if found_list[i] == 0: found_list[i] = 2 total_partial += 1 gene_blocks.append(block) if cur_feature_is_found: break if cur_feature_is_found: break # adding info about partially found genes/operons if found_list[i] == 2: # partial gene/operon contig_info = ','.join([ block.format_gene_info(region) for block in sorted(gene_blocks, key=lambda block: block.start) ]) found_file.write( '%s\t\t%d\t%d\tpartial\t%s\n' % (region.id, region.start, region.end, contig_info)) if container.kind == 'operon': results[reporting.Fields.OPERONS + "_full"] = total_full results[reporting.Fields.OPERONS + "_partial"] = total_partial else: if results[reporting.Fields.GENES + "_full"] is None: results[reporting.Fields.GENES + "_full"] = 0 results[reporting.Fields.GENES + "_partial"] = 0 results[reporting.Fields.GENES + "_full"] += total_full results[reporting.Fields.GENES + "_partial"] += total_partial found_file.close() logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') unsorted_features_in_contigs = [ features_in_contigs[idx] for idx in contigs_order ] unsorted_operons_in_contigs = [ operons_in_contigs[idx] for idx in contigs_order ] return ref_lengths, (results, unsorted_features_in_contigs, features_in_contigs, unsorted_operons_in_contigs, operons_in_contigs)
def correct_meta_references(ref_fpaths, corrected_dirpath, downloaded_refs=False): corrected_ref_fpaths = [] combined_ref_fpath = os.path.join(corrected_dirpath, qconfig.combined_ref_name) chromosomes_by_refs = {} def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath( os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name if not qconfig.no_check: corr_seq = correct_seq(seq, ref_fpath) if not corr_seq: return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[ corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths] ref_names = [] for ref_fname in ref_fnames: ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) ref_names.append(ref_name) excluded_ref_fpaths = [] ref_names = qutils.process_labels(ref_fpaths) for ref_fpath, ref_name in zip(ref_fpaths, ref_names): total_references = 0 ref_fname = os.path.basename(ref_fpath) _, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) chromosomes_by_refs[ref_name] = [] used_seq_names = defaultdict(int) corr_seq_fpath = None for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)): total_references += 1 seq_name = correct_name(seq_name, qutils.MAX_CONTIG_NAME - len(ref_name) - 1) uniq_seq_name = get_uniq_name(seq_name, used_seq_names) used_seq_names[seq_name] += 1 corr_seq_name, corr_seq_fpath = _proceed_seq( uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath) if not corr_seq_name: break if corr_seq_fpath: logger.main_info(' ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '') fastaparser.write_fasta(combined_ref_fpath, fastaparser.read_fasta(corr_seq_fpath), 'a') elif downloaded_refs: logger.warning( 'Skipping ' + ref_fpath + ' because it' ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!' ) # cleaning for corr_seq_name, _ in chromosomes_by_refs[ref_name]: del contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] del chromosomes_by_refs[ref_name] corrected_ref_fpaths.pop() excluded_ref_fpaths.append(ref_fpath) else: logger.error( 'Reference file ' + ref_fpath + ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!', exit_with_code=1) for excluded in excluded_ref_fpaths: ref_fpaths.remove(excluded) if len(chromosomes_by_refs) > 0: logger.main_info(' All references were combined in ' + qconfig.combined_ref_name) else: logger.warning('All references were skipped!') return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths
def correct_meta_references(ref_fpaths, corrected_dirpath, downloaded_refs=False): corrected_ref_fpaths = [] combined_ref_fpath = os.path.join(corrected_dirpath, qconfig.combined_ref_name) chromosomes_by_refs = {} def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name if not qconfig.no_check: corr_seq = correct_seq(seq, ref_fpath) if not corr_seq: return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths] ref_names = [] for ref_fname in ref_fnames: ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) ref_names.append(ref_name) excluded_ref_fpaths = [] ref_names = qutils.process_labels(ref_fpaths) for ref_fpath, ref_name in zip(ref_fpaths, ref_names): total_references = 0 ref_fname = os.path.basename(ref_fpath) _, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) chromosomes_by_refs[ref_name] = [] used_seq_names = defaultdict(int) corr_seq_fpath = None for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)): total_references += 1 seq_name = correct_name(seq_name, qutils.MAX_CONTIG_NAME - len(ref_name) - 1) uniq_seq_name = get_uniq_name(seq_name, used_seq_names) used_seq_names[seq_name] += 1 corr_seq_name, corr_seq_fpath = _proceed_seq(uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath) if not corr_seq_name: break if corr_seq_fpath: logger.main_info(' ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '') fastaparser.write_fasta(combined_ref_fpath, fastaparser.read_fasta(corr_seq_fpath), 'a') elif downloaded_refs: logger.warning('Skipping ' + ref_fpath + ' because it' ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!') # cleaning for corr_seq_name, _ in chromosomes_by_refs[ref_name]: del contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] del chromosomes_by_refs[ref_name] corrected_ref_fpaths.pop() excluded_ref_fpaths.append(ref_fpath) else: logger.error('Reference file ' + ref_fpath + ' is empty or contains incorrect sequences (header-only or with non-ACGTN characters)!', exit_with_code=1) for excluded in excluded_ref_fpaths: ref_fpaths.remove(excluded) if len(chromosomes_by_refs) > 0: logger.main_info(' All references were combined in ' + qconfig.combined_ref_name) else: logger.warning('All references were skipped!') return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath, old_contigs_fpath, bed_fpath, parallel_by_chr=False, threads=1): nucmer_output_dirpath = create_nucmer_output_dir(output_dirpath) assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) nucmer_fpath = join(nucmer_output_dirpath, corr_assembly_label) logger.info(' ' + qutils.index_to_str(index) + assembly_label) if not qconfig.space_efficient: log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout') log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr') icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label) misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info') unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info') else: log_out_fpath = '/dev/null' log_err_fpath = '/dev/null' icarus_out_fpath = '/dev/null' misassembly_fpath = '/dev/null' unaligned_info_fpath = '/dev/null' icarus_out_f = open(icarus_out_fpath, 'w') icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group'] icarus_out_f.write('\t'.join(icarus_header_cols) + '\n') misassembly_f = open(misassembly_fpath, 'w') if not qconfig.space_efficient: logger.info(' ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath + ' and ' + os.path.basename(log_err_fpath) + '...') else: logger.info(' ' + qutils.index_to_str(index) + 'Logging is disabled.') coords_fpath, coords_filtered_fpath, unaligned_fpath, show_snps_fpath, used_snps_fpath = \ get_nucmer_aux_out_fpaths(nucmer_fpath) nucmer_status = align_contigs(nucmer_fpath, ref_fpath, contigs_fpath, old_contigs_fpath, index, parallel_by_chr, threads, log_out_fpath, log_err_fpath) if nucmer_status != NucmerStatus.OK: with open(log_err_fpath, 'a') as log_err_f: if nucmer_status == NucmerStatus.ERROR: logger.error(' ' + qutils.index_to_str(index) + 'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) + ' to the reference (non-zero exit code). ' + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) elif nucmer_status == NucmerStatus.FAILED: log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n') logger.info(' ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.') elif nucmer_status == NucmerStatus.NOT_ALIGNED: log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n') logger.info(' ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.') clean_tmp_files(nucmer_fpath) return nucmer_status, {}, [], [], [] log_out_f = open(log_out_fpath, 'a') # Loading the alignment files log_out_f.write('Parsing coords...\n') aligns = {} coords_file = open(coords_fpath) coords_filtered_file = open(coords_filtered_fpath, 'w') coords_filtered_file.write(coords_file.readline()) coords_filtered_file.write(coords_file.readline()) for line in coords_file: if line.strip() == '': break assert line[0] != '=' #Clear leading spaces from nucmer output #Store nucmer lines in an array mapping = Mapping.from_line(line) aligns.setdefault(mapping.contig, []).append(mapping) # Loading the reference sequences log_out_f.write('Loading reference...\n') # TODO: move up references = {} ref_features = {} for name, seq in fastaparser.read_fasta(ref_fpath): name = name.split()[0] # no spaces in reference header references[name] = seq log_out_f.write('\tLoaded [%s]\n' % name) #Loading the SNP calls if qconfig.show_snps: log_out_f.write('Loading SNPs...\n') used_snps_file = None snps = {} if qconfig.show_snps: prev_line = None for line in open_gzipsafe(show_snps_fpath): #print "$line"; line = line.split() if not line[0].isdigit(): continue if prev_line and line == prev_line: continue ref = line[10] ctg = line[11] pos = int(line[0]) # Kolya: python don't convert int<->str types automatically loc = int(line[3]) # Kolya: same as above # if (! exists $line[11]) { die "Malformed line in SNP file. Please check that show-snps has completed succesfully.\n$line\n[$line[9]][$line[10]][$line[11]]\n"; } if pos in snps.setdefault(ref, {}).setdefault(ctg, {}): snps.setdefault(ref, {}).setdefault(ctg, {})[pos].append(SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])) else: snps.setdefault(ref, {}).setdefault(ctg, {})[pos] = [SNP(ref_pos=pos, ctg_pos=loc, ref_nucl=line[1], ctg_nucl=line[2])] prev_line = line used_snps_file = open_gzipsafe(used_snps_fpath, 'w') # Loading the regions (if any) regions = {} ref_lens = {} total_reg_len = 0 total_regions = 0 # # TODO: gff # log_out_f.write('Loading regions...\n') # log_out_f.write('\tNo regions given, using whole reference.\n') for name, seq in references.items(): regions.setdefault(name, []).append([1, len(seq)]) ref_lens[name] = len(seq) total_regions += 1 total_reg_len += ref_lens[name] log_out_f.write('\tTotal Regions: %d\n' % total_regions) log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len) ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=coords_filtered_file, used_snps_f=used_snps_file, icarus_out_f=icarus_out_f) log_out_f.write('Analyzing contigs...\n') result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\ analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, ref_lens, is_cyclic) log_out_f.write('Analyzing coverage...\n') if qconfig.show_snps: log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n') result.update(analyze_coverage(ca_output, regions, ref_aligns, ref_features, snps, total_indels_info)) result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result) if not qconfig.space_efficient: ## outputting misassembled contigs to separate file fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath) if name in misassembled_contigs.keys()] fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta) if qconfig.is_combined_ref: alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv') unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label) logger.debug(' ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath)) used_contigs = set() with open(unique_contigs_fpath, 'w') as unique_contigs_f: with open(alignment_tsv_fpath, 'w') as alignment_tsv_f: for chr_name, aligns in ref_aligns.items(): alignment_tsv_f.write(chr_name) contigs = set([align.contig for align in aligns]) for contig in contigs: alignment_tsv_f.write('\t' + contig) if qconfig.is_combined_ref: ref_name = ref_labels_by_chromosomes[chr_name] align_by_contigs = defaultdict(int) for align in aligns: align_by_contigs[align.contig] += align.len2 for contig, aligned_len in align_by_contigs.items(): if contig in used_contigs: continue used_contigs.add(contig) len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)') if len_cov_pattern.findall(contig): contig_len = len_cov_pattern.findall(contig)[0][0] contig_cov = len_cov_pattern.findall(contig)[0][1] if aligned_len / float(contig_len) > 0.9: unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n') alignment_tsv_f.write('\n') close_handlers(ca_output) logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') logger.debug('') clean_tmp_files(nucmer_fpath) if not qconfig.no_gzip: compress_nucmer_output(logger, nucmer_fpath) if not ref_aligns: return NucmerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs else: return NucmerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
def do(output_dir, ref_fpath, contigs_fpaths, logger): logger.print_timestamp() kmer_len = qconfig.unique_kmer_len logger.main_info('Running analysis based on unique ' + str(kmer_len) + '-mers...') checked_assemblies = [] for contigs_fpath in contigs_fpaths: label = qutils.label_from_fpath_for_fname(contigs_fpath) if check_kmc_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): kmc_stats_fpath = join(output_dir, label + '.stat') stats_content = open(kmc_stats_fpath).read().split('\n') if len(stats_content) < 1: continue logger.info(' Using existing results for ' + label + '... ') report = reporting.get(contigs_fpath) report.add_field( reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1])) if len(stats_content) >= 7: corr_len = int(stats_content[1].strip().split(': ')[-1]) mis_len = int(stats_content[2].strip().split(': ')[-1]) undef_len = int(stats_content[3].strip().split(': ')[-1]) total_len = int(stats_content[4].strip().split(': ')[-1]) translocations = int(stats_content[5].strip().split(': ')[-1]) relocations = int(stats_content[6].strip().split(': ')[-1]) report.add_field(reporting.Fields.KMER_CORR_LENGTH, '%.2f' % (corr_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_MIS_LENGTH, '%.2f' % (mis_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_UNDEF_LENGTH, '%.2f' % (undef_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_TRANSLOCATIONS, translocations) report.add_field(reporting.Fields.KMER_RELOCATIONS, relocations) report.add_field(reporting.Fields.KMER_MISASSEMBLIES, translocations + relocations) checked_assemblies.append(contigs_fpath) contigs_fpaths = [ fpath for fpath in contigs_fpaths if fpath not in checked_assemblies ] if len(contigs_fpaths) == 0: save_kmers(output_dir) logger.info('Done.') return if qconfig.platform_name == 'linux_32': logger.warning(' Sorry, can\'t run KMC on this platform, skipping...') return None kmc_dirpath = get_dir_for_download(kmc_dirname, 'KMC', ['kmc', 'kmc_tools'], logger) global kmc_bin_fpath global kmc_tools_fpath kmc_bin_fpath = download_external_tool('kmc', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True) kmc_tools_fpath = download_external_tool('kmc_tools', kmc_dirpath, 'KMC', platform_specific=True, is_executable=True) if not exists(kmc_bin_fpath) or not exists( kmc_tools_fpath) or not compile_minimap(logger): logger.warning(' Sorry, can\'t run KMC, skipping...') return None logger.info(' Running KMC on reference...') if not isdir(output_dir): os.makedirs(output_dir) log_fpath = join(output_dir, 'kmc.log') err_fpath = join(output_dir, 'kmc.err') open(log_fpath, 'w').close() open(err_fpath, 'w').close() tmp_dirpath = join(output_dir, 'tmp') if not isdir(tmp_dirpath): os.makedirs(tmp_dirpath) ref_kmc_out_fpath = count_kmers(tmp_dirpath, ref_fpath, kmer_len, log_fpath, err_fpath) unique_kmers = get_kmers_cnt(tmp_dirpath, ref_kmc_out_fpath, log_fpath, err_fpath) if not unique_kmers: logger.warning('KMC failed, check ' + log_fpath + ' and ' + err_fpath + '. Skipping...') return logger.info(' Analyzing assemblies completeness...') kmc_out_fpaths = [] for id, contigs_fpath in enumerate(contigs_fpaths): assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) report = reporting.get(contigs_fpath) kmc_out_fpath = count_kmers(tmp_dirpath, contigs_fpath, kmer_len, log_fpath, err_fpath) intersect_out_fpath = intersect_kmers( tmp_dirpath, [ref_kmc_out_fpath, kmc_out_fpath], log_fpath, err_fpath) matched_kmers = get_kmers_cnt(tmp_dirpath, intersect_out_fpath, log_fpath, err_fpath) completeness = matched_kmers * 100.0 / unique_kmers report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness) kmc_out_fpaths.append(intersect_out_fpath) logger.info(' Analyzing assemblies correctness...') ref_contigs = [name for name, _ in read_fasta(ref_fpath)] logger.info(' Downsampling k-mers...') ref_kmers, downsampled_kmers_fpath = downsample_kmers( tmp_dirpath, ref_fpath, ref_kmc_out_fpath, kmer_len, log_fpath, err_fpath) for id, (contigs_fpath, kmc_db_fpath) in enumerate(zip(contigs_fpaths, kmc_out_fpaths)): assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) report = reporting.get(contigs_fpath) corr_len = None mis_len = None undef_len = None translocations, relocations = None, None total_len = 0 contig_lens = dict() for name, seq in read_fasta(contigs_fpath): total_len += len(seq) contig_lens[name] = len(seq) if len(ref_contigs) > MAX_REF_CONTIGS_NUM: logger.warning( 'Reference is too fragmented. Scaffolding accuracy will not be assessed.' ) else: corr_len = 0 mis_len = 0 kmers_by_contig, kmers_pos_by_contig = align_kmers( tmp_dirpath, contigs_fpath, downsampled_kmers_fpath, err_fpath, qconfig.max_threads) is_cyclic = qconfig.prokaryote and not qconfig.check_for_fragmented_ref cyclic_ref_lens = report.get_field( reporting.Fields.REFLEN) if is_cyclic else None translocations = 0 relocations = 0 with open( join( tmp_dirpath, qutils.label_from_fpath_for_fname(contigs_fpath) + '.misjoins.txt'), 'w') as out: for contig in kmers_by_contig.keys(): contig_markers = [] prev_pos, prev_ref_pos, prev_chrom, marker = None, None, None, None for pos, kmer in sorted(zip(kmers_pos_by_contig[contig], kmers_by_contig[contig]), key=lambda x: x[0]): ref_chrom, ref_pos = ref_kmers[kmer] if prev_pos and prev_chrom: if prev_chrom == ref_chrom and abs( abs(pos - prev_pos) / abs(ref_pos - prev_ref_pos) - 1) <= 0.05: marker = (pos, ref_pos, ref_chrom) elif marker: contig_markers.append(marker) pos, ref_pos, ref_chrom, marker = None, None, None, None prev_pos, prev_ref_pos, prev_chrom = pos, ref_pos, ref_chrom if marker: contig_markers.append(marker) prev_pos, prev_ref_pos, prev_chrom = None, None, None is_misassembled = False for marker in contig_markers: pos, ref_pos, ref_chrom = marker if prev_pos and prev_chrom: if ref_chrom != prev_chrom: translocations += 1 out.write( 'Translocation in %s: %s %d | %s %d\n' % (contig, prev_chrom, prev_pos, ref_chrom, pos)) is_misassembled = True elif _get_dist_inconstistency( pos, prev_pos, ref_pos, prev_ref_pos, cyclic_ref_lens) > EXT_RELOCATION_SIZE: relocations += 1 out.write( 'Relocation in %s: %d (%d) | %d (%d)\n' % (contig, prev_pos, prev_ref_pos, pos, ref_pos)) is_misassembled = True prev_pos, prev_ref_pos, prev_chrom = pos, ref_pos, ref_chrom if is_misassembled: mis_len += contig_lens[contig] elif len(contig_markers) > 0: corr_len += contig_lens[contig] undef_len = total_len - corr_len - mis_len report.add_field(reporting.Fields.KMER_CORR_LENGTH, '%.2f' % (corr_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_MIS_LENGTH, '%.2f' % (mis_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_UNDEF_LENGTH, '%.2f' % (undef_len * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_TRANSLOCATIONS, translocations) report.add_field(reporting.Fields.KMER_RELOCATIONS, relocations) report.add_field(reporting.Fields.KMER_MISASSEMBLIES, translocations + relocations) create_kmc_stats_file( output_dir, contigs_fpath, ref_fpath, report.get_field(reporting.Fields.KMER_COMPLETENESS), corr_len, mis_len, undef_len, total_len, translocations, relocations) save_kmers(output_dir) if not qconfig.debug: shutil.rmtree(tmp_dirpath) logger.info('Done.')
def align_and_analyze(is_cyclic, index, contigs_fpath, output_dirpath, ref_fpath, reference_chromosomes, ns_by_chromosomes, old_contigs_fpath, bed_fpath, threads=1): tmp_output_dirpath = create_minimap_output_dir(output_dirpath) assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) out_basename = join(tmp_output_dirpath, corr_assembly_label) logger.info(' ' + qutils.index_to_str(index) + assembly_label) if not qconfig.space_efficient: log_out_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stdout') log_err_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.stderr') icarus_out_fpath = join(output_dirpath, qconfig.icarus_report_fname_pattern % corr_assembly_label) misassembly_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.mis_contigs.info') unaligned_info_fpath = join(output_dirpath, qconfig.contig_report_fname_pattern % corr_assembly_label + '.unaligned.info') else: log_out_fpath = '/dev/null' log_err_fpath = '/dev/null' icarus_out_fpath = '/dev/null' misassembly_fpath = '/dev/null' unaligned_info_fpath = '/dev/null' icarus_out_f = open(icarus_out_fpath, 'w') icarus_header_cols = ['S1', 'E1', 'S2', 'E2', 'Reference', 'Contig', 'IDY', 'Ambiguous', 'Best_group'] icarus_out_f.write('\t'.join(icarus_header_cols) + '\n') misassembly_f = open(misassembly_fpath, 'w') if not qconfig.space_efficient: logger.info(' ' + qutils.index_to_str(index) + 'Logging to files ' + log_out_fpath + ' and ' + os.path.basename(log_err_fpath) + '...') else: logger.info(' ' + qutils.index_to_str(index) + 'Logging is disabled.') coords_fpath, coords_filtered_fpath, unaligned_fpath, used_snps_fpath = get_aux_out_fpaths(out_basename) status = align_contigs(coords_fpath, out_basename, ref_fpath, contigs_fpath, old_contigs_fpath, index, threads, log_out_fpath, log_err_fpath) if status != AlignerStatus.OK: with open(log_err_fpath, 'a') as log_err_f: if status == AlignerStatus.ERROR: logger.error(' ' + qutils.index_to_str(index) + 'Failed aligning contigs ' + qutils.label_from_fpath(contigs_fpath) + ' to the reference (non-zero exit code). ' + ('Run with the --debug flag to see additional information.' if not qconfig.debug else '')) elif status == AlignerStatus.FAILED: log_err_f.write(qutils.index_to_str(index) + 'Alignment failed for ' + contigs_fpath + ':' + coords_fpath + 'doesn\'t exist.\n') logger.info(' ' + qutils.index_to_str(index) + 'Alignment failed for ' + '\'' + assembly_label + '\'.') elif status == AlignerStatus.NOT_ALIGNED: log_err_f.write(qutils.index_to_str(index) + 'Nothing aligned for ' + contigs_fpath + '\n') logger.info(' ' + qutils.index_to_str(index) + 'Nothing aligned for ' + '\'' + assembly_label + '\'.') return status, {}, [], [], [] log_out_f = open(log_out_fpath, 'a') # Loading the alignment files log_out_f.write('Parsing coords...\n') aligns = {} with open(coords_fpath) as coords_file: for line in coords_file: mapping = Mapping.from_line(line) aligns.setdefault(mapping.contig, []).append(mapping) # Loading the reference sequences log_out_f.write('Loading reference...\n') # TODO: move up ref_features = {} # Loading the regions (if any) regions = {} total_reg_len = 0 total_regions = 0 # # TODO: gff # log_out_f.write('Loading regions...\n') # log_out_f.write('\tNo regions given, using whole reference.\n') for name, seq_len in reference_chromosomes.items(): log_out_f.write('\tLoaded [%s]\n' % name) regions.setdefault(name, []).append([1, seq_len]) total_regions += 1 total_reg_len += seq_len log_out_f.write('\tTotal Regions: %d\n' % total_regions) log_out_f.write('\tTotal Region Length: %d\n' % total_reg_len) ca_output = CAOutput(stdout_f=log_out_f, misassembly_f=misassembly_f, coords_filtered_f=open(coords_filtered_fpath, 'w'), icarus_out_f=icarus_out_f) log_out_f.write('Analyzing contigs...\n') result, ref_aligns, total_indels_info, aligned_lengths, misassembled_contigs, misassemblies_in_contigs, aligned_lengths_by_contigs =\ analyze_contigs(ca_output, contigs_fpath, unaligned_fpath, unaligned_info_fpath, aligns, ref_features, reference_chromosomes, is_cyclic) log_out_f.write('Analyzing coverage...\n') if qconfig.show_snps: log_out_f.write('Writing SNPs into ' + used_snps_fpath + '\n') total_aligned_bases, indels_info = analyze_coverage(ref_aligns, reference_chromosomes, ns_by_chromosomes, used_snps_fpath) total_indels_info += indels_info cov_stats = {'SNPs': total_indels_info.mismatches, 'indels_list': total_indels_info.indels_list, 'total_aligned_bases': total_aligned_bases} result.update(cov_stats) result = print_results(contigs_fpath, log_out_f, used_snps_fpath, total_indels_info, result) if not qconfig.space_efficient: ## outputting misassembled contigs to separate file fasta = [(name, seq) for name, seq in fastaparser.read_fasta(contigs_fpath) if name in misassembled_contigs.keys()] fastaparser.write_fasta(join(output_dirpath, qutils.name_from_fpath(contigs_fpath) + '.mis_contigs.fa'), fasta) if qconfig.is_combined_ref: alignment_tsv_fpath = join(output_dirpath, "alignments_" + corr_assembly_label + '.tsv') unique_contigs_fpath = join(output_dirpath, qconfig.unique_contigs_fname_pattern % corr_assembly_label) logger.debug(' ' + qutils.index_to_str(index) + 'Alignments: ' + qutils.relpath(alignment_tsv_fpath)) used_contigs = set() with open(unique_contigs_fpath, 'w') as unique_contigs_f: with open(alignment_tsv_fpath, 'w') as alignment_tsv_f: for chr_name, aligns in ref_aligns.items(): alignment_tsv_f.write(chr_name) contigs = set([align.contig for align in aligns]) for contig in contigs: alignment_tsv_f.write('\t' + contig) if qconfig.is_combined_ref: ref_name = ref_labels_by_chromosomes[chr_name] align_by_contigs = defaultdict(int) for align in aligns: align_by_contigs[align.contig] += align.len2 for contig, aligned_len in align_by_contigs.items(): if contig in used_contigs: continue used_contigs.add(contig) len_cov_pattern = re.compile(r'_length_([\d\.]+)_cov_([\d\.]+)') if len_cov_pattern.findall(contig): contig_len = len_cov_pattern.findall(contig)[0][0] contig_cov = len_cov_pattern.findall(contig)[0][1] if aligned_len / float(contig_len) > 0.9: unique_contigs_f.write(ref_name + '\t' + str(aligned_len) + '\t' + contig_cov + '\n') alignment_tsv_f.write('\n') close_handlers(ca_output) logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') logger.debug('') if not ref_aligns: return AlignerStatus.NOT_ALIGNED, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs else: return AlignerStatus.OK, result, aligned_lengths, misassemblies_in_contigs, aligned_lengths_by_contigs
def do(output_dir, ref_fpath, contigs_fpaths, logger): logger.print_timestamp() logger.main_info('Running analysis based on unique 101-mers...') addsitedir(jellyfish_python_dirpath) try: compile_jellyfish(logger) import jellyfish try: import imp imp.reload(jellyfish) except: reload(jellyfish) jellyfish.MerDNA.k(KMERS_LEN) except: logger.warning('Failed unique 101-mers analysis.') return checked_assemblies = [] for contigs_fpath in contigs_fpaths: label = qutils.label_from_fpath_for_fname(contigs_fpath) if check_jf_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): jf_stats_fpath = join(output_dir, label + '.stat') stats_content = open(jf_stats_fpath).read().split('\n') if len(stats_content) < 4: continue logger.info(' Using existing results for ' + label + '... ') report = reporting.get(contigs_fpath) report.add_field( reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1])) report.add_field( reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % float(stats_content[1].strip().split(': ')[-1])) report.add_field( reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % float(stats_content[2].strip().split(': ')[-1])) report.add_field( reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % float(stats_content[3].strip().split(': ')[-1])) checked_assemblies.append(contigs_fpath) contigs_fpaths = [ fpath for fpath in contigs_fpaths if fpath not in checked_assemblies ] if len(contigs_fpaths) == 0: logger.info('Done.') return logger.info('Running Jellyfish on reference...') jf_out_fpath = join(output_dir, basename(ref_fpath) + '.jf') qutils.call_subprocess([ jellyfish_bin_fpath, 'count', '-m', '101', '-U', '1', '-s', str(getsize(ref_fpath)), '-o', jf_out_fpath, '-t', str(qconfig.max_threads), ref_fpath ]) ref_kmers = jellyfish.ReadMerFile(jf_out_fpath) os.remove(jf_out_fpath) logger.info('Running Jellyfish on assemblies...') contigs_kmers = [] for contigs_fpath in contigs_fpaths: jf_out_fpath = join(output_dir, basename(contigs_fpath) + '.jf') qutils.call_subprocess([ jellyfish_bin_fpath, 'count', '-m', '101', '-U', '1', '-s', str(getsize(contigs_fpath)), '-o', jf_out_fpath, '-t', str(qconfig.max_threads), contigs_fpath ]) contigs_kmers.append(jellyfish.QueryMerFile(jf_out_fpath)) os.remove(jf_out_fpath) logger.info('Analyzing completeness and accuracy of assemblies...') unique_kmers = 0 matched_kmers = defaultdict(int) shared_kmers = set() kmer_i = 0 for kmer, count in ref_kmers: unique_kmers += 1 matches = 0 for idx in range(len(contigs_fpaths)): if contigs_kmers[idx][kmer]: matched_kmers[idx] += 1 matches += 1 if matches == len(contigs_fpaths): if kmer_i % 100 == 0: shared_kmers.add(str(kmer)) kmer_i += 1 for idx, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) completeness = matched_kmers[idx] * 100.0 / unique_kmers report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness) shared_kmers_by_chrom = dict() ref_contigs = dict((name, seq) for name, seq in read_fasta(ref_fpath)) for name, seq in ref_contigs.items(): seq_kmers = jellyfish.string_mers(seq) for kmer in seq_kmers: if str(kmer) in shared_kmers: shared_kmers_by_chrom[str(kmer)] = name for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) len_map_to_one_chrom = 0 len_map_to_multi_chrom = 0 total_len = 0 for name, seq in read_fasta(contigs_fpath): total_len += len(seq) seq_kmers = jellyfish.string_mers(seq) chrom_markers = [] for kmer in seq_kmers: kmer_str = str(kmer) if kmer_str in shared_kmers_by_chrom: chrom = shared_kmers_by_chrom[kmer_str] chrom_markers.append(chrom) if len(chrom_markers) < MIN_MARKERS: continue if len(set(chrom_markers)) == 1: len_map_to_one_chrom += len(seq) else: len_map_to_multi_chrom += len(seq) len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len)) create_jf_stats_file( output_dir, contigs_fpath, contigs_fpaths, ref_fpath, report.get_field(reporting.Fields.KMER_COMPLETENESS), len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom) logger.info('Done.')
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath): nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath, 'nucmer_output') from quast_libs import search_references_meta if search_references_meta.is_quast_first_run: nucmer_path_dirpath = os.path.join(nucmer_path_dirpath, 'raw') logger.print_timestamp() logger.main_info('Running Genome analyzer...') if not os.path.isdir(genome_stats_dirpath): os.mkdir(genome_stats_dirpath) reference_chromosomes = {} genome_size = 0 for name, seq in fastaparser.read_fasta(ref_fpath): chr_name = name.split()[0] chr_len = len(seq) genome_size += chr_len reference_chromosomes[chr_name] = chr_len # reading genome size # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0] # reading reference name # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome # ref_file = open(reference, 'r') # reference_name = ref_file.readline().split()[0][1:] # ref_file.close() # RESULTS file result_fpath = genome_stats_dirpath + '/genome_info.txt' res_file = open(result_fpath, 'w') genes_container = FeatureContainer(genes_fpaths, 'gene') operons_container = FeatureContainer(operons_fpaths, 'operon') for container in [genes_container, operons_container]: if not container.fpaths: logger.notice('No file with ' + container.kind + 's provided. ' 'Use the -' + container.kind[0].capitalize() + ' option ' 'if you want to specify it.', indent=' ') continue for fpath in container.fpaths: container.region_list += genes_parser.get_genes_from_file(fpath, container.kind) if len(container.region_list) == 0: logger.warning('No ' + container.kind + 's were loaded.', indent=' ') res_file.write(container.kind + 's loaded: ' + 'None' + '\n') else: logger.info(' Loaded ' + str(len(container.region_list)) + ' ' + container.kind + 's') res_file.write(container.kind + 's loaded: ' + str(len(container.region_list)) + '\n') container.chr_names_dict = chromosomes_names_dict(container.kind, container.region_list, list(reference_chromosomes.keys())) for contigs_fpath in aligned_contigs_fpaths: report = reporting.get(contigs_fpath) if genes_container.fpaths: report.add_field(reporting.Fields.REF_GENES, len(genes_container.region_list)) if operons_container.fpaths: report.add_field(reporting.Fields.REF_OPERONS, len(operons_container.region_list)) # for cumulative plots: files_genes_in_contigs = {} # "filename" : [ genes in sorted contigs (see below) ] files_operons_in_contigs = {} # for histograms genome_mapped = [] full_found_genes = [] full_found_operons = [] # process all contig files num_nf_errors = logger._num_nf_errors n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads) if is_python2(): from joblib import Parallel, delayed else: from joblib3 import Parallel, delayed process_results = Parallel(n_jobs=n_jobs)(delayed(process_single_file)( contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container) for index, contigs_fpath in enumerate(aligned_contigs_fpaths)) num_nf_errors += len([res for res in process_results if res is None]) logger._num_nf_errors = num_nf_errors process_results = [res for res in process_results if res] if not process_results: logger.main_info('Genome analyzer failed for all the assemblies.') res_file.close() return ref_lengths = [process_results[i][0] for i in range(len(process_results))] results_genes_operons_tuples = [process_results[i][1] for i in range(len(process_results))] for ref in reference_chromosomes: ref_lengths_by_contigs[ref] = [ref_lengths[i][ref] for i in range(len(ref_lengths))] res_file.write('reference chromosomes:\n') for chr_name, chr_len in reference_chromosomes.items(): aligned_len = max(ref_lengths_by_contigs[chr_name]) res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) + ' bp, maximal covered length: ' + str(aligned_len) + ' bp)\n') res_file.write('\n') res_file.write('total genome size: ' + str(genome_size) + '\n\n') res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n') res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n') # header # header res_file.write('\n\n') res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial')) res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons')) res_file.write('================================================================================================================\n') for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip(aligned_contigs_fpaths, results_genes_operons_tuples): assembly_name = qutils.name_from_fpath(contigs_fpath) files_genes_in_contigs[contigs_fpath] = genes_in_contigs files_operons_in_contigs[contigs_fpath] = operons_in_contigs full_found_genes.append(sum(genes_in_contigs)) full_found_operons.append(sum(operons_in_contigs)) covered_bp = results["covered_bp"] gaps_count = results["gaps_count"] genes_full = results[reporting.Fields.GENES + "_full"] genes_part = results[reporting.Fields.GENES + "_partial"] operons_full = results[reporting.Fields.OPERONS + "_full"] operons_part = results[reporting.Fields.OPERONS + "_partial"] report = reporting.get(contigs_fpath) genome_fraction = float(covered_bp) * 100 / float(genome_size) duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) + report.get_field(reporting.Fields.MISINTERNALOVERLAP) + report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) - report.get_field(reporting.Fields.UNALIGNEDBASES)) /\ ((genome_fraction / 100.0) * float(genome_size)) res_file.write('%-25s| %-10s| %-12s| %-10s|' % (assembly_name[:24], '%3.5f%%' % genome_fraction, '%1.5f' % duplication_ratio, gaps_count)) report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction) report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio) genome_mapped.append(genome_fraction) for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part), (reporting.Fields.OPERONS, operons_full, operons_part)]: if full is None and part is None: res_file.write(' %-10s| %-10s|' % ('-', '-')) else: res_file.write(' %-10s| %-10s|' % (full, part)) report.add_field(field, '%s + %s part' % (full, part)) res_file.write('\n') res_file.close() if genes_container.region_list: ref_genes_num = len(genes_container.region_list) else: ref_genes_num = None if operons_container.region_list: ref_operons_num = len(operons_container.region_list) else: ref_operons_num = None # saving json if json_output_dirpath: if genes_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.html_report: from quast_libs.html_saver import html_saver if genes_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num) if operons_container.region_list: html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num) if qconfig.draw_plots: # cumulative plots: from . import plotter if genes_container.region_list: plotter.genes_operons_plot(len(genes_container.region_list), aligned_contigs_fpaths, files_genes_in_contigs, genome_stats_dirpath + '/genes_cumulative_plot', 'genes') plotter.histogram(aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_genes_histogram', '# complete genes') if operons_container.region_list: plotter.genes_operons_plot(len(operons_container.region_list), aligned_contigs_fpaths, files_operons_in_contigs, genome_stats_dirpath + '/operons_cumulative_plot', 'operons') plotter.histogram(aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram', '# complete operons') plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram', 'Genome fraction, %', top_value=100) logger.main_info('Done.') return [genes_container, operons_container]
print("Usage: " + sys.argv[0] + " <input fasta (scaffolds)> (to get stats on sizes of Ns regions)") print("Usage: " + sys.argv[0] + " <input fasta (scaffolds)> <THRESHOLD> <output fasta (contigs)> (to break contigs on Ns regions of size >= THRESHOLD)") sys.exit() BREAK_SCAFFOLDS = False if len(sys.argv) == 4: BREAK_SCAFFOLDS = True N_NUMBER = None counter = 0 if BREAK_SCAFFOLDS: N_NUMBER = int(sys.argv[2]) sizes_of_Ns_regions = dict() new_fasta = [] for id, (name, seq) in enumerate(fastaparser.read_fasta(sys.argv[1])): i = 0 cur_contig_number = 1 cur_contig_start = 0 while (i < len(seq)) and (seq.find("N", i) != -1): start = seq.find("N", i) end = start + 1 while (end != len(seq)) and (seq[end] == 'N'): end += 1 i = end + 1 if BREAK_SCAFFOLDS and (end - start) >= N_NUMBER: new_fasta.append((name.split()[0] + "_" + str(cur_contig_number), seq[cur_contig_start:start])) cur_contig_number += 1 cur_contig_start = end
def process_single_file(contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath, reference_chromosomes, genes_container, operons_container): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) results = dict() ref_lengths = {} logger.info(' ' + qutils.index_to_str(index) + assembly_label) nucmer_base_fpath = os.path.join(nucmer_path_dirpath, corr_assembly_label + '.coords') if qconfig.use_all_alignments: nucmer_fpath = nucmer_base_fpath else: nucmer_fpath = nucmer_base_fpath + '.filtered' if not os.path.isfile(nucmer_fpath): logger.error('Nucmer\'s coords file (' + nucmer_fpath + ') not found! Try to restart QUAST.', indent=' ') return None coordfile = open(nucmer_fpath, 'r') for line in coordfile: if line.startswith('='): break # EXAMPLE: # [S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [% IDY] | [TAGS] #===================================================================================== # 338980 339138 | 2298 2134 | 159 165 | 79.76 | gi|48994873|gb|U00096.2| NODE_0_length_6088 # 374145 374355 | 2306 2097 | 211 210 | 85.45 | gi|48994873|gb|U00096.2| NODE_0_length_6088 genome_mapping = {} for chr_name, chr_len in reference_chromosomes.items(): genome_mapping[chr_name] = [0] * (chr_len + 1) contig_tuples = fastaparser.read_fasta(contigs_fpath) # list of FASTA entries (in tuples: name, seq) contig_tuples = sorted(contig_tuples, key=lambda contig: len(contig[1]), reverse=True) sorted_contigs_names = [name for (name, seq) in contig_tuples] genes_in_contigs = [0] * len(sorted_contigs_names) # for cumulative plots: i-th element is the number of genes in i-th contig operons_in_contigs = [0] * len(sorted_contigs_names) aligned_blocks_by_contig_name = {} # for gene finding: contig_name --> list of AlignedBlock gene_searching_enabled = len(genes_container.region_list) or len(operons_container.region_list) if qconfig.memory_efficient and gene_searching_enabled: logger.warning('Run QUAST without genes and operons files to reduce memory consumption.') if gene_searching_enabled: for name in sorted_contigs_names: aligned_blocks_by_contig_name[name] = [] for line in coordfile: if line.strip() == '': break s1 = int(line.split('|')[0].split()[0]) e1 = int(line.split('|')[0].split()[1]) s2 = int(line.split('|')[1].split()[0]) e2 = int(line.split('|')[1].split()[1]) contig_name = line.split()[12].strip() chr_name = line.split()[11].strip() if chr_name not in genome_mapping: logger.error("Something went wrong and chromosome names in your coords file (" + nucmer_base_fpath + ") " \ "differ from the names in the reference. Try to remove the file and restart QUAST.") return None if gene_searching_enabled: aligned_blocks_by_contig_name[contig_name].append(AlignedBlock(seqname=chr_name, start=s1, end=e1)) if s2 == 0 and e2 == 0: # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning for i in range(s1, len(genome_mapping[chr_name])): genome_mapping[chr_name][i] = 1 for i in range(1, e1 + 1): genome_mapping[chr_name][i] = 1 else: #if s1 <= e1: for i in range(s1, e1 + 1): genome_mapping[chr_name][i] = 1 coordfile.close() if qconfig.space_efficient and nucmer_fpath.endswith('.filtered'): os.remove(nucmer_fpath) # counting genome coverage and gaps number covered_bp = 0 gaps_count = 0 gaps_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + '_gaps.txt') if not qconfig.space_efficient else '/dev/null' gaps_file = open(gaps_fpath, 'w') for chr_name, chr_len in reference_chromosomes.items(): gaps_file.write(chr_name + '\n') cur_gap_size = 0 aligned_len = 0 for i in range(1, chr_len + 1): if genome_mapping[chr_name][i] == 1: if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 gaps_file.write(str(i - cur_gap_size) + ' ' + str(i - 1) + '\n') aligned_len += 1 covered_bp += 1 cur_gap_size = 0 else: cur_gap_size += 1 ref_lengths[chr_name] = aligned_len if cur_gap_size >= qconfig.min_gap_size: gaps_count += 1 gaps_file.write(str(chr_len - cur_gap_size + 1) + ' ' + str(chr_len) + '\n') gaps_file.close() results["covered_bp"] = covered_bp results["gaps_count"] = gaps_count # finding genes and operons for container, feature_in_contigs, field, suffix in [ (genes_container, genes_in_contigs, reporting.Fields.GENES, '_genes.txt'), (operons_container, operons_in_contigs, reporting.Fields.OPERONS, '_operons.txt')]: if not container.region_list: results[field + "_full"] = None results[field + "_partial"] = None continue total_full = 0 total_partial = 0 found_fpath = os.path.join(genome_stats_dirpath, corr_assembly_label + suffix) found_file = open(found_fpath, 'w') found_file.write('%s\t\t%s\t%s\t%s\n' % ('ID or #', 'Start', 'End', 'Type')) found_file.write('=========================================\n') # 0 - gene is not found, # 1 - gene is found, # 2 - part of gene is found found_list = [0] * len(container.region_list) for i, region in enumerate(container.region_list): found_list[i] = 0 for contig_id, name in enumerate(sorted_contigs_names): cur_feature_is_found = False for cur_block in aligned_blocks_by_contig_name[name]: if container.chr_names_dict[region.seqname] != cur_block.seqname: continue # computing circular genomes if cur_block.start > cur_block.end: blocks = [AlignedBlock(seqname=cur_block.seqname, start=cur_block.start, end=region.end + 1), AlignedBlock(seqname=cur_block.seqname, start=1, end=cur_block.end)] else: blocks = [cur_block] for block in blocks: if region.end <= block.start or block.end <= region.start: continue elif block.start <= region.start and region.end <= block.end: if found_list[i] == 2: # already found as partial gene total_partial -= 1 found_list[i] = 1 total_full += 1 region_id = str(region.id) if region_id == 'None': region_id = '# ' + str(region.number + 1) found_file.write('%s\t\t%d\t%d\tcomplete\n' % (region_id, region.start, region.end)) feature_in_contigs[contig_id] += 1 # inc number of found genes/operons in id-th contig cur_feature_is_found = True break elif found_list[i] == 0 and min(region.end, block.end) - max(region.start, block.start) >= qconfig.min_gene_overlap: found_list[i] = 2 total_partial += 1 if cur_feature_is_found: break if cur_feature_is_found: break # adding info about partially found genes/operons if found_list[i] == 2: # partial gene/operon region_id = str(region.id) if region_id == 'None': region_id = '# ' + str(region.number + 1) found_file.write('%s\t\t%d\t%d\tpartial\n' % (region_id, region.start, region.end)) results[field + "_full"] = total_full results[field + "_partial"] = total_partial found_file.close() logger.info(' ' + qutils.index_to_str(index) + 'Analysis is finished.') return ref_lengths, (results, genes_in_contigs, operons_in_contigs)
def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir): logger.print_timestamp() logger.main_info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None reference_lengths = [] reference_fragments = None if ref_fpath: reference_lengths = sorted( fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values(), reverse=True) reference_fragments = len(reference_lengths) reference_length = sum(reference_lengths) reference_GC, reference_GC_distribution, reference_GC_contigs_distribution = GC_content( ref_fpath) logger.info(' Reference genome:') logger.info(' ' + os.path.basename(ref_fpath) + ', length = ' + str(reference_length) + ', num fragments = ' + str(reference_fragments) + ', GC % = ' + '%.2f' % reference_GC if reference_GC is not None else 'undefined') if reference_fragments > 30 and not qconfig.check_for_fragmented_ref: logger.warning( ' Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option.' ' QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).' ) elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size reference_lengths = [reference_length] logger.info(' Estimated reference length = ' + str(reference_length)) logger.info(' Contig files: ') lists_of_lengths = [] numbers_of_Ns = [] coverage_dict = dict() cov_pattern = re.compile(r'_cov_(\d+\.?\d*)') for id, contigs_fpath in enumerate(contigs_fpaths): coverage_dict[contigs_fpath] = [] assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 is_potential_scaffold = False for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count('N') if not qconfig.scaffolds and not is_potential_scaffold and qutils.is_scaffold( seq): is_potential_scaffold = True qconfig.potential_scaffolds_assemblies.append(assembly_label) if cov_pattern.findall(name): cov = int(float(cov_pattern.findall(name)[0])) if len(coverage_dict[contigs_fpath]) <= cov: coverage_dict[contigs_fpath] += [0] * ( cov - len(coverage_dict[contigs_fpath]) + 1) coverage_dict[contigs_fpath][cov] += len(seq) lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) lists_of_lengths = [ sorted(list, reverse=True) for list in lists_of_lengths ] num_contigs = max( [len(list_of_length) for list_of_length in lists_of_lengths]) multiplicator = 1 if num_contigs >= (qconfig.max_points * 2): import math multiplicator = int(num_contigs / qconfig.max_points) max_points = num_contigs // multiplicator corr_lists_of_lengths = [[ sum(list_of_length[((i - 1) * multiplicator):(i * multiplicator)]) for i in range(1, max_points) if (i * multiplicator) < len(list_of_length) ] for list_of_length in lists_of_lengths] if len(reference_lengths) > 1: reference_lengths = [ sum(reference_lengths[( (i - 1) * multiplicator):(i * multiplicator)]) if (i * multiplicator) < len(reference_lengths) else sum( reference_lengths[((i - 1) * multiplicator):]) for i in range(1, max_points) ] + [sum(reference_lengths[(max_points - 1) * multiplicator:])] for num_list in range(len(corr_lists_of_lengths)): last_index = len(corr_lists_of_lengths[num_list]) corr_lists_of_lengths[num_list].append( sum(lists_of_lengths[num_list][last_index * multiplicator:])) else: corr_lists_of_lengths = [ sorted(list, reverse=True) for list in lists_of_lengths ] if reference_lengths: # Saving for an HTML report if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_reference_lengths(results_dir, reference_lengths) if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths) html_saver.save_tick_x(results_dir, multiplicator) ######################################################################## logger.info(' Calculating N50 and L50...') list_of_GC_distributions = [] list_of_GC_contigs_distributions = [] largest_contig = 0 from . import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate( zip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution, GC_contigs_distribution = GC_content( contigs_fpath, skip=qconfig.no_gc) list_of_GC_distributions.append(GC_distribution) list_of_GC_contigs_distributions.append(GC_contigs_distribution) logger.info(' ' + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + \ ', N50 = ' + str(n50) + \ ', L50 = ' + str(l50) + \ ', Total length = ' + str(total_length) + \ ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \ ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined') report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) if lengths_list: report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) largest_contig = max(largest_contig, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) if not qconfig.is_combined_ref: report.add_field( reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field( reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments) if not qconfig.is_combined_ref: report.add_field( reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None)) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) import math qconfig.min_difference = math.ceil( (largest_contig / 1000) / 600) # divide on height of plot list_of_GC_distributions_with_ref = list_of_GC_distributions reference_index = None if ref_fpath: reference_index = len(list_of_GC_distributions_with_ref) list_of_GC_distributions_with_ref.append(reference_GC_distribution) if qconfig.html_report and not qconfig.no_gc: from quast_libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions_with_ref, list_of_GC_contigs_distributions, reference_index) ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'Nx_plot'), 'Nx', []) if reference_length and not qconfig.is_combined_ref: plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'NGx_plot'), 'NGx', [reference_length for i in range(len(contigs_fpaths))]) if qconfig.draw_plots: ########################################################################import plotter # Drawing cumulative plot... plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'cumulative_plot'), 'Cumulative length') if not qconfig.no_gc: ######################################################################## # Drawing GC content plot... plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, join(output_dirpath, 'GC_content_plot')) for contigs_fpath, GC_distribution in zip( contigs_fpaths, list_of_GC_contigs_distributions): plotter.contigs_GC_content_plot( contigs_fpath, GC_distribution, join( output_dirpath, qutils.label_from_fpath(contigs_fpath) + '_GC_content_plot')) if any(coverage_dict[contigs_fpath] for contigs_fpath in contigs_fpaths): draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath) logger.main_info('Done.')
def correct_meta_references(ref_fpaths, corrected_dirpath): corrected_ref_fpaths = [] combined_ref_fpath = os.path.join(corrected_dirpath, qconfig.combined_ref_name) chromosomes_by_refs = {} def _proceed_seq(seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath): seq_fname = ref_name seq_fname += ref_fasta_ext if total_references > 1: corr_seq_fpath = corrected_ref_fpaths[-1] else: corr_seq_fpath = qutils.unique_corrected_fpath(os.path.join(corrected_dirpath, seq_fname)) corrected_ref_fpaths.append(corr_seq_fpath) corr_seq_name = qutils.name_from_fpath(corr_seq_fpath) + '_' + seq_name if not qconfig.no_check: corr_seq = correct_seq(seq, ref_fpath) if not corr_seq: return None, None fastaparser.write_fasta(corr_seq_fpath, [(corr_seq_name, seq)], 'a') fastaparser.write_fasta(combined_ref_fpath, [(corr_seq_name, seq)], 'a') contigs_analyzer.ref_labels_by_chromosomes[corr_seq_name] = qutils.name_from_fpath(corr_seq_fpath) chromosomes_by_refs[ref_name].append((corr_seq_name, len(seq))) return corr_seq_name, corr_seq_fpath ref_fnames = [os.path.basename(ref_fpath) for ref_fpath in ref_fpaths] ref_names = [] for ref_fname in ref_fnames: ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) ref_names.append(ref_name) dupl_ref_names = [ref_name for ref_name in ref_names if ref_names.count(ref_name) > 1] for ref_fpath in ref_fpaths: total_references = 0 ref_fname = os.path.basename(ref_fpath) ref_name, ref_fasta_ext = qutils.splitext_for_fasta_file(ref_fname) if ref_name in dupl_ref_names: ref_name = qutils.get_label_from_par_dir_and_fname(ref_fpath) chromosomes_by_refs[ref_name] = [] used_seq_names = defaultdict(int) corr_seq_fpath = None for i, (seq_name, seq) in enumerate(fastaparser.read_fasta(ref_fpath)): total_references += 1 seq_name = correct_name(seq_name, qutils.MAX_CONTIG_NAME - len(ref_name) - 1) uniq_seq_name = get_uniq_name(seq_name, used_seq_names) used_seq_names[seq_name] += 1 corr_seq_name, corr_seq_fpath = _proceed_seq(uniq_seq_name, seq, ref_name, ref_fasta_ext, total_references, ref_fpath) if not corr_seq_name: break if corr_seq_fpath: logger.main_info(' ' + ref_fpath + ' ==> ' + qutils.name_from_fpath(corr_seq_fpath) + '') logger.main_info(' All references combined in ' + qconfig.combined_ref_name) return corrected_ref_fpaths, combined_ref_fpath, chromosomes_by_refs, ref_fpaths