def get_chr_len_fpath(ref_fpath): chr_len_fpath = ref_fpath + '.fai' if not is_non_empty_file(chr_len_fpath): chr_lengths = fastaparser.get_chr_lengths_from_fastafile(ref_fpath) with open(chr_len_fpath, 'w') as out_f: for chr_name, chr_len in chr_lengths.iteritems(): out_f.write(chr_name + '\t' + str(chr_len) + '\n') return chr_len_fpath
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) logger.info(' ' + qutils.index_to_str(index) + assembly_label) err_fpath = os.path.join(out_dirpath, corr_assembly_label + '_genemark.stderr') genes = gmhmm_p_function(tool_dirpath, contigs_fpath, err_fpath, index, tmp_dirpath, num_threads) contig_lengths = get_chr_lengths_from_fastafile(contigs_fpath) if not genes: unique_count = None full_cnt = None partial_cnt = None else: for gene in genes: gene.is_full = gene.start > 1 and gene.end < contig_lengths[ gene.contig] tool_name = "genemark" out_gff_fpath = os.path.join( out_dirpath, corr_assembly_label + '_' + tool_name + '_genes.gff' + ('.gz' if not qconfig.no_gzip else '')) add_genes_to_gff(genes, out_gff_fpath, prokaryote) if OUTPUT_FASTA: out_fasta_fpath = os.path.join( out_dirpath, corr_assembly_label + '_' + tool_name + '_genes.fasta') add_genes_to_fasta(genes, out_fasta_fpath) full_cnt = [ sum([ gene.end - gene.start >= threshold for gene in genes if gene.is_full ]) for threshold in gene_lengths ] partial_cnt = [ sum([ gene.end - gene.start >= threshold for gene in genes if not gene.is_full ]) for threshold in gene_lengths ] gene_ids = [gene.seq if gene.seq else gene.name for gene in genes] unique_count = len(set(gene_ids)) total_count = len(genes) logger.info(' ' + qutils.index_to_str(index) + ' Genes = ' + str(unique_count) + ' unique, ' + str(total_count) + ' total') logger.info(' ' + qutils.index_to_str(index) + ' Predicted genes (GFF): ' + out_gff_fpath) return genes, unique_count, full_cnt, partial_cnt
def get_lengths_from_fasta(contigs_fpath, label): lengths = fastaparser.get_chr_lengths_from_fastafile(contigs_fpath).values() if not sum(l for l in lengths if l >= qconfig.min_contig): logger.warning("Skipping %s because it doesn't contain contigs >= %d bp." % (label, qconfig.min_contig)) return None return list(lengths)
def get_lengths_from_fasta(contigs_fpath, label): lengths = fastaparser.get_chr_lengths_from_fastafile(contigs_fpath).values() if not sum(l for l in lengths if l >= qconfig.min_contig): logger.warning("Skipping %s because it doesn't contain contigs >= %d bp." % (label, qconfig.min_contig)) return None return list(lengths)
def get_chr_len_fpath(ref_fpath, correct_chr_names=None): chr_len_fpath = ref_fpath + '.fai' raw_chr_names = dict((raw_name, correct_name) for correct_name, raw_name in correct_chr_names.items()) \ if correct_chr_names else None if not is_non_empty_file(chr_len_fpath): chr_lengths = fastaparser.get_chr_lengths_from_fastafile(ref_fpath) with open(chr_len_fpath, 'w') as out_f: for chr_name, chr_len in chr_lengths.items(): chr_name = raw_chr_names[chr_name] if correct_chr_names else chr_name out_f.write(chr_name + '\t' + str(chr_len) + '\n') return chr_len_fpath
def get_chr_len_fpath(ref_fpath, correct_chr_names=None): chr_len_fpath = ref_fpath + '.fai' raw_chr_names = dict((raw_name, correct_name) for correct_name, raw_name in correct_chr_names.items()) \ if correct_chr_names else None if not is_non_empty_file(chr_len_fpath): chr_lengths = fastaparser.get_chr_lengths_from_fastafile(ref_fpath) with open(chr_len_fpath, 'w') as out_f: for chr_name, chr_len in chr_lengths.items(): chr_name = raw_chr_names[chr_name] if correct_chr_names else chr_name out_f.write(chr_name + '\t' + str(chr_len) + '\n') return chr_len_fpath
def get_correct_names_for_chroms(output_dirpath, ref_fpath, sam_fpath, err_path, reads_fpaths): correct_chr_names = dict() ref_chr_lengths = get_chr_lengths_from_fastafile(ref_fpath) sam_chr_lengths = dict() sam_header_fpath = os.path.join(output_dirpath, os.path.basename(sam_fpath) + '.header') qutils.call_subprocess( [sambamba_fpath('sambamba'), 'view', '-H', '-S', sam_fpath], stdout=open(sam_header_fpath, 'w'), stderr=open(err_path, 'w'), logger=logger) chr_name_pattern = 'SN:(\S+)' chr_len_pattern = 'LN:(\d+)' with open(sam_header_fpath) as sam_in: for l in sam_in: if l.startswith('@SQ'): chr_name = re.findall(chr_name_pattern, l)[0] chr_len = re.findall(chr_len_pattern, l)[0] sam_chr_lengths[chr_name] = int(chr_len) inconsistency = '' if len(ref_chr_lengths) != len(sam_chr_lengths): inconsistency = 'Number of chromosomes' else: for ref_chr, sam_chr in zip(ref_chr_lengths.keys(), sam_chr_lengths.keys()): if correct_name( sam_chr) == ref_chr[:len(sam_chr)] and sam_chr_lengths[ sam_chr] == ref_chr_lengths[ref_chr]: correct_chr_names[sam_chr] = ref_chr elif sam_chr_lengths[sam_chr] != ref_chr_lengths[ref_chr]: inconsistency = 'Chromosome lengths' break else: inconsistency = 'Chromosome names' break if inconsistency: if reads_fpaths: logger.warning( inconsistency + ' in reference and SAM file do not match. ' + 'QUAST will try to realign reads to the reference genome.') else: logger.error( inconsistency + ' in reference and SAM file do not match. ' + 'Use SAM file obtained by aligning reads to the reference genome.' ) return None return correct_chr_names
def cumulative_plot(reference, contigs_fpaths, lists_of_lengths, plot_fpath, title): if not can_draw_plots: return logger.info(' Drawing cumulative plot...') plots = [] max_x = 0 for (contigs_fpath, lengths) in zip(contigs_fpaths, lists_of_lengths): y_vals = [0] for l in sorted(lengths, reverse=True): y_vals.append(y_vals[-1] + l) x_vals = list(range(0, len(y_vals))) if x_vals: max_x = max(x_vals[-1], max_x) color, ls = get_color_and_ls(contigs_fpath) plots.append(Plot(x_vals, y_vals, color, ls)) if reference: y_vals = [0] for l in sorted( fastaparser.get_chr_lengths_from_fastafile(reference).values(), reverse=True): y_vals.append(y_vals[-1] + l) x_vals = list(range(0, len(y_vals))) # extend reference curve to the max X-axis point reference_length = y_vals[-1] max_x = max(max_x, x_vals[-1]) y_vals.append(reference_length) x_vals.append(max_x) plots.append(Plot(x_vals, y_vals, reference_color, reference_ls)) legend_list = [label_from_fpath(fpath) for fpath in contigs_fpaths] if reference: legend_list += ['Reference'] create_plot(plot_fpath, title, plots, legend_list, x_label='Contig index', y_label='Cumulative length', x_limit=[0, max_x])
def get_correct_names_for_chroms(output_dirpath, fasta_fpath, sam_fpath, err_path, reads_fpaths, logger, is_reference=False): correct_chr_names = dict() fasta_chr_lengths = get_chr_lengths_from_fastafile(fasta_fpath) sam_chr_lengths = dict() sam_header_fpath = join(dirname(output_dirpath), basename(sam_fpath) + '.header') if not isfile(sam_fpath) and not isfile(sam_header_fpath): return None if isfile(sam_fpath): qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-H', '-S', sam_fpath], stdout=open(sam_header_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) chr_name_pattern = 'SN:(\S+)' chr_len_pattern = 'LN:(\d+)' with open(sam_header_fpath) as sam_in: for l in sam_in: if l.startswith('@SQ'): chr_name = re.findall(chr_name_pattern, l)[0] chr_len = re.findall(chr_len_pattern, l)[0] sam_chr_lengths[chr_name] = int(chr_len) inconsistency = '' if len(fasta_chr_lengths) != len(sam_chr_lengths): inconsistency = 'Number of chromosomes' else: for fasta_chr, sam_chr in zip(fasta_chr_lengths.keys(), sam_chr_lengths.keys()): if correct_name(sam_chr) == fasta_chr[:len(sam_chr)] and sam_chr_lengths[sam_chr] == fasta_chr_lengths[fasta_chr]: correct_chr_names[sam_chr] = fasta_chr elif sam_chr_lengths[sam_chr] != fasta_chr_lengths[fasta_chr]: inconsistency = 'Chromosome lengths' break else: inconsistency = 'Chromosome names' break if inconsistency: if reads_fpaths: logger.warning(inconsistency + ' in ' + fasta_fpath + ' and corresponding SAM file ' + sam_fpath + ' do not match. ' + 'QUAST will try to realign reads to ' + ('the reference genome' if is_reference else fasta_fpath)) else: logger.error(inconsistency + ' in ' + fasta_fpath + ' and corresponding SAM file ' + sam_fpath + ' do not match. ' + 'Use SAM file obtained by aligning reads to ' + ('the reference genome' if is_reference else fasta_fpath)) return None return correct_chr_names
def parse_kmer_coords(kmers_coords, ref_fpath, kmer_fraction): kmers_pos_by_chrom = defaultdict(list) kmers_by_chrom = defaultdict(list) with open(kmers_coords) as f: for line in f: fs = line.split('\t') if len(fs) < 10: continue contig, chrom, pos = fs[0], fs[5], fs[7] kmers_pos_by_chrom[chrom].append(int(pos)) kmers_by_chrom[chrom].append(int(contig)) downsampled_kmers_cnt = sum([len(kmers) for kmers in kmers_by_chrom.values()]) * kmer_fraction genome_size = sum(get_chr_lengths_from_fastafile(ref_fpath).values()) interval = int(genome_size / downsampled_kmers_cnt) downsampled_kmers = set() for chrom in kmers_by_chrom.keys(): sorted_kmers = [kmers for kmers_pos, kmers in sorted(zip(kmers_pos_by_chrom[chrom], kmers_by_chrom[chrom]))] for kmer_i in sorted_kmers[::interval]: downsampled_kmers.add(kmer_i) return downsampled_kmers
def get_correct_names_for_chroms(output_dirpath, ref_fpath, sam_fpath, err_path, reads_fpaths): correct_chr_names = dict() ref_chr_lengths = get_chr_lengths_from_fastafile(ref_fpath) sam_chr_lengths = dict() sam_header_fpath = os.path.join(output_dirpath, os.path.basename(sam_fpath) + '.header') qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-H', '-S', sam_fpath], stdout=open(sam_header_fpath, 'w'), stderr=open(err_path, 'w'), logger=logger) chr_name_pattern = 'SN:(\S+)' chr_len_pattern = 'LN:(\d+)' with open(sam_header_fpath) as sam_in: for l in sam_in: if l.startswith('@SQ'): chr_name = re.findall(chr_name_pattern, l)[0] chr_len = re.findall(chr_len_pattern, l)[0] sam_chr_lengths[chr_name] = int(chr_len) inconsistency = '' if len(ref_chr_lengths) != len(sam_chr_lengths): inconsistency = 'Number of chromosomes' else: for ref_chr, sam_chr in zip(ref_chr_lengths.keys(), sam_chr_lengths.keys()): if correct_name(sam_chr) == ref_chr[:len(sam_chr)] and sam_chr_lengths[sam_chr] == ref_chr_lengths[ref_chr]: correct_chr_names[sam_chr] = ref_chr elif sam_chr_lengths[sam_chr] != ref_chr_lengths[ref_chr]: inconsistency = 'Chromosome lengths' break else: inconsistency = 'Chromosome names' break if inconsistency: if reads_fpaths: logger.warning(inconsistency + ' in reference and SAM file do not match. ' + 'QUAST will try to realign reads to the reference genome.') else: logger.error(inconsistency + ' in reference and SAM file do not match. ' + 'Use SAM file obtained by aligning reads to the reference genome.') return None return correct_chr_names
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote, num_threads): assembly_label = qutils.label_from_fpath(contigs_fpath) corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath) logger.info(' ' + qutils.index_to_str(index) + assembly_label) err_fpath = os.path.join(out_dirpath, corr_assembly_label + '_genemark.stderr') genes = gmhmm_p_function(tool_dirpath, contigs_fpath, err_fpath, index, tmp_dirpath, num_threads) contig_lengths = get_chr_lengths_from_fastafile(contigs_fpath) if not genes: unique_count = None full_cnt = None partial_cnt = None else: for gene in genes: gene.is_full = gene.start > 1 and gene.end < contig_lengths[gene.contig] tool_name = "genemark" out_gff_fpath = os.path.join(out_dirpath, corr_assembly_label + '_' + tool_name + '_genes.gff' + ('.gz' if not qconfig.no_gzip else '')) add_genes_to_gff(genes, out_gff_fpath, prokaryote) if OUTPUT_FASTA: out_fasta_fpath = os.path.join(out_dirpath, corr_assembly_label + '_' + tool_name + '_genes.fasta') add_genes_to_fasta(genes, out_fasta_fpath) full_cnt = [sum([gene.end - gene.start >= threshold for gene in genes if gene.is_full]) for threshold in gene_lengths] partial_cnt = [sum([gene.end - gene.start >= threshold for gene in genes if not gene.is_full]) for threshold in gene_lengths] gene_ids = [gene.seq if gene.seq else gene.name for gene in genes] unique_count = len(set(gene_ids)) total_count = len(genes) logger.info(' ' + qutils.index_to_str(index) + ' Genes = ' + str(unique_count) + ' unique, ' + str(total_count) + ' total') logger.info(' ' + qutils.index_to_str(index) + ' Predicted genes (GFF): ' + out_gff_fpath) return genes, unique_count, full_cnt, partial_cnt
def create_conf(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, output_dir, gc_fpath, features_containers, cov_fpath, logger): data_dir = join(output_dir, 'data') if not exists(data_dir): os.makedirs(data_dir) chr_lengths = get_chr_lengths_from_fastafile(ref_fpath) max_len, karyotype_fpath, ideogram_fpath = create_ideogram(chr_lengths, data_dir) if max_len >= 10 ** 6: chrom_units = 10 ** 5 elif max_len >= 10 ** 5: chrom_units = 10 ** 4 else: chrom_units = 1000 ticks_fpath = create_ticks_conf(chrom_units, data_dir) ref_len = sum(chr_lengths.values()) window_size = set_window_size(ref_len) assemblies, contig_points = parse_alignments(contigs_fpaths, contig_report_fpath_pattern) alignments_fpaths = [create_alignment_plots(assembly, ref_len, data_dir) for assembly in assemblies] if not alignments_fpaths: return None gc_fpath, min_gc, max_gc, gc_points = create_gc_plot(gc_fpath, data_dir) feature_fpaths, gene_points = create_genes_plot(features_containers, window_size, ref_len, data_dir) mismatches_fpaths = [create_mismatches_plot(assembly, window_size, ref_len, output_dir, data_dir) for assembly in assemblies] cov_data_fpath, cov_points = create_coverage_plot(cov_fpath, window_size, chr_lengths, data_dir) max_points = max([MAX_POINTS, gc_points, gene_points, cov_points, contig_points]) labels_fpath, track_labels = create_labels(chr_lengths, assemblies, features_containers, cov_data_fpath, data_dir) conf_fpath = join(output_dir, 'circos.conf') radius = 0.95 plot_idx = 0 track_intervals = [TRACK_INTERVAL] * len(assemblies) if feature_fpaths: track_intervals[-1] = BIG_TRACK_INTERVAL track_intervals += [TRACK_INTERVAL] * len(feature_fpaths) if cov_data_fpath: track_intervals[-1] = BIG_TRACK_INTERVAL track_intervals.append(TRACK_INTERVAL) track_intervals[-1] = BIG_TRACK_INTERVAL with open(conf_fpath, 'w') as out_f: out_f.write('<<include etc/colors_fonts_patterns.conf>>\n') out_f.write('<<include %s>>\n' % relpath(ideogram_fpath, output_dir)) out_f.write('<<include %s>>\n' % relpath(ticks_fpath, output_dir)) out_f.write('karyotype = %s\n' % relpath(karyotype_fpath, output_dir)) out_f.write('chromosomes_units = %d\n' % chrom_units) out_f.write('chromosomes_display_default = yes\n') out_f.write('track_width = ' + str(TRACK_WIDTH) + '\n') for i in range(len(track_intervals)): out_f.write('track%d_pos = %f\n' % (i, radius)) radius -= TRACK_WIDTH radius -= track_intervals[i] out_f.write('track%d_pos = %f\n' % (len(track_intervals), radius)) out_f.write('<image>\n') out_f.write('dir = %s\n' % output_dir) out_f.write('file = %s\n' % circos_png_fname) out_f.write('png = yes\n') out_f.write('svg = no\n') out_f.write('radius = 1500p\n') out_f.write('angle_offset = -90\n') out_f.write('auto_alpha_colors = yes\n') out_f.write('auto_alpha_steps = 5\n') out_f.write('background = white\n') out_f.write('</image>\n') if qconfig.is_combined_ref: out_f.write('<highlights>\n') highlights_fpath = create_meta_highlights(chr_lengths, data_dir) out_f.write('<highlight>\n') out_f.write('file = %s\n' % relpath(highlights_fpath, output_dir)) out_f.write('r0 = 1r - 50p\n') out_f.write('r1 = 1r - 30p\n') out_f.write('</highlight>\n') out_f.write('</highlights>\n') out_f.write(create_housekeeping_file(chr_lengths, max_points, output_dir, data_dir, logger)) out_f.write('<plots>\n') out_f.write('layers_overflow = collapse\n') for label, i in track_labels: out_f.write('<plot>\n') out_f.write('track_idx = track%d\n' % i) out_f.write('<<include %s>>\n' % relpath(labels_fpath, output_dir)) out_f.write('</plot>\n') for i, alignments_conf in enumerate(alignments_fpaths): out_f.write('<plot>\n') out_f.write('type = tile\n') out_f.write('thickness = 50p\n') out_f.write('stroke_thickness = 0\n') out_f.write('layers = 1\n') out_f.write('file = %s\n' % relpath(alignments_conf, output_dir)) out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n') out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n') out_f.write('</plot>\n') if mismatches_fpaths and mismatches_fpaths[i]: out_f.write('<plot>\n') out_f.write('type = histogram\n') out_f.write('thickness = 1\n') out_f.write('fill_color = vlyellow\n') out_f.write('file = %s\n' % relpath(mismatches_fpaths[i], output_dir)) out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n') out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n') out_f.write('</plot>\n') plot_idx += 1 for feature_fpath in feature_fpaths: # genes plot out_f.write('<plot>\n') out_f.write('type = heatmap\n') out_f.write('file = %s\n' % relpath(feature_fpath, output_dir)) out_f.write('color = ylorbr-9\n') out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n') out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n') out_f.write('</plot>\n') plot_idx += 1 if cov_data_fpath: # coverage plot out_f.write('<plot>\n') out_f.write('type = histogram\n') out_f.write('thickness = 1\n') out_f.write('file = %s\n' % relpath(cov_data_fpath, output_dir)) out_f.write('fill_color = vlblue\n') out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n') out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n') out_f.write('</plot>\n') plot_idx += 1 # GC plot out_f.write('<plot>\n') out_f.write('type = heatmap\n') out_f.write('file = %s\n' % relpath(gc_fpath, output_dir)) out_f.write('color = greys-6\n') out_f.write('scale_log_base = 1.5\n') out_f.write('r0 = 1r - 29p\n') out_f.write('r1 = 1r - 1p\n') out_f.write('</plot>\n') out_f.write('</plots>\n') circos_legend_fpath = create_legend(assemblies, min_gc, max_gc, features_containers, cov_data_fpath, output_dir) return conf_fpath, circos_legend_fpath
def get_correct_names_for_chroms(output_dirpath, fasta_fpath, sam_fpath, err_path, reads_fpaths, logger, is_reference=False): correct_chr_names = dict() fasta_chr_lengths = get_chr_lengths_from_fastafile(fasta_fpath) sam_chr_lengths = OrderedDict() sam_header_fpath = join(dirname(output_dirpath), basename(sam_fpath) + '.header') if not isfile(sam_fpath) and not isfile(sam_header_fpath): return None if isfile(sam_fpath): qutils.call_subprocess( [sambamba_fpath('sambamba'), 'view', '-H', '-S', sam_fpath], stdout=open(sam_header_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger) chr_name_pattern = 'SN:(\S+)' chr_len_pattern = 'LN:(\d+)' with open(sam_header_fpath) as sam_in: for l in sam_in: if l.startswith('@SQ'): chr_name = re.findall(chr_name_pattern, l)[0] chr_len = re.findall(chr_len_pattern, l)[0] sam_chr_lengths[chr_name] = int(chr_len) inconsistency = '' if len(fasta_chr_lengths) != len(sam_chr_lengths): inconsistency = 'Number of chromosomes' else: for fasta_chr, sam_chr in zip(fasta_chr_lengths.keys(), sam_chr_lengths.keys()): if correct_name( sam_chr) == fasta_chr[:len(sam_chr)] and sam_chr_lengths[ sam_chr] == fasta_chr_lengths[fasta_chr]: correct_chr_names[sam_chr] = fasta_chr elif sam_chr_lengths[sam_chr] != fasta_chr_lengths[fasta_chr]: inconsistency = 'Chromosome lengths' break else: inconsistency = 'Chromosome names' break if inconsistency: if reads_fpaths: logger.warning( inconsistency + ' in ' + fasta_fpath + ' and corresponding SAM file ' + sam_fpath + ' do not match. ' + 'QUAST will try to realign reads to ' + ('the reference genome' if is_reference else fasta_fpath)) else: logger.error( inconsistency + ' in ' + fasta_fpath + ' and corresponding SAM file ' + sam_fpath + ' do not match. ' + 'Use SAM file obtained by aligning reads to ' + ('the reference genome' if is_reference else fasta_fpath)) return None return correct_chr_names
def create_conf(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, output_dir, gc_fpath, features_containers, cov_fpath, logger): data_dir = join(output_dir, 'data') if not exists(data_dir): os.makedirs(data_dir) chr_lengths = get_chr_lengths_from_fastafile(ref_fpath) max_len, karyotype_fpath, ideogram_fpath = create_ideogram(chr_lengths, data_dir) if max_len >= 10 ** 6: chrom_units = 10 ** 5 elif max_len >= 10 ** 5: chrom_units = 10 ** 4 else: chrom_units = 1000 ticks_fpath = create_ticks_conf(chrom_units, data_dir) ref_len = sum(chr_lengths.values()) window_size = set_window_size(ref_len) assemblies, contig_points = parse_alignments(contigs_fpaths, contig_report_fpath_pattern) alignments_fpaths = [create_alignment_plots(assembly, ref_len, data_dir) for assembly in assemblies] if not alignments_fpaths: return None gc_fpath, min_gc, max_gc, gc_points = create_gc_plot(gc_fpath, data_dir) feature_fpaths, gene_points = create_genes_plot(features_containers, window_size, ref_len, data_dir) mismatches_fpaths = [create_mismatches_plot(assembly, window_size, ref_len, output_dir, data_dir) for assembly in assemblies] cov_data_fpath, cov_points = create_coverage_plot(cov_fpath, window_size, ref_len, data_dir) max_points = max([MAX_POINTS, gc_points, gene_points, cov_points, contig_points]) labels_fpath, track_labels = create_labels(chr_lengths, assemblies, features_containers, cov_data_fpath, data_dir) conf_fpath = join(output_dir, 'circos.conf') radius = 0.95 plot_idx = 0 track_intervals = [TRACK_INTERVAL] * len(assemblies) if feature_fpaths: track_intervals[-1] = BIG_TRACK_INTERVAL track_intervals += [TRACK_INTERVAL] * len(feature_fpaths) if cov_data_fpath: track_intervals[-1] = BIG_TRACK_INTERVAL track_intervals.append(TRACK_INTERVAL) track_intervals[-1] = BIG_TRACK_INTERVAL with open(conf_fpath, 'w') as out_f: out_f.write('<<include etc/colors_fonts_patterns.conf>>\n') out_f.write('<<include %s>>\n' % relpath(ideogram_fpath, output_dir)) out_f.write('<<include %s>>\n' % relpath(ticks_fpath, output_dir)) out_f.write('karyotype = %s\n' % relpath(karyotype_fpath, output_dir)) out_f.write('chromosomes_units = %d\n' % chrom_units) out_f.write('chromosomes_display_default = yes\n') out_f.write('track_width = ' + str(TRACK_WIDTH) + '\n') for i in range(len(track_intervals)): out_f.write('track%d_pos = %f\n' % (i, radius)) radius -= TRACK_WIDTH radius -= track_intervals[i] out_f.write('track%d_pos = %f\n' % (len(track_intervals), radius)) out_f.write('<image>\n') out_f.write('dir = %s\n' % output_dir) out_f.write('file = %s\n' % circos_png_fname) out_f.write('png = yes\n') out_f.write('svg = no\n') out_f.write('radius = 1500p\n') out_f.write('angle_offset = -90\n') out_f.write('auto_alpha_colors = yes\n') out_f.write('auto_alpha_steps = 5\n') out_f.write('background = white\n') out_f.write('</image>\n') if qconfig.is_combined_ref: out_f.write('<highlights>\n') highlights_fpath = create_meta_highlights(chr_lengths, data_dir) out_f.write('<highlight>\n') out_f.write('file = %s\n' % relpath(highlights_fpath, output_dir)) out_f.write('r0 = 1r - 50p\n') out_f.write('r1 = 1r - 30p\n') out_f.write('</highlight>\n') out_f.write('</highlights>\n') out_f.write(create_housekeeping_file(chr_lengths, max_points, output_dir, data_dir, logger)) out_f.write('<plots>\n') out_f.write('layers_overflow = collapse\n') for label, i in track_labels: out_f.write('<plot>\n') out_f.write('track_idx = track%d\n' % i) out_f.write('<<include %s>>\n' % relpath(labels_fpath, output_dir)) out_f.write('</plot>\n') for i, alignments_conf in enumerate(alignments_fpaths): out_f.write('<plot>\n') out_f.write('type = tile\n') out_f.write('thickness = 50p\n') out_f.write('stroke_thickness = 0\n') out_f.write('layers = 1\n') out_f.write('file = %s\n' % relpath(alignments_conf, output_dir)) out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n') out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n') out_f.write('</plot>\n') if mismatches_fpaths and mismatches_fpaths[i]: out_f.write('<plot>\n') out_f.write('type = histogram\n') out_f.write('thickness = 1\n') out_f.write('fill_color = vlyellow\n') out_f.write('file = %s\n' % relpath(mismatches_fpaths[i], output_dir)) out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n') out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n') out_f.write('</plot>\n') plot_idx += 1 for feature_fpath in feature_fpaths: # genes plot out_f.write('<plot>\n') out_f.write('type = heatmap\n') out_f.write('file = %s\n' % relpath(feature_fpath, output_dir)) out_f.write('color = ylorbr-9\n') out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n') out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n') out_f.write('</plot>\n') plot_idx += 1 if cov_data_fpath: # coverage plot out_f.write('<plot>\n') out_f.write('type = histogram\n') out_f.write('thickness = 1\n') out_f.write('file = %s\n' % relpath(cov_data_fpath, output_dir)) out_f.write('fill_color = vlblue\n') out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n') out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n') out_f.write('</plot>\n') plot_idx += 1 # GC plot out_f.write('<plot>\n') out_f.write('type = heatmap\n') out_f.write('file = %s\n' % relpath(gc_fpath, output_dir)) out_f.write('color = greys-6\n') out_f.write('scale_log_base = 1.5\n') out_f.write('r0 = 1r - 29p\n') out_f.write('r1 = 1r - 1p\n') out_f.write('</plot>\n') out_f.write('</plots>\n') circos_legend_fpath = create_legend(assemblies, min_gc, max_gc, features_containers, cov_data_fpath, output_dir) return conf_fpath, circos_legend_fpath
def do(ref_fpath, contigs_fpaths, aligned_contigs_fpaths, output_dirpath, aligned_lengths_lists, aligned_stats_dirpath): if not os.path.isdir(aligned_stats_dirpath): os.mkdir(aligned_stats_dirpath) ######################################################################## report_dict = {'header': []} for contigs_fpath in aligned_contigs_fpaths: report_dict[qutils.name_from_fpath(contigs_fpath)] = [] ######################################################################## logger.print_timestamp() logger.main_info('Running NA-NGA calculation...') ref_chr_lengths = fastaparser.get_chr_lengths_from_fastafile(ref_fpath) reference_length = sum(ref_chr_lengths.values()) assembly_lengths = [] for contigs_fpath in aligned_contigs_fpaths: assembly_lengths.append( sum( fastaparser.get_chr_lengths_from_fastafile( contigs_fpath).values())) for i, (contigs_fpath, lens, assembly_len) in enumerate( zip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)): sorted_lengths = sorted(lens, reverse=True) na50, la50 = N50.NG50_and_LG50(sorted_lengths, assembly_len) na75, la75 = N50.NG50_and_LG50(sorted_lengths, assembly_len, 75) ea_size = N50.E_size(sorted_lengths) if not qconfig.is_combined_ref: nga50, lga50 = N50.NG50_and_LG50(sorted_lengths, reference_length) nga75, lga75 = N50.NG50_and_LG50(sorted_lengths, reference_length, 75) logger.info( ' ' + qutils.index_to_str(i) + qutils.label_from_fpath(contigs_fpath) + ', Largest alignment = ' + str(max(lens)) + ', NA50 = ' + str(na50) + (', NGA50 = ' + str(nga50) if not qconfig.is_combined_ref and nga50 else '') + ', LA50 = ' + str(la50) + (', LGA50 = ' + str(lga50) if not qconfig.is_combined_ref and lga50 else '')) report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.LARGALIGN, max(lens)) report.add_field(reporting.Fields.TOTAL_ALIGNED_LEN, sum(lens)) report.add_field(reporting.Fields.NA50, na50) report.add_field(reporting.Fields.NA75, na75) report.add_field(reporting.Fields.LA50, la50) report.add_field(reporting.Fields.LA75, la75) report.add_field(reporting.Fields.EA_SIZE, ea_size) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.NGA50, nga50) report.add_field(reporting.Fields.NGA75, nga75) report.add_field(reporting.Fields.LGA50, lga50) report.add_field(reporting.Fields.LGA75, lga75) ######################################################################## num_contigs = max([ len(aligned_lengths_lists[i]) for i in range(len(aligned_lengths_lists)) ]) # saving to html if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths) if qconfig.draw_plots: # Drawing cumulative plot (aligned contigs)... plotter.cumulative_plot( ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists, os.path.join(aligned_stats_dirpath, 'cumulative_plot'), 'Cumulative length (aligned contigs)') # Drawing NAx and NGAx plots... logger.info("Making plots...") plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx', assembly_lengths) ea_x_max = reporting.get(aligned_contigs_fpaths[0]).get_field( reporting.Fields.E_SIZE_MAX) plotter.EAxmax_plot(output_dirpath, False, aligned_contigs_fpaths, aligned_stats_dirpath + '/EAxmax_plot', 'EAxmax', ea_x_max) if not qconfig.is_combined_ref: plotter.Nx_plot( output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))]) logger.main_info('Done.') return report_dict
def frc_plot(results_dir, ref_fpath, contigs_fpaths, contigs_aligned_lengths, features_in_contigs_by_file, plot_fpath, title): if can_draw_plots: logger.info(' Drawing ' + title + ' FRCurve plot...') plots = [] max_y = 0 max_x = 0 ref_length = sum( fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values()) json_vals_x = [] # coordinates for Nx-like plots in HTML-report json_vals_y = [] max_features = max( sum(feature_in_contigs) for feature_in_contigs in features_in_contigs_by_file.values()) + 1 #create TSV file for metaquast features outf = open(results_dir + os.sep + "metaquast_frc.tsv", 'w') outf.write( "Assembly\tContig_ID\tContig_Length\tFeature_Count\tFeature_Type\n") aligned_contigs_fpaths = [] idx = 0 legend_list = [label_from_fpath(fpath) for fpath in contigs_fpaths] max_len = 0 max_features = 0 for contigs_fpath in contigs_fpaths: x_vals = [0] y_vals = [0] cumulative_len = 0 cumulative_features = 0 aligned_lengths = contigs_aligned_lengths[contigs_fpath] feature_in_contigs = features_in_contigs_by_file[contigs_fpath] contigs_lens = fastaparser.get_chr_lengths_from_fastafile( contigs_fpath) if not aligned_lengths or not feature_in_contigs: continue aligned_contigs_fpaths.append(contigs_fpath) len_with_zero_features = 0 lengths = [] non_zero_feature_in_contigs = [] ctg_idx = 1 #create unsorted TSV for l, feature in zip(aligned_lengths, feature_in_contigs): if l > 0: outf.write("%s\t%s\t%d\t%d\t%s\n" % (legend_list[idx], ctg_idx, l, feature, title)) ctg_idx += 1 if feature == 0: len_with_zero_features += l if l > 0: lengths.append(l) non_zero_feature_in_contigs.append(feature) optimal_sorted_tuples = sorted( zip(lengths, non_zero_feature_in_contigs), reverse=True) # sort by len/features ratio sorted_lengths = [tuple[0] for tuple in optimal_sorted_tuples] sorted_features = [tuple[1] for tuple in optimal_sorted_tuples] for tuple in optimal_sorted_tuples: cumulative_len += tuple[0] cumulative_features += tuple[1] y_vals.append(cumulative_features) x_vals.append(cumulative_len) #y_vals.append(cumulative_features) #x_vals.append(cumulative_len ) json_vals_x.append(y_vals) json_vals_y.append(x_vals) max_y = max(max_y, max(y_vals)) max_x = max(max_x, max(x_vals)) idx += 1 color, ls = get_color_and_ls(contigs_fpath) plots.append(Plot(x_vals, y_vals, color, ls)) if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_coord(results_dir, json_vals_x, json_vals_y, 'coord' + title, aligned_contigs_fpaths) if can_draw_plots: title = 'FRCurve (' + title + ')' legend_list = [ label_from_fpath(fpath) for fpath in aligned_contigs_fpaths ] create_plot(plot_fpath, title, plots, legend_list, x_label='Cumulative length', y_label='Cumulative features', y_limit=[0, max_y], x_limit=[0, max_x])
def frc_plot(results_dir, ref_fpath, contigs_fpaths, contigs_aligned_lengths, features_in_contigs_by_file, plot_fpath, title): if can_draw_plots: logger.info(' Drawing ' + title + ' FRCurve plot...') plots = [] max_y = 0 ref_length = sum( fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values()) json_vals_x = [] # coordinates for Nx-like plots in HTML-report json_vals_y = [] max_features = max( sum(feature_in_contigs) for feature_in_contigs in features_in_contigs_by_file.values()) + 1 aligned_contigs_fpaths = [] for contigs_fpath in contigs_fpaths: aligned_lengths = contigs_aligned_lengths[contigs_fpath] feature_in_contigs = features_in_contigs_by_file[contigs_fpath] if not aligned_lengths or not feature_in_contigs: continue aligned_contigs_fpaths.append(contigs_fpath) len_with_zero_features = 0 lengths = [] non_zero_feature_in_contigs = [] for l, feature in zip(aligned_lengths, feature_in_contigs): if feature == 0: len_with_zero_features += l else: lengths.append(l) non_zero_feature_in_contigs.append(feature) optimal_sorted_tuples = sorted( zip(lengths, non_zero_feature_in_contigs), key=lambda tuple: tuple[0] * 1.0 / tuple[1], reverse=True) # sort by len/features ratio sorted_lengths = [tuple[0] for tuple in optimal_sorted_tuples] sorted_features = [tuple[1] for tuple in optimal_sorted_tuples] x_vals = [] y_vals = [] for features_n in range(max_features): features_cnt = 0 cumulative_len = len_with_zero_features for l, feature in zip(sorted_lengths, sorted_features): if features_cnt + feature <= features_n: features_cnt += feature cumulative_len += l if features_cnt == features_n: break x_vals.append(features_n) y_vals.append(cumulative_len * 100.0 / ref_length) x_vals.append(features_n + 1) y_vals.append(cumulative_len * 100.0 / ref_length) json_vals_x.append(x_vals) json_vals_y.append(y_vals) max_y = max(max_y, max(y_vals)) color, ls = get_color_and_ls(contigs_fpath) plots.append(Plot(x_vals, y_vals, color, ls)) if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_coord(results_dir, json_vals_x, json_vals_y, 'coord' + title, aligned_contigs_fpaths) if can_draw_plots: title = 'FRCurve (' + title + ')' legend_list = [ label_from_fpath(fpath) for fpath in aligned_contigs_fpaths ] create_plot(plot_fpath, title, plots, legend_list, x_label='Feature space', y_label='Genome coverage (%)', x_limit=[0, max_features], y_limit=[0, max(100, max_y)])
def cumulative_plot(reference, contigs_fpaths, lists_of_lengths, plot_fpath, title): if not can_draw_plots: return logger.info(' Drawing cumulative plot...') import matplotlib.pyplot import matplotlib.ticker figure = matplotlib.pyplot.figure() matplotlib.pyplot.rc('font', **font) max_x = 0 max_y = 0 for (contigs_fpath, lengths) in itertools.izip(contigs_fpaths, lists_of_lengths): vals_length = [0] for l in sorted(lengths, reverse=True): vals_length.append(vals_length[-1] + l) vals_contig_index = range(0, len(vals_length)) if vals_contig_index: max_x = max(vals_contig_index[-1], max_x) max_y = max(max_y, vals_length[-1]) color, ls = get_color_and_ls(contigs_fpath) matplotlib.pyplot.plot(vals_contig_index, vals_length, color=color, lw=line_width, ls=ls) if reference: y_vals = [] for l in sorted(fastaparser.get_chr_lengths_from_fastafile(reference).values(), reverse=True): if y_vals: y_vals.append(y_vals[-1] + l) else: y_vals = [l] x_vals = range(0, len(y_vals)) # for reference only: starting from X=1 # extend reference curve to the max X-axis point reference_length = y_vals[-1] max_x = max(max_x, x_vals[-1]) max_y = max(max_y, reference_length) y_vals.append(reference_length) x_vals.append(max_x) matplotlib.pyplot.plot(x_vals, y_vals, color=reference_color, lw=line_width, ls=reference_ls) if with_title: matplotlib.pyplot.title(title) matplotlib.pyplot.grid(with_grid) ax = matplotlib.pyplot.gca() # Shink current axis's height by 20% on the bottom box = ax.get_position() ax.set_position([box.x0, box.y0 + box.height * 0.2, box.width, box.height * 0.8]) legend_list = map(qutils.label_from_fpath, contigs_fpaths) if reference: legend_list += ['Reference'] # Put a legend below current axis try: # for matplotlib <= 2009-12-09 ax.legend(legend_list, loc='upper center', bbox_to_anchor=(0.5, -0.1), fancybox=True, shadow=True, ncol=n_columns if n_columns<3 else 3) except Exception: # ZeroDivisionError: ValueError: pass ylabel = 'Cumulative length ' ylabel, mkfunc = y_formatter(ylabel, max_y) matplotlib.pyplot.xlabel('Contig index', fontsize=axes_fontsize) matplotlib.pyplot.ylabel(ylabel, fontsize=axes_fontsize) mkformatter = matplotlib.ticker.FuncFormatter(mkfunc) ax.yaxis.set_major_formatter(mkformatter) xLocator, yLocator = get_locators() ax.yaxis.set_major_locator(yLocator) ax.xaxis.set_major_locator(xLocator) if logarithmic_x_scale: ax.set_xscale('log') #ax.set_yscale('log') #matplotlib.pyplot.ylim([0, int(float(max_y) * 1.1)]) plot_fpath += '.' + qconfig.plot_extension matplotlib.pyplot.savefig(plot_fpath, bbox_inches='tight') logger.info(' saved to ' + plot_fpath) pdf_plots_figures.append(figure) matplotlib.pyplot.close()
def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir): logger.print_timestamp() logger.main_info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None reference_lengths = [] reference_fragments = None icarus_gc_fpath = None circos_gc_fpath = None if ref_fpath: reference_lengths = sorted(fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values(), reverse=True) reference_fragments = len(reference_lengths) reference_length = sum(reference_lengths) reference_GC, reference_GC_distribution, reference_GC_contigs_distribution = GC_content(ref_fpath) if qconfig.create_icarus_html or qconfig.draw_plots: icarus_gc_fpath = join(output_dirpath, 'gc.icarus.txt') save_icarus_GC(ref_fpath, icarus_gc_fpath) if qconfig.draw_plots: circos_gc_fpath = join(output_dirpath, 'gc.circos.txt') save_circos_GC(ref_fpath, reference_length, circos_gc_fpath) logger.info(' Reference genome:') logger.info(' ' + os.path.basename(ref_fpath) + ', length = ' + str(reference_length) + ', num fragments = ' + str(reference_fragments) + ', GC % = ' + '%.2f' % reference_GC if reference_GC is not None else 'undefined') if reference_fragments > 30 and not qconfig.check_for_fragmented_ref: logger.warning(' Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option.' ' QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).') elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size reference_lengths = [reference_length] logger.info(' Estimated reference length = ' + str(reference_length)) logger.info(' Contig files: ') lists_of_lengths = [] numbers_of_Ns = [] coverage_dict = dict() cov_pattern = re.compile(r'_cov_(\d+\.?\d*)') for id, contigs_fpath in enumerate(contigs_fpaths): coverage_dict[contigs_fpath] = [] assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 is_potential_scaffold = False for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count('N') if not qconfig.scaffolds and not is_potential_scaffold and qutils.is_scaffold(seq): is_potential_scaffold = True qconfig.potential_scaffolds_assemblies.append(assembly_label) if cov_pattern.findall(name): cov = int(float(cov_pattern.findall(name)[0])) if len(coverage_dict[contigs_fpath]) <= cov: coverage_dict[contigs_fpath] += [0] * (cov - len(coverage_dict[contigs_fpath]) + 1) coverage_dict[contigs_fpath][cov] += len(seq) lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths] num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths]) multiplicator = 1 if num_contigs >= (qconfig.max_points * 2): import math multiplicator = int(num_contigs / qconfig.max_points) max_points = num_contigs // multiplicator corr_lists_of_lengths = [[sum(list_of_length[((i - 1) * multiplicator):(i * multiplicator)]) for i in range(1, max_points) if (i * multiplicator) < len(list_of_length)] for list_of_length in lists_of_lengths] if len(reference_lengths) > 1: reference_lengths = [sum(reference_lengths[((i - 1) * multiplicator):(i * multiplicator)]) if (i * multiplicator) < len(reference_lengths) else sum(reference_lengths[((i - 1) * multiplicator):]) for i in range(1, max_points)] + [sum(reference_lengths[(max_points - 1) * multiplicator:])] for num_list in range(len(corr_lists_of_lengths)): last_index = len(corr_lists_of_lengths[num_list]) corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index * multiplicator:])) else: corr_lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths] if reference_lengths: # Saving for an HTML report if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_reference_lengths(results_dir, reference_lengths) if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths) html_saver.save_tick_x(results_dir, multiplicator) ######################################################################## logger.info(' Calculating N50 and L50...') list_of_GC_distributions = [] list_of_GC_contigs_distributions = [] largest_contig = 0 from . import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(zip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution, GC_contigs_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc) list_of_GC_distributions.append(GC_distribution) list_of_GC_contigs_distributions.append(GC_contigs_distribution) logger.info(' ' + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + \ ', N50 = ' + str(n50) + \ ', L50 = ' + str(l50) + \ ', Total length = ' + str(total_length) + \ ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \ ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined') report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) if lengths_list: report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) largest_contig = max(largest_contig, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None)) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) import math qconfig.min_difference = math.ceil((largest_contig / 1000) / 600) # divide on height of plot list_of_GC_distributions_with_ref = list_of_GC_distributions reference_index = None if ref_fpath: reference_index = len(list_of_GC_distributions_with_ref) list_of_GC_distributions_with_ref.append(reference_GC_distribution) if qconfig.html_report and not qconfig.no_gc: from quast_libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions_with_ref, list_of_GC_contigs_distributions, reference_index) ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'Nx_plot'), 'Nx', []) if reference_length and not qconfig.is_combined_ref: plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'NGx_plot'), 'NGx', [reference_length for i in range(len(contigs_fpaths))]) if qconfig.draw_plots: ########################################################################import plotter # Drawing cumulative plot... plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'cumulative_plot'), 'Cumulative length') if not qconfig.no_gc: ######################################################################## # Drawing GC content plot... plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, join(output_dirpath, 'GC_content_plot')) for contigs_fpath, GC_distribution in zip(contigs_fpaths, list_of_GC_contigs_distributions): plotter.contigs_GC_content_plot(contigs_fpath, GC_distribution, join(output_dirpath, qutils.label_from_fpath(contigs_fpath) + '_GC_content_plot')) if any(coverage_dict[contigs_fpath] for contigs_fpath in contigs_fpaths): draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath) logger.main_info('Done.') return icarus_gc_fpath, circos_gc_fpath
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath, aligned_lengths_lists, aligned_stats_dirpath): if not os.path.isdir(aligned_stats_dirpath): os.mkdir(aligned_stats_dirpath) ######################################################################## report_dict = {'header': []} for contigs_fpath in aligned_contigs_fpaths: report_dict[qutils.name_from_fpath(contigs_fpath)] = [] ######################################################################## logger.print_timestamp() logger.main_info('Running NA-NGA calculation...') ref_chr_lengths = fastaparser.get_chr_lengths_from_fastafile(ref_fpath) reference_length = sum(ref_chr_lengths.values()) assembly_lengths = [] for contigs_fpath in aligned_contigs_fpaths: assembly_lengths.append(sum(fastaparser.get_chr_lengths_from_fastafile(contigs_fpath).values())) for i, (contigs_fpath, lens, assembly_len) in enumerate( zip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)): na50 = N50.NG50(lens, assembly_len) na75 = N50.NG50(lens, assembly_len, 75) la50 = N50.LG50(lens, assembly_len) la75 = N50.LG50(lens, assembly_len, 75) if not qconfig.is_combined_ref: nga50 = N50.NG50(lens, reference_length) nga75 = N50.NG50(lens, reference_length, 75) lga50 = N50.LG50(lens, reference_length) lga75 = N50.LG50(lens, reference_length, 75) logger.info(' ' + qutils.index_to_str(i) + qutils.label_from_fpath(contigs_fpath) + ', Largest alignment = ' + str(max(lens)) + ', NA50 = ' + str(na50) + (', NGA50 = ' + str(nga50) if not qconfig.is_combined_ref and nga50 else '') + ', LA50 = ' + str(la50) + (', LGA50 = ' + str(lga50) if not qconfig.is_combined_ref and lga50 else '')) report = reporting.get(contigs_fpath) report.add_field(reporting.Fields.LARGALIGN, max(lens)) report.add_field(reporting.Fields.TOTAL_ALIGNED_LEN, sum(lens)) report.add_field(reporting.Fields.NA50, na50) report.add_field(reporting.Fields.NA75, na75) report.add_field(reporting.Fields.LA50, la50) report.add_field(reporting.Fields.LA75, la75) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.NGA50, nga50) report.add_field(reporting.Fields.NGA75, nga75) report.add_field(reporting.Fields.LGA50, lga50) report.add_field(reporting.Fields.LGA75, lga75) ######################################################################## num_contigs = max([len(aligned_lengths_lists[i]) for i in range(len(aligned_lengths_lists))]) if json_output_dirpath: from quast_libs.html_saver import json_saver json_saver.save_assembly_lengths(json_output_dirpath, aligned_contigs_fpaths, assembly_lengths) # saving to html if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths) if qconfig.draw_plots: # Drawing cumulative plot (aligned contigs)... plotter.cumulative_plot(ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists, os.path.join(aligned_stats_dirpath, 'cumulative_plot'), 'Cumulative length (aligned contigs)') # Drawing NAx and NGAx plots... plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx', assembly_lengths, json_output_dir=json_output_dirpath) if not qconfig.is_combined_ref: plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))], json_output_dir=json_output_dirpath) logger.main_info('Done.') return report_dict
def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir): logger.print_timestamp() logger.main_info("Running Basic statistics processor...") if not os.path.isdir(output_dirpath): os.mkdir(output_dirpath) reference_length = None reference_lengths = [] reference_fragments = None if ref_fpath: reference_lengths = sorted( fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values(), reverse=True) reference_fragments = len(reference_lengths) reference_length = sum(reference_lengths) reference_GC, reference_GC_distribution, reference_GC_contigs_distribution = GC_content( ref_fpath) logger.info(' Reference genome:') logger.info(' ' + os.path.basename(ref_fpath) + ', length = ' + str(reference_length) + ', num fragments = ' + str(reference_fragments) + ', GC % = ' + '%.2f' % reference_GC if reference_GC is not None else 'undefined') if reference_fragments > 30 and not qconfig.check_for_fragmented_ref: logger.warning( ' Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option.' ' QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).' ) elif qconfig.estimated_reference_size: reference_length = qconfig.estimated_reference_size reference_lengths = [reference_length] logger.info(' Estimated reference length = ' + str(reference_length)) logger.info(' Contig files: ') lists_of_lengths = [] numbers_of_Ns = [] coverage_dict = dict() cov_pattern = re.compile(r'_cov_(\d+\.?\d*)') for id, contigs_fpath in enumerate(contigs_fpaths): coverage_dict[contigs_fpath] = [] assembly_label = qutils.label_from_fpath(contigs_fpath) logger.info(' ' + qutils.index_to_str(id) + assembly_label) # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath)) list_of_length = [] number_of_Ns = 0 is_potential_scaffold = False for (name, seq) in fastaparser.read_fasta(contigs_fpath): list_of_length.append(len(seq)) number_of_Ns += seq.count('N') if not qconfig.scaffolds and not is_potential_scaffold and qutils.is_scaffold( seq): is_potential_scaffold = True qconfig.potential_scaffolds_assemblies.append(assembly_label) if cov_pattern.findall(name): cov = int(float(cov_pattern.findall(name)[0])) if len(coverage_dict[contigs_fpath]) <= cov: coverage_dict[contigs_fpath] += [0] * ( cov - len(coverage_dict[contigs_fpath]) + 1) coverage_dict[contigs_fpath][cov] += len(seq) lists_of_lengths.append(list_of_length) numbers_of_Ns.append(number_of_Ns) lists_of_lengths = [ sorted(list, reverse=True) for list in lists_of_lengths ] num_contigs = max( [len(list_of_length) for list_of_length in lists_of_lengths]) multiplicator = 1 if num_contigs >= (qconfig.max_points * 2): import math multiplicator = int(num_contigs / qconfig.max_points) max_points = num_contigs // multiplicator corr_lists_of_lengths = [[ sum(list_of_length[((i - 1) * multiplicator):(i * multiplicator)]) for i in range(1, max_points) if (i * multiplicator) < len(list_of_length) ] for list_of_length in lists_of_lengths] if len(reference_lengths) > 1: reference_lengths = [ sum(reference_lengths[( (i - 1) * multiplicator):(i * multiplicator)]) if (i * multiplicator) < len(reference_lengths) else sum( reference_lengths[((i - 1) * multiplicator):]) for i in range(1, max_points) ] + [sum(reference_lengths[(max_points - 1) * multiplicator:])] for num_list in range(len(corr_lists_of_lengths)): last_index = len(corr_lists_of_lengths[num_list]) corr_lists_of_lengths[num_list].append( sum(lists_of_lengths[num_list][last_index * multiplicator:])) else: corr_lists_of_lengths = [ sorted(list, reverse=True) for list in lists_of_lengths ] if reference_lengths: # Saving for an HTML report if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_reference_lengths(results_dir, reference_lengths) if qconfig.html_report: from quast_libs.html_saver import html_saver html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths) html_saver.save_tick_x(results_dir, multiplicator) ######################################################################## logger.info(' Calculating N50 and L50...') list_of_GC_distributions = [] list_of_GC_contigs_distributions = [] largest_contig = 0 from . import N50 for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate( zip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)): report = reporting.get(contigs_fpath) n50, l50 = N50.N50_and_L50(lengths_list) ng50, lg50 = None, None if reference_length: ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length) n75, l75 = N50.N50_and_L50(lengths_list, 75) ng75, lg75 = None, None if reference_length: ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75) total_length = sum(lengths_list) total_GC, GC_distribution, GC_contigs_distribution = GC_content( contigs_fpath, skip=qconfig.no_gc) list_of_GC_distributions.append(GC_distribution) list_of_GC_contigs_distributions.append(GC_contigs_distribution) logger.info(' ' + qutils.index_to_str(id) + qutils.label_from_fpath(contigs_fpath) + \ ', N50 = ' + str(n50) + \ ', L50 = ' + str(l50) + \ ', Total length = ' + str(total_length) + \ ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \ ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined') report.add_field(reporting.Fields.N50, n50) report.add_field(reporting.Fields.L50, l50) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG50, ng50) report.add_field(reporting.Fields.LG50, lg50) report.add_field(reporting.Fields.N75, n75) report.add_field(reporting.Fields.L75, l75) if reference_length and not qconfig.is_combined_ref: report.add_field(reporting.Fields.NG75, ng75) report.add_field(reporting.Fields.LG75, lg75) report.add_field(reporting.Fields.CONTIGS, len(lengths_list)) if lengths_list: report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list)) largest_contig = max(largest_contig, max(lengths_list)) report.add_field(reporting.Fields.TOTALLEN, total_length) if not qconfig.is_combined_ref: report.add_field( reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None)) report.add_field(reporting.Fields.UNCALLED, number_of_Ns) report.add_field( reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments) if not qconfig.is_combined_ref: report.add_field( reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None)) elif reference_length: report.add_field(reporting.Fields.ESTREFLEN, int(reference_length)) import math qconfig.min_difference = math.ceil( (largest_contig / 1000) / 600) # divide on height of plot list_of_GC_distributions_with_ref = list_of_GC_distributions reference_index = None if ref_fpath: reference_index = len(list_of_GC_distributions_with_ref) list_of_GC_distributions_with_ref.append(reference_GC_distribution) if qconfig.html_report and not qconfig.no_gc: from quast_libs.html_saver import html_saver html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions_with_ref, list_of_GC_contigs_distributions, reference_index) ######################################################################## # Drawing Nx and NGx plots... plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'Nx_plot'), 'Nx', []) if reference_length and not qconfig.is_combined_ref: plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'NGx_plot'), 'NGx', [reference_length for i in range(len(contigs_fpaths))]) if qconfig.draw_plots: ########################################################################import plotter # Drawing cumulative plot... plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'cumulative_plot'), 'Cumulative length') if not qconfig.no_gc: ######################################################################## # Drawing GC content plot... plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, join(output_dirpath, 'GC_content_plot')) for contigs_fpath, GC_distribution in zip( contigs_fpaths, list_of_GC_contigs_distributions): plotter.contigs_GC_content_plot( contigs_fpath, GC_distribution, join( output_dirpath, qutils.label_from_fpath(contigs_fpath) + '_GC_content_plot')) if any(coverage_dict[contigs_fpath] for contigs_fpath in contigs_fpaths): draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath) logger.main_info('Done.')
def cumulative_plot(reference, contigs_fpaths, lists_of_lengths, plot_fpath, title): if not can_draw_plots: return logger.info(' Drawing cumulative plot...') import matplotlib.pyplot import matplotlib.ticker figure = matplotlib.pyplot.figure() matplotlib.pyplot.rc('font', **font) max_x = 0 max_y = 0 for (contigs_fpath, lengths) in zip(contigs_fpaths, lists_of_lengths): vals_length = [0] for l in sorted(lengths, reverse=True): vals_length.append(vals_length[-1] + l) vals_contig_index = list(range(0, len(vals_length))) if vals_contig_index: max_x = max(vals_contig_index[-1], max_x) max_y = max(max_y, vals_length[-1]) color, ls = get_color_and_ls(contigs_fpath) matplotlib.pyplot.plot(vals_contig_index, vals_length, color=color, lw=line_width, ls=ls) if reference: y_vals = [] for l in sorted(fastaparser.get_chr_lengths_from_fastafile(reference).values(), reverse=True): if y_vals: y_vals.append(y_vals[-1] + l) else: y_vals = [l] x_vals = list(range(0, len(y_vals))) # for reference only: starting from X=1) # extend reference curve to the max X-axis point reference_length = y_vals[-1] max_x = max(max_x, x_vals[-1]) max_y = max(max_y, reference_length) y_vals.append(reference_length) x_vals.append(max_x) matplotlib.pyplot.plot(x_vals, y_vals, color=reference_color, lw=line_width, ls=reference_ls) if with_title: matplotlib.pyplot.title(title) matplotlib.pyplot.grid(with_grid) ax = matplotlib.pyplot.gca() # Shink current axis's height by 20% on the bottom box = ax.get_position() ax.set_position([box.x0, box.y0 + box.height * 0.2, box.width, box.height * 0.8]) legend_list = [qutils.label_from_fpath(fpath) for fpath in contigs_fpaths] if reference: legend_list += ['Reference'] # Put a legend below current axis try: # for matplotlib <= 2009-12-09 ax.legend(legend_list, loc='upper center', bbox_to_anchor=(0.5, -0.1), fancybox=True, shadow=True, ncol=n_columns if n_columns<3 else 3) except Exception: # ZeroDivisionError: ValueError: pass ylabel = 'Cumulative length ' ylabel, mkfunc = y_formatter(ylabel, max_y) matplotlib.pyplot.xlabel('Contig index', fontsize=axes_fontsize) matplotlib.pyplot.ylabel(ylabel, fontsize=axes_fontsize) mkformatter = matplotlib.ticker.FuncFormatter(mkfunc) ax.yaxis.set_major_formatter(mkformatter) xLocator, yLocator = get_locators() ax.yaxis.set_major_locator(yLocator) ax.xaxis.set_major_locator(xLocator) if logarithmic_x_scale: ax.set_xscale('log') #ax.set_yscale('log') #matplotlib.pyplot.ylim([0, int(float(max_y) * 1.1)]) plot_fpath += '.' + qconfig.plot_extension matplotlib.pyplot.savefig(plot_fpath, bbox_inches='tight') logger.info(' saved to ' + plot_fpath) pdf_plots_figures.append(figure) matplotlib.pyplot.close()