Example #1
0
def get_chr_len_fpath(ref_fpath):
    chr_len_fpath = ref_fpath + '.fai'
    if not is_non_empty_file(chr_len_fpath):
        chr_lengths = fastaparser.get_chr_lengths_from_fastafile(ref_fpath)
        with open(chr_len_fpath, 'w') as out_f:
            for chr_name, chr_len in chr_lengths.iteritems():
                out_f.write(chr_name + '\t' + str(chr_len) + '\n')
    return chr_len_fpath
Example #2
0
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath,
                  tool_dirpath, tmp_dirpath, gmhmm_p_function, prokaryote,
                  num_threads):
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    err_fpath = os.path.join(out_dirpath,
                             corr_assembly_label + '_genemark.stderr')

    genes = gmhmm_p_function(tool_dirpath, contigs_fpath, err_fpath, index,
                             tmp_dirpath, num_threads)
    contig_lengths = get_chr_lengths_from_fastafile(contigs_fpath)

    if not genes:
        unique_count = None
        full_cnt = None
        partial_cnt = None
    else:
        for gene in genes:
            gene.is_full = gene.start > 1 and gene.end < contig_lengths[
                gene.contig]
        tool_name = "genemark"
        out_gff_fpath = os.path.join(
            out_dirpath, corr_assembly_label + '_' + tool_name + '_genes.gff' +
            ('.gz' if not qconfig.no_gzip else ''))
        add_genes_to_gff(genes, out_gff_fpath, prokaryote)
        if OUTPUT_FASTA:
            out_fasta_fpath = os.path.join(
                out_dirpath,
                corr_assembly_label + '_' + tool_name + '_genes.fasta')
            add_genes_to_fasta(genes, out_fasta_fpath)

        full_cnt = [
            sum([
                gene.end - gene.start >= threshold for gene in genes
                if gene.is_full
            ]) for threshold in gene_lengths
        ]
        partial_cnt = [
            sum([
                gene.end - gene.start >= threshold for gene in genes
                if not gene.is_full
            ]) for threshold in gene_lengths
        ]
        gene_ids = [gene.seq if gene.seq else gene.name for gene in genes]
        unique_count = len(set(gene_ids))
        total_count = len(genes)

        logger.info('  ' + qutils.index_to_str(index) + '  Genes = ' +
                    str(unique_count) + ' unique, ' + str(total_count) +
                    ' total')
        logger.info('  ' + qutils.index_to_str(index) +
                    '  Predicted genes (GFF): ' + out_gff_fpath)

    return genes, unique_count, full_cnt, partial_cnt
Example #3
0
def get_lengths_from_fasta(contigs_fpath, label):
    lengths = fastaparser.get_chr_lengths_from_fastafile(contigs_fpath).values()

    if not sum(l for l in lengths if l >= qconfig.min_contig):
        logger.warning("Skipping %s because it doesn't contain contigs >= %d bp."
                       % (label, qconfig.min_contig))
        return None

    return list(lengths)
Example #4
0
def get_lengths_from_fasta(contigs_fpath, label):
    lengths = fastaparser.get_chr_lengths_from_fastafile(contigs_fpath).values()

    if not sum(l for l in lengths if l >= qconfig.min_contig):
        logger.warning("Skipping %s because it doesn't contain contigs >= %d bp."
                       % (label, qconfig.min_contig))
        return None

    return list(lengths)
Example #5
0
def get_chr_len_fpath(ref_fpath, correct_chr_names=None):
    chr_len_fpath = ref_fpath + '.fai'
    raw_chr_names = dict((raw_name, correct_name) for correct_name, raw_name in correct_chr_names.items()) \
        if correct_chr_names else None
    if not is_non_empty_file(chr_len_fpath):
        chr_lengths = fastaparser.get_chr_lengths_from_fastafile(ref_fpath)
        with open(chr_len_fpath, 'w') as out_f:
            for chr_name, chr_len in chr_lengths.items():
                chr_name = raw_chr_names[chr_name] if correct_chr_names else chr_name
                out_f.write(chr_name + '\t' + str(chr_len) + '\n')
    return chr_len_fpath
Example #6
0
def get_chr_len_fpath(ref_fpath, correct_chr_names=None):
    chr_len_fpath = ref_fpath + '.fai'
    raw_chr_names = dict((raw_name, correct_name) for correct_name, raw_name in correct_chr_names.items()) \
        if correct_chr_names else None
    if not is_non_empty_file(chr_len_fpath):
        chr_lengths = fastaparser.get_chr_lengths_from_fastafile(ref_fpath)
        with open(chr_len_fpath, 'w') as out_f:
            for chr_name, chr_len in chr_lengths.items():
                chr_name = raw_chr_names[chr_name] if correct_chr_names else chr_name
                out_f.write(chr_name + '\t' + str(chr_len) + '\n')
    return chr_len_fpath
Example #7
0
def get_correct_names_for_chroms(output_dirpath, ref_fpath, sam_fpath,
                                 err_path, reads_fpaths):
    correct_chr_names = dict()
    ref_chr_lengths = get_chr_lengths_from_fastafile(ref_fpath)
    sam_chr_lengths = dict()
    sam_header_fpath = os.path.join(output_dirpath,
                                    os.path.basename(sam_fpath) + '.header')
    qutils.call_subprocess(
        [sambamba_fpath('sambamba'), 'view', '-H', '-S', sam_fpath],
        stdout=open(sam_header_fpath, 'w'),
        stderr=open(err_path, 'w'),
        logger=logger)
    chr_name_pattern = 'SN:(\S+)'
    chr_len_pattern = 'LN:(\d+)'

    with open(sam_header_fpath) as sam_in:
        for l in sam_in:
            if l.startswith('@SQ'):
                chr_name = re.findall(chr_name_pattern, l)[0]
                chr_len = re.findall(chr_len_pattern, l)[0]
                sam_chr_lengths[chr_name] = int(chr_len)

    inconsistency = ''
    if len(ref_chr_lengths) != len(sam_chr_lengths):
        inconsistency = 'Number of chromosomes'
    else:
        for ref_chr, sam_chr in zip(ref_chr_lengths.keys(),
                                    sam_chr_lengths.keys()):
            if correct_name(
                    sam_chr) == ref_chr[:len(sam_chr)] and sam_chr_lengths[
                        sam_chr] == ref_chr_lengths[ref_chr]:
                correct_chr_names[sam_chr] = ref_chr
            elif sam_chr_lengths[sam_chr] != ref_chr_lengths[ref_chr]:
                inconsistency = 'Chromosome lengths'
                break
            else:
                inconsistency = 'Chromosome names'
                break
    if inconsistency:
        if reads_fpaths:
            logger.warning(
                inconsistency + ' in reference and SAM file do not match. ' +
                'QUAST will try to realign reads to the reference genome.')
        else:
            logger.error(
                inconsistency + ' in reference and SAM file do not match. ' +
                'Use SAM file obtained by aligning reads to the reference genome.'
            )
        return None
    return correct_chr_names
Example #8
0
def cumulative_plot(reference, contigs_fpaths, lists_of_lengths, plot_fpath,
                    title):
    if not can_draw_plots:
        return

    logger.info('  Drawing cumulative plot...')

    plots = []
    max_x = 0

    for (contigs_fpath, lengths) in zip(contigs_fpaths, lists_of_lengths):
        y_vals = [0]
        for l in sorted(lengths, reverse=True):
            y_vals.append(y_vals[-1] + l)
        x_vals = list(range(0, len(y_vals)))
        if x_vals:
            max_x = max(x_vals[-1], max_x)
        color, ls = get_color_and_ls(contigs_fpath)
        plots.append(Plot(x_vals, y_vals, color, ls))

    if reference:
        y_vals = [0]
        for l in sorted(
                fastaparser.get_chr_lengths_from_fastafile(reference).values(),
                reverse=True):
            y_vals.append(y_vals[-1] + l)
        x_vals = list(range(0, len(y_vals)))
        # extend reference curve to the max X-axis point
        reference_length = y_vals[-1]
        max_x = max(max_x, x_vals[-1])
        y_vals.append(reference_length)
        x_vals.append(max_x)
        plots.append(Plot(x_vals, y_vals, reference_color, reference_ls))

    legend_list = [label_from_fpath(fpath) for fpath in contigs_fpaths]
    if reference:
        legend_list += ['Reference']

    create_plot(plot_fpath,
                title,
                plots,
                legend_list,
                x_label='Contig index',
                y_label='Cumulative length',
                x_limit=[0, max_x])
Example #9
0
def get_correct_names_for_chroms(output_dirpath, fasta_fpath, sam_fpath, err_path, reads_fpaths, logger, is_reference=False):
    correct_chr_names = dict()
    fasta_chr_lengths = get_chr_lengths_from_fastafile(fasta_fpath)
    sam_chr_lengths = dict()
    sam_header_fpath = join(dirname(output_dirpath), basename(sam_fpath) + '.header')
    if not isfile(sam_fpath) and not isfile(sam_header_fpath):
        return None
    if isfile(sam_fpath):
        qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-H', '-S', sam_fpath],
                               stdout=open(sam_header_fpath, 'w'), stderr=open(err_path, 'a'), logger=logger)
    chr_name_pattern = 'SN:(\S+)'
    chr_len_pattern = 'LN:(\d+)'

    with open(sam_header_fpath) as sam_in:
        for l in sam_in:
            if l.startswith('@SQ'):
                chr_name = re.findall(chr_name_pattern, l)[0]
                chr_len = re.findall(chr_len_pattern, l)[0]
                sam_chr_lengths[chr_name] = int(chr_len)

    inconsistency = ''
    if len(fasta_chr_lengths) != len(sam_chr_lengths):
        inconsistency = 'Number of chromosomes'
    else:
        for fasta_chr, sam_chr in zip(fasta_chr_lengths.keys(), sam_chr_lengths.keys()):
            if correct_name(sam_chr) == fasta_chr[:len(sam_chr)] and sam_chr_lengths[sam_chr] == fasta_chr_lengths[fasta_chr]:
                correct_chr_names[sam_chr] = fasta_chr
            elif sam_chr_lengths[sam_chr] != fasta_chr_lengths[fasta_chr]:
                inconsistency = 'Chromosome lengths'
                break
            else:
                inconsistency = 'Chromosome names'
                break
    if inconsistency:
        if reads_fpaths:
            logger.warning(inconsistency + ' in ' + fasta_fpath + ' and corresponding SAM file ' + sam_fpath + ' do not match. ' +
                           'QUAST will try to realign reads to ' + ('the reference genome' if is_reference else fasta_fpath))
        else:
            logger.error(inconsistency + ' in ' + fasta_fpath + ' and corresponding SAM file ' + sam_fpath + ' do not match. ' +
                         'Use SAM file obtained by aligning reads to ' + ('the reference genome' if is_reference else fasta_fpath))
        return None
    return correct_chr_names
Example #10
0
def parse_kmer_coords(kmers_coords, ref_fpath, kmer_fraction):
    kmers_pos_by_chrom = defaultdict(list)
    kmers_by_chrom = defaultdict(list)
    with open(kmers_coords) as f:
        for line in f:
            fs = line.split('\t')
            if len(fs) < 10:
                continue
            contig, chrom, pos = fs[0], fs[5], fs[7]
            kmers_pos_by_chrom[chrom].append(int(pos))
            kmers_by_chrom[chrom].append(int(contig))
    downsampled_kmers_cnt = sum([len(kmers) for kmers in kmers_by_chrom.values()]) * kmer_fraction
    genome_size = sum(get_chr_lengths_from_fastafile(ref_fpath).values())
    interval = int(genome_size / downsampled_kmers_cnt)
    downsampled_kmers = set()
    for chrom in kmers_by_chrom.keys():
        sorted_kmers = [kmers for kmers_pos, kmers in sorted(zip(kmers_pos_by_chrom[chrom], kmers_by_chrom[chrom]))]
        for kmer_i in sorted_kmers[::interval]:
            downsampled_kmers.add(kmer_i)
    return downsampled_kmers
Example #11
0
def get_correct_names_for_chroms(output_dirpath, ref_fpath, sam_fpath, err_path, reads_fpaths):
    correct_chr_names = dict()
    ref_chr_lengths = get_chr_lengths_from_fastafile(ref_fpath)
    sam_chr_lengths = dict()
    sam_header_fpath = os.path.join(output_dirpath, os.path.basename(sam_fpath) + '.header')
    qutils.call_subprocess([sambamba_fpath('sambamba'), 'view', '-H', '-S', sam_fpath],
                           stdout=open(sam_header_fpath, 'w'), stderr=open(err_path, 'w'), logger=logger)
    chr_name_pattern = 'SN:(\S+)'
    chr_len_pattern = 'LN:(\d+)'

    with open(sam_header_fpath) as sam_in:
        for l in sam_in:
            if l.startswith('@SQ'):
                chr_name = re.findall(chr_name_pattern, l)[0]
                chr_len = re.findall(chr_len_pattern, l)[0]
                sam_chr_lengths[chr_name] = int(chr_len)

    inconsistency = ''
    if len(ref_chr_lengths) != len(sam_chr_lengths):
        inconsistency = 'Number of chromosomes'
    else:
        for ref_chr, sam_chr in zip(ref_chr_lengths.keys(), sam_chr_lengths.keys()):
            if correct_name(sam_chr) == ref_chr[:len(sam_chr)] and sam_chr_lengths[sam_chr] == ref_chr_lengths[ref_chr]:
                correct_chr_names[sam_chr] = ref_chr
            elif sam_chr_lengths[sam_chr] != ref_chr_lengths[ref_chr]:
                inconsistency = 'Chromosome lengths'
                break
            else:
                inconsistency = 'Chromosome names'
                break
    if inconsistency:
        if reads_fpaths:
            logger.warning(inconsistency + ' in reference and SAM file do not match. ' +
                           'QUAST will try to realign reads to the reference genome.')
        else:
            logger.error(inconsistency + ' in reference and SAM file do not match. ' +
                         'Use SAM file obtained by aligning reads to the reference genome.')
        return None
    return correct_chr_names
Example #12
0
def predict_genes(index, contigs_fpath, gene_lengths, out_dirpath, tool_dirpath, tmp_dirpath, gmhmm_p_function,
                  prokaryote, num_threads):
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    corr_assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    err_fpath = os.path.join(out_dirpath, corr_assembly_label + '_genemark.stderr')

    genes = gmhmm_p_function(tool_dirpath, contigs_fpath, err_fpath, index, tmp_dirpath, num_threads)
    contig_lengths = get_chr_lengths_from_fastafile(contigs_fpath)

    if not genes:
        unique_count = None
        full_cnt = None
        partial_cnt = None
    else:
        for gene in genes:
            gene.is_full = gene.start > 1 and gene.end < contig_lengths[gene.contig]
        tool_name = "genemark"
        out_gff_fpath = os.path.join(out_dirpath, corr_assembly_label + '_' + tool_name + '_genes.gff' + ('.gz' if not qconfig.no_gzip else ''))
        add_genes_to_gff(genes, out_gff_fpath, prokaryote)
        if OUTPUT_FASTA:
            out_fasta_fpath = os.path.join(out_dirpath, corr_assembly_label + '_' + tool_name + '_genes.fasta')
            add_genes_to_fasta(genes, out_fasta_fpath)

        full_cnt = [sum([gene.end - gene.start >= threshold for gene in genes if gene.is_full])
                    for threshold in gene_lengths]
        partial_cnt = [sum([gene.end - gene.start >= threshold for gene in genes if not gene.is_full])
                       for threshold in gene_lengths]
        gene_ids = [gene.seq if gene.seq else gene.name for gene in genes]
        unique_count = len(set(gene_ids))
        total_count = len(genes)

        logger.info('  ' + qutils.index_to_str(index) + '  Genes = ' + str(unique_count) + ' unique, ' + str(total_count) + ' total')
        logger.info('  ' + qutils.index_to_str(index) + '  Predicted genes (GFF): ' + out_gff_fpath)

    return genes, unique_count, full_cnt, partial_cnt
Example #13
0
def create_conf(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, output_dir, gc_fpath, features_containers, cov_fpath, logger):
    data_dir = join(output_dir, 'data')
    if not exists(data_dir):
        os.makedirs(data_dir)

    chr_lengths = get_chr_lengths_from_fastafile(ref_fpath)
    max_len, karyotype_fpath, ideogram_fpath = create_ideogram(chr_lengths, data_dir)
    if max_len >= 10 ** 6:
        chrom_units = 10 ** 5
    elif max_len >= 10 ** 5:
        chrom_units = 10 ** 4
    else:
        chrom_units = 1000
    ticks_fpath = create_ticks_conf(chrom_units, data_dir)
    ref_len = sum(chr_lengths.values())
    window_size = set_window_size(ref_len)

    assemblies, contig_points = parse_alignments(contigs_fpaths, contig_report_fpath_pattern)
    alignments_fpaths = [create_alignment_plots(assembly, ref_len, data_dir) for assembly in assemblies]
    if not alignments_fpaths:
        return None

    gc_fpath, min_gc, max_gc, gc_points = create_gc_plot(gc_fpath, data_dir)
    feature_fpaths, gene_points = create_genes_plot(features_containers, window_size, ref_len, data_dir)
    mismatches_fpaths = [create_mismatches_plot(assembly, window_size, ref_len, output_dir, data_dir) for assembly in assemblies]
    cov_data_fpath, cov_points = create_coverage_plot(cov_fpath, window_size, chr_lengths, data_dir)
    max_points = max([MAX_POINTS, gc_points, gene_points, cov_points, contig_points])
    labels_fpath, track_labels = create_labels(chr_lengths, assemblies, features_containers, cov_data_fpath, data_dir)

    conf_fpath = join(output_dir, 'circos.conf')
    radius = 0.95
    plot_idx = 0
    track_intervals = [TRACK_INTERVAL] * len(assemblies)
    if feature_fpaths:
        track_intervals[-1] = BIG_TRACK_INTERVAL
        track_intervals += [TRACK_INTERVAL] * len(feature_fpaths)
    if cov_data_fpath:
        track_intervals[-1] = BIG_TRACK_INTERVAL
        track_intervals.append(TRACK_INTERVAL)
    track_intervals[-1] = BIG_TRACK_INTERVAL
    with open(conf_fpath, 'w') as out_f:
        out_f.write('<<include etc/colors_fonts_patterns.conf>>\n')
        out_f.write('<<include %s>>\n' % relpath(ideogram_fpath, output_dir))
        out_f.write('<<include %s>>\n' % relpath(ticks_fpath, output_dir))
        out_f.write('karyotype = %s\n' % relpath(karyotype_fpath, output_dir))
        out_f.write('chromosomes_units = %d\n' % chrom_units)
        out_f.write('chromosomes_display_default = yes\n')
        out_f.write('track_width = ' + str(TRACK_WIDTH) + '\n')
        for i in range(len(track_intervals)):
            out_f.write('track%d_pos = %f\n' % (i, radius))
            radius -= TRACK_WIDTH
            radius -= track_intervals[i]
        out_f.write('track%d_pos = %f\n' % (len(track_intervals), radius))
        out_f.write('<image>\n')
        out_f.write('dir = %s\n' % output_dir)
        out_f.write('file = %s\n' % circos_png_fname)
        out_f.write('png = yes\n')
        out_f.write('svg = no\n')
        out_f.write('radius = 1500p\n')
        out_f.write('angle_offset = -90\n')
        out_f.write('auto_alpha_colors = yes\n')
        out_f.write('auto_alpha_steps = 5\n')
        out_f.write('background = white\n')
        out_f.write('</image>\n')
        if qconfig.is_combined_ref:
            out_f.write('<highlights>\n')
            highlights_fpath = create_meta_highlights(chr_lengths, data_dir)
            out_f.write('<highlight>\n')
            out_f.write('file = %s\n' % relpath(highlights_fpath, output_dir))
            out_f.write('r0 = 1r - 50p\n')
            out_f.write('r1 = 1r - 30p\n')
            out_f.write('</highlight>\n')
            out_f.write('</highlights>\n')
        out_f.write(create_housekeeping_file(chr_lengths, max_points, output_dir, data_dir, logger))
        out_f.write('<plots>\n')
        out_f.write('layers_overflow = collapse\n')
        for label, i in track_labels:
            out_f.write('<plot>\n')
            out_f.write('track_idx = track%d\n' % i)
            out_f.write('<<include %s>>\n' % relpath(labels_fpath, output_dir))
            out_f.write('</plot>\n')
        for i, alignments_conf in enumerate(alignments_fpaths):
            out_f.write('<plot>\n')
            out_f.write('type = tile\n')
            out_f.write('thickness = 50p\n')
            out_f.write('stroke_thickness = 0\n')
            out_f.write('layers = 1\n')
            out_f.write('file = %s\n' % relpath(alignments_conf, output_dir))
            out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n')
            out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n')
            out_f.write('</plot>\n')
            if mismatches_fpaths and mismatches_fpaths[i]:
                out_f.write('<plot>\n')
                out_f.write('type = histogram\n')
                out_f.write('thickness = 1\n')
                out_f.write('fill_color = vlyellow\n')
                out_f.write('file = %s\n' % relpath(mismatches_fpaths[i], output_dir))
                out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n')
                out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n')
                out_f.write('</plot>\n')
            plot_idx += 1
        for feature_fpath in feature_fpaths:
            # genes plot
            out_f.write('<plot>\n')
            out_f.write('type = heatmap\n')
            out_f.write('file = %s\n' % relpath(feature_fpath, output_dir))
            out_f.write('color = ylorbr-9\n')
            out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n')
            out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n')
            out_f.write('</plot>\n')
            plot_idx += 1
        if cov_data_fpath:
            # coverage plot
            out_f.write('<plot>\n')
            out_f.write('type = histogram\n')
            out_f.write('thickness = 1\n')
            out_f.write('file = %s\n' % relpath(cov_data_fpath, output_dir))
            out_f.write('fill_color = vlblue\n')
            out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n')
            out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n')
            out_f.write('</plot>\n')
            plot_idx += 1
        # GC plot
        out_f.write('<plot>\n')
        out_f.write('type = heatmap\n')
        out_f.write('file = %s\n' % relpath(gc_fpath, output_dir))
        out_f.write('color = greys-6\n')
        out_f.write('scale_log_base = 1.5\n')
        out_f.write('r0 = 1r - 29p\n')
        out_f.write('r1 = 1r - 1p\n')
        out_f.write('</plot>\n')
        out_f.write('</plots>\n')

    circos_legend_fpath = create_legend(assemblies, min_gc, max_gc, features_containers, cov_data_fpath, output_dir)
    return conf_fpath, circos_legend_fpath
Example #14
0
def get_correct_names_for_chroms(output_dirpath,
                                 fasta_fpath,
                                 sam_fpath,
                                 err_path,
                                 reads_fpaths,
                                 logger,
                                 is_reference=False):
    correct_chr_names = dict()
    fasta_chr_lengths = get_chr_lengths_from_fastafile(fasta_fpath)
    sam_chr_lengths = OrderedDict()
    sam_header_fpath = join(dirname(output_dirpath),
                            basename(sam_fpath) + '.header')
    if not isfile(sam_fpath) and not isfile(sam_header_fpath):
        return None
    if isfile(sam_fpath):
        qutils.call_subprocess(
            [sambamba_fpath('sambamba'), 'view', '-H', '-S', sam_fpath],
            stdout=open(sam_header_fpath, 'w'),
            stderr=open(err_path, 'a'),
            logger=logger)
    chr_name_pattern = 'SN:(\S+)'
    chr_len_pattern = 'LN:(\d+)'

    with open(sam_header_fpath) as sam_in:
        for l in sam_in:
            if l.startswith('@SQ'):
                chr_name = re.findall(chr_name_pattern, l)[0]
                chr_len = re.findall(chr_len_pattern, l)[0]
                sam_chr_lengths[chr_name] = int(chr_len)

    inconsistency = ''
    if len(fasta_chr_lengths) != len(sam_chr_lengths):
        inconsistency = 'Number of chromosomes'
    else:
        for fasta_chr, sam_chr in zip(fasta_chr_lengths.keys(),
                                      sam_chr_lengths.keys()):
            if correct_name(
                    sam_chr) == fasta_chr[:len(sam_chr)] and sam_chr_lengths[
                        sam_chr] == fasta_chr_lengths[fasta_chr]:
                correct_chr_names[sam_chr] = fasta_chr
            elif sam_chr_lengths[sam_chr] != fasta_chr_lengths[fasta_chr]:
                inconsistency = 'Chromosome lengths'
                break
            else:
                inconsistency = 'Chromosome names'
                break
    if inconsistency:
        if reads_fpaths:
            logger.warning(
                inconsistency + ' in ' + fasta_fpath +
                ' and corresponding SAM file ' + sam_fpath +
                ' do not match. ' + 'QUAST will try to realign reads to ' +
                ('the reference genome' if is_reference else fasta_fpath))
        else:
            logger.error(
                inconsistency + ' in ' + fasta_fpath +
                ' and corresponding SAM file ' + sam_fpath +
                ' do not match. ' +
                'Use SAM file obtained by aligning reads to ' +
                ('the reference genome' if is_reference else fasta_fpath))
        return None
    return correct_chr_names
Example #15
0
def create_conf(ref_fpath, contigs_fpaths, contig_report_fpath_pattern, output_dir, gc_fpath, features_containers, cov_fpath, logger):
    data_dir = join(output_dir, 'data')
    if not exists(data_dir):
        os.makedirs(data_dir)

    chr_lengths = get_chr_lengths_from_fastafile(ref_fpath)
    max_len, karyotype_fpath, ideogram_fpath = create_ideogram(chr_lengths, data_dir)
    if max_len >= 10 ** 6:
        chrom_units = 10 ** 5
    elif max_len >= 10 ** 5:
        chrom_units = 10 ** 4
    else:
        chrom_units = 1000
    ticks_fpath = create_ticks_conf(chrom_units, data_dir)
    ref_len = sum(chr_lengths.values())
    window_size = set_window_size(ref_len)

    assemblies, contig_points = parse_alignments(contigs_fpaths, contig_report_fpath_pattern)
    alignments_fpaths = [create_alignment_plots(assembly, ref_len, data_dir) for assembly in assemblies]
    if not alignments_fpaths:
        return None

    gc_fpath, min_gc, max_gc, gc_points = create_gc_plot(gc_fpath, data_dir)
    feature_fpaths, gene_points = create_genes_plot(features_containers, window_size, ref_len, data_dir)
    mismatches_fpaths = [create_mismatches_plot(assembly, window_size, ref_len, output_dir, data_dir) for assembly in assemblies]
    cov_data_fpath, cov_points = create_coverage_plot(cov_fpath, window_size, ref_len, data_dir)
    max_points = max([MAX_POINTS, gc_points, gene_points, cov_points, contig_points])
    labels_fpath, track_labels = create_labels(chr_lengths, assemblies, features_containers, cov_data_fpath, data_dir)

    conf_fpath = join(output_dir, 'circos.conf')
    radius = 0.95
    plot_idx = 0
    track_intervals = [TRACK_INTERVAL] * len(assemblies)
    if feature_fpaths:
        track_intervals[-1] = BIG_TRACK_INTERVAL
        track_intervals += [TRACK_INTERVAL] * len(feature_fpaths)
    if cov_data_fpath:
        track_intervals[-1] = BIG_TRACK_INTERVAL
        track_intervals.append(TRACK_INTERVAL)
    track_intervals[-1] = BIG_TRACK_INTERVAL
    with open(conf_fpath, 'w') as out_f:
        out_f.write('<<include etc/colors_fonts_patterns.conf>>\n')
        out_f.write('<<include %s>>\n' % relpath(ideogram_fpath, output_dir))
        out_f.write('<<include %s>>\n' % relpath(ticks_fpath, output_dir))
        out_f.write('karyotype = %s\n' % relpath(karyotype_fpath, output_dir))
        out_f.write('chromosomes_units = %d\n' % chrom_units)
        out_f.write('chromosomes_display_default = yes\n')
        out_f.write('track_width = ' + str(TRACK_WIDTH) + '\n')
        for i in range(len(track_intervals)):
            out_f.write('track%d_pos = %f\n' % (i, radius))
            radius -= TRACK_WIDTH
            radius -= track_intervals[i]
        out_f.write('track%d_pos = %f\n' % (len(track_intervals), radius))
        out_f.write('<image>\n')
        out_f.write('dir = %s\n' % output_dir)
        out_f.write('file = %s\n' % circos_png_fname)
        out_f.write('png = yes\n')
        out_f.write('svg = no\n')
        out_f.write('radius = 1500p\n')
        out_f.write('angle_offset = -90\n')
        out_f.write('auto_alpha_colors = yes\n')
        out_f.write('auto_alpha_steps = 5\n')
        out_f.write('background = white\n')
        out_f.write('</image>\n')
        if qconfig.is_combined_ref:
            out_f.write('<highlights>\n')
            highlights_fpath = create_meta_highlights(chr_lengths, data_dir)
            out_f.write('<highlight>\n')
            out_f.write('file = %s\n' % relpath(highlights_fpath, output_dir))
            out_f.write('r0 = 1r - 50p\n')
            out_f.write('r1 = 1r - 30p\n')
            out_f.write('</highlight>\n')
            out_f.write('</highlights>\n')
        out_f.write(create_housekeeping_file(chr_lengths, max_points, output_dir, data_dir, logger))
        out_f.write('<plots>\n')
        out_f.write('layers_overflow = collapse\n')
        for label, i in track_labels:
            out_f.write('<plot>\n')
            out_f.write('track_idx = track%d\n' % i)
            out_f.write('<<include %s>>\n' % relpath(labels_fpath, output_dir))
            out_f.write('</plot>\n')
        for i, alignments_conf in enumerate(alignments_fpaths):
            out_f.write('<plot>\n')
            out_f.write('type = tile\n')
            out_f.write('thickness = 50p\n')
            out_f.write('stroke_thickness = 0\n')
            out_f.write('layers = 1\n')
            out_f.write('file = %s\n' % relpath(alignments_conf, output_dir))
            out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n')
            out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n')
            out_f.write('</plot>\n')
            if mismatches_fpaths and mismatches_fpaths[i]:
                out_f.write('<plot>\n')
                out_f.write('type = histogram\n')
                out_f.write('thickness = 1\n')
                out_f.write('fill_color = vlyellow\n')
                out_f.write('file = %s\n' % relpath(mismatches_fpaths[i], output_dir))
                out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n')
                out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n')
                out_f.write('</plot>\n')
            plot_idx += 1
        for feature_fpath in feature_fpaths:
            # genes plot
            out_f.write('<plot>\n')
            out_f.write('type = heatmap\n')
            out_f.write('file = %s\n' % relpath(feature_fpath, output_dir))
            out_f.write('color = ylorbr-9\n')
            out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n')
            out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n')
            out_f.write('</plot>\n')
            plot_idx += 1
        if cov_data_fpath:
            # coverage plot
            out_f.write('<plot>\n')
            out_f.write('type = histogram\n')
            out_f.write('thickness = 1\n')
            out_f.write('file = %s\n' % relpath(cov_data_fpath, output_dir))
            out_f.write('fill_color = vlblue\n')
            out_f.write('r0 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos) - conf(track_width)))\n')
            out_f.write('r1 = eval(sprintf("%.3fr",conf(track' + str(plot_idx) + '_pos)))\n')
            out_f.write('</plot>\n')
            plot_idx += 1
        # GC plot
        out_f.write('<plot>\n')
        out_f.write('type = heatmap\n')
        out_f.write('file = %s\n' % relpath(gc_fpath, output_dir))
        out_f.write('color = greys-6\n')
        out_f.write('scale_log_base = 1.5\n')
        out_f.write('r0 = 1r - 29p\n')
        out_f.write('r1 = 1r - 1p\n')
        out_f.write('</plot>\n')
        out_f.write('</plots>\n')

    circos_legend_fpath = create_legend(assemblies, min_gc, max_gc, features_containers, cov_data_fpath, output_dir)
    return conf_fpath, circos_legend_fpath
Example #16
0
def do(ref_fpath, contigs_fpaths, aligned_contigs_fpaths, output_dirpath,
       aligned_lengths_lists, aligned_stats_dirpath):

    if not os.path.isdir(aligned_stats_dirpath):
        os.mkdir(aligned_stats_dirpath)

    ########################################################################
    report_dict = {'header': []}
    for contigs_fpath in aligned_contigs_fpaths:
        report_dict[qutils.name_from_fpath(contigs_fpath)] = []

    ########################################################################
    logger.print_timestamp()
    logger.main_info('Running NA-NGA calculation...')

    ref_chr_lengths = fastaparser.get_chr_lengths_from_fastafile(ref_fpath)
    reference_length = sum(ref_chr_lengths.values())
    assembly_lengths = []
    for contigs_fpath in aligned_contigs_fpaths:
        assembly_lengths.append(
            sum(
                fastaparser.get_chr_lengths_from_fastafile(
                    contigs_fpath).values()))

    for i, (contigs_fpath, lens, assembly_len) in enumerate(
            zip(aligned_contigs_fpaths, aligned_lengths_lists,
                assembly_lengths)):
        sorted_lengths = sorted(lens, reverse=True)
        na50, la50 = N50.NG50_and_LG50(sorted_lengths, assembly_len)
        na75, la75 = N50.NG50_and_LG50(sorted_lengths, assembly_len, 75)
        ea_size = N50.E_size(sorted_lengths)
        if not qconfig.is_combined_ref:
            nga50, lga50 = N50.NG50_and_LG50(sorted_lengths, reference_length)
            nga75, lga75 = N50.NG50_and_LG50(sorted_lengths, reference_length,
                                             75)

        logger.info(
            '  ' + qutils.index_to_str(i) +
            qutils.label_from_fpath(contigs_fpath) + ', Largest alignment = ' +
            str(max(lens)) + ', NA50 = ' + str(na50) +
            (', NGA50 = ' +
             str(nga50) if not qconfig.is_combined_ref and nga50 else '') +
            ', LA50 = ' + str(la50) +
            (', LGA50 = ' +
             str(lga50) if not qconfig.is_combined_ref and lga50 else ''))
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.LARGALIGN, max(lens))
        report.add_field(reporting.Fields.TOTAL_ALIGNED_LEN, sum(lens))
        report.add_field(reporting.Fields.NA50, na50)
        report.add_field(reporting.Fields.NA75, na75)
        report.add_field(reporting.Fields.LA50, la50)
        report.add_field(reporting.Fields.LA75, la75)
        report.add_field(reporting.Fields.EA_SIZE, ea_size)
        if not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NGA50, nga50)
            report.add_field(reporting.Fields.NGA75, nga75)
            report.add_field(reporting.Fields.LGA50, lga50)
            report.add_field(reporting.Fields.LGA75, lga75)

    ########################################################################
    num_contigs = max([
        len(aligned_lengths_lists[i])
        for i in range(len(aligned_lengths_lists))
    ])

    # saving to html
    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        html_saver.save_assembly_lengths(output_dirpath,
                                         aligned_contigs_fpaths,
                                         assembly_lengths)

    if qconfig.draw_plots:
        # Drawing cumulative plot (aligned contigs)...
        plotter.cumulative_plot(
            ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists,
            os.path.join(aligned_stats_dirpath, 'cumulative_plot'),
            'Cumulative length (aligned contigs)')

        # Drawing NAx and NGAx plots...
    logger.info("Making plots...")
    plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points,
                    aligned_contigs_fpaths, aligned_lengths_lists,
                    aligned_stats_dirpath + '/NAx_plot', 'NAx',
                    assembly_lengths)
    ea_x_max = reporting.get(aligned_contigs_fpaths[0]).get_field(
        reporting.Fields.E_SIZE_MAX)
    plotter.EAxmax_plot(output_dirpath, False, aligned_contigs_fpaths,
                        aligned_stats_dirpath + '/EAxmax_plot', 'EAxmax',
                        ea_x_max)
    if not qconfig.is_combined_ref:
        plotter.Nx_plot(
            output_dirpath, num_contigs > qconfig.max_points,
            aligned_contigs_fpaths, aligned_lengths_lists,
            aligned_stats_dirpath + '/NGAx_plot', 'NGAx',
            [reference_length for i in range(len(aligned_contigs_fpaths))])

    logger.main_info('Done.')
    return report_dict
Example #17
0
def frc_plot(results_dir, ref_fpath, contigs_fpaths, contigs_aligned_lengths,
             features_in_contigs_by_file, plot_fpath, title):
    if can_draw_plots:
        logger.info('  Drawing ' + title + ' FRCurve plot...')

    plots = []
    max_y = 0
    max_x = 0
    ref_length = sum(
        fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values())
    json_vals_x = []  # coordinates for Nx-like plots in HTML-report
    json_vals_y = []
    max_features = max(
        sum(feature_in_contigs)
        for feature_in_contigs in features_in_contigs_by_file.values()) + 1
    #create TSV file for metaquast features
    outf = open(results_dir + os.sep + "metaquast_frc.tsv", 'w')
    outf.write(
        "Assembly\tContig_ID\tContig_Length\tFeature_Count\tFeature_Type\n")
    aligned_contigs_fpaths = []
    idx = 0
    legend_list = [label_from_fpath(fpath) for fpath in contigs_fpaths]
    max_len = 0
    max_features = 0
    for contigs_fpath in contigs_fpaths:
        x_vals = [0]
        y_vals = [0]
        cumulative_len = 0
        cumulative_features = 0
        aligned_lengths = contigs_aligned_lengths[contigs_fpath]
        feature_in_contigs = features_in_contigs_by_file[contigs_fpath]
        contigs_lens = fastaparser.get_chr_lengths_from_fastafile(
            contigs_fpath)

        if not aligned_lengths or not feature_in_contigs:
            continue

        aligned_contigs_fpaths.append(contigs_fpath)
        len_with_zero_features = 0
        lengths = []
        non_zero_feature_in_contigs = []
        ctg_idx = 1
        #create unsorted TSV
        for l, feature in zip(aligned_lengths, feature_in_contigs):
            if l > 0:
                outf.write("%s\t%s\t%d\t%d\t%s\n" %
                           (legend_list[idx], ctg_idx, l, feature, title))
                ctg_idx += 1
            if feature == 0:
                len_with_zero_features += l
            if l > 0:
                lengths.append(l)
                non_zero_feature_in_contigs.append(feature)

        optimal_sorted_tuples = sorted(
            zip(lengths, non_zero_feature_in_contigs),
            reverse=True)  # sort by len/features ratio
        sorted_lengths = [tuple[0] for tuple in optimal_sorted_tuples]
        sorted_features = [tuple[1] for tuple in optimal_sorted_tuples]

        for tuple in optimal_sorted_tuples:
            cumulative_len += tuple[0]
            cumulative_features += tuple[1]
            y_vals.append(cumulative_features)
            x_vals.append(cumulative_len)
            #y_vals.append(cumulative_features)
            #x_vals.append(cumulative_len )

        json_vals_x.append(y_vals)
        json_vals_y.append(x_vals)
        max_y = max(max_y, max(y_vals))
        max_x = max(max_x, max(x_vals))
        idx += 1

        color, ls = get_color_and_ls(contigs_fpath)
        plots.append(Plot(x_vals, y_vals, color, ls))

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        html_saver.save_coord(results_dir, json_vals_x, json_vals_y,
                              'coord' + title, aligned_contigs_fpaths)

    if can_draw_plots:
        title = 'FRCurve (' + title + ')'
        legend_list = [
            label_from_fpath(fpath) for fpath in aligned_contigs_fpaths
        ]
        create_plot(plot_fpath,
                    title,
                    plots,
                    legend_list,
                    x_label='Cumulative length',
                    y_label='Cumulative features',
                    y_limit=[0, max_y],
                    x_limit=[0, max_x])
Example #18
0
def frc_plot(results_dir, ref_fpath, contigs_fpaths, contigs_aligned_lengths,
             features_in_contigs_by_file, plot_fpath, title):
    if can_draw_plots:
        logger.info('  Drawing ' + title + ' FRCurve plot...')

    plots = []
    max_y = 0
    ref_length = sum(
        fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values())
    json_vals_x = []  # coordinates for Nx-like plots in HTML-report
    json_vals_y = []
    max_features = max(
        sum(feature_in_contigs)
        for feature_in_contigs in features_in_contigs_by_file.values()) + 1

    aligned_contigs_fpaths = []
    for contigs_fpath in contigs_fpaths:
        aligned_lengths = contigs_aligned_lengths[contigs_fpath]
        feature_in_contigs = features_in_contigs_by_file[contigs_fpath]
        if not aligned_lengths or not feature_in_contigs:
            continue

        aligned_contigs_fpaths.append(contigs_fpath)
        len_with_zero_features = 0
        lengths = []
        non_zero_feature_in_contigs = []
        for l, feature in zip(aligned_lengths, feature_in_contigs):
            if feature == 0:
                len_with_zero_features += l
            else:
                lengths.append(l)
                non_zero_feature_in_contigs.append(feature)
        optimal_sorted_tuples = sorted(
            zip(lengths, non_zero_feature_in_contigs),
            key=lambda tuple: tuple[0] * 1.0 / tuple[1],
            reverse=True)  # sort by len/features ratio
        sorted_lengths = [tuple[0] for tuple in optimal_sorted_tuples]
        sorted_features = [tuple[1] for tuple in optimal_sorted_tuples]
        x_vals = []
        y_vals = []
        for features_n in range(max_features):
            features_cnt = 0
            cumulative_len = len_with_zero_features
            for l, feature in zip(sorted_lengths, sorted_features):
                if features_cnt + feature <= features_n:
                    features_cnt += feature
                    cumulative_len += l
                    if features_cnt == features_n:
                        break

            x_vals.append(features_n)
            y_vals.append(cumulative_len * 100.0 / ref_length)
            x_vals.append(features_n + 1)
            y_vals.append(cumulative_len * 100.0 / ref_length)

        json_vals_x.append(x_vals)
        json_vals_y.append(y_vals)
        max_y = max(max_y, max(y_vals))

        color, ls = get_color_and_ls(contigs_fpath)
        plots.append(Plot(x_vals, y_vals, color, ls))

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        html_saver.save_coord(results_dir, json_vals_x, json_vals_y,
                              'coord' + title, aligned_contigs_fpaths)

    if can_draw_plots:
        title = 'FRCurve (' + title + ')'
        legend_list = [
            label_from_fpath(fpath) for fpath in aligned_contigs_fpaths
        ]
        create_plot(plot_fpath,
                    title,
                    plots,
                    legend_list,
                    x_label='Feature space',
                    y_label='Genome coverage (%)',
                    x_limit=[0, max_features],
                    y_limit=[0, max(100, max_y)])
Example #19
0
def cumulative_plot(reference, contigs_fpaths, lists_of_lengths, plot_fpath, title):
    if not can_draw_plots:
        return

    logger.info('  Drawing cumulative plot...')
    import matplotlib.pyplot
    import matplotlib.ticker

    figure = matplotlib.pyplot.figure()
    matplotlib.pyplot.rc('font', **font)
    max_x = 0
    max_y = 0

    for (contigs_fpath, lengths) in itertools.izip(contigs_fpaths, lists_of_lengths):
        vals_length = [0]
        for l in sorted(lengths, reverse=True):
            vals_length.append(vals_length[-1] + l)
        vals_contig_index = range(0, len(vals_length))
        if vals_contig_index:
            max_x = max(vals_contig_index[-1], max_x)
            max_y = max(max_y, vals_length[-1])
        color, ls = get_color_and_ls(contigs_fpath)
        matplotlib.pyplot.plot(vals_contig_index, vals_length, color=color, lw=line_width, ls=ls)

    if reference:
        y_vals = []
        for l in sorted(fastaparser.get_chr_lengths_from_fastafile(reference).values(), reverse=True):
            if y_vals:
                y_vals.append(y_vals[-1] + l)
            else:
                y_vals = [l]
        x_vals = range(0, len(y_vals)) # for reference only: starting from X=1
        # extend reference curve to the max X-axis point
        reference_length = y_vals[-1]
        max_x = max(max_x, x_vals[-1])
        max_y = max(max_y, reference_length)
        y_vals.append(reference_length)
        x_vals.append(max_x)
        matplotlib.pyplot.plot(x_vals, y_vals,
                               color=reference_color, lw=line_width, ls=reference_ls)

    if with_title:
        matplotlib.pyplot.title(title)
    matplotlib.pyplot.grid(with_grid)
    ax = matplotlib.pyplot.gca()
    # Shink current axis's height by 20% on the bottom
    box = ax.get_position()
    ax.set_position([box.x0, box.y0 + box.height * 0.2, box.width, box.height * 0.8])

    legend_list = map(qutils.label_from_fpath, contigs_fpaths)
    if reference:
        legend_list += ['Reference']

    # Put a legend below current axis
    try: # for matplotlib <= 2009-12-09
        ax.legend(legend_list, loc='upper center', bbox_to_anchor=(0.5, -0.1), fancybox=True,
            shadow=True, ncol=n_columns if n_columns<3 else 3)
    except Exception: # ZeroDivisionError: ValueError:
        pass

    ylabel = 'Cumulative length '
    ylabel, mkfunc = y_formatter(ylabel, max_y)
    matplotlib.pyplot.xlabel('Contig index', fontsize=axes_fontsize)
    matplotlib.pyplot.ylabel(ylabel, fontsize=axes_fontsize)

    mkformatter = matplotlib.ticker.FuncFormatter(mkfunc)
    ax.yaxis.set_major_formatter(mkformatter)


    xLocator, yLocator = get_locators()
    ax.yaxis.set_major_locator(yLocator)
    ax.xaxis.set_major_locator(xLocator)
    if logarithmic_x_scale:
        ax.set_xscale('log')
    #ax.set_yscale('log')

    #matplotlib.pyplot.ylim([0, int(float(max_y) * 1.1)])

    plot_fpath += '.' + qconfig.plot_extension
    matplotlib.pyplot.savefig(plot_fpath, bbox_inches='tight')
    logger.info('    saved to ' + plot_fpath)
    pdf_plots_figures.append(figure)
    matplotlib.pyplot.close()
Example #20
0
def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir):
    logger.print_timestamp()
    logger.main_info("Running Basic statistics processor...")
    
    if not os.path.isdir(output_dirpath):
        os.mkdir(output_dirpath)

    reference_length = None
    reference_lengths = []
    reference_fragments = None
    icarus_gc_fpath = None
    circos_gc_fpath = None
    if ref_fpath:
        reference_lengths = sorted(fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values(), reverse=True)
        reference_fragments = len(reference_lengths)
        reference_length = sum(reference_lengths)
        reference_GC, reference_GC_distribution, reference_GC_contigs_distribution = GC_content(ref_fpath)
        if qconfig.create_icarus_html or qconfig.draw_plots:
            icarus_gc_fpath = join(output_dirpath, 'gc.icarus.txt')
            save_icarus_GC(ref_fpath, icarus_gc_fpath)
        if qconfig.draw_plots:
            circos_gc_fpath = join(output_dirpath, 'gc.circos.txt')
            save_circos_GC(ref_fpath, reference_length, circos_gc_fpath)

        logger.info('  Reference genome:')
        logger.info('    ' + os.path.basename(ref_fpath) + ', length = ' + str(reference_length) +
                    ', num fragments = ' + str(reference_fragments) + ', GC % = ' +
                    '%.2f' % reference_GC if reference_GC is not None else 'undefined')
        if reference_fragments > 30 and not qconfig.check_for_fragmented_ref:
            logger.warning('  Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option.'
                           ' QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).')
    elif qconfig.estimated_reference_size:
        reference_length = qconfig.estimated_reference_size
        reference_lengths = [reference_length]
        logger.info('  Estimated reference length = ' + str(reference_length))

    logger.info('  Contig files: ')
    lists_of_lengths = []
    numbers_of_Ns = []
    coverage_dict = dict()
    cov_pattern = re.compile(r'_cov_(\d+\.?\d*)')
    for id, contigs_fpath in enumerate(contigs_fpaths):
        coverage_dict[contigs_fpath] = []
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        logger.info('    ' + qutils.index_to_str(id) + assembly_label)
        # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
        list_of_length = []
        number_of_Ns = 0
        is_potential_scaffold = False
        for (name, seq) in fastaparser.read_fasta(contigs_fpath):
            list_of_length.append(len(seq))
            number_of_Ns += seq.count('N')
            if not qconfig.scaffolds and not is_potential_scaffold and qutils.is_scaffold(seq):
                is_potential_scaffold = True
                qconfig.potential_scaffolds_assemblies.append(assembly_label)
            if cov_pattern.findall(name):
                cov = int(float(cov_pattern.findall(name)[0]))
                if len(coverage_dict[contigs_fpath]) <= cov:
                    coverage_dict[contigs_fpath] += [0] * (cov - len(coverage_dict[contigs_fpath]) + 1)
                coverage_dict[contigs_fpath][cov] += len(seq)

        lists_of_lengths.append(list_of_length)
        numbers_of_Ns.append(number_of_Ns)

    lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths]
    num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths])
    multiplicator = 1
    if num_contigs >= (qconfig.max_points * 2):
        import math
        multiplicator = int(num_contigs / qconfig.max_points)
        max_points = num_contigs // multiplicator
        corr_lists_of_lengths = [[sum(list_of_length[((i - 1) * multiplicator):(i * multiplicator)]) for i in range(1, max_points)
                                  if (i * multiplicator) < len(list_of_length)] for list_of_length in lists_of_lengths]
        if len(reference_lengths) > 1:
            reference_lengths = [sum(reference_lengths[((i - 1) * multiplicator):(i * multiplicator)])
                                 if (i * multiplicator) < len(reference_lengths) else
                                 sum(reference_lengths[((i - 1) * multiplicator):])
                                 for i in range(1, max_points)] + [sum(reference_lengths[(max_points - 1) * multiplicator:])]
        for num_list in range(len(corr_lists_of_lengths)):
            last_index = len(corr_lists_of_lengths[num_list])
            corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index * multiplicator:]))
    else:
        corr_lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths]

    if reference_lengths:
        # Saving for an HTML report
        if qconfig.html_report:
            from quast_libs.html_saver import html_saver
            html_saver.save_reference_lengths(results_dir, reference_lengths)

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths)
        html_saver.save_tick_x(results_dir, multiplicator)

    ########################################################################

    logger.info('  Calculating N50 and L50...')

    list_of_GC_distributions = []
    list_of_GC_contigs_distributions = []
    largest_contig = 0
    from . import N50
    for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(zip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)):
        report = reporting.get(contigs_fpath)
        n50, l50 = N50.N50_and_L50(lengths_list)
        ng50, lg50 = None, None
        if reference_length:
            ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
        n75, l75 = N50.N50_and_L50(lengths_list, 75)
        ng75, lg75 = None, None
        if reference_length:
            ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75)
        total_length = sum(lengths_list)
        total_GC, GC_distribution, GC_contigs_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc)
        list_of_GC_distributions.append(GC_distribution)
        list_of_GC_contigs_distributions.append(GC_contigs_distribution)
        logger.info('    ' + qutils.index_to_str(id) +
                    qutils.label_from_fpath(contigs_fpath) + \
                    ', N50 = ' + str(n50) + \
                    ', L50 = ' + str(l50) + \
                    ', Total length = ' + str(total_length) + \
                    ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \
                    ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined')
        
        report.add_field(reporting.Fields.N50, n50)
        report.add_field(reporting.Fields.L50, l50)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG50, ng50)
            report.add_field(reporting.Fields.LG50, lg50)
        report.add_field(reporting.Fields.N75, n75)
        report.add_field(reporting.Fields.L75, l75)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG75, ng75)
            report.add_field(reporting.Fields.LG75, lg75)
        report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
        if lengths_list:
            report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
            largest_contig = max(largest_contig, max(lengths_list))
            report.add_field(reporting.Fields.TOTALLEN, total_length)
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None))
            report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
            report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length))))
        if ref_fpath:
            report.add_field(reporting.Fields.REFLEN, int(reference_length))
            report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments)
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None))
        elif reference_length:
            report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))

    import math
    qconfig.min_difference = math.ceil((largest_contig / 1000) / 600)  # divide on height of plot

    list_of_GC_distributions_with_ref = list_of_GC_distributions
    reference_index = None
    if ref_fpath:
        reference_index = len(list_of_GC_distributions_with_ref)
        list_of_GC_distributions_with_ref.append(reference_GC_distribution)

    if qconfig.html_report and not qconfig.no_gc:
        from quast_libs.html_saver import html_saver
        html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions_with_ref, list_of_GC_contigs_distributions, reference_index)

    ########################################################################
    # Drawing Nx and NGx plots...
    plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'Nx_plot'), 'Nx', [])
    if reference_length and not qconfig.is_combined_ref:
        plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'NGx_plot'), 'NGx',
                        [reference_length for i in range(len(contigs_fpaths))])

    if qconfig.draw_plots:
        ########################################################################import plotter
        # Drawing cumulative plot...
        plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, join(output_dirpath, 'cumulative_plot'), 'Cumulative length')
        if not qconfig.no_gc:
            ########################################################################
            # Drawing GC content plot...
            plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, join(output_dirpath, 'GC_content_plot'))
            for contigs_fpath, GC_distribution in zip(contigs_fpaths, list_of_GC_contigs_distributions):
                plotter.contigs_GC_content_plot(contigs_fpath, GC_distribution,
                                                join(output_dirpath, qutils.label_from_fpath(contigs_fpath) + '_GC_content_plot'))

        if any(coverage_dict[contigs_fpath] for contigs_fpath in contigs_fpaths):
            draw_coverage_histograms(coverage_dict, contigs_fpaths, output_dirpath)

    logger.main_info('Done.')
    return icarus_gc_fpath, circos_gc_fpath
Example #21
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
       aligned_lengths_lists, aligned_stats_dirpath):

    if not os.path.isdir(aligned_stats_dirpath):
        os.mkdir(aligned_stats_dirpath)

    ########################################################################
    report_dict = {'header': []}
    for contigs_fpath in aligned_contigs_fpaths:
        report_dict[qutils.name_from_fpath(contigs_fpath)] = []

    ########################################################################
    logger.print_timestamp()
    logger.main_info('Running NA-NGA calculation...')

    ref_chr_lengths = fastaparser.get_chr_lengths_from_fastafile(ref_fpath)
    reference_length = sum(ref_chr_lengths.values())
    assembly_lengths = []
    for contigs_fpath in aligned_contigs_fpaths:
        assembly_lengths.append(sum(fastaparser.get_chr_lengths_from_fastafile(contigs_fpath).values()))

    for i, (contigs_fpath, lens, assembly_len) in enumerate(
            zip(aligned_contigs_fpaths, aligned_lengths_lists, assembly_lengths)):
        na50 = N50.NG50(lens, assembly_len)
        na75 = N50.NG50(lens, assembly_len, 75)
        la50 = N50.LG50(lens, assembly_len)
        la75 = N50.LG50(lens, assembly_len, 75)
        if not qconfig.is_combined_ref:
            nga50 = N50.NG50(lens, reference_length)
            nga75 = N50.NG50(lens, reference_length, 75)
            lga50 = N50.LG50(lens, reference_length)
            lga75 = N50.LG50(lens, reference_length, 75)

        logger.info('  ' +
                    qutils.index_to_str(i) +
                    qutils.label_from_fpath(contigs_fpath) +
                 ', Largest alignment = ' + str(max(lens)) +
                 ', NA50 = ' + str(na50) +
                 (', NGA50 = ' + str(nga50) if not qconfig.is_combined_ref and nga50 else '') +
                 ', LA50 = ' + str(la50) +
                 (', LGA50 = ' + str(lga50) if not qconfig.is_combined_ref and lga50 else ''))
        report = reporting.get(contigs_fpath)
        report.add_field(reporting.Fields.LARGALIGN, max(lens))
        report.add_field(reporting.Fields.TOTAL_ALIGNED_LEN, sum(lens))
        report.add_field(reporting.Fields.NA50, na50)
        report.add_field(reporting.Fields.NA75, na75)
        report.add_field(reporting.Fields.LA50, la50)
        report.add_field(reporting.Fields.LA75, la75)
        if not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NGA50, nga50)
            report.add_field(reporting.Fields.NGA75, nga75)
            report.add_field(reporting.Fields.LGA50, lga50)
            report.add_field(reporting.Fields.LGA75, lga75)

    ########################################################################
    num_contigs = max([len(aligned_lengths_lists[i]) for i in range(len(aligned_lengths_lists))])

    if json_output_dirpath:
        from quast_libs.html_saver import json_saver
        json_saver.save_assembly_lengths(json_output_dirpath, aligned_contigs_fpaths, assembly_lengths)

    # saving to html
    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        html_saver.save_assembly_lengths(output_dirpath, aligned_contigs_fpaths, assembly_lengths)

    if qconfig.draw_plots:
        # Drawing cumulative plot (aligned contigs)...
        plotter.cumulative_plot(ref_fpath, aligned_contigs_fpaths, aligned_lengths_lists,
                                os.path.join(aligned_stats_dirpath, 'cumulative_plot'),
                                'Cumulative length (aligned contigs)')

        # Drawing NAx and NGAx plots...
    plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists, aligned_stats_dirpath + '/NAx_plot', 'NAx',
                    assembly_lengths, json_output_dir=json_output_dirpath)
    if not qconfig.is_combined_ref:
        plotter.Nx_plot(output_dirpath, num_contigs > qconfig.max_points, aligned_contigs_fpaths, aligned_lengths_lists,
                        aligned_stats_dirpath + '/NGAx_plot', 'NGAx', [reference_length for i in range(len(aligned_contigs_fpaths))], json_output_dir=json_output_dirpath)

    logger.main_info('Done.')
    return report_dict
Example #22
0
def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir):
    logger.print_timestamp()
    logger.main_info("Running Basic statistics processor...")

    if not os.path.isdir(output_dirpath):
        os.mkdir(output_dirpath)

    reference_length = None
    reference_lengths = []
    reference_fragments = None
    if ref_fpath:
        reference_lengths = sorted(
            fastaparser.get_chr_lengths_from_fastafile(ref_fpath).values(),
            reverse=True)
        reference_fragments = len(reference_lengths)
        reference_length = sum(reference_lengths)
        reference_GC, reference_GC_distribution, reference_GC_contigs_distribution = GC_content(
            ref_fpath)

        logger.info('  Reference genome:')
        logger.info('    ' + os.path.basename(ref_fpath) + ', length = ' +
                    str(reference_length) + ', num fragments = ' +
                    str(reference_fragments) + ', GC % = ' + '%.2f' %
                    reference_GC if reference_GC is not None else 'undefined')
        if reference_fragments > 30 and not qconfig.check_for_fragmented_ref:
            logger.warning(
                '  Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option.'
                ' QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).'
            )
    elif qconfig.estimated_reference_size:
        reference_length = qconfig.estimated_reference_size
        reference_lengths = [reference_length]
        logger.info('  Estimated reference length = ' + str(reference_length))

    logger.info('  Contig files: ')
    lists_of_lengths = []
    numbers_of_Ns = []
    coverage_dict = dict()
    cov_pattern = re.compile(r'_cov_(\d+\.?\d*)')
    for id, contigs_fpath in enumerate(contigs_fpaths):
        coverage_dict[contigs_fpath] = []
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        logger.info('    ' + qutils.index_to_str(id) + assembly_label)
        # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
        list_of_length = []
        number_of_Ns = 0
        is_potential_scaffold = False
        for (name, seq) in fastaparser.read_fasta(contigs_fpath):
            list_of_length.append(len(seq))
            number_of_Ns += seq.count('N')
            if not qconfig.scaffolds and not is_potential_scaffold and qutils.is_scaffold(
                    seq):
                is_potential_scaffold = True
                qconfig.potential_scaffolds_assemblies.append(assembly_label)
            if cov_pattern.findall(name):
                cov = int(float(cov_pattern.findall(name)[0]))
                if len(coverage_dict[contigs_fpath]) <= cov:
                    coverage_dict[contigs_fpath] += [0] * (
                        cov - len(coverage_dict[contigs_fpath]) + 1)
                coverage_dict[contigs_fpath][cov] += len(seq)

        lists_of_lengths.append(list_of_length)
        numbers_of_Ns.append(number_of_Ns)

    lists_of_lengths = [
        sorted(list, reverse=True) for list in lists_of_lengths
    ]
    num_contigs = max(
        [len(list_of_length) for list_of_length in lists_of_lengths])
    multiplicator = 1
    if num_contigs >= (qconfig.max_points * 2):
        import math
        multiplicator = int(num_contigs / qconfig.max_points)
        max_points = num_contigs // multiplicator
        corr_lists_of_lengths = [[
            sum(list_of_length[((i - 1) * multiplicator):(i * multiplicator)])
            for i in range(1, max_points)
            if (i * multiplicator) < len(list_of_length)
        ] for list_of_length in lists_of_lengths]
        if len(reference_lengths) > 1:
            reference_lengths = [
                sum(reference_lengths[(
                    (i - 1) * multiplicator):(i * multiplicator)]) if
                (i * multiplicator) < len(reference_lengths) else sum(
                    reference_lengths[((i - 1) * multiplicator):])
                for i in range(1, max_points)
            ] + [sum(reference_lengths[(max_points - 1) * multiplicator:])]
        for num_list in range(len(corr_lists_of_lengths)):
            last_index = len(corr_lists_of_lengths[num_list])
            corr_lists_of_lengths[num_list].append(
                sum(lists_of_lengths[num_list][last_index * multiplicator:]))
    else:
        corr_lists_of_lengths = [
            sorted(list, reverse=True) for list in lists_of_lengths
        ]

    if reference_lengths:
        # Saving for an HTML report
        if qconfig.html_report:
            from quast_libs.html_saver import html_saver
            html_saver.save_reference_lengths(results_dir, reference_lengths)

    if qconfig.html_report:
        from quast_libs.html_saver import html_saver
        html_saver.save_contigs_lengths(results_dir, contigs_fpaths,
                                        corr_lists_of_lengths)
        html_saver.save_tick_x(results_dir, multiplicator)

    ########################################################################

    logger.info('  Calculating N50 and L50...')

    list_of_GC_distributions = []
    list_of_GC_contigs_distributions = []
    largest_contig = 0
    from . import N50
    for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(
            zip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)):
        report = reporting.get(contigs_fpath)
        n50, l50 = N50.N50_and_L50(lengths_list)
        ng50, lg50 = None, None
        if reference_length:
            ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
        n75, l75 = N50.N50_and_L50(lengths_list, 75)
        ng75, lg75 = None, None
        if reference_length:
            ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75)
        total_length = sum(lengths_list)
        total_GC, GC_distribution, GC_contigs_distribution = GC_content(
            contigs_fpath, skip=qconfig.no_gc)
        list_of_GC_distributions.append(GC_distribution)
        list_of_GC_contigs_distributions.append(GC_contigs_distribution)
        logger.info('    ' + qutils.index_to_str(id) +
                    qutils.label_from_fpath(contigs_fpath) + \
                    ', N50 = ' + str(n50) + \
                    ', L50 = ' + str(l50) + \
                    ', Total length = ' + str(total_length) + \
                    ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \
                    ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined')

        report.add_field(reporting.Fields.N50, n50)
        report.add_field(reporting.Fields.L50, l50)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG50, ng50)
            report.add_field(reporting.Fields.LG50, lg50)
        report.add_field(reporting.Fields.N75, n75)
        report.add_field(reporting.Fields.L75, l75)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG75, ng75)
            report.add_field(reporting.Fields.LG75, lg75)
        report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
        if lengths_list:
            report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
            largest_contig = max(largest_contig, max(lengths_list))
            report.add_field(reporting.Fields.TOTALLEN, total_length)
            if not qconfig.is_combined_ref:
                report.add_field(
                    reporting.Fields.GC,
                    ('%.2f' % total_GC if total_GC is not None else None))
            report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
            report.add_field(
                reporting.Fields.UNCALLED_PERCENT,
                ('%.2f' %
                 (float(number_of_Ns) * 100000.0 / float(total_length))))
        if ref_fpath:
            report.add_field(reporting.Fields.REFLEN, int(reference_length))
            report.add_field(reporting.Fields.REF_FRAGMENTS,
                             reference_fragments)
            if not qconfig.is_combined_ref:
                report.add_field(
                    reporting.Fields.REFGC,
                    ('%.2f' %
                     reference_GC if reference_GC is not None else None))
        elif reference_length:
            report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))

    import math
    qconfig.min_difference = math.ceil(
        (largest_contig / 1000) / 600)  # divide on height of plot

    list_of_GC_distributions_with_ref = list_of_GC_distributions
    reference_index = None
    if ref_fpath:
        reference_index = len(list_of_GC_distributions_with_ref)
        list_of_GC_distributions_with_ref.append(reference_GC_distribution)

    if qconfig.html_report and not qconfig.no_gc:
        from quast_libs.html_saver import html_saver
        html_saver.save_GC_info(results_dir, contigs_fpaths,
                                list_of_GC_distributions_with_ref,
                                list_of_GC_contigs_distributions,
                                reference_index)

    ########################################################################
    # Drawing Nx and NGx plots...
    plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points,
                    contigs_fpaths, lists_of_lengths,
                    join(output_dirpath, 'Nx_plot'), 'Nx', [])
    if reference_length and not qconfig.is_combined_ref:
        plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points,
                        contigs_fpaths, lists_of_lengths,
                        join(output_dirpath, 'NGx_plot'), 'NGx',
                        [reference_length for i in range(len(contigs_fpaths))])

    if qconfig.draw_plots:
        ########################################################################import plotter
        # Drawing cumulative plot...
        plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths,
                                join(output_dirpath, 'cumulative_plot'),
                                'Cumulative length')
        if not qconfig.no_gc:
            ########################################################################
            # Drawing GC content plot...
            plotter.GC_content_plot(ref_fpath, contigs_fpaths,
                                    list_of_GC_distributions_with_ref,
                                    join(output_dirpath, 'GC_content_plot'))
            for contigs_fpath, GC_distribution in zip(
                    contigs_fpaths, list_of_GC_contigs_distributions):
                plotter.contigs_GC_content_plot(
                    contigs_fpath, GC_distribution,
                    join(
                        output_dirpath,
                        qutils.label_from_fpath(contigs_fpath) +
                        '_GC_content_plot'))

        if any(coverage_dict[contigs_fpath]
               for contigs_fpath in contigs_fpaths):
            draw_coverage_histograms(coverage_dict, contigs_fpaths,
                                     output_dirpath)

    logger.main_info('Done.')
Example #23
0
def cumulative_plot(reference, contigs_fpaths, lists_of_lengths, plot_fpath, title):
    if not can_draw_plots:
        return

    logger.info('  Drawing cumulative plot...')
    import matplotlib.pyplot
    import matplotlib.ticker

    figure = matplotlib.pyplot.figure()
    matplotlib.pyplot.rc('font', **font)
    max_x = 0
    max_y = 0

    for (contigs_fpath, lengths) in zip(contigs_fpaths, lists_of_lengths):
        vals_length = [0]
        for l in sorted(lengths, reverse=True):
            vals_length.append(vals_length[-1] + l)
        vals_contig_index = list(range(0, len(vals_length)))
        if vals_contig_index:
            max_x = max(vals_contig_index[-1], max_x)
            max_y = max(max_y, vals_length[-1])
        color, ls = get_color_and_ls(contigs_fpath)
        matplotlib.pyplot.plot(vals_contig_index, vals_length, color=color, lw=line_width, ls=ls)

    if reference:
        y_vals = []
        for l in sorted(fastaparser.get_chr_lengths_from_fastafile(reference).values(), reverse=True):
            if y_vals:
                y_vals.append(y_vals[-1] + l)
            else:
                y_vals = [l]
        x_vals = list(range(0, len(y_vals))) # for reference only: starting from X=1)
        # extend reference curve to the max X-axis point
        reference_length = y_vals[-1]
        max_x = max(max_x, x_vals[-1])
        max_y = max(max_y, reference_length)
        y_vals.append(reference_length)
        x_vals.append(max_x)
        matplotlib.pyplot.plot(x_vals, y_vals,
                               color=reference_color, lw=line_width, ls=reference_ls)

    if with_title:
        matplotlib.pyplot.title(title)
    matplotlib.pyplot.grid(with_grid)
    ax = matplotlib.pyplot.gca()
    # Shink current axis's height by 20% on the bottom
    box = ax.get_position()
    ax.set_position([box.x0, box.y0 + box.height * 0.2, box.width, box.height * 0.8])

    legend_list = [qutils.label_from_fpath(fpath) for fpath in contigs_fpaths]
    if reference:
        legend_list += ['Reference']

    # Put a legend below current axis
    try: # for matplotlib <= 2009-12-09
        ax.legend(legend_list, loc='upper center', bbox_to_anchor=(0.5, -0.1), fancybox=True,
            shadow=True, ncol=n_columns if n_columns<3 else 3)
    except Exception: # ZeroDivisionError: ValueError:
        pass

    ylabel = 'Cumulative length '
    ylabel, mkfunc = y_formatter(ylabel, max_y)
    matplotlib.pyplot.xlabel('Contig index', fontsize=axes_fontsize)
    matplotlib.pyplot.ylabel(ylabel, fontsize=axes_fontsize)

    mkformatter = matplotlib.ticker.FuncFormatter(mkfunc)
    ax.yaxis.set_major_formatter(mkformatter)


    xLocator, yLocator = get_locators()
    ax.yaxis.set_major_locator(yLocator)
    ax.xaxis.set_major_locator(xLocator)
    if logarithmic_x_scale:
        ax.set_xscale('log')
    #ax.set_yscale('log')

    #matplotlib.pyplot.ylim([0, int(float(max_y) * 1.1)])

    plot_fpath += '.' + qconfig.plot_extension
    matplotlib.pyplot.savefig(plot_fpath, bbox_inches='tight')
    logger.info('    saved to ' + plot_fpath)
    pdf_plots_figures.append(figure)
    matplotlib.pyplot.close()